Roll generated boringssl files to 2db0eb3f96a5756298dcd7f9319e56a98585bd10.
Change-Id: I7d0428b1a4aee7c7faf5bcbe928c0ee45a957ea6
Reviewed-on: https://dart-review.googlesource.com/c/boringssl_gen/+/375140
Reviewed-by: Brian Quinlan <bquinlan@google.com>
diff --git a/BUILD.generated.gni b/BUILD.generated.gni
index 910ebff..599b590 100644
--- a/BUILD.generated.gni
+++ b/BUILD.generated.gni
@@ -15,7 +15,6 @@
# This file is created by generate_build_files.py. Do not edit manually.
crypto_sources = [
- "err_data.c",
"src/crypto/asn1/a_bitstr.c",
"src/crypto/asn1/a_bool.c",
"src/crypto/asn1/a_d2i_fp.c",
@@ -121,6 +120,8 @@
"src/crypto/evp/evp_asn1.c",
"src/crypto/evp/evp_ctx.c",
"src/crypto/evp/internal.h",
+ "src/crypto/evp/p_dh.c",
+ "src/crypto/evp/p_dh_asn1.c",
"src/crypto/evp/p_dsa_asn1.c",
"src/crypto/evp/p_ec.c",
"src/crypto/evp/p_ec_asn1.c",
@@ -165,8 +166,9 @@
"src/crypto/hrss/hrss.c",
"src/crypto/hrss/internal.h",
"src/crypto/internal.h",
+ "src/crypto/keccak/internal.h",
+ "src/crypto/keccak/keccak.c",
"src/crypto/kyber/internal.h",
- "src/crypto/kyber/keccak.c",
"src/crypto/kyber/kyber.c",
"src/crypto/lhash/internal.h",
"src/crypto/lhash/lhash.c",
@@ -210,6 +212,20 @@
"src/crypto/rsa_extra/rsa_crypt.c",
"src/crypto/rsa_extra/rsa_print.c",
"src/crypto/siphash/siphash.c",
+ "src/crypto/spx/address.c",
+ "src/crypto/spx/address.h",
+ "src/crypto/spx/fors.c",
+ "src/crypto/spx/fors.h",
+ "src/crypto/spx/merkle.c",
+ "src/crypto/spx/merkle.h",
+ "src/crypto/spx/params.h",
+ "src/crypto/spx/spx.c",
+ "src/crypto/spx/spx_util.c",
+ "src/crypto/spx/spx_util.h",
+ "src/crypto/spx/thash.c",
+ "src/crypto/spx/thash.h",
+ "src/crypto/spx/wots.c",
+ "src/crypto/spx/wots.h",
"src/crypto/stack/stack.c",
"src/crypto/thread.c",
"src/crypto/thread_none.c",
@@ -226,6 +242,7 @@
"src/crypto/x509/asn1_gen.c",
"src/crypto/x509/by_dir.c",
"src/crypto/x509/by_file.c",
+ "src/crypto/x509/ext_dat.h",
"src/crypto/x509/i2d_pr.c",
"src/crypto/x509/internal.h",
"src/crypto/x509/name_print.c",
@@ -235,6 +252,29 @@
"src/crypto/x509/t_req.c",
"src/crypto/x509/t_x509.c",
"src/crypto/x509/t_x509a.c",
+ "src/crypto/x509/v3_akey.c",
+ "src/crypto/x509/v3_akeya.c",
+ "src/crypto/x509/v3_alt.c",
+ "src/crypto/x509/v3_bcons.c",
+ "src/crypto/x509/v3_bitst.c",
+ "src/crypto/x509/v3_conf.c",
+ "src/crypto/x509/v3_cpols.c",
+ "src/crypto/x509/v3_crld.c",
+ "src/crypto/x509/v3_enum.c",
+ "src/crypto/x509/v3_extku.c",
+ "src/crypto/x509/v3_genn.c",
+ "src/crypto/x509/v3_ia5.c",
+ "src/crypto/x509/v3_info.c",
+ "src/crypto/x509/v3_int.c",
+ "src/crypto/x509/v3_lib.c",
+ "src/crypto/x509/v3_ncons.c",
+ "src/crypto/x509/v3_ocsp.c",
+ "src/crypto/x509/v3_pcons.c",
+ "src/crypto/x509/v3_pmaps.c",
+ "src/crypto/x509/v3_prn.c",
+ "src/crypto/x509/v3_purp.c",
+ "src/crypto/x509/v3_skey.c",
+ "src/crypto/x509/v3_utl.c",
"src/crypto/x509/x509.c",
"src/crypto/x509/x509_att.c",
"src/crypto/x509/x509_cmp.c",
@@ -259,9 +299,7 @@
"src/crypto/x509/x_attrib.c",
"src/crypto/x509/x_crl.c",
"src/crypto/x509/x_exten.c",
- "src/crypto/x509/x_info.c",
"src/crypto/x509/x_name.c",
- "src/crypto/x509/x_pkey.c",
"src/crypto/x509/x_pubkey.c",
"src/crypto/x509/x_req.c",
"src/crypto/x509/x_sig.c",
@@ -269,31 +307,7 @@
"src/crypto/x509/x_val.c",
"src/crypto/x509/x_x509.c",
"src/crypto/x509/x_x509a.c",
- "src/crypto/x509v3/ext_dat.h",
- "src/crypto/x509v3/internal.h",
- "src/crypto/x509v3/v3_akey.c",
- "src/crypto/x509v3/v3_akeya.c",
- "src/crypto/x509v3/v3_alt.c",
- "src/crypto/x509v3/v3_bcons.c",
- "src/crypto/x509v3/v3_bitst.c",
- "src/crypto/x509v3/v3_conf.c",
- "src/crypto/x509v3/v3_cpols.c",
- "src/crypto/x509v3/v3_crld.c",
- "src/crypto/x509v3/v3_enum.c",
- "src/crypto/x509v3/v3_extku.c",
- "src/crypto/x509v3/v3_genn.c",
- "src/crypto/x509v3/v3_ia5.c",
- "src/crypto/x509v3/v3_info.c",
- "src/crypto/x509v3/v3_int.c",
- "src/crypto/x509v3/v3_lib.c",
- "src/crypto/x509v3/v3_ncons.c",
- "src/crypto/x509v3/v3_ocsp.c",
- "src/crypto/x509v3/v3_pcons.c",
- "src/crypto/x509v3/v3_pmaps.c",
- "src/crypto/x509v3/v3_prn.c",
- "src/crypto/x509v3/v3_purp.c",
- "src/crypto/x509v3/v3_skey.c",
- "src/crypto/x509v3/v3_utl.c",
+ "src/gen/crypto/err_data.c",
"src/third_party/fiat/curve25519_32.h",
"src/third_party/fiat/curve25519_64.h",
"src/third_party/fiat/curve25519_64_adx.h",
@@ -304,177 +318,168 @@
]
crypto_sources_asm = [
- "apple-aarch64/crypto/chacha/chacha-armv8-apple.S",
- "apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8-apple.S",
- "apple-aarch64/crypto/fipsmodule/aesv8-armv8-apple.S",
- "apple-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-apple.S",
- "apple-aarch64/crypto/fipsmodule/armv8-mont-apple.S",
- "apple-aarch64/crypto/fipsmodule/bn-armv8-apple.S",
- "apple-aarch64/crypto/fipsmodule/ghash-neon-armv8-apple.S",
- "apple-aarch64/crypto/fipsmodule/ghashv8-armv8-apple.S",
- "apple-aarch64/crypto/fipsmodule/p256-armv8-asm-apple.S",
- "apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-apple.S",
- "apple-aarch64/crypto/fipsmodule/sha1-armv8-apple.S",
- "apple-aarch64/crypto/fipsmodule/sha256-armv8-apple.S",
- "apple-aarch64/crypto/fipsmodule/sha512-armv8-apple.S",
- "apple-aarch64/crypto/fipsmodule/vpaes-armv8-apple.S",
- "apple-aarch64/crypto/test/trampoline-armv8-apple.S",
- "apple-arm/crypto/chacha/chacha-armv4-apple.S",
- "apple-arm/crypto/fipsmodule/aesv8-armv7-apple.S",
- "apple-arm/crypto/fipsmodule/armv4-mont-apple.S",
- "apple-arm/crypto/fipsmodule/bsaes-armv7-apple.S",
- "apple-arm/crypto/fipsmodule/ghash-armv4-apple.S",
- "apple-arm/crypto/fipsmodule/ghashv8-armv7-apple.S",
- "apple-arm/crypto/fipsmodule/sha1-armv4-large-apple.S",
- "apple-arm/crypto/fipsmodule/sha256-armv4-apple.S",
- "apple-arm/crypto/fipsmodule/sha512-armv4-apple.S",
- "apple-arm/crypto/fipsmodule/vpaes-armv7-apple.S",
- "apple-arm/crypto/test/trampoline-armv4-apple.S",
- "apple-x86/crypto/chacha/chacha-x86-apple.S",
- "apple-x86/crypto/fipsmodule/aesni-x86-apple.S",
- "apple-x86/crypto/fipsmodule/bn-586-apple.S",
- "apple-x86/crypto/fipsmodule/co-586-apple.S",
- "apple-x86/crypto/fipsmodule/ghash-ssse3-x86-apple.S",
- "apple-x86/crypto/fipsmodule/ghash-x86-apple.S",
- "apple-x86/crypto/fipsmodule/md5-586-apple.S",
- "apple-x86/crypto/fipsmodule/sha1-586-apple.S",
- "apple-x86/crypto/fipsmodule/sha256-586-apple.S",
- "apple-x86/crypto/fipsmodule/sha512-586-apple.S",
- "apple-x86/crypto/fipsmodule/vpaes-x86-apple.S",
- "apple-x86/crypto/fipsmodule/x86-mont-apple.S",
- "apple-x86/crypto/test/trampoline-x86-apple.S",
- "apple-x86_64/crypto/chacha/chacha-x86_64-apple.S",
- "apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64-apple.S",
- "apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64-apple.S",
- "apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64-apple.S",
- "apple-x86_64/crypto/fipsmodule/aesni-x86_64-apple.S",
- "apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64-apple.S",
- "apple-x86_64/crypto/fipsmodule/ghash-x86_64-apple.S",
- "apple-x86_64/crypto/fipsmodule/md5-x86_64-apple.S",
- "apple-x86_64/crypto/fipsmodule/p256-x86_64-asm-apple.S",
- "apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm-apple.S",
- "apple-x86_64/crypto/fipsmodule/rdrand-x86_64-apple.S",
- "apple-x86_64/crypto/fipsmodule/rsaz-avx2-apple.S",
- "apple-x86_64/crypto/fipsmodule/sha1-x86_64-apple.S",
- "apple-x86_64/crypto/fipsmodule/sha256-x86_64-apple.S",
- "apple-x86_64/crypto/fipsmodule/sha512-x86_64-apple.S",
- "apple-x86_64/crypto/fipsmodule/vpaes-x86_64-apple.S",
- "apple-x86_64/crypto/fipsmodule/x86_64-mont-apple.S",
- "apple-x86_64/crypto/fipsmodule/x86_64-mont5-apple.S",
- "apple-x86_64/crypto/test/trampoline-x86_64-apple.S",
- "linux-aarch64/crypto/chacha/chacha-armv8-linux.S",
- "linux-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8-linux.S",
- "linux-aarch64/crypto/fipsmodule/aesv8-armv8-linux.S",
- "linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-linux.S",
- "linux-aarch64/crypto/fipsmodule/armv8-mont-linux.S",
- "linux-aarch64/crypto/fipsmodule/bn-armv8-linux.S",
- "linux-aarch64/crypto/fipsmodule/ghash-neon-armv8-linux.S",
- "linux-aarch64/crypto/fipsmodule/ghashv8-armv8-linux.S",
- "linux-aarch64/crypto/fipsmodule/p256-armv8-asm-linux.S",
- "linux-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-linux.S",
- "linux-aarch64/crypto/fipsmodule/sha1-armv8-linux.S",
- "linux-aarch64/crypto/fipsmodule/sha256-armv8-linux.S",
- "linux-aarch64/crypto/fipsmodule/sha512-armv8-linux.S",
- "linux-aarch64/crypto/fipsmodule/vpaes-armv8-linux.S",
- "linux-aarch64/crypto/test/trampoline-armv8-linux.S",
- "linux-arm/crypto/chacha/chacha-armv4-linux.S",
- "linux-arm/crypto/fipsmodule/aesv8-armv7-linux.S",
- "linux-arm/crypto/fipsmodule/armv4-mont-linux.S",
- "linux-arm/crypto/fipsmodule/bsaes-armv7-linux.S",
- "linux-arm/crypto/fipsmodule/ghash-armv4-linux.S",
- "linux-arm/crypto/fipsmodule/ghashv8-armv7-linux.S",
- "linux-arm/crypto/fipsmodule/sha1-armv4-large-linux.S",
- "linux-arm/crypto/fipsmodule/sha256-armv4-linux.S",
- "linux-arm/crypto/fipsmodule/sha512-armv4-linux.S",
- "linux-arm/crypto/fipsmodule/vpaes-armv7-linux.S",
- "linux-arm/crypto/test/trampoline-armv4-linux.S",
- "linux-x86/crypto/chacha/chacha-x86-linux.S",
- "linux-x86/crypto/fipsmodule/aesni-x86-linux.S",
- "linux-x86/crypto/fipsmodule/bn-586-linux.S",
- "linux-x86/crypto/fipsmodule/co-586-linux.S",
- "linux-x86/crypto/fipsmodule/ghash-ssse3-x86-linux.S",
- "linux-x86/crypto/fipsmodule/ghash-x86-linux.S",
- "linux-x86/crypto/fipsmodule/md5-586-linux.S",
- "linux-x86/crypto/fipsmodule/sha1-586-linux.S",
- "linux-x86/crypto/fipsmodule/sha256-586-linux.S",
- "linux-x86/crypto/fipsmodule/sha512-586-linux.S",
- "linux-x86/crypto/fipsmodule/vpaes-x86-linux.S",
- "linux-x86/crypto/fipsmodule/x86-mont-linux.S",
- "linux-x86/crypto/test/trampoline-x86-linux.S",
- "linux-x86_64/crypto/chacha/chacha-x86_64-linux.S",
- "linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64-linux.S",
- "linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64-linux.S",
- "linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64-linux.S",
- "linux-x86_64/crypto/fipsmodule/aesni-x86_64-linux.S",
- "linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64-linux.S",
- "linux-x86_64/crypto/fipsmodule/ghash-x86_64-linux.S",
- "linux-x86_64/crypto/fipsmodule/md5-x86_64-linux.S",
- "linux-x86_64/crypto/fipsmodule/p256-x86_64-asm-linux.S",
- "linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm-linux.S",
- "linux-x86_64/crypto/fipsmodule/rdrand-x86_64-linux.S",
- "linux-x86_64/crypto/fipsmodule/rsaz-avx2-linux.S",
- "linux-x86_64/crypto/fipsmodule/sha1-x86_64-linux.S",
- "linux-x86_64/crypto/fipsmodule/sha256-x86_64-linux.S",
- "linux-x86_64/crypto/fipsmodule/sha512-x86_64-linux.S",
- "linux-x86_64/crypto/fipsmodule/vpaes-x86_64-linux.S",
- "linux-x86_64/crypto/fipsmodule/x86_64-mont-linux.S",
- "linux-x86_64/crypto/fipsmodule/x86_64-mont5-linux.S",
- "linux-x86_64/crypto/test/trampoline-x86_64-linux.S",
"src/crypto/curve25519/asm/x25519-asm-arm.S",
"src/crypto/hrss/asm/poly_rq_mul.S",
"src/crypto/poly1305/poly1305_arm_asm.S",
+ "src/gen/bcm/aesni-gcm-x86_64-apple.S",
+ "src/gen/bcm/aesni-gcm-x86_64-linux.S",
+ "src/gen/bcm/aesni-x86-apple.S",
+ "src/gen/bcm/aesni-x86-linux.S",
+ "src/gen/bcm/aesni-x86_64-apple.S",
+ "src/gen/bcm/aesni-x86_64-linux.S",
+ "src/gen/bcm/aesv8-armv7-linux.S",
+ "src/gen/bcm/aesv8-armv8-apple.S",
+ "src/gen/bcm/aesv8-armv8-linux.S",
+ "src/gen/bcm/aesv8-armv8-win.S",
+ "src/gen/bcm/aesv8-gcm-armv8-apple.S",
+ "src/gen/bcm/aesv8-gcm-armv8-linux.S",
+ "src/gen/bcm/aesv8-gcm-armv8-win.S",
+ "src/gen/bcm/armv4-mont-linux.S",
+ "src/gen/bcm/armv8-mont-apple.S",
+ "src/gen/bcm/armv8-mont-linux.S",
+ "src/gen/bcm/armv8-mont-win.S",
+ "src/gen/bcm/bn-586-apple.S",
+ "src/gen/bcm/bn-586-linux.S",
+ "src/gen/bcm/bn-armv8-apple.S",
+ "src/gen/bcm/bn-armv8-linux.S",
+ "src/gen/bcm/bn-armv8-win.S",
+ "src/gen/bcm/bsaes-armv7-linux.S",
+ "src/gen/bcm/co-586-apple.S",
+ "src/gen/bcm/co-586-linux.S",
+ "src/gen/bcm/ghash-armv4-linux.S",
+ "src/gen/bcm/ghash-neon-armv8-apple.S",
+ "src/gen/bcm/ghash-neon-armv8-linux.S",
+ "src/gen/bcm/ghash-neon-armv8-win.S",
+ "src/gen/bcm/ghash-ssse3-x86-apple.S",
+ "src/gen/bcm/ghash-ssse3-x86-linux.S",
+ "src/gen/bcm/ghash-ssse3-x86_64-apple.S",
+ "src/gen/bcm/ghash-ssse3-x86_64-linux.S",
+ "src/gen/bcm/ghash-x86-apple.S",
+ "src/gen/bcm/ghash-x86-linux.S",
+ "src/gen/bcm/ghash-x86_64-apple.S",
+ "src/gen/bcm/ghash-x86_64-linux.S",
+ "src/gen/bcm/ghashv8-armv7-linux.S",
+ "src/gen/bcm/ghashv8-armv8-apple.S",
+ "src/gen/bcm/ghashv8-armv8-linux.S",
+ "src/gen/bcm/ghashv8-armv8-win.S",
+ "src/gen/bcm/md5-586-apple.S",
+ "src/gen/bcm/md5-586-linux.S",
+ "src/gen/bcm/md5-x86_64-apple.S",
+ "src/gen/bcm/md5-x86_64-linux.S",
+ "src/gen/bcm/p256-armv8-asm-apple.S",
+ "src/gen/bcm/p256-armv8-asm-linux.S",
+ "src/gen/bcm/p256-armv8-asm-win.S",
+ "src/gen/bcm/p256-x86_64-asm-apple.S",
+ "src/gen/bcm/p256-x86_64-asm-linux.S",
+ "src/gen/bcm/p256_beeu-armv8-asm-apple.S",
+ "src/gen/bcm/p256_beeu-armv8-asm-linux.S",
+ "src/gen/bcm/p256_beeu-armv8-asm-win.S",
+ "src/gen/bcm/p256_beeu-x86_64-asm-apple.S",
+ "src/gen/bcm/p256_beeu-x86_64-asm-linux.S",
+ "src/gen/bcm/rdrand-x86_64-apple.S",
+ "src/gen/bcm/rdrand-x86_64-linux.S",
+ "src/gen/bcm/rsaz-avx2-apple.S",
+ "src/gen/bcm/rsaz-avx2-linux.S",
+ "src/gen/bcm/sha1-586-apple.S",
+ "src/gen/bcm/sha1-586-linux.S",
+ "src/gen/bcm/sha1-armv4-large-linux.S",
+ "src/gen/bcm/sha1-armv8-apple.S",
+ "src/gen/bcm/sha1-armv8-linux.S",
+ "src/gen/bcm/sha1-armv8-win.S",
+ "src/gen/bcm/sha1-x86_64-apple.S",
+ "src/gen/bcm/sha1-x86_64-linux.S",
+ "src/gen/bcm/sha256-586-apple.S",
+ "src/gen/bcm/sha256-586-linux.S",
+ "src/gen/bcm/sha256-armv4-linux.S",
+ "src/gen/bcm/sha256-armv8-apple.S",
+ "src/gen/bcm/sha256-armv8-linux.S",
+ "src/gen/bcm/sha256-armv8-win.S",
+ "src/gen/bcm/sha256-x86_64-apple.S",
+ "src/gen/bcm/sha256-x86_64-linux.S",
+ "src/gen/bcm/sha512-586-apple.S",
+ "src/gen/bcm/sha512-586-linux.S",
+ "src/gen/bcm/sha512-armv4-linux.S",
+ "src/gen/bcm/sha512-armv8-apple.S",
+ "src/gen/bcm/sha512-armv8-linux.S",
+ "src/gen/bcm/sha512-armv8-win.S",
+ "src/gen/bcm/sha512-x86_64-apple.S",
+ "src/gen/bcm/sha512-x86_64-linux.S",
+ "src/gen/bcm/vpaes-armv7-linux.S",
+ "src/gen/bcm/vpaes-armv8-apple.S",
+ "src/gen/bcm/vpaes-armv8-linux.S",
+ "src/gen/bcm/vpaes-armv8-win.S",
+ "src/gen/bcm/vpaes-x86-apple.S",
+ "src/gen/bcm/vpaes-x86-linux.S",
+ "src/gen/bcm/vpaes-x86_64-apple.S",
+ "src/gen/bcm/vpaes-x86_64-linux.S",
+ "src/gen/bcm/x86-mont-apple.S",
+ "src/gen/bcm/x86-mont-linux.S",
+ "src/gen/bcm/x86_64-mont-apple.S",
+ "src/gen/bcm/x86_64-mont-linux.S",
+ "src/gen/bcm/x86_64-mont5-apple.S",
+ "src/gen/bcm/x86_64-mont5-linux.S",
+ "src/gen/crypto/aes128gcmsiv-x86_64-apple.S",
+ "src/gen/crypto/aes128gcmsiv-x86_64-linux.S",
+ "src/gen/crypto/chacha-armv4-linux.S",
+ "src/gen/crypto/chacha-armv8-apple.S",
+ "src/gen/crypto/chacha-armv8-linux.S",
+ "src/gen/crypto/chacha-armv8-win.S",
+ "src/gen/crypto/chacha-x86-apple.S",
+ "src/gen/crypto/chacha-x86-linux.S",
+ "src/gen/crypto/chacha-x86_64-apple.S",
+ "src/gen/crypto/chacha-x86_64-linux.S",
+ "src/gen/crypto/chacha20_poly1305_armv8-apple.S",
+ "src/gen/crypto/chacha20_poly1305_armv8-linux.S",
+ "src/gen/crypto/chacha20_poly1305_armv8-win.S",
+ "src/gen/crypto/chacha20_poly1305_x86_64-apple.S",
+ "src/gen/crypto/chacha20_poly1305_x86_64-linux.S",
+ "src/gen/test_support/trampoline-armv4-linux.S",
+ "src/gen/test_support/trampoline-armv8-apple.S",
+ "src/gen/test_support/trampoline-armv8-linux.S",
+ "src/gen/test_support/trampoline-armv8-win.S",
+ "src/gen/test_support/trampoline-x86-apple.S",
+ "src/gen/test_support/trampoline-x86-linux.S",
+ "src/gen/test_support/trampoline-x86_64-apple.S",
+ "src/gen/test_support/trampoline-x86_64-linux.S",
"src/third_party/fiat/asm/fiat_curve25519_adx_mul.S",
"src/third_party/fiat/asm/fiat_curve25519_adx_square.S",
- "win-aarch64/crypto/chacha/chacha-armv8-win.S",
- "win-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8-win.S",
- "win-aarch64/crypto/fipsmodule/aesv8-armv8-win.S",
- "win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-win.S",
- "win-aarch64/crypto/fipsmodule/armv8-mont-win.S",
- "win-aarch64/crypto/fipsmodule/bn-armv8-win.S",
- "win-aarch64/crypto/fipsmodule/ghash-neon-armv8-win.S",
- "win-aarch64/crypto/fipsmodule/ghashv8-armv8-win.S",
- "win-aarch64/crypto/fipsmodule/p256-armv8-asm-win.S",
- "win-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-win.S",
- "win-aarch64/crypto/fipsmodule/sha1-armv8-win.S",
- "win-aarch64/crypto/fipsmodule/sha256-armv8-win.S",
- "win-aarch64/crypto/fipsmodule/sha512-armv8-win.S",
- "win-aarch64/crypto/fipsmodule/vpaes-armv8-win.S",
- "win-aarch64/crypto/test/trampoline-armv8-win.S",
+ "src/third_party/fiat/asm/fiat_p256_adx_mul.S",
+ "src/third_party/fiat/asm/fiat_p256_adx_sqr.S",
]
crypto_sources_nasm = [
- "win-x86/crypto/chacha/chacha-x86-win.asm",
- "win-x86/crypto/fipsmodule/aesni-x86-win.asm",
- "win-x86/crypto/fipsmodule/bn-586-win.asm",
- "win-x86/crypto/fipsmodule/co-586-win.asm",
- "win-x86/crypto/fipsmodule/ghash-ssse3-x86-win.asm",
- "win-x86/crypto/fipsmodule/ghash-x86-win.asm",
- "win-x86/crypto/fipsmodule/md5-586-win.asm",
- "win-x86/crypto/fipsmodule/sha1-586-win.asm",
- "win-x86/crypto/fipsmodule/sha256-586-win.asm",
- "win-x86/crypto/fipsmodule/sha512-586-win.asm",
- "win-x86/crypto/fipsmodule/vpaes-x86-win.asm",
- "win-x86/crypto/fipsmodule/x86-mont-win.asm",
- "win-x86/crypto/test/trampoline-x86-win.asm",
- "win-x86_64/crypto/chacha/chacha-x86_64-win.asm",
- "win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64-win.asm",
- "win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64-win.asm",
- "win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64-win.asm",
- "win-x86_64/crypto/fipsmodule/aesni-x86_64-win.asm",
- "win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64-win.asm",
- "win-x86_64/crypto/fipsmodule/ghash-x86_64-win.asm",
- "win-x86_64/crypto/fipsmodule/md5-x86_64-win.asm",
- "win-x86_64/crypto/fipsmodule/p256-x86_64-asm-win.asm",
- "win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm-win.asm",
- "win-x86_64/crypto/fipsmodule/rdrand-x86_64-win.asm",
- "win-x86_64/crypto/fipsmodule/rsaz-avx2-win.asm",
- "win-x86_64/crypto/fipsmodule/sha1-x86_64-win.asm",
- "win-x86_64/crypto/fipsmodule/sha256-x86_64-win.asm",
- "win-x86_64/crypto/fipsmodule/sha512-x86_64-win.asm",
- "win-x86_64/crypto/fipsmodule/vpaes-x86_64-win.asm",
- "win-x86_64/crypto/fipsmodule/x86_64-mont-win.asm",
- "win-x86_64/crypto/fipsmodule/x86_64-mont5-win.asm",
- "win-x86_64/crypto/test/trampoline-x86_64-win.asm",
+ "src/gen/bcm/aesni-gcm-x86_64-win.asm",
+ "src/gen/bcm/aesni-x86-win.asm",
+ "src/gen/bcm/aesni-x86_64-win.asm",
+ "src/gen/bcm/bn-586-win.asm",
+ "src/gen/bcm/co-586-win.asm",
+ "src/gen/bcm/ghash-ssse3-x86-win.asm",
+ "src/gen/bcm/ghash-ssse3-x86_64-win.asm",
+ "src/gen/bcm/ghash-x86-win.asm",
+ "src/gen/bcm/ghash-x86_64-win.asm",
+ "src/gen/bcm/md5-586-win.asm",
+ "src/gen/bcm/md5-x86_64-win.asm",
+ "src/gen/bcm/p256-x86_64-asm-win.asm",
+ "src/gen/bcm/p256_beeu-x86_64-asm-win.asm",
+ "src/gen/bcm/rdrand-x86_64-win.asm",
+ "src/gen/bcm/rsaz-avx2-win.asm",
+ "src/gen/bcm/sha1-586-win.asm",
+ "src/gen/bcm/sha1-x86_64-win.asm",
+ "src/gen/bcm/sha256-586-win.asm",
+ "src/gen/bcm/sha256-x86_64-win.asm",
+ "src/gen/bcm/sha512-586-win.asm",
+ "src/gen/bcm/sha512-x86_64-win.asm",
+ "src/gen/bcm/vpaes-x86-win.asm",
+ "src/gen/bcm/vpaes-x86_64-win.asm",
+ "src/gen/bcm/x86-mont-win.asm",
+ "src/gen/bcm/x86_64-mont-win.asm",
+ "src/gen/bcm/x86_64-mont5-win.asm",
+ "src/gen/crypto/aes128gcmsiv-x86_64-win.asm",
+ "src/gen/crypto/chacha-x86-win.asm",
+ "src/gen/crypto/chacha-x86_64-win.asm",
+ "src/gen/crypto/chacha20_poly1305_x86_64-win.asm",
+ "src/gen/test_support/trampoline-x86-win.asm",
+ "src/gen/test_support/trampoline-x86_64-win.asm",
]
crypto_headers = [
@@ -517,13 +522,14 @@
"src/include/openssl/evp.h",
"src/include/openssl/evp_errors.h",
"src/include/openssl/ex_data.h",
+ "src/include/openssl/experimental/kyber.h",
+ "src/include/openssl/experimental/spx.h",
"src/include/openssl/hkdf.h",
"src/include/openssl/hmac.h",
"src/include/openssl/hpke.h",
"src/include/openssl/hrss.h",
"src/include/openssl/is_boringssl.h",
"src/include/openssl/kdf.h",
- "src/include/openssl/kyber.h",
"src/include/openssl/lhash.h",
"src/include/openssl/md4.h",
"src/include/openssl/md5.h",
@@ -541,6 +547,7 @@
"src/include/openssl/pkcs8.h",
"src/include/openssl/poly1305.h",
"src/include/openssl/pool.h",
+ "src/include/openssl/posix_time.h",
"src/include/openssl/rand.h",
"src/include/openssl/rc4.h",
"src/include/openssl/ripemd.h",
@@ -559,6 +566,33 @@
"src/include/openssl/x509.h",
"src/include/openssl/x509_vfy.h",
"src/include/openssl/x509v3.h",
+ "src/include/openssl/x509v3_errors.h",
+]
+
+rust_bssl_sys = [ "src/rust/bssl-sys/src/lib.rs" ]
+
+rust_bssl_crypto = [
+ "src/rust/bssl-crypto/src/aead.rs",
+ "src/rust/bssl-crypto/src/aes.rs",
+ "src/rust/bssl-crypto/src/cipher/aes_cbc.rs",
+ "src/rust/bssl-crypto/src/cipher/aes_ctr.rs",
+ "src/rust/bssl-crypto/src/cipher/mod.rs",
+ "src/rust/bssl-crypto/src/digest.rs",
+ "src/rust/bssl-crypto/src/ec.rs",
+ "src/rust/bssl-crypto/src/ecdh.rs",
+ "src/rust/bssl-crypto/src/ecdsa.rs",
+ "src/rust/bssl-crypto/src/ed25519.rs",
+ "src/rust/bssl-crypto/src/hkdf.rs",
+ "src/rust/bssl-crypto/src/hmac.rs",
+ "src/rust/bssl-crypto/src/hpke.rs",
+ "src/rust/bssl-crypto/src/lib.rs",
+ "src/rust/bssl-crypto/src/macros.rs",
+ "src/rust/bssl-crypto/src/mem.rs",
+ "src/rust/bssl-crypto/src/rand.rs",
+ "src/rust/bssl-crypto/src/rsa.rs",
+ "src/rust/bssl-crypto/src/scoped.rs",
+ "src/rust/bssl-crypto/src/test_helpers.rs",
+ "src/rust/bssl-crypto/src/x25519.rs",
]
ssl_sources = [
@@ -584,6 +618,7 @@
"src/ssl/ssl_buffer.cc",
"src/ssl/ssl_cert.cc",
"src/ssl/ssl_cipher.cc",
+ "src/ssl/ssl_credential.cc",
"src/ssl/ssl_file.cc",
"src/ssl/ssl_key_share.cc",
"src/ssl/ssl_lib.cc",
@@ -611,97 +646,91 @@
]
pki_sources = [
- "src/pki/asn1_util.h",
"src/pki/cert_error_id.cc",
- "src/pki/cert_error_id.h",
"src/pki/cert_error_params.cc",
- "src/pki/cert_error_params.h",
"src/pki/cert_errors.cc",
+ "src/pki/cert_issuer_source_static.cc",
+ "src/pki/certificate.cc",
+ "src/pki/certificate_policies.cc",
+ "src/pki/common_cert_errors.cc",
+ "src/pki/crl.cc",
+ "src/pki/encode_values.cc",
+ "src/pki/extended_key_usage.cc",
+ "src/pki/general_names.cc",
+ "src/pki/input.cc",
+ "src/pki/ip_util.cc",
+ "src/pki/name_constraints.cc",
+ "src/pki/ocsp.cc",
+ "src/pki/ocsp_verify_result.cc",
+ "src/pki/parse_certificate.cc",
+ "src/pki/parse_name.cc",
+ "src/pki/parse_values.cc",
+ "src/pki/parsed_certificate.cc",
+ "src/pki/parser.cc",
+ "src/pki/path_builder.cc",
+ "src/pki/pem.cc",
+ "src/pki/revocation_util.cc",
+ "src/pki/signature_algorithm.cc",
+ "src/pki/simple_path_builder_delegate.cc",
+ "src/pki/string_util.cc",
+ "src/pki/trust_store.cc",
+ "src/pki/trust_store_collection.cc",
+ "src/pki/trust_store_in_memory.cc",
+ "src/pki/verify_certificate_chain.cc",
+ "src/pki/verify_error.cc",
+ "src/pki/verify_name_match.cc",
+ "src/pki/verify_signed_data.cc",
+]
+
+pki_internal_headers = [
+ "src/pki/cert_error_id.h",
+ "src/pki/cert_error_params.h",
"src/pki/cert_errors.h",
"src/pki/cert_issuer_source.h",
- "src/pki/cert_issuer_source_static.cc",
"src/pki/cert_issuer_source_static.h",
"src/pki/cert_issuer_source_sync_unittest.h",
- "src/pki/cert_status_flags.h",
- "src/pki/cert_status_flags_list.h",
- "src/pki/certificate_policies.cc",
"src/pki/certificate_policies.h",
- "src/pki/common_cert_errors.cc",
"src/pki/common_cert_errors.h",
- "src/pki/crl.cc",
"src/pki/crl.h",
- "src/pki/encode_values.cc",
"src/pki/encode_values.h",
- "src/pki/extended_key_usage.cc",
"src/pki/extended_key_usage.h",
- "src/pki/fillins/file_util.h",
- "src/pki/fillins/fillins_base64.cc",
- "src/pki/fillins/fillins_base64.h",
- "src/pki/fillins/fillins_string_util.cc",
- "src/pki/fillins/fillins_string_util.h",
- "src/pki/fillins/log.h",
- "src/pki/fillins/net_errors.h",
- "src/pki/fillins/openssl_util.cc",
- "src/pki/fillins/openssl_util.h",
- "src/pki/fillins/path_service.h",
- "src/pki/general_names.cc",
"src/pki/general_names.h",
- "src/pki/input.cc",
"src/pki/input.h",
- "src/pki/ip_util.cc",
"src/pki/ip_util.h",
"src/pki/mock_signature_verify_cache.h",
- "src/pki/name_constraints.cc",
"src/pki/name_constraints.h",
"src/pki/nist_pkits_unittest.h",
- "src/pki/ocsp.cc",
"src/pki/ocsp.h",
"src/pki/ocsp_revocation_status.h",
- "src/pki/ocsp_verify_result.cc",
"src/pki/ocsp_verify_result.h",
- "src/pki/parse_certificate.cc",
"src/pki/parse_certificate.h",
- "src/pki/parse_name.cc",
"src/pki/parse_name.h",
- "src/pki/parse_values.cc",
"src/pki/parse_values.h",
- "src/pki/parsed_certificate.cc",
"src/pki/parsed_certificate.h",
- "src/pki/parser.cc",
"src/pki/parser.h",
- "src/pki/path_builder.cc",
"src/pki/path_builder.h",
- "src/pki/pem.cc",
"src/pki/pem.h",
- "src/pki/revocation_util.cc",
"src/pki/revocation_util.h",
- "src/pki/signature_algorithm.cc",
"src/pki/signature_algorithm.h",
- "src/pki/signature_verify_cache.h",
- "src/pki/simple_path_builder_delegate.cc",
"src/pki/simple_path_builder_delegate.h",
- "src/pki/string_util.cc",
"src/pki/string_util.h",
- "src/pki/tag.cc",
- "src/pki/tag.h",
"src/pki/test_helpers.h",
"src/pki/testdata/nist-pkits/pkits_testcases-inl.h",
- "src/pki/testdata/test_certificate_data.h",
- "src/pki/trust_store.cc",
"src/pki/trust_store.h",
- "src/pki/trust_store_collection.cc",
"src/pki/trust_store_collection.h",
- "src/pki/trust_store_in_memory.cc",
"src/pki/trust_store_in_memory.h",
- "src/pki/verify_certificate_chain.cc",
"src/pki/verify_certificate_chain.h",
"src/pki/verify_certificate_chain_typed_unittest.h",
- "src/pki/verify_name_match.cc",
"src/pki/verify_name_match.h",
- "src/pki/verify_signed_data.cc",
"src/pki/verify_signed_data.h",
]
+pki_headers = [
+ "src/include/openssl/pki/certificate.h",
+ "src/include/openssl/pki/signature_verify_cache.h",
+ "src/include/openssl/pki/verify_error.h",
+]
+
tool_sources = [
"src/tool/args.cc",
"src/tool/ciphers.cc",
diff --git a/BUILD.generated_tests.gni b/BUILD.generated_tests.gni
index d7b63a7..a306858 100644
--- a/BUILD.generated_tests.gni
+++ b/BUILD.generated_tests.gni
@@ -19,7 +19,12 @@
"src/crypto/test/abi_test.h",
"src/crypto/test/file_test.cc",
"src/crypto/test/file_test.h",
+ "src/crypto/test/file_test_gtest.cc",
+ "src/crypto/test/file_util.cc",
+ "src/crypto/test/file_util.h",
"src/crypto/test/gtest_main.h",
+ "src/crypto/test/test_data.cc",
+ "src/crypto/test/test_data.h",
"src/crypto/test/test_util.cc",
"src/crypto/test/test_util.h",
"src/crypto/test/wycheproof_util.cc",
@@ -36,7 +41,6 @@
]
crypto_test_sources = [
- "crypto_test_data.cc",
"src/crypto/abi_self_test.cc",
"src/crypto/asn1/asn1_test.cc",
"src/crypto/base64/base64_test.cc",
@@ -69,6 +73,7 @@
"src/crypto/fipsmodule/cmac/cmac_test.cc",
"src/crypto/fipsmodule/ec/ec_test.cc",
"src/crypto/fipsmodule/ec/p256-nistz_test.cc",
+ "src/crypto/fipsmodule/ec/p256_test.cc",
"src/crypto/fipsmodule/ecdsa/ecdsa_test.cc",
"src/crypto/fipsmodule/hkdf/hkdf_test.cc",
"src/crypto/fipsmodule/md5/md5_test.cc",
@@ -81,6 +86,7 @@
"src/crypto/hpke/hpke_test.cc",
"src/crypto/hrss/hrss_test.cc",
"src/crypto/impl_dispatch_test.cc",
+ "src/crypto/keccak/keccak_test.cc",
"src/crypto/kyber/kyber_test.cc",
"src/crypto/lhash/lhash_test.cc",
"src/crypto/obj/obj_test.cc",
@@ -96,14 +102,14 @@
"src/crypto/rsa_extra/rsa_test.cc",
"src/crypto/self_test.cc",
"src/crypto/siphash/siphash_test.cc",
+ "src/crypto/spx/spx_test.cc",
"src/crypto/stack/stack_test.cc",
- "src/crypto/test/file_test_gtest.cc",
"src/crypto/test/gtest_main.cc",
"src/crypto/thread_test.cc",
"src/crypto/trust_token/trust_token_test.cc",
+ "src/crypto/x509/tab_test.cc",
"src/crypto/x509/x509_test.cc",
"src/crypto/x509/x509_time_test.cc",
- "src/crypto/x509v3/tab_test.cc",
]
crypto_test_data = [
@@ -167,9 +173,14 @@
"src/crypto/fipsmodule/rand/ctrdrbg_vectors.txt",
"src/crypto/hmac_extra/hmac_tests.txt",
"src/crypto/hpke/hpke_test_vectors.txt",
- "src/crypto/kyber/keccak_tests.txt",
+ "src/crypto/keccak/keccak_tests.txt",
"src/crypto/kyber/kyber_tests.txt",
+ "src/crypto/pkcs8/test/bad1.p12",
+ "src/crypto/pkcs8/test/bad2.p12",
+ "src/crypto/pkcs8/test/bad3.p12",
"src/crypto/pkcs8/test/empty_password.p12",
+ "src/crypto/pkcs8/test/empty_password_ber.p12",
+ "src/crypto/pkcs8/test/empty_password_ber_nested.p12",
"src/crypto/pkcs8/test/no_encryption.p12",
"src/crypto/pkcs8/test/nss.p12",
"src/crypto/pkcs8/test/null_password.p12",
@@ -180,6 +191,8 @@
"src/crypto/pkcs8/test/windows.p12",
"src/crypto/poly1305/poly1305_tests.txt",
"src/crypto/siphash/siphash_tests.txt",
+ "src/crypto/spx/spx_tests.txt",
+ "src/crypto/spx/spx_tests_deterministic.txt",
"src/crypto/x509/test/basic_constraints_ca.pem",
"src/crypto/x509/test/basic_constraints_ca_pathlen_0.pem",
"src/crypto/x509/test/basic_constraints_ca_pathlen_1.pem",
@@ -346,27 +359,14 @@
"src/pki/testdata/cert_issuer_source_static_unittest/d.pem",
"src/pki/testdata/cert_issuer_source_static_unittest/e1.pem",
"src/pki/testdata/cert_issuer_source_static_unittest/e2.pem",
- "src/pki/testdata/cert_issuer_source_static_unittest/generate-certs.py",
"src/pki/testdata/cert_issuer_source_static_unittest/i1_1.pem",
"src/pki/testdata/cert_issuer_source_static_unittest/i1_2.pem",
"src/pki/testdata/cert_issuer_source_static_unittest/i2.pem",
"src/pki/testdata/cert_issuer_source_static_unittest/i3_1.pem",
"src/pki/testdata/cert_issuer_source_static_unittest/i3_2.pem",
- "src/pki/testdata/cert_issuer_source_static_unittest/keys/C1.key",
- "src/pki/testdata/cert_issuer_source_static_unittest/keys/C2.key",
- "src/pki/testdata/cert_issuer_source_static_unittest/keys/D.key",
- "src/pki/testdata/cert_issuer_source_static_unittest/keys/E1.key",
- "src/pki/testdata/cert_issuer_source_static_unittest/keys/E2.key",
- "src/pki/testdata/cert_issuer_source_static_unittest/keys/I1.key",
- "src/pki/testdata/cert_issuer_source_static_unittest/keys/I2.key",
- "src/pki/testdata/cert_issuer_source_static_unittest/keys/I3.key",
- "src/pki/testdata/cert_issuer_source_static_unittest/keys/I3_1.key",
- "src/pki/testdata/cert_issuer_source_static_unittest/keys/Root.key",
- "src/pki/testdata/cert_issuer_source_static_unittest/keys/i1_1.key",
"src/pki/testdata/cert_issuer_source_static_unittest/root.pem",
"src/pki/testdata/certificate_policies_unittest/anypolicy.pem",
"src/pki/testdata/certificate_policies_unittest/anypolicy_with_qualifier.pem",
- "src/pki/testdata/certificate_policies_unittest/generate_policies.py",
"src/pki/testdata/certificate_policies_unittest/invalid-anypolicy_with_custom_qualifier.pem",
"src/pki/testdata/certificate_policies_unittest/invalid-empty.pem",
"src/pki/testdata/certificate_policies_unittest/invalid-policy_1_2_3_dupe.pem",
@@ -395,7 +395,6 @@
"src/pki/testdata/crl_unittest/bad_thisupdate_in_future.pem",
"src/pki/testdata/crl_unittest/bad_thisupdate_too_old.pem",
"src/pki/testdata/crl_unittest/bad_wrong_issuer.pem",
- "src/pki/testdata/crl_unittest/generate_crl_test_data.py",
"src/pki/testdata/crl_unittest/good.pem",
"src/pki/testdata/crl_unittest/good_fake_extension.pem",
"src/pki/testdata/crl_unittest/good_fake_extension_no_nextupdate.pem",
@@ -466,7 +465,6 @@
"src/pki/testdata/name_constraints_unittest/dnsname2.pem",
"src/pki/testdata/name_constraints_unittest/edipartyname-excluded.pem",
"src/pki/testdata/name_constraints_unittest/edipartyname-permitted.pem",
- "src/pki/testdata/name_constraints_unittest/generate_name_constraints.py",
"src/pki/testdata/name_constraints_unittest/invalid-empty_excluded_subtree.pem",
"src/pki/testdata/name_constraints_unittest/invalid-empty_permitted_subtree.pem",
"src/pki/testdata/name_constraints_unittest/invalid-no_subtrees.pem",
@@ -554,8 +552,6 @@
"src/pki/testdata/name_constraints_unittest/uri-permitted.pem",
"src/pki/testdata/name_constraints_unittest/x400address-excluded.pem",
"src/pki/testdata/name_constraints_unittest/x400address-permitted.pem",
- "src/pki/testdata/nist-pkits/BUILD.gn",
- "src/pki/testdata/nist-pkits/README.chromium",
"src/pki/testdata/nist-pkits/certs/AllCertificatesNoPoliciesTest2EE.crt",
"src/pki/testdata/nist-pkits/certs/AllCertificatesSamePoliciesTest10EE.crt",
"src/pki/testdata/nist-pkits/certs/AllCertificatesSamePoliciesTest13EE.crt",
@@ -1134,11 +1130,6 @@
"src/pki/testdata/nist-pkits/crls/requireExplicitPolicy7subCARE2CRL.crl",
"src/pki/testdata/nist-pkits/crls/requireExplicitPolicy7subsubCARE2RE4CRL.crl",
"src/pki/testdata/nist-pkits/crls/requireExplicitPolicy7subsubsubCARE2RE4CRL.crl",
- "src/pki/testdata/nist-pkits/generate_tests.py",
- "src/pki/testdata/nist-pkits/pkits_testcases-inl.h",
- "src/pki/testdata/nist-pkits/test_bundle_data.filelist",
- "src/pki/testdata/nist-pkits/test_bundle_data.globlist",
- "src/pki/testdata/ocsp_unittest/annotate_test_data.py",
"src/pki/testdata/ocsp_unittest/bad_ocsp_type.pem",
"src/pki/testdata/ocsp_unittest/bad_signature.pem",
"src/pki/testdata/ocsp_unittest/bad_status.pem",
@@ -1151,7 +1142,6 @@
"src/pki/testdata/ocsp_unittest/has_extension.pem",
"src/pki/testdata/ocsp_unittest/has_single_extension.pem",
"src/pki/testdata/ocsp_unittest/has_version.pem",
- "src/pki/testdata/ocsp_unittest/make_ocsp.py",
"src/pki/testdata/ocsp_unittest/malformed_request.pem",
"src/pki/testdata/ocsp_unittest/missing_response.pem",
"src/pki/testdata/ocsp_unittest/multiple_response.pem",
@@ -1170,7 +1160,6 @@
"src/pki/testdata/parse_certificate_unittest/authority_key_identifier/empty_sequence.pem",
"src/pki/testdata/parse_certificate_unittest/authority_key_identifier/extra_contents_after_extension_sequence.pem",
"src/pki/testdata/parse_certificate_unittest/authority_key_identifier/extra_contents_after_issuer_and_serial.pem",
- "src/pki/testdata/parse_certificate_unittest/authority_key_identifier/generate.py",
"src/pki/testdata/parse_certificate_unittest/authority_key_identifier/invalid_contents.pem",
"src/pki/testdata/parse_certificate_unittest/authority_key_identifier/invalid_issuer.pem",
"src/pki/testdata/parse_certificate_unittest/authority_key_identifier/invalid_key_identifier.pem",
@@ -1229,8 +1218,6 @@
"src/pki/testdata/parse_certificate_unittest/policy_constraints_inhibit_require.pem",
"src/pki/testdata/parse_certificate_unittest/policy_constraints_require.pem",
"src/pki/testdata/parse_certificate_unittest/policy_qualifiers_empty_sequence.pem",
- "src/pki/testdata/parse_certificate_unittest/rebase-errors.py",
- "src/pki/testdata/parse_certificate_unittest/regenerate_pem_from_ascii.py",
"src/pki/testdata/parse_certificate_unittest/serial_37_bytes.pem",
"src/pki/testdata/parse_certificate_unittest/serial_negative.pem",
"src/pki/testdata/parse_certificate_unittest/serial_not_minimal.pem",
@@ -1273,20 +1260,12 @@
"src/pki/testdata/parse_certificate_unittest/tbs_validity_relaxed.pem",
"src/pki/testdata/parse_certificate_unittest/tbs_validity_utc_time_and_generalized_time.pem",
"src/pki/testdata/parse_certificate_unittest/v1_explicit_version.pem",
- "src/pki/testdata/parse_certificate_unittest/v3_certificate_template.pk8",
- "src/pki/testdata/parse_certificate_unittest/v3_certificate_template.txt",
- "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/generate-certs.py",
"src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/int_match_name_only.pem",
"src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/int_matching.pem",
"src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/int_mismatch.pem",
- "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/keys/Intermediate.key",
- "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/keys/Root.key",
- "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/keys/Root2.key",
- "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/keys/Target.key",
"src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/root.pem",
"src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/root2.pem",
"src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/target.pem",
- "src/pki/testdata/path_builder_unittest/key_id_prioritization/generate-certs.py",
"src/pki/testdata/path_builder_unittest/key_id_prioritization/int_different_ski_a.pem",
"src/pki/testdata/path_builder_unittest/key_id_prioritization/int_different_ski_b.pem",
"src/pki/testdata/path_builder_unittest/key_id_prioritization/int_different_ski_c.pem",
@@ -1296,277 +1275,34 @@
"src/pki/testdata/path_builder_unittest/key_id_prioritization/int_no_ski_a.pem",
"src/pki/testdata/path_builder_unittest/key_id_prioritization/int_no_ski_b.pem",
"src/pki/testdata/path_builder_unittest/key_id_prioritization/int_no_ski_c.pem",
- "src/pki/testdata/path_builder_unittest/key_id_prioritization/keys/Intermediate.key",
- "src/pki/testdata/path_builder_unittest/key_id_prioritization/keys/Intermediate_1.key",
- "src/pki/testdata/path_builder_unittest/key_id_prioritization/keys/Root.key",
- "src/pki/testdata/path_builder_unittest/key_id_prioritization/keys/Target.key",
"src/pki/testdata/path_builder_unittest/key_id_prioritization/root.pem",
"src/pki/testdata/path_builder_unittest/key_id_prioritization/target.pem",
- "src/pki/testdata/path_builder_unittest/self_issued_prioritization/generate-certs.py",
- "src/pki/testdata/path_builder_unittest/self_issued_prioritization/keys/Root1.key",
- "src/pki/testdata/path_builder_unittest/self_issued_prioritization/keys/Root2.key",
- "src/pki/testdata/path_builder_unittest/self_issued_prioritization/keys/Target.key",
+ "src/pki/testdata/path_builder_unittest/multi-root-A-by-B.pem",
+ "src/pki/testdata/path_builder_unittest/multi-root-B-by-C.pem",
+ "src/pki/testdata/path_builder_unittest/multi-root-B-by-F.pem",
+ "src/pki/testdata/path_builder_unittest/multi-root-C-by-D.pem",
+ "src/pki/testdata/path_builder_unittest/multi-root-C-by-E.pem",
+ "src/pki/testdata/path_builder_unittest/multi-root-D-by-D.pem",
+ "src/pki/testdata/path_builder_unittest/multi-root-E-by-E.pem",
+ "src/pki/testdata/path_builder_unittest/multi-root-F-by-E.pem",
+ "src/pki/testdata/path_builder_unittest/precertificate/precertificate.pem",
+ "src/pki/testdata/path_builder_unittest/precertificate/root.pem",
"src/pki/testdata/path_builder_unittest/self_issued_prioritization/root1.pem",
"src/pki/testdata/path_builder_unittest/self_issued_prioritization/root1_cross.pem",
"src/pki/testdata/path_builder_unittest/self_issued_prioritization/root2.pem",
"src/pki/testdata/path_builder_unittest/self_issued_prioritization/target.pem",
- "src/pki/testdata/path_builder_unittest/validity_date_prioritization/generate-certs.py",
"src/pki/testdata/path_builder_unittest/validity_date_prioritization/int_ac.pem",
"src/pki/testdata/path_builder_unittest/validity_date_prioritization/int_ad.pem",
"src/pki/testdata/path_builder_unittest/validity_date_prioritization/int_bc.pem",
"src/pki/testdata/path_builder_unittest/validity_date_prioritization/int_bd.pem",
- "src/pki/testdata/path_builder_unittest/validity_date_prioritization/keys/Intermediate.key",
- "src/pki/testdata/path_builder_unittest/validity_date_prioritization/keys/Root.key",
- "src/pki/testdata/path_builder_unittest/validity_date_prioritization/keys/Target.key",
"src/pki/testdata/path_builder_unittest/validity_date_prioritization/root.pem",
"src/pki/testdata/path_builder_unittest/validity_date_prioritization/target.pem",
- "src/pki/testdata/ssl/certificates/1024-rsa-ee-by-1024-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/1024-rsa-ee-by-2048-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/1024-rsa-ee-by-768-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/1024-rsa-ee-by-prime256v1-ecdsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/1024-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/10_year_validity.pem",
- "src/pki/testdata/ssl/certificates/11_year_validity.pem",
- "src/pki/testdata/ssl/certificates/2029_globalsign_com_cert.pem",
- "src/pki/testdata/ssl/certificates/2048-rsa-ee-by-1024-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/2048-rsa-ee-by-2048-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/2048-rsa-ee-by-768-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/2048-rsa-ee-by-prime256v1-ecdsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/2048-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/2048-rsa-root.pem",
- "src/pki/testdata/ssl/certificates/398_days_1_second_after_2020_09_01.pem",
- "src/pki/testdata/ssl/certificates/398_days_after_2020_09_01.pem",
- "src/pki/testdata/ssl/certificates/399_days_after_2020_09_01.pem",
- "src/pki/testdata/ssl/certificates/39_months_after_2015_04.pem",
- "src/pki/testdata/ssl/certificates/39_months_based_on_last_day.pem",
- "src/pki/testdata/ssl/certificates/40_months_after_2015_04.pem",
- "src/pki/testdata/ssl/certificates/60_months_after_2012_07.pem",
- "src/pki/testdata/ssl/certificates/61_months_after_2012_07.pem",
- "src/pki/testdata/ssl/certificates/768-rsa-ee-by-1024-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/768-rsa-ee-by-2048-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/768-rsa-ee-by-768-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/768-rsa-ee-by-prime256v1-ecdsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/768-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/825_days_1_second_after_2018_03_01.pem",
- "src/pki/testdata/ssl/certificates/825_days_after_2018_03_01.pem",
- "src/pki/testdata/ssl/certificates/826_days_after_2018_03_01.pem",
- "src/pki/testdata/ssl/certificates/900_days_after_2019_07_01.pem",
- "src/pki/testdata/ssl/certificates/BUILD.gn",
- "src/pki/testdata/ssl/certificates/README",
- "src/pki/testdata/ssl/certificates/bad_validity.pem",
- "src/pki/testdata/ssl/certificates/can_sign_http_exchanges_draft_extension.pem",
- "src/pki/testdata/ssl/certificates/can_sign_http_exchanges_draft_extension_invalid.pem",
- "src/pki/testdata/ssl/certificates/client-empty-password.p12",
- "src/pki/testdata/ssl/certificates/client-nokey.p12",
- "src/pki/testdata/ssl/certificates/client-null-password.p12",
- "src/pki/testdata/ssl/certificates/client.p12",
- "src/pki/testdata/ssl/certificates/client_1.key",
- "src/pki/testdata/ssl/certificates/client_1.pem",
- "src/pki/testdata/ssl/certificates/client_1.pk8",
- "src/pki/testdata/ssl/certificates/client_1_ca.pem",
- "src/pki/testdata/ssl/certificates/client_2.key",
- "src/pki/testdata/ssl/certificates/client_2.pem",
- "src/pki/testdata/ssl/certificates/client_2.pk8",
- "src/pki/testdata/ssl/certificates/client_2_ca.pem",
- "src/pki/testdata/ssl/certificates/client_3.key",
- "src/pki/testdata/ssl/certificates/client_3.pem",
- "src/pki/testdata/ssl/certificates/client_3.pk8",
- "src/pki/testdata/ssl/certificates/client_3_ca.pem",
- "src/pki/testdata/ssl/certificates/client_4.key",
- "src/pki/testdata/ssl/certificates/client_4.pem",
- "src/pki/testdata/ssl/certificates/client_4.pk8",
- "src/pki/testdata/ssl/certificates/client_4_ca.pem",
- "src/pki/testdata/ssl/certificates/client_5.key",
- "src/pki/testdata/ssl/certificates/client_5.pem",
- "src/pki/testdata/ssl/certificates/client_5.pk8",
- "src/pki/testdata/ssl/certificates/client_5_ca.pem",
- "src/pki/testdata/ssl/certificates/client_6.key",
- "src/pki/testdata/ssl/certificates/client_6.pem",
- "src/pki/testdata/ssl/certificates/client_6.pk8",
- "src/pki/testdata/ssl/certificates/client_6_ca.pem",
- "src/pki/testdata/ssl/certificates/client_7.key",
- "src/pki/testdata/ssl/certificates/client_7.pem",
- "src/pki/testdata/ssl/certificates/client_7.pk8",
- "src/pki/testdata/ssl/certificates/client_7_ca.pem",
- "src/pki/testdata/ssl/certificates/client_root_ca.pem",
- "src/pki/testdata/ssl/certificates/common_name_only.pem",
- "src/pki/testdata/ssl/certificates/crit-codeSigning-chain.pem",
- "src/pki/testdata/ssl/certificates/crlset_blocked_interception_by_intermediate.raw",
- "src/pki/testdata/ssl/certificates/crlset_blocked_interception_by_root.raw",
- "src/pki/testdata/ssl/certificates/crlset_by_intermediate_serial.raw",
- "src/pki/testdata/ssl/certificates/crlset_by_leaf_spki.raw",
- "src/pki/testdata/ssl/certificates/crlset_by_leaf_subject_no_spki.raw",
- "src/pki/testdata/ssl/certificates/crlset_by_root_serial.raw",
- "src/pki/testdata/ssl/certificates/crlset_by_root_spki.raw",
- "src/pki/testdata/ssl/certificates/crlset_by_root_subject.raw",
- "src/pki/testdata/ssl/certificates/crlset_by_root_subject_no_spki.raw",
- "src/pki/testdata/ssl/certificates/crlset_known_interception_by_root.raw",
- "src/pki/testdata/ssl/certificates/cross-signed-leaf.pem",
- "src/pki/testdata/ssl/certificates/cross-signed-root-md5.pem",
- "src/pki/testdata/ssl/certificates/cross-signed-root-sha256.pem",
- "src/pki/testdata/ssl/certificates/ct-test-embedded-cert.pem",
- "src/pki/testdata/ssl/certificates/ct-test-embedded-with-intermediate-chain.pem",
- "src/pki/testdata/ssl/certificates/ct-test-embedded-with-intermediate-preca-chain.pem",
- "src/pki/testdata/ssl/certificates/ct-test-embedded-with-preca-chain.pem",
- "src/pki/testdata/ssl/certificates/ct-test-embedded-with-uids.pem",
- "src/pki/testdata/ssl/certificates/dec_2017.pem",
- "src/pki/testdata/ssl/certificates/diginotar_cyber_ca.pem",
- "src/pki/testdata/ssl/certificates/diginotar_pkioverheid.pem",
- "src/pki/testdata/ssl/certificates/diginotar_pkioverheid_g2.pem",
- "src/pki/testdata/ssl/certificates/diginotar_public_ca_2025.pem",
- "src/pki/testdata/ssl/certificates/diginotar_root_ca.pem",
- "src/pki/testdata/ssl/certificates/diginotar_services_1024_ca.pem",
- "src/pki/testdata/ssl/certificates/duplicate_cn_1.p12",
- "src/pki/testdata/ssl/certificates/duplicate_cn_1.pem",
- "src/pki/testdata/ssl/certificates/duplicate_cn_2.p12",
- "src/pki/testdata/ssl/certificates/duplicate_cn_2.pem",
- "src/pki/testdata/ssl/certificates/ec-prime256v1-1.key",
- "src/pki/testdata/ssl/certificates/ec-prime256v1-2.key",
- "src/pki/testdata/ssl/certificates/ec-prime256v1-3.key",
- "src/pki/testdata/ssl/certificates/eku-test-root.pem",
- "src/pki/testdata/ssl/certificates/ev_test.pem",
- "src/pki/testdata/ssl/certificates/ev_test_state_only.pem",
- "src/pki/testdata/ssl/certificates/expired_cert.pem",
- "src/pki/testdata/ssl/certificates/foaf.me.chromium-test-cert.der",
- "src/pki/testdata/ssl/certificates/google.binary.p7b",
- "src/pki/testdata/ssl/certificates/google.chain.pem",
- "src/pki/testdata/ssl/certificates/google.pem_cert.p7b",
- "src/pki/testdata/ssl/certificates/google.pem_pkcs7.p7b",
- "src/pki/testdata/ssl/certificates/google.single.der",
- "src/pki/testdata/ssl/certificates/google.single.pem",
- "src/pki/testdata/ssl/certificates/google_diginotar.pem",
- "src/pki/testdata/ssl/certificates/intermediate_ca_cert.pem",
- "src/pki/testdata/ssl/certificates/invalid_key_usage_cert.der",
- "src/pki/testdata/ssl/certificates/key_usage_p256.key",
- "src/pki/testdata/ssl/certificates/key_usage_p256_both.pem",
- "src/pki/testdata/ssl/certificates/key_usage_p256_digitalsignature.pem",
- "src/pki/testdata/ssl/certificates/key_usage_p256_keyagreement.pem",
- "src/pki/testdata/ssl/certificates/key_usage_p256_no_extension.pem",
- "src/pki/testdata/ssl/certificates/key_usage_rsa.key",
- "src/pki/testdata/ssl/certificates/key_usage_rsa_both.pem",
- "src/pki/testdata/ssl/certificates/key_usage_rsa_digitalsignature.pem",
- "src/pki/testdata/ssl/certificates/key_usage_rsa_keyencipherment.pem",
- "src/pki/testdata/ssl/certificates/key_usage_rsa_no_extension.pem",
- "src/pki/testdata/ssl/certificates/large_key.pem",
- "src/pki/testdata/ssl/certificates/leaf_from_known_root.pem",
- "src/pki/testdata/ssl/certificates/lets-encrypt-dst-x3-root.pem",
- "src/pki/testdata/ssl/certificates/lets-encrypt-isrg-x1-root.pem",
- "src/pki/testdata/ssl/certificates/localhost_cert.pem",
- "src/pki/testdata/ssl/certificates/may_2018.pem",
- "src/pki/testdata/ssl/certificates/mit.davidben.der",
- "src/pki/testdata/ssl/certificates/multi-root-A-by-B.pem",
- "src/pki/testdata/ssl/certificates/multi-root-B-by-C.pem",
- "src/pki/testdata/ssl/certificates/multi-root-B-by-F.pem",
- "src/pki/testdata/ssl/certificates/multi-root-C-by-D.pem",
- "src/pki/testdata/ssl/certificates/multi-root-C-by-E.pem",
- "src/pki/testdata/ssl/certificates/multi-root-D-by-D.pem",
- "src/pki/testdata/ssl/certificates/multi-root-E-by-E.pem",
- "src/pki/testdata/ssl/certificates/multi-root-F-by-E.pem",
- "src/pki/testdata/ssl/certificates/multi-root-chain1.pem",
- "src/pki/testdata/ssl/certificates/multi-root-chain2.pem",
- "src/pki/testdata/ssl/certificates/multi-root-crlset-C.raw",
- "src/pki/testdata/ssl/certificates/multi-root-crlset-CD-and-FE.raw",
- "src/pki/testdata/ssl/certificates/multi-root-crlset-D-and-E.raw",
- "src/pki/testdata/ssl/certificates/multi-root-crlset-E.raw",
- "src/pki/testdata/ssl/certificates/multi-root-crlset-unrelated.raw",
- "src/pki/testdata/ssl/certificates/multi-root.keychain",
- "src/pki/testdata/ssl/certificates/multivalue_rdn.pem",
- "src/pki/testdata/ssl/certificates/name_constrained_key.pem",
- "src/pki/testdata/ssl/certificates/ndn.ca.crt",
- "src/pki/testdata/ssl/certificates/nist.der",
- "src/pki/testdata/ssl/certificates/no_subject_common_name_cert.pem",
- "src/pki/testdata/ssl/certificates/non-crit-codeSigning-chain.pem",
- "src/pki/testdata/ssl/certificates/ok_cert.pem",
- "src/pki/testdata/ssl/certificates/ok_cert_by_intermediate.pem",
- "src/pki/testdata/ssl/certificates/policies_sanity_check.pem",
- "src/pki/testdata/ssl/certificates/post_june_2016.pem",
- "src/pki/testdata/ssl/certificates/pre_br_validity_bad_121.pem",
- "src/pki/testdata/ssl/certificates/pre_br_validity_bad_2020.pem",
- "src/pki/testdata/ssl/certificates/pre_br_validity_ok.pem",
- "src/pki/testdata/ssl/certificates/pre_june_2016.pem",
- "src/pki/testdata/ssl/certificates/prime256v1-ecdsa-ee-by-1024-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/prime256v1-ecdsa-ee-by-2048-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/prime256v1-ecdsa-ee-by-768-rsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/prime256v1-ecdsa-ee-by-prime256v1-ecdsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/prime256v1-ecdsa-intermediate.pem",
- "src/pki/testdata/ssl/certificates/punycodetest.pem",
- "src/pki/testdata/ssl/certificates/quic-chain.pem",
- "src/pki/testdata/ssl/certificates/quic-ecdsa-leaf.key",
- "src/pki/testdata/ssl/certificates/quic-leaf-cert.key",
- "src/pki/testdata/ssl/certificates/quic-leaf-cert.key.pkcs8.pem",
- "src/pki/testdata/ssl/certificates/quic-leaf-cert.key.sct",
- "src/pki/testdata/ssl/certificates/quic-root.pem",
- "src/pki/testdata/ssl/certificates/quic-short-lived.pem",
- "src/pki/testdata/ssl/certificates/redundant-server-chain.pem",
- "src/pki/testdata/ssl/certificates/redundant-validated-chain-root.pem",
- "src/pki/testdata/ssl/certificates/redundant-validated-chain.pem",
- "src/pki/testdata/ssl/certificates/root_ca_cert.pem",
- "src/pki/testdata/ssl/certificates/rsa-1024-1.key",
- "src/pki/testdata/ssl/certificates/rsa-1024-2.key",
- "src/pki/testdata/ssl/certificates/rsa-1024-3.key",
- "src/pki/testdata/ssl/certificates/rsa-2048-1.key",
- "src/pki/testdata/ssl/certificates/rsa-2048-2.key",
- "src/pki/testdata/ssl/certificates/rsa-2048-3.key",
- "src/pki/testdata/ssl/certificates/rsa-768-1.key",
- "src/pki/testdata/ssl/certificates/rsa-768-2.key",
- "src/pki/testdata/ssl/certificates/rsa-768-3.key",
- "src/pki/testdata/ssl/certificates/rsa-8200-1.key",
- "src/pki/testdata/ssl/certificates/salesforce_com_test.pem",
- "src/pki/testdata/ssl/certificates/self-signed-invalid-name.pem",
- "src/pki/testdata/ssl/certificates/self-signed-invalid-sig.pem",
- "src/pki/testdata/ssl/certificates/sha1_2016.pem",
- "src/pki/testdata/ssl/certificates/sha1_leaf.pem",
- "src/pki/testdata/ssl/certificates/spdy_pooling.pem",
- "src/pki/testdata/ssl/certificates/start_after_expiry.pem",
- "src/pki/testdata/ssl/certificates/subjectAltName_sanity_check.pem",
- "src/pki/testdata/ssl/certificates/subjectAltName_www_example_com.pem",
- "src/pki/testdata/ssl/certificates/test_names.pem",
- "src/pki/testdata/ssl/certificates/treadclimber.pem",
- "src/pki/testdata/ssl/certificates/treadclimber.sctlist",
- "src/pki/testdata/ssl/certificates/unescaped.pem",
- "src/pki/testdata/ssl/certificates/unittest.key.bin",
- "src/pki/testdata/ssl/certificates/unittest.selfsigned.der",
- "src/pki/testdata/ssl/certificates/verisign_intermediate_ca_2011.pem",
- "src/pki/testdata/ssl/certificates/verisign_intermediate_ca_2016.pem",
- "src/pki/testdata/ssl/certificates/weak_digest_md2_ee.pem",
- "src/pki/testdata/ssl/certificates/weak_digest_md2_intermediate.pem",
- "src/pki/testdata/ssl/certificates/weak_digest_md2_root.pem",
- "src/pki/testdata/ssl/certificates/weak_digest_md4_ee.pem",
- "src/pki/testdata/ssl/certificates/weak_digest_md4_intermediate.pem",
- "src/pki/testdata/ssl/certificates/weak_digest_md4_root.pem",
- "src/pki/testdata/ssl/certificates/weak_digest_md5_ee.pem",
- "src/pki/testdata/ssl/certificates/weak_digest_md5_intermediate.pem",
- "src/pki/testdata/ssl/certificates/weak_digest_md5_root.pem",
- "src/pki/testdata/ssl/certificates/weak_digest_sha1_ee.pem",
- "src/pki/testdata/ssl/certificates/weak_digest_sha1_intermediate.pem",
- "src/pki/testdata/ssl/certificates/weak_digest_sha1_root.pem",
- "src/pki/testdata/ssl/certificates/websocket_cacert.pem",
- "src/pki/testdata/ssl/certificates/websocket_client_cert.p12",
- "src/pki/testdata/ssl/certificates/wildcard.pem",
- "src/pki/testdata/ssl/certificates/x509_verify_results.chain.pem",
- "src/pki/testdata/test_certificate_data.h",
- "src/pki/testdata/verify_certificate_chain_unittest/README",
"src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/keys/Intermediate_1.key",
- "src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/not-after.test",
"src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/not-before.test",
"src/pki/testdata/verify_certificate_chain_unittest/expired-root/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-root/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-root/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-root/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-root/keys/Target.key",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-root/keys/expired-unconstrained-root_Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-root/keys/expired-unconstrained-root_Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/expired-root/not-after-ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/expired-root/not-after-ta-with-expiration-and-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/expired-root/not-after-ta-with-expiration.test",
@@ -1574,67 +1310,32 @@
"src/pki/testdata/verify_certificate_chain_unittest/expired-root/not-before-ta-with-expiration.test",
"src/pki/testdata/verify_certificate_chain_unittest/expired-root/not-before.test",
"src/pki/testdata/verify_certificate_chain_unittest/expired-target/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-target/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-target/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-target/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/expired-target/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/expired-target/not-after.test",
"src/pki/testdata/verify_certificate_chain_unittest/expired-target/not-before.test",
- "src/pki/testdata/verify_certificate_chain_unittest/generate-all.sh",
"src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/keys/BogusRoot.key",
- "src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/keys/Intermediate_1.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/keys/Root_1.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-ca-false/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-ca-false/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-ca-false/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-ca-false/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-ca-false/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-ca-false/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-not-critical/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-not-critical/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-not-critical/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-not-critical/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-not-critical/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-not-critical/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/any.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/chain.pem",
+ "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/clientauth-strict-leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/clientauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/clientauth.test",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/keys/Target.key",
+ "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/serverauth-strict-leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/serverauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/serverauth.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/any.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/chain.pem",
+ "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/clientauth-strict-leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/clientauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/clientauth.test",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/keys/Target.key",
+ "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/serverauth-strict-leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/serverauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/serverauth.test",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/keys/Intermediate_1.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/keys/Root_1.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/keys/Target.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/keys/Target_1.key",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/sha1-chain.pem",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/sha1-eku-any.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/sha1-eku-clientAuth-strict.test",
@@ -1647,52 +1348,24 @@
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/sha256-eku-clientAuth.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/sha256-eku-serverAuth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/sha256-eku-serverAuth.test",
+ "src/pki/testdata/verify_certificate_chain_unittest/intermediate-invalid-spki/chain.pem",
+ "src/pki/testdata/verify_certificate_chain_unittest/intermediate-invalid-spki/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-basic-constraints/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-basic-constraints/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-basic-constraints/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-basic-constraints/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-basic-constraints/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-basic-constraints/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-signing-key-usage/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-signing-key-usage/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-signing-key-usage/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-signing-key-usage/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-signing-key-usage/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-signing-key-usage/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-signed-with-sha1/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-signed-with-sha1/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-signed-with-sha1/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-signed-with-sha1/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-signed-with-sha1/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-signed-with-sha1/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-critical-extension/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-critical-extension/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-critical-extension/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-critical-extension/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-critical-extension/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-critical-extension/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-non-critical-extension/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-non-critical-extension/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-non-critical-extension/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-non-critical-extension/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-non-critical-extension/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-non-critical-extension/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/keys/Root_1.key",
- "src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/issuer-and-subject-not-byte-for-byte-equal/anchor.pem",
"src/pki/testdata/verify_certificate_chain_unittest/issuer-and-subject-not-byte-for-byte-equal/anchor.test",
"src/pki/testdata/verify_certificate_chain_unittest/issuer-and-subject-not-byte-for-byte-equal/target.pem",
"src/pki/testdata/verify_certificate_chain_unittest/issuer-and-subject-not-byte-for-byte-equal/target.test",
- "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/keys/Root_1.key",
- "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/key-rollover/longrolloverchain.pem",
"src/pki/testdata/verify_certificate_chain_unittest/key-rollover/longrolloverchain.test",
"src/pki/testdata/verify_certificate_chain_unittest/key-rollover/newchain.pem",
@@ -1701,94 +1374,8 @@
"src/pki/testdata/verify_certificate_chain_unittest/key-rollover/oldchain.test",
"src/pki/testdata/verify_certificate_chain_unittest/key-rollover/rolloverchain.pem",
"src/pki/testdata/verify_certificate_chain_unittest/key-rollover/rolloverchain.test",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/keys/t0.key",
"src/pki/testdata/verify_certificate_chain_unittest/many-names/ok-all-types.pem",
"src/pki/testdata/verify_certificate_chain_unittest/many-names/ok-all-types.test",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8D4.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8D5.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8D6.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8D7.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8D8.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8D9.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8DA.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8DB.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6F5.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6F6.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6F7.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6F8.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6F9.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6FA.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6FB.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6FC.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6FD.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.db",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.db.attr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.db.attr.old",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.db.old",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.serial",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.serial.old",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_1.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_1.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_1.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_2.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_2.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_2.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_3.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_3.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_3.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_4.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_4.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_4.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_5.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_5.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_5.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_6.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_6.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_6.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_7.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_7.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_7.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.db",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.db.attr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.db.attr.old",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.db.old",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.serial",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.serial.old",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0.db",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0.serial",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_1.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_1.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_1.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_2.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_2.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_2.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_3.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_3.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_3.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_4.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_4.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_4.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_5.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_5.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_5.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_6.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_6.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_6.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_7.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_7.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_7.pem",
"src/pki/testdata/verify_certificate_chain_unittest/many-names/toomany-all-types.pem",
"src/pki/testdata/verify_certificate_chain_unittest/many-names/toomany-all-types.test",
"src/pki/testdata/verify_certificate_chain_unittest/many-names/toomany-dirnames-excluded.pem",
@@ -1804,11 +1391,6 @@
"src/pki/testdata/verify_certificate_chain_unittest/many-names/toomany-ips-permitted.pem",
"src/pki/testdata/verify_certificate_chain_unittest/many-names/toomany-ips-permitted.test",
"src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/keys/ShadowRoot.key",
- "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/pkits_errors/4.1.2.txt",
@@ -1906,95 +1488,42 @@
"src/pki/testdata/verify_certificate_chain_unittest/pkits_errors/4.9.7.txt",
"src/pki/testdata/verify_certificate_chain_unittest/pkits_errors/4.9.8.txt",
"src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-ok/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-ok/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-ok/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-ok/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-ok/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/policies-ok/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-ok/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/ta-with-constraints.test",
- "src/pki/testdata/verify_certificate_chain_unittest/rebase-errors.py",
"src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/serverauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/serverauth-ta-with-constraints-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/serverauth-ta-with-constraints.test",
@@ -2002,28 +1531,16 @@
"src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/serverauth-ta-with-expiration.test",
"src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/serverauth.test",
"src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/ta-with-constraints-require-basic-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/ta-with-require-basic-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/chain.pem",
"src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/distrusted-root-expired.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/distrusted-root.test",
- "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/ta-with-constraints.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/ta-with-expiration.test",
@@ -2032,122 +1549,54 @@
"src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/unspecified-trust-root.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/any.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/chain.pem",
+ "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/clientauth-strict-leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/clientauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/clientauth.test",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/keys/Target.key",
+ "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/serverauth-strict-leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/serverauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/serverauth.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/any.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/chain.pem",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/clientauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/clientauth.test",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/serverauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/serverauth.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/any.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/chain.pem",
+ "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/clientauth-strict-leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/clientauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/clientauth.test",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/keys/Target.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/3F1D2B1D127E34B62B61B278F274669ADC66ADCC.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/71F49EE7B5F73630C9845EA5B8398B58F3237B18.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/71F49EE7B5F73630C9845EA5B8398B58F3237B19.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.db",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.db.attr",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.db.old",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.serial",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.serial.old",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Issuer.db",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Issuer.serial",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.db",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.db.attr",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.db.attr.old",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.db.old",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.serial",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.serial.old",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Target.cnf",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Target.csr",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Target.db",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Target.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Target.serial",
+ "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/serverauth-strict-leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/serverauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/serverauth.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/any.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/chain.pem",
+ "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/clientauth-strict-leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/clientauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/clientauth.test",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/serverauth-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/serverauth.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-has-512bit-rsa-key/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-512bit-rsa-key/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-512bit-rsa-key/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-512bit-rsa-key/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-512bit-rsa-key/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-has-512bit-rsa-key/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/target_only-trusted_leaf-strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/target_only-trusted_leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/target_only.pem",
"src/pki/testdata/verify_certificate_chain_unittest/target-has-keycertsign-but-not-ca/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-keycertsign-but-not-ca/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-keycertsign-but-not-ca/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-keycertsign-but-not-ca/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-keycertsign-but-not-ca/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-has-keycertsign-but-not-ca/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-has-pathlen-but-not-ca/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-pathlen-but-not-ca/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-pathlen-but-not-ca/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-pathlen-but-not-ca/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-has-pathlen-but-not-ca/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-has-pathlen-but-not-ca/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-and-eku/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-and-eku/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-and-eku/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-and-eku/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-and-eku/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-and-eku/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-no-eku/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-no-eku/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-no-eku/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-no-eku/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-no-eku/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-no-eku/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/main.test",
+ "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/strict-leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/strict.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-only/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-only/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-only/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-only/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-only/trusted_anchor.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-only/trusted_leaf-and-trust_anchor.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-only/trusted_leaf-not_after.test",
@@ -2155,16 +1604,11 @@
"src/pki/testdata/verify_certificate_chain_unittest/target-only/trusted_leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-only/trusted_leaf_require_self_signed.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/keys/Target.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/keys/Target_1.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/trusted_anchor.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/trusted_leaf-and-trust_anchor.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/trusted_leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/trusted_leaf_require_self_signed.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-selfsigned/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-selfsigned/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-selfsigned/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-selfsigned/trusted_leaf-and-trust_anchor.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-selfsigned/trusted_leaf-not_after.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-selfsigned/trusted_leaf-wrong_eku.test",
@@ -2178,11 +1622,6 @@
"src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/ec-keyAgreement.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/ec-keyEncipherment.pem",
"src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/ec-keyEncipherment.test",
- "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/keys/Target-ec.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/keys/Target-rsa.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/rsa-decipherOnly.pem",
"src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/rsa-decipherOnly.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/rsa-digitalSignature.pem",
@@ -2192,70 +1631,26 @@
"src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/rsa-keyEncipherment.pem",
"src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/rsa-keyEncipherment.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-signed-by-512bit-rsa/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-signed-by-512bit-rsa/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-signed-by-512bit-rsa/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-signed-by-512bit-rsa/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-signed-by-512bit-rsa/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-signed-by-512bit-rsa/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-signed-using-ecdsa/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-signed-using-ecdsa/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-signed-using-ecdsa/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-signed-using-ecdsa/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-signed-using-ecdsa/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-signed-using-ecdsa/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-signed-with-sha1/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-signed-with-sha1/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-signed-with-sha1/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-signed-with-sha1/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-signed-with-sha1/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-signed-with-sha1/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/target_only-trusted_leaf.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/target_only.pem",
"src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/keys/Intermediate_1.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/keys/Intermediate_1.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/unknown-critical-policy-qualifier/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/unknown-critical-policy-qualifier/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/unknown-critical-policy-qualifier/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/unknown-critical-policy-qualifier/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/unknown-critical-policy-qualifier/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/unknown-critical-policy-qualifier/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/unknown-non-critical-policy-qualifier/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/unknown-non-critical-policy-qualifier/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/unknown-non-critical-policy-qualifier/keys/Intermediate.key",
- "src/pki/testdata/verify_certificate_chain_unittest/unknown-non-critical-policy-qualifier/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/unknown-non-critical-policy-qualifier/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/unknown-non-critical-policy-qualifier/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/keys/Intermediate1.key",
- "src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/keys/Intermediate2.key",
- "src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/chain.pem",
- "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/generate-chains.py",
- "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/keys/Intermediate1.key",
- "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/keys/Intermediate2.key",
- "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/keys/Root.key",
- "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/keys/Target.key",
"src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/main.test",
"src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/ta-with-constraints.test",
"src/pki/testdata/verify_name_match_unittest/names/ascii-BMPSTRING-case_swap-dupe_attr.pem",
@@ -2340,9 +1735,6 @@
"src/pki/testdata/verify_name_match_unittest/names/unicode_supplementary-UTF8-unmangled.pem",
"src/pki/testdata/verify_name_match_unittest/names/valid-Name-empty.pem",
"src/pki/testdata/verify_name_match_unittest/names/valid-minimal.pem",
- "src/pki/testdata/verify_name_match_unittest/scripts/generate_names.py",
- "src/pki/testdata/verify_signed_data_unittest/README",
- "src/pki/testdata/verify_signed_data_unittest/annotate_test_data.py",
"src/pki/testdata/verify_signed_data_unittest/ecdsa-prime256v1-sha512-spki-params-null.pem",
"src/pki/testdata/verify_signed_data_unittest/ecdsa-prime256v1-sha512-unused-bits-signature.pem",
"src/pki/testdata/verify_signed_data_unittest/ecdsa-prime256v1-sha512-using-ecdh-key.pem",
@@ -2369,6 +1761,8 @@
"src/pki/testdata/verify_signed_data_unittest/rsa-pss-sha256.pem",
"src/pki/testdata/verify_signed_data_unittest/rsa-using-ec-key.pem",
"src/pki/testdata/verify_signed_data_unittest/rsa2048-pkcs1-sha512.pem",
+ "src/pki/testdata/verify_unittest/google-leaf.der",
+ "src/pki/testdata/verify_unittest/self-issued.pem",
]
ssl_test_sources = [
@@ -2382,11 +1776,10 @@
"src/crypto/test/gtest_main.cc",
"src/pki/cert_issuer_source_static_unittest.cc",
"src/pki/certificate_policies_unittest.cc",
+ "src/pki/certificate_unittest.cc",
"src/pki/crl_unittest.cc",
"src/pki/encode_values_unittest.cc",
"src/pki/extended_key_usage_unittest.cc",
- "src/pki/fillins/file_util.cc",
- "src/pki/fillins/path_service.cc",
"src/pki/general_names_unittest.cc",
"src/pki/input_unittest.cc",
"src/pki/ip_util_unittest.cc",
@@ -2402,11 +1795,13 @@
"src/pki/path_builder_pkits_unittest.cc",
"src/pki/path_builder_unittest.cc",
"src/pki/path_builder_verify_certificate_chain_unittest.cc",
+ "src/pki/pem_unittest.cc",
"src/pki/signature_algorithm_unittest.cc",
"src/pki/simple_path_builder_delegate_unittest.cc",
"src/pki/string_util_unittest.cc",
"src/pki/test_helpers.cc",
"src/pki/trust_store_collection_unittest.cc",
+ "src/pki/trust_store_in_memory_unittest.cc",
"src/pki/verify_certificate_chain_pkits_unittest.cc",
"src/pki/verify_certificate_chain_unittest.cc",
"src/pki/verify_name_match_unittest.cc",
diff --git a/apple-aarch64/crypto/chacha/chacha-armv8-apple.S b/apple-aarch64/crypto/chacha/chacha-armv8-apple.S
deleted file mode 100644
index dd1c964..0000000
--- a/apple-aarch64/crypto/chacha/chacha-armv8-apple.S
+++ /dev/null
@@ -1,1984 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-
-.private_extern _OPENSSL_armcap_P
-
-.section __TEXT,__const
-
-.align 5
-Lsigma:
-.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
-Lone:
-.long 1,0,0,0
-.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-
-.text
-
-.globl _ChaCha20_ctr32
-.private_extern _ChaCha20_ctr32
-
-.align 5
-_ChaCha20_ctr32:
- AARCH64_VALID_CALL_TARGET
- cbz x2,Labort
-#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
- adrp x5,:pg_hi21_nc:_OPENSSL_armcap_P
-#else
- adrp x5,_OPENSSL_armcap_P@PAGE
-#endif
- cmp x2,#192
- b.lo Lshort
- ldr w17,[x5,_OPENSSL_armcap_P@PAGEOFF]
- tst w17,#ARMV7_NEON
- b.ne ChaCha20_neon
-
-Lshort:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adrp x5,Lsigma@PAGE
- add x5,x5,Lsigma@PAGEOFF
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#64
-
- ldp x22,x23,[x5] // load sigma
- ldp x24,x25,[x3] // load key
- ldp x26,x27,[x3,#16]
- ldp x28,x30,[x4] // load counter
-#ifdef __AARCH64EB__
- ror x24,x24,#32
- ror x25,x25,#32
- ror x26,x26,#32
- ror x27,x27,#32
- ror x28,x28,#32
- ror x30,x30,#32
-#endif
-
-Loop_outer:
- mov w5,w22 // unpack key block
- lsr x6,x22,#32
- mov w7,w23
- lsr x8,x23,#32
- mov w9,w24
- lsr x10,x24,#32
- mov w11,w25
- lsr x12,x25,#32
- mov w13,w26
- lsr x14,x26,#32
- mov w15,w27
- lsr x16,x27,#32
- mov w17,w28
- lsr x19,x28,#32
- mov w20,w30
- lsr x21,x30,#32
-
- mov x4,#10
- subs x2,x2,#64
-Loop:
- sub x4,x4,#1
- add w5,w5,w9
- add w6,w6,w10
- add w7,w7,w11
- add w8,w8,w12
- eor w17,w17,w5
- eor w19,w19,w6
- eor w20,w20,w7
- eor w21,w21,w8
- ror w17,w17,#16
- ror w19,w19,#16
- ror w20,w20,#16
- ror w21,w21,#16
- add w13,w13,w17
- add w14,w14,w19
- add w15,w15,w20
- add w16,w16,w21
- eor w9,w9,w13
- eor w10,w10,w14
- eor w11,w11,w15
- eor w12,w12,w16
- ror w9,w9,#20
- ror w10,w10,#20
- ror w11,w11,#20
- ror w12,w12,#20
- add w5,w5,w9
- add w6,w6,w10
- add w7,w7,w11
- add w8,w8,w12
- eor w17,w17,w5
- eor w19,w19,w6
- eor w20,w20,w7
- eor w21,w21,w8
- ror w17,w17,#24
- ror w19,w19,#24
- ror w20,w20,#24
- ror w21,w21,#24
- add w13,w13,w17
- add w14,w14,w19
- add w15,w15,w20
- add w16,w16,w21
- eor w9,w9,w13
- eor w10,w10,w14
- eor w11,w11,w15
- eor w12,w12,w16
- ror w9,w9,#25
- ror w10,w10,#25
- ror w11,w11,#25
- ror w12,w12,#25
- add w5,w5,w10
- add w6,w6,w11
- add w7,w7,w12
- add w8,w8,w9
- eor w21,w21,w5
- eor w17,w17,w6
- eor w19,w19,w7
- eor w20,w20,w8
- ror w21,w21,#16
- ror w17,w17,#16
- ror w19,w19,#16
- ror w20,w20,#16
- add w15,w15,w21
- add w16,w16,w17
- add w13,w13,w19
- add w14,w14,w20
- eor w10,w10,w15
- eor w11,w11,w16
- eor w12,w12,w13
- eor w9,w9,w14
- ror w10,w10,#20
- ror w11,w11,#20
- ror w12,w12,#20
- ror w9,w9,#20
- add w5,w5,w10
- add w6,w6,w11
- add w7,w7,w12
- add w8,w8,w9
- eor w21,w21,w5
- eor w17,w17,w6
- eor w19,w19,w7
- eor w20,w20,w8
- ror w21,w21,#24
- ror w17,w17,#24
- ror w19,w19,#24
- ror w20,w20,#24
- add w15,w15,w21
- add w16,w16,w17
- add w13,w13,w19
- add w14,w14,w20
- eor w10,w10,w15
- eor w11,w11,w16
- eor w12,w12,w13
- eor w9,w9,w14
- ror w10,w10,#25
- ror w11,w11,#25
- ror w12,w12,#25
- ror w9,w9,#25
- cbnz x4,Loop
-
- add w5,w5,w22 // accumulate key block
- add x6,x6,x22,lsr#32
- add w7,w7,w23
- add x8,x8,x23,lsr#32
- add w9,w9,w24
- add x10,x10,x24,lsr#32
- add w11,w11,w25
- add x12,x12,x25,lsr#32
- add w13,w13,w26
- add x14,x14,x26,lsr#32
- add w15,w15,w27
- add x16,x16,x27,lsr#32
- add w17,w17,w28
- add x19,x19,x28,lsr#32
- add w20,w20,w30
- add x21,x21,x30,lsr#32
-
- b.lo Ltail
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
-#ifdef __AARCH64EB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor x15,x15,x16
- eor x17,x17,x19
- eor x20,x20,x21
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#1 // increment counter
- stp x9,x11,[x0,#16]
- stp x13,x15,[x0,#32]
- stp x17,x20,[x0,#48]
- add x0,x0,#64
-
- b.hi Loop_outer
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- AARCH64_VALIDATE_LINK_REGISTER
-Labort:
- ret
-
-.align 4
-Ltail:
- add x2,x2,#64
-Less_than_64:
- sub x0,x0,#1
- add x1,x1,x2
- add x0,x0,x2
- add x4,sp,x2
- neg x2,x2
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
-#ifdef __AARCH64EB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- stp x5,x7,[sp,#0]
- stp x9,x11,[sp,#16]
- stp x13,x15,[sp,#32]
- stp x17,x20,[sp,#48]
-
-Loop_tail:
- ldrb w10,[x1,x2]
- ldrb w11,[x4,x2]
- add x2,x2,#1
- eor w10,w10,w11
- strb w10,[x0,x2]
- cbnz x2,Loop_tail
-
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-
-.align 5
-ChaCha20_neon:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adrp x5,Lsigma@PAGE
- add x5,x5,Lsigma@PAGEOFF
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- cmp x2,#512
- b.hs L512_or_more_neon
-
- sub sp,sp,#64
-
- ldp x22,x23,[x5] // load sigma
- ld1 {v24.4s},[x5],#16
- ldp x24,x25,[x3] // load key
- ldp x26,x27,[x3,#16]
- ld1 {v25.4s,v26.4s},[x3]
- ldp x28,x30,[x4] // load counter
- ld1 {v27.4s},[x4]
- ld1 {v31.4s},[x5]
-#ifdef __AARCH64EB__
- rev64 v24.4s,v24.4s
- ror x24,x24,#32
- ror x25,x25,#32
- ror x26,x26,#32
- ror x27,x27,#32
- ror x28,x28,#32
- ror x30,x30,#32
-#endif
- add v27.4s,v27.4s,v31.4s // += 1
- add v28.4s,v27.4s,v31.4s
- add v29.4s,v28.4s,v31.4s
- shl v31.4s,v31.4s,#2 // 1 -> 4
-
-Loop_outer_neon:
- mov w5,w22 // unpack key block
- lsr x6,x22,#32
- mov v0.16b,v24.16b
- mov w7,w23
- lsr x8,x23,#32
- mov v4.16b,v24.16b
- mov w9,w24
- lsr x10,x24,#32
- mov v16.16b,v24.16b
- mov w11,w25
- mov v1.16b,v25.16b
- lsr x12,x25,#32
- mov v5.16b,v25.16b
- mov w13,w26
- mov v17.16b,v25.16b
- lsr x14,x26,#32
- mov v3.16b,v27.16b
- mov w15,w27
- mov v7.16b,v28.16b
- lsr x16,x27,#32
- mov v19.16b,v29.16b
- mov w17,w28
- mov v2.16b,v26.16b
- lsr x19,x28,#32
- mov v6.16b,v26.16b
- mov w20,w30
- mov v18.16b,v26.16b
- lsr x21,x30,#32
-
- mov x4,#10
- subs x2,x2,#256
-Loop_neon:
- sub x4,x4,#1
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v16.4s,v16.4s,v17.4s
- add w7,w7,w11
- eor v3.16b,v3.16b,v0.16b
- add w8,w8,w12
- eor v7.16b,v7.16b,v4.16b
- eor w17,w17,w5
- eor v19.16b,v19.16b,v16.16b
- eor w19,w19,w6
- rev32 v3.8h,v3.8h
- eor w20,w20,w7
- rev32 v7.8h,v7.8h
- eor w21,w21,w8
- rev32 v19.8h,v19.8h
- ror w17,w17,#16
- add v2.4s,v2.4s,v3.4s
- ror w19,w19,#16
- add v6.4s,v6.4s,v7.4s
- ror w20,w20,#16
- add v18.4s,v18.4s,v19.4s
- ror w21,w21,#16
- eor v20.16b,v1.16b,v2.16b
- add w13,w13,w17
- eor v21.16b,v5.16b,v6.16b
- add w14,w14,w19
- eor v22.16b,v17.16b,v18.16b
- add w15,w15,w20
- ushr v1.4s,v20.4s,#20
- add w16,w16,w21
- ushr v5.4s,v21.4s,#20
- eor w9,w9,w13
- ushr v17.4s,v22.4s,#20
- eor w10,w10,w14
- sli v1.4s,v20.4s,#12
- eor w11,w11,w15
- sli v5.4s,v21.4s,#12
- eor w12,w12,w16
- sli v17.4s,v22.4s,#12
- ror w9,w9,#20
- add v0.4s,v0.4s,v1.4s
- ror w10,w10,#20
- add v4.4s,v4.4s,v5.4s
- ror w11,w11,#20
- add v16.4s,v16.4s,v17.4s
- ror w12,w12,#20
- eor v20.16b,v3.16b,v0.16b
- add w5,w5,w9
- eor v21.16b,v7.16b,v4.16b
- add w6,w6,w10
- eor v22.16b,v19.16b,v16.16b
- add w7,w7,w11
- ushr v3.4s,v20.4s,#24
- add w8,w8,w12
- ushr v7.4s,v21.4s,#24
- eor w17,w17,w5
- ushr v19.4s,v22.4s,#24
- eor w19,w19,w6
- sli v3.4s,v20.4s,#8
- eor w20,w20,w7
- sli v7.4s,v21.4s,#8
- eor w21,w21,w8
- sli v19.4s,v22.4s,#8
- ror w17,w17,#24
- add v2.4s,v2.4s,v3.4s
- ror w19,w19,#24
- add v6.4s,v6.4s,v7.4s
- ror w20,w20,#24
- add v18.4s,v18.4s,v19.4s
- ror w21,w21,#24
- eor v20.16b,v1.16b,v2.16b
- add w13,w13,w17
- eor v21.16b,v5.16b,v6.16b
- add w14,w14,w19
- eor v22.16b,v17.16b,v18.16b
- add w15,w15,w20
- ushr v1.4s,v20.4s,#25
- add w16,w16,w21
- ushr v5.4s,v21.4s,#25
- eor w9,w9,w13
- ushr v17.4s,v22.4s,#25
- eor w10,w10,w14
- sli v1.4s,v20.4s,#7
- eor w11,w11,w15
- sli v5.4s,v21.4s,#7
- eor w12,w12,w16
- sli v17.4s,v22.4s,#7
- ror w9,w9,#25
- ext v2.16b,v2.16b,v2.16b,#8
- ror w10,w10,#25
- ext v6.16b,v6.16b,v6.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v3.16b,v3.16b,v3.16b,#12
- ext v7.16b,v7.16b,v7.16b,#12
- ext v19.16b,v19.16b,v19.16b,#12
- ext v1.16b,v1.16b,v1.16b,#4
- ext v5.16b,v5.16b,v5.16b,#4
- ext v17.16b,v17.16b,v17.16b,#4
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w10
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w11
- add v16.4s,v16.4s,v17.4s
- add w7,w7,w12
- eor v3.16b,v3.16b,v0.16b
- add w8,w8,w9
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w5
- eor v19.16b,v19.16b,v16.16b
- eor w17,w17,w6
- rev32 v3.8h,v3.8h
- eor w19,w19,w7
- rev32 v7.8h,v7.8h
- eor w20,w20,w8
- rev32 v19.8h,v19.8h
- ror w21,w21,#16
- add v2.4s,v2.4s,v3.4s
- ror w17,w17,#16
- add v6.4s,v6.4s,v7.4s
- ror w19,w19,#16
- add v18.4s,v18.4s,v19.4s
- ror w20,w20,#16
- eor v20.16b,v1.16b,v2.16b
- add w15,w15,w21
- eor v21.16b,v5.16b,v6.16b
- add w16,w16,w17
- eor v22.16b,v17.16b,v18.16b
- add w13,w13,w19
- ushr v1.4s,v20.4s,#20
- add w14,w14,w20
- ushr v5.4s,v21.4s,#20
- eor w10,w10,w15
- ushr v17.4s,v22.4s,#20
- eor w11,w11,w16
- sli v1.4s,v20.4s,#12
- eor w12,w12,w13
- sli v5.4s,v21.4s,#12
- eor w9,w9,w14
- sli v17.4s,v22.4s,#12
- ror w10,w10,#20
- add v0.4s,v0.4s,v1.4s
- ror w11,w11,#20
- add v4.4s,v4.4s,v5.4s
- ror w12,w12,#20
- add v16.4s,v16.4s,v17.4s
- ror w9,w9,#20
- eor v20.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v21.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v22.16b,v19.16b,v16.16b
- add w7,w7,w12
- ushr v3.4s,v20.4s,#24
- add w8,w8,w9
- ushr v7.4s,v21.4s,#24
- eor w21,w21,w5
- ushr v19.4s,v22.4s,#24
- eor w17,w17,w6
- sli v3.4s,v20.4s,#8
- eor w19,w19,w7
- sli v7.4s,v21.4s,#8
- eor w20,w20,w8
- sli v19.4s,v22.4s,#8
- ror w21,w21,#24
- add v2.4s,v2.4s,v3.4s
- ror w17,w17,#24
- add v6.4s,v6.4s,v7.4s
- ror w19,w19,#24
- add v18.4s,v18.4s,v19.4s
- ror w20,w20,#24
- eor v20.16b,v1.16b,v2.16b
- add w15,w15,w21
- eor v21.16b,v5.16b,v6.16b
- add w16,w16,w17
- eor v22.16b,v17.16b,v18.16b
- add w13,w13,w19
- ushr v1.4s,v20.4s,#25
- add w14,w14,w20
- ushr v5.4s,v21.4s,#25
- eor w10,w10,w15
- ushr v17.4s,v22.4s,#25
- eor w11,w11,w16
- sli v1.4s,v20.4s,#7
- eor w12,w12,w13
- sli v5.4s,v21.4s,#7
- eor w9,w9,w14
- sli v17.4s,v22.4s,#7
- ror w10,w10,#25
- ext v2.16b,v2.16b,v2.16b,#8
- ror w11,w11,#25
- ext v6.16b,v6.16b,v6.16b,#8
- ror w12,w12,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#4
- ext v7.16b,v7.16b,v7.16b,#4
- ext v19.16b,v19.16b,v19.16b,#4
- ext v1.16b,v1.16b,v1.16b,#12
- ext v5.16b,v5.16b,v5.16b,#12
- ext v17.16b,v17.16b,v17.16b,#12
- cbnz x4,Loop_neon
-
- add w5,w5,w22 // accumulate key block
- add v0.4s,v0.4s,v24.4s
- add x6,x6,x22,lsr#32
- add v4.4s,v4.4s,v24.4s
- add w7,w7,w23
- add v16.4s,v16.4s,v24.4s
- add x8,x8,x23,lsr#32
- add v2.4s,v2.4s,v26.4s
- add w9,w9,w24
- add v6.4s,v6.4s,v26.4s
- add x10,x10,x24,lsr#32
- add v18.4s,v18.4s,v26.4s
- add w11,w11,w25
- add v3.4s,v3.4s,v27.4s
- add x12,x12,x25,lsr#32
- add w13,w13,w26
- add v7.4s,v7.4s,v28.4s
- add x14,x14,x26,lsr#32
- add w15,w15,w27
- add v19.4s,v19.4s,v29.4s
- add x16,x16,x27,lsr#32
- add w17,w17,w28
- add v1.4s,v1.4s,v25.4s
- add x19,x19,x28,lsr#32
- add w20,w20,w30
- add v5.4s,v5.4s,v25.4s
- add x21,x21,x30,lsr#32
- add v17.4s,v17.4s,v25.4s
-
- b.lo Ltail_neon
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
-#ifdef __AARCH64EB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor v0.16b,v0.16b,v20.16b
- eor x15,x15,x16
- eor v1.16b,v1.16b,v21.16b
- eor x17,x17,x19
- eor v2.16b,v2.16b,v22.16b
- eor x20,x20,x21
- eor v3.16b,v3.16b,v23.16b
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#4 // increment counter
- stp x9,x11,[x0,#16]
- add v27.4s,v27.4s,v31.4s // += 4
- stp x13,x15,[x0,#32]
- add v28.4s,v28.4s,v31.4s
- stp x17,x20,[x0,#48]
- add v29.4s,v29.4s,v31.4s
- add x0,x0,#64
-
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
- ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
-
- eor v4.16b,v4.16b,v20.16b
- eor v5.16b,v5.16b,v21.16b
- eor v6.16b,v6.16b,v22.16b
- eor v7.16b,v7.16b,v23.16b
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
-
- eor v16.16b,v16.16b,v0.16b
- eor v17.16b,v17.16b,v1.16b
- eor v18.16b,v18.16b,v2.16b
- eor v19.16b,v19.16b,v3.16b
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
-
- b.hi Loop_outer_neon
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-Ltail_neon:
- add x2,x2,#256
- cmp x2,#64
- b.lo Less_than_64
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
-#ifdef __AARCH64EB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor x15,x15,x16
- eor x17,x17,x19
- eor x20,x20,x21
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#4 // increment counter
- stp x9,x11,[x0,#16]
- stp x13,x15,[x0,#32]
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- b.eq Ldone_neon
- sub x2,x2,#64
- cmp x2,#64
- b.lo Less_than_128
-
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- eor v0.16b,v0.16b,v20.16b
- eor v1.16b,v1.16b,v21.16b
- eor v2.16b,v2.16b,v22.16b
- eor v3.16b,v3.16b,v23.16b
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
- b.eq Ldone_neon
- sub x2,x2,#64
- cmp x2,#64
- b.lo Less_than_192
-
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
- eor v4.16b,v4.16b,v20.16b
- eor v5.16b,v5.16b,v21.16b
- eor v6.16b,v6.16b,v22.16b
- eor v7.16b,v7.16b,v23.16b
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
- b.eq Ldone_neon
- sub x2,x2,#64
-
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
- b Last_neon
-
-Less_than_128:
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
- b Last_neon
-Less_than_192:
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
- b Last_neon
-
-.align 4
-Last_neon:
- sub x0,x0,#1
- add x1,x1,x2
- add x0,x0,x2
- add x4,sp,x2
- neg x2,x2
-
-Loop_tail_neon:
- ldrb w10,[x1,x2]
- ldrb w11,[x4,x2]
- add x2,x2,#1
- eor w10,w10,w11
- strb w10,[x0,x2]
- cbnz x2,Loop_tail_neon
-
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
-
-Ldone_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-.align 5
-ChaCha20_512_neon:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adrp x5,Lsigma@PAGE
- add x5,x5,Lsigma@PAGEOFF
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
-
-L512_or_more_neon:
- sub sp,sp,#128+64
-
- ldp x22,x23,[x5] // load sigma
- ld1 {v24.4s},[x5],#16
- ldp x24,x25,[x3] // load key
- ldp x26,x27,[x3,#16]
- ld1 {v25.4s,v26.4s},[x3]
- ldp x28,x30,[x4] // load counter
- ld1 {v27.4s},[x4]
- ld1 {v31.4s},[x5]
-#ifdef __AARCH64EB__
- rev64 v24.4s,v24.4s
- ror x24,x24,#32
- ror x25,x25,#32
- ror x26,x26,#32
- ror x27,x27,#32
- ror x28,x28,#32
- ror x30,x30,#32
-#endif
- add v27.4s,v27.4s,v31.4s // += 1
- stp q24,q25,[sp,#0] // off-load key block, invariant part
- add v27.4s,v27.4s,v31.4s // not typo
- str q26,[sp,#32]
- add v28.4s,v27.4s,v31.4s
- add v29.4s,v28.4s,v31.4s
- add v30.4s,v29.4s,v31.4s
- shl v31.4s,v31.4s,#2 // 1 -> 4
-
- stp d8,d9,[sp,#128+0] // meet ABI requirements
- stp d10,d11,[sp,#128+16]
- stp d12,d13,[sp,#128+32]
- stp d14,d15,[sp,#128+48]
-
- sub x2,x2,#512 // not typo
-
-Loop_outer_512_neon:
- mov v0.16b,v24.16b
- mov v4.16b,v24.16b
- mov v8.16b,v24.16b
- mov v12.16b,v24.16b
- mov v16.16b,v24.16b
- mov v20.16b,v24.16b
- mov v1.16b,v25.16b
- mov w5,w22 // unpack key block
- mov v5.16b,v25.16b
- lsr x6,x22,#32
- mov v9.16b,v25.16b
- mov w7,w23
- mov v13.16b,v25.16b
- lsr x8,x23,#32
- mov v17.16b,v25.16b
- mov w9,w24
- mov v21.16b,v25.16b
- lsr x10,x24,#32
- mov v3.16b,v27.16b
- mov w11,w25
- mov v7.16b,v28.16b
- lsr x12,x25,#32
- mov v11.16b,v29.16b
- mov w13,w26
- mov v15.16b,v30.16b
- lsr x14,x26,#32
- mov v2.16b,v26.16b
- mov w15,w27
- mov v6.16b,v26.16b
- lsr x16,x27,#32
- add v19.4s,v3.4s,v31.4s // +4
- mov w17,w28
- add v23.4s,v7.4s,v31.4s // +4
- lsr x19,x28,#32
- mov v10.16b,v26.16b
- mov w20,w30
- mov v14.16b,v26.16b
- lsr x21,x30,#32
- mov v18.16b,v26.16b
- stp q27,q28,[sp,#48] // off-load key block, variable part
- mov v22.16b,v26.16b
- str q29,[sp,#80]
-
- mov x4,#5
- subs x2,x2,#512
-Loop_upper_neon:
- sub x4,x4,#1
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#12
- ext v7.16b,v7.16b,v7.16b,#12
- ext v11.16b,v11.16b,v11.16b,#12
- ext v15.16b,v15.16b,v15.16b,#12
- ext v19.16b,v19.16b,v19.16b,#12
- ext v23.16b,v23.16b,v23.16b,#12
- ext v1.16b,v1.16b,v1.16b,#4
- ext v5.16b,v5.16b,v5.16b,#4
- ext v9.16b,v9.16b,v9.16b,#4
- ext v13.16b,v13.16b,v13.16b,#4
- ext v17.16b,v17.16b,v17.16b,#4
- ext v21.16b,v21.16b,v21.16b,#4
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#4
- ext v7.16b,v7.16b,v7.16b,#4
- ext v11.16b,v11.16b,v11.16b,#4
- ext v15.16b,v15.16b,v15.16b,#4
- ext v19.16b,v19.16b,v19.16b,#4
- ext v23.16b,v23.16b,v23.16b,#4
- ext v1.16b,v1.16b,v1.16b,#12
- ext v5.16b,v5.16b,v5.16b,#12
- ext v9.16b,v9.16b,v9.16b,#12
- ext v13.16b,v13.16b,v13.16b,#12
- ext v17.16b,v17.16b,v17.16b,#12
- ext v21.16b,v21.16b,v21.16b,#12
- cbnz x4,Loop_upper_neon
-
- add w5,w5,w22 // accumulate key block
- add x6,x6,x22,lsr#32
- add w7,w7,w23
- add x8,x8,x23,lsr#32
- add w9,w9,w24
- add x10,x10,x24,lsr#32
- add w11,w11,w25
- add x12,x12,x25,lsr#32
- add w13,w13,w26
- add x14,x14,x26,lsr#32
- add w15,w15,w27
- add x16,x16,x27,lsr#32
- add w17,w17,w28
- add x19,x19,x28,lsr#32
- add w20,w20,w30
- add x21,x21,x30,lsr#32
-
- add x5,x5,x6,lsl#32 // pack
- add x7,x7,x8,lsl#32
- ldp x6,x8,[x1,#0] // load input
- add x9,x9,x10,lsl#32
- add x11,x11,x12,lsl#32
- ldp x10,x12,[x1,#16]
- add x13,x13,x14,lsl#32
- add x15,x15,x16,lsl#32
- ldp x14,x16,[x1,#32]
- add x17,x17,x19,lsl#32
- add x20,x20,x21,lsl#32
- ldp x19,x21,[x1,#48]
- add x1,x1,#64
-#ifdef __AARCH64EB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor x15,x15,x16
- eor x17,x17,x19
- eor x20,x20,x21
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#1 // increment counter
- mov w5,w22 // unpack key block
- lsr x6,x22,#32
- stp x9,x11,[x0,#16]
- mov w7,w23
- lsr x8,x23,#32
- stp x13,x15,[x0,#32]
- mov w9,w24
- lsr x10,x24,#32
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- mov w11,w25
- lsr x12,x25,#32
- mov w13,w26
- lsr x14,x26,#32
- mov w15,w27
- lsr x16,x27,#32
- mov w17,w28
- lsr x19,x28,#32
- mov w20,w30
- lsr x21,x30,#32
-
- mov x4,#5
-Loop_lower_neon:
- sub x4,x4,#1
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#12
- ext v7.16b,v7.16b,v7.16b,#12
- ext v11.16b,v11.16b,v11.16b,#12
- ext v15.16b,v15.16b,v15.16b,#12
- ext v19.16b,v19.16b,v19.16b,#12
- ext v23.16b,v23.16b,v23.16b,#12
- ext v1.16b,v1.16b,v1.16b,#4
- ext v5.16b,v5.16b,v5.16b,#4
- ext v9.16b,v9.16b,v9.16b,#4
- ext v13.16b,v13.16b,v13.16b,#4
- ext v17.16b,v17.16b,v17.16b,#4
- ext v21.16b,v21.16b,v21.16b,#4
- add v0.4s,v0.4s,v1.4s
- add w5,w5,w9
- add v4.4s,v4.4s,v5.4s
- add w6,w6,w10
- add v8.4s,v8.4s,v9.4s
- add w7,w7,w11
- add v12.4s,v12.4s,v13.4s
- add w8,w8,w12
- add v16.4s,v16.4s,v17.4s
- eor w17,w17,w5
- add v20.4s,v20.4s,v21.4s
- eor w19,w19,w6
- eor v3.16b,v3.16b,v0.16b
- eor w20,w20,w7
- eor v7.16b,v7.16b,v4.16b
- eor w21,w21,w8
- eor v11.16b,v11.16b,v8.16b
- ror w17,w17,#16
- eor v15.16b,v15.16b,v12.16b
- ror w19,w19,#16
- eor v19.16b,v19.16b,v16.16b
- ror w20,w20,#16
- eor v23.16b,v23.16b,v20.16b
- ror w21,w21,#16
- rev32 v3.8h,v3.8h
- add w13,w13,w17
- rev32 v7.8h,v7.8h
- add w14,w14,w19
- rev32 v11.8h,v11.8h
- add w15,w15,w20
- rev32 v15.8h,v15.8h
- add w16,w16,w21
- rev32 v19.8h,v19.8h
- eor w9,w9,w13
- rev32 v23.8h,v23.8h
- eor w10,w10,w14
- add v2.4s,v2.4s,v3.4s
- eor w11,w11,w15
- add v6.4s,v6.4s,v7.4s
- eor w12,w12,w16
- add v10.4s,v10.4s,v11.4s
- ror w9,w9,#20
- add v14.4s,v14.4s,v15.4s
- ror w10,w10,#20
- add v18.4s,v18.4s,v19.4s
- ror w11,w11,#20
- add v22.4s,v22.4s,v23.4s
- ror w12,w12,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w9
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w10
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w11
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w12
- eor v28.16b,v17.16b,v18.16b
- eor w17,w17,w5
- eor v29.16b,v21.16b,v22.16b
- eor w19,w19,w6
- ushr v1.4s,v24.4s,#20
- eor w20,w20,w7
- ushr v5.4s,v25.4s,#20
- eor w21,w21,w8
- ushr v9.4s,v26.4s,#20
- ror w17,w17,#24
- ushr v13.4s,v27.4s,#20
- ror w19,w19,#24
- ushr v17.4s,v28.4s,#20
- ror w20,w20,#24
- ushr v21.4s,v29.4s,#20
- ror w21,w21,#24
- sli v1.4s,v24.4s,#12
- add w13,w13,w17
- sli v5.4s,v25.4s,#12
- add w14,w14,w19
- sli v9.4s,v26.4s,#12
- add w15,w15,w20
- sli v13.4s,v27.4s,#12
- add w16,w16,w21
- sli v17.4s,v28.4s,#12
- eor w9,w9,w13
- sli v21.4s,v29.4s,#12
- eor w10,w10,w14
- add v0.4s,v0.4s,v1.4s
- eor w11,w11,w15
- add v4.4s,v4.4s,v5.4s
- eor w12,w12,w16
- add v8.4s,v8.4s,v9.4s
- ror w9,w9,#25
- add v12.4s,v12.4s,v13.4s
- ror w10,w10,#25
- add v16.4s,v16.4s,v17.4s
- ror w11,w11,#25
- add v20.4s,v20.4s,v21.4s
- ror w12,w12,#25
- eor v24.16b,v3.16b,v0.16b
- add w5,w5,w10
- eor v25.16b,v7.16b,v4.16b
- add w6,w6,w11
- eor v26.16b,v11.16b,v8.16b
- add w7,w7,w12
- eor v27.16b,v15.16b,v12.16b
- add w8,w8,w9
- eor v28.16b,v19.16b,v16.16b
- eor w21,w21,w5
- eor v29.16b,v23.16b,v20.16b
- eor w17,w17,w6
- ushr v3.4s,v24.4s,#24
- eor w19,w19,w7
- ushr v7.4s,v25.4s,#24
- eor w20,w20,w8
- ushr v11.4s,v26.4s,#24
- ror w21,w21,#16
- ushr v15.4s,v27.4s,#24
- ror w17,w17,#16
- ushr v19.4s,v28.4s,#24
- ror w19,w19,#16
- ushr v23.4s,v29.4s,#24
- ror w20,w20,#16
- sli v3.4s,v24.4s,#8
- add w15,w15,w21
- sli v7.4s,v25.4s,#8
- add w16,w16,w17
- sli v11.4s,v26.4s,#8
- add w13,w13,w19
- sli v15.4s,v27.4s,#8
- add w14,w14,w20
- sli v19.4s,v28.4s,#8
- eor w10,w10,w15
- sli v23.4s,v29.4s,#8
- eor w11,w11,w16
- add v2.4s,v2.4s,v3.4s
- eor w12,w12,w13
- add v6.4s,v6.4s,v7.4s
- eor w9,w9,w14
- add v10.4s,v10.4s,v11.4s
- ror w10,w10,#20
- add v14.4s,v14.4s,v15.4s
- ror w11,w11,#20
- add v18.4s,v18.4s,v19.4s
- ror w12,w12,#20
- add v22.4s,v22.4s,v23.4s
- ror w9,w9,#20
- eor v24.16b,v1.16b,v2.16b
- add w5,w5,w10
- eor v25.16b,v5.16b,v6.16b
- add w6,w6,w11
- eor v26.16b,v9.16b,v10.16b
- add w7,w7,w12
- eor v27.16b,v13.16b,v14.16b
- add w8,w8,w9
- eor v28.16b,v17.16b,v18.16b
- eor w21,w21,w5
- eor v29.16b,v21.16b,v22.16b
- eor w17,w17,w6
- ushr v1.4s,v24.4s,#25
- eor w19,w19,w7
- ushr v5.4s,v25.4s,#25
- eor w20,w20,w8
- ushr v9.4s,v26.4s,#25
- ror w21,w21,#24
- ushr v13.4s,v27.4s,#25
- ror w17,w17,#24
- ushr v17.4s,v28.4s,#25
- ror w19,w19,#24
- ushr v21.4s,v29.4s,#25
- ror w20,w20,#24
- sli v1.4s,v24.4s,#7
- add w15,w15,w21
- sli v5.4s,v25.4s,#7
- add w16,w16,w17
- sli v9.4s,v26.4s,#7
- add w13,w13,w19
- sli v13.4s,v27.4s,#7
- add w14,w14,w20
- sli v17.4s,v28.4s,#7
- eor w10,w10,w15
- sli v21.4s,v29.4s,#7
- eor w11,w11,w16
- ext v2.16b,v2.16b,v2.16b,#8
- eor w12,w12,w13
- ext v6.16b,v6.16b,v6.16b,#8
- eor w9,w9,w14
- ext v10.16b,v10.16b,v10.16b,#8
- ror w10,w10,#25
- ext v14.16b,v14.16b,v14.16b,#8
- ror w11,w11,#25
- ext v18.16b,v18.16b,v18.16b,#8
- ror w12,w12,#25
- ext v22.16b,v22.16b,v22.16b,#8
- ror w9,w9,#25
- ext v3.16b,v3.16b,v3.16b,#4
- ext v7.16b,v7.16b,v7.16b,#4
- ext v11.16b,v11.16b,v11.16b,#4
- ext v15.16b,v15.16b,v15.16b,#4
- ext v19.16b,v19.16b,v19.16b,#4
- ext v23.16b,v23.16b,v23.16b,#4
- ext v1.16b,v1.16b,v1.16b,#12
- ext v5.16b,v5.16b,v5.16b,#12
- ext v9.16b,v9.16b,v9.16b,#12
- ext v13.16b,v13.16b,v13.16b,#12
- ext v17.16b,v17.16b,v17.16b,#12
- ext v21.16b,v21.16b,v21.16b,#12
- cbnz x4,Loop_lower_neon
-
- add w5,w5,w22 // accumulate key block
- ldp q24,q25,[sp,#0]
- add x6,x6,x22,lsr#32
- ldp q26,q27,[sp,#32]
- add w7,w7,w23
- ldp q28,q29,[sp,#64]
- add x8,x8,x23,lsr#32
- add v0.4s,v0.4s,v24.4s
- add w9,w9,w24
- add v4.4s,v4.4s,v24.4s
- add x10,x10,x24,lsr#32
- add v8.4s,v8.4s,v24.4s
- add w11,w11,w25
- add v12.4s,v12.4s,v24.4s
- add x12,x12,x25,lsr#32
- add v16.4s,v16.4s,v24.4s
- add w13,w13,w26
- add v20.4s,v20.4s,v24.4s
- add x14,x14,x26,lsr#32
- add v2.4s,v2.4s,v26.4s
- add w15,w15,w27
- add v6.4s,v6.4s,v26.4s
- add x16,x16,x27,lsr#32
- add v10.4s,v10.4s,v26.4s
- add w17,w17,w28
- add v14.4s,v14.4s,v26.4s
- add x19,x19,x28,lsr#32
- add v18.4s,v18.4s,v26.4s
- add w20,w20,w30
- add v22.4s,v22.4s,v26.4s
- add x21,x21,x30,lsr#32
- add v19.4s,v19.4s,v31.4s // +4
- add x5,x5,x6,lsl#32 // pack
- add v23.4s,v23.4s,v31.4s // +4
- add x7,x7,x8,lsl#32
- add v3.4s,v3.4s,v27.4s
- ldp x6,x8,[x1,#0] // load input
- add v7.4s,v7.4s,v28.4s
- add x9,x9,x10,lsl#32
- add v11.4s,v11.4s,v29.4s
- add x11,x11,x12,lsl#32
- add v15.4s,v15.4s,v30.4s
- ldp x10,x12,[x1,#16]
- add v19.4s,v19.4s,v27.4s
- add x13,x13,x14,lsl#32
- add v23.4s,v23.4s,v28.4s
- add x15,x15,x16,lsl#32
- add v1.4s,v1.4s,v25.4s
- ldp x14,x16,[x1,#32]
- add v5.4s,v5.4s,v25.4s
- add x17,x17,x19,lsl#32
- add v9.4s,v9.4s,v25.4s
- add x20,x20,x21,lsl#32
- add v13.4s,v13.4s,v25.4s
- ldp x19,x21,[x1,#48]
- add v17.4s,v17.4s,v25.4s
- add x1,x1,#64
- add v21.4s,v21.4s,v25.4s
-
-#ifdef __AARCH64EB__
- rev x5,x5
- rev x7,x7
- rev x9,x9
- rev x11,x11
- rev x13,x13
- rev x15,x15
- rev x17,x17
- rev x20,x20
-#endif
- ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
- eor x5,x5,x6
- eor x7,x7,x8
- eor x9,x9,x10
- eor x11,x11,x12
- eor x13,x13,x14
- eor v0.16b,v0.16b,v24.16b
- eor x15,x15,x16
- eor v1.16b,v1.16b,v25.16b
- eor x17,x17,x19
- eor v2.16b,v2.16b,v26.16b
- eor x20,x20,x21
- eor v3.16b,v3.16b,v27.16b
- ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
-
- stp x5,x7,[x0,#0] // store output
- add x28,x28,#7 // increment counter
- stp x9,x11,[x0,#16]
- stp x13,x15,[x0,#32]
- stp x17,x20,[x0,#48]
- add x0,x0,#64
- st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
-
- ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
- eor v4.16b,v4.16b,v24.16b
- eor v5.16b,v5.16b,v25.16b
- eor v6.16b,v6.16b,v26.16b
- eor v7.16b,v7.16b,v27.16b
- st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
-
- ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
- eor v8.16b,v8.16b,v0.16b
- ldp q24,q25,[sp,#0]
- eor v9.16b,v9.16b,v1.16b
- ldp q26,q27,[sp,#32]
- eor v10.16b,v10.16b,v2.16b
- eor v11.16b,v11.16b,v3.16b
- st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
-
- ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
- eor v12.16b,v12.16b,v4.16b
- eor v13.16b,v13.16b,v5.16b
- eor v14.16b,v14.16b,v6.16b
- eor v15.16b,v15.16b,v7.16b
- st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
-
- ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
- eor v16.16b,v16.16b,v8.16b
- eor v17.16b,v17.16b,v9.16b
- eor v18.16b,v18.16b,v10.16b
- eor v19.16b,v19.16b,v11.16b
- st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
-
- shl v0.4s,v31.4s,#1 // 4 -> 8
- eor v20.16b,v20.16b,v12.16b
- eor v21.16b,v21.16b,v13.16b
- eor v22.16b,v22.16b,v14.16b
- eor v23.16b,v23.16b,v15.16b
- st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
-
- add v27.4s,v27.4s,v0.4s // += 8
- add v28.4s,v28.4s,v0.4s
- add v29.4s,v29.4s,v0.4s
- add v30.4s,v30.4s,v0.4s
-
- b.hs Loop_outer_512_neon
-
- adds x2,x2,#512
- ushr v0.4s,v31.4s,#2 // 4 -> 1
-
- ldp d8,d9,[sp,#128+0] // meet ABI requirements
- ldp d10,d11,[sp,#128+16]
- ldp d12,d13,[sp,#128+32]
- ldp d14,d15,[sp,#128+48]
-
- stp q24,q31,[sp,#0] // wipe off-load area
- stp q24,q31,[sp,#32]
- stp q24,q31,[sp,#64]
-
- b.eq Ldone_512_neon
-
- cmp x2,#192
- sub v27.4s,v27.4s,v0.4s // -= 1
- sub v28.4s,v28.4s,v0.4s
- sub v29.4s,v29.4s,v0.4s
- add sp,sp,#128
- b.hs Loop_outer_neon
-
- eor v25.16b,v25.16b,v25.16b
- eor v26.16b,v26.16b,v26.16b
- eor v27.16b,v27.16b,v27.16b
- eor v28.16b,v28.16b,v28.16b
- eor v29.16b,v29.16b,v29.16b
- eor v30.16b,v30.16b,v30.16b
- b Loop_outer
-
-Ldone_512_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#128+64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8-apple.S b/apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8-apple.S
deleted file mode 100644
index 04a1e22..0000000
--- a/apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8-apple.S
+++ /dev/null
@@ -1,3009 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-.section __TEXT,__const
-
-.align 7
-Lchacha20_consts:
-.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
-Linc:
-.long 1,2,3,4
-Lrol8:
-.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
-Lclamp:
-.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
-
-.text
-
-
-.align 6
-Lpoly_hash_ad_internal:
-.cfi_startproc
- cbnz x4, Lpoly_hash_intro
- ret
-
-Lpoly_hash_intro:
- cmp x4, #16
- b.lt Lpoly_hash_ad_tail
- ldp x11, x12, [x3], 16
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- sub x4, x4, #16
- b Lpoly_hash_ad_internal
-
-Lpoly_hash_ad_tail:
- cbz x4, Lpoly_hash_ad_ret
-
- eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
- sub x4, x4, #1
-
-Lpoly_hash_tail_16_compose:
- ext v20.16b, v20.16b, v20.16b, #15
- ldrb w11, [x3, x4]
- mov v20.b[0], w11
- subs x4, x4, #1
- b.ge Lpoly_hash_tail_16_compose
- mov x11, v20.d[0]
- mov x12, v20.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
-
-Lpoly_hash_ad_ret:
- ret
-.cfi_endproc
-
-
-/////////////////////////////////
-//
-// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
-//
-.globl _chacha20_poly1305_seal
-.private_extern _chacha20_poly1305_seal
-
-.align 6
-_chacha20_poly1305_seal:
- AARCH64_SIGN_LINK_REGISTER
-.cfi_startproc
- stp x29, x30, [sp, #-80]!
-.cfi_def_cfa_offset 80
-.cfi_offset w30, -72
-.cfi_offset w29, -80
- mov x29, sp
- // We probably could do .cfi_def_cfa w29, 80 at this point, but since
- // we don't actually use the frame pointer like that, it's probably not
- // worth bothering.
- stp d8, d9, [sp, #16]
- stp d10, d11, [sp, #32]
- stp d12, d13, [sp, #48]
- stp d14, d15, [sp, #64]
-.cfi_offset b15, -8
-.cfi_offset b14, -16
-.cfi_offset b13, -24
-.cfi_offset b12, -32
-.cfi_offset b11, -40
-.cfi_offset b10, -48
-.cfi_offset b9, -56
-.cfi_offset b8, -64
-
- adrp x11, Lchacha20_consts@PAGE
- add x11, x11, Lchacha20_consts@PAGEOFF
-
- ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
- ld1 {v28.16b - v30.16b}, [x5]
-
- mov x15, #1 // Prepare the Poly1305 state
- mov x8, #0
- mov x9, #0
- mov x10, #0
-
- ldr x12, [x5, #56] // The total cipher text length includes extra_in_len
- add x12, x12, x2
- mov v31.d[0], x4 // Store the input and aad lengths
- mov v31.d[1], x12
-
- cmp x2, #128
- b.le Lseal_128 // Optimization for smaller buffers
-
- // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
- // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
- // the fifth block (A4-D4) horizontally.
- ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
- mov v4.16b, v24.16b
-
- ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
- mov v9.16b, v28.16b
-
- ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
- mov v14.16b, v29.16b
-
- ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
- add v15.4s, v15.4s, v25.4s
- mov v19.16b, v30.16b
-
- sub x5, x5, #32
-
- mov x6, #10
-
-.align 5
-Lseal_init_rounds:
- add v0.4s, v0.4s, v5.4s
- add v1.4s, v1.4s, v6.4s
- add v2.4s, v2.4s, v7.4s
- add v3.4s, v3.4s, v8.4s
- add v4.4s, v4.4s, v9.4s
-
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- eor v18.16b, v18.16b, v3.16b
- eor v19.16b, v19.16b, v4.16b
-
- rev32 v15.8h, v15.8h
- rev32 v16.8h, v16.8h
- rev32 v17.8h, v17.8h
- rev32 v18.8h, v18.8h
- rev32 v19.8h, v19.8h
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- add v13.4s, v13.4s, v18.4s
- add v14.4s, v14.4s, v19.4s
-
- eor v5.16b, v5.16b, v10.16b
- eor v6.16b, v6.16b, v11.16b
- eor v7.16b, v7.16b, v12.16b
- eor v8.16b, v8.16b, v13.16b
- eor v9.16b, v9.16b, v14.16b
-
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- ushr v5.4s, v6.4s, #20
- sli v5.4s, v6.4s, #12
- ushr v6.4s, v7.4s, #20
- sli v6.4s, v7.4s, #12
- ushr v7.4s, v8.4s, #20
- sli v7.4s, v8.4s, #12
- ushr v8.4s, v9.4s, #20
- sli v8.4s, v9.4s, #12
-
- add v0.4s, v0.4s, v20.4s
- add v1.4s, v1.4s, v5.4s
- add v2.4s, v2.4s, v6.4s
- add v3.4s, v3.4s, v7.4s
- add v4.4s, v4.4s, v8.4s
-
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- eor v18.16b, v18.16b, v3.16b
- eor v19.16b, v19.16b, v4.16b
-
- tbl v15.16b, {v15.16b}, v26.16b
- tbl v16.16b, {v16.16b}, v26.16b
- tbl v17.16b, {v17.16b}, v26.16b
- tbl v18.16b, {v18.16b}, v26.16b
- tbl v19.16b, {v19.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- add v13.4s, v13.4s, v18.4s
- add v14.4s, v14.4s, v19.4s
-
- eor v20.16b, v20.16b, v10.16b
- eor v5.16b, v5.16b, v11.16b
- eor v6.16b, v6.16b, v12.16b
- eor v7.16b, v7.16b, v13.16b
- eor v8.16b, v8.16b, v14.16b
-
- ushr v9.4s, v8.4s, #25
- sli v9.4s, v8.4s, #7
- ushr v8.4s, v7.4s, #25
- sli v8.4s, v7.4s, #7
- ushr v7.4s, v6.4s, #25
- sli v7.4s, v6.4s, #7
- ushr v6.4s, v5.4s, #25
- sli v6.4s, v5.4s, #7
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
-
- ext v9.16b, v9.16b, v9.16b, #4
- ext v14.16b, v14.16b, v14.16b, #8
- ext v19.16b, v19.16b, v19.16b, #12
- add v0.4s, v0.4s, v6.4s
- add v1.4s, v1.4s, v7.4s
- add v2.4s, v2.4s, v8.4s
- add v3.4s, v3.4s, v5.4s
- add v4.4s, v4.4s, v9.4s
-
- eor v18.16b, v18.16b, v0.16b
- eor v15.16b, v15.16b, v1.16b
- eor v16.16b, v16.16b, v2.16b
- eor v17.16b, v17.16b, v3.16b
- eor v19.16b, v19.16b, v4.16b
-
- rev32 v18.8h, v18.8h
- rev32 v15.8h, v15.8h
- rev32 v16.8h, v16.8h
- rev32 v17.8h, v17.8h
- rev32 v19.8h, v19.8h
-
- add v12.4s, v12.4s, v18.4s
- add v13.4s, v13.4s, v15.4s
- add v10.4s, v10.4s, v16.4s
- add v11.4s, v11.4s, v17.4s
- add v14.4s, v14.4s, v19.4s
-
- eor v6.16b, v6.16b, v12.16b
- eor v7.16b, v7.16b, v13.16b
- eor v8.16b, v8.16b, v10.16b
- eor v5.16b, v5.16b, v11.16b
- eor v9.16b, v9.16b, v14.16b
-
- ushr v20.4s, v6.4s, #20
- sli v20.4s, v6.4s, #12
- ushr v6.4s, v7.4s, #20
- sli v6.4s, v7.4s, #12
- ushr v7.4s, v8.4s, #20
- sli v7.4s, v8.4s, #12
- ushr v8.4s, v5.4s, #20
- sli v8.4s, v5.4s, #12
- ushr v5.4s, v9.4s, #20
- sli v5.4s, v9.4s, #12
-
- add v0.4s, v0.4s, v20.4s
- add v1.4s, v1.4s, v6.4s
- add v2.4s, v2.4s, v7.4s
- add v3.4s, v3.4s, v8.4s
- add v4.4s, v4.4s, v5.4s
-
- eor v18.16b, v18.16b, v0.16b
- eor v15.16b, v15.16b, v1.16b
- eor v16.16b, v16.16b, v2.16b
- eor v17.16b, v17.16b, v3.16b
- eor v19.16b, v19.16b, v4.16b
-
- tbl v18.16b, {v18.16b}, v26.16b
- tbl v15.16b, {v15.16b}, v26.16b
- tbl v16.16b, {v16.16b}, v26.16b
- tbl v17.16b, {v17.16b}, v26.16b
- tbl v19.16b, {v19.16b}, v26.16b
-
- add v12.4s, v12.4s, v18.4s
- add v13.4s, v13.4s, v15.4s
- add v10.4s, v10.4s, v16.4s
- add v11.4s, v11.4s, v17.4s
- add v14.4s, v14.4s, v19.4s
-
- eor v20.16b, v20.16b, v12.16b
- eor v6.16b, v6.16b, v13.16b
- eor v7.16b, v7.16b, v10.16b
- eor v8.16b, v8.16b, v11.16b
- eor v5.16b, v5.16b, v14.16b
-
- ushr v9.4s, v5.4s, #25
- sli v9.4s, v5.4s, #7
- ushr v5.4s, v8.4s, #25
- sli v5.4s, v8.4s, #7
- ushr v8.4s, v7.4s, #25
- sli v8.4s, v7.4s, #7
- ushr v7.4s, v6.4s, #25
- sli v7.4s, v6.4s, #7
- ushr v6.4s, v20.4s, #25
- sli v6.4s, v20.4s, #7
-
- ext v9.16b, v9.16b, v9.16b, #12
- ext v14.16b, v14.16b, v14.16b, #8
- ext v19.16b, v19.16b, v19.16b, #4
- subs x6, x6, #1
- b.hi Lseal_init_rounds
-
- add v15.4s, v15.4s, v25.4s
- mov x11, #4
- dup v20.4s, w11
- add v25.4s, v25.4s, v20.4s
-
- zip1 v20.4s, v0.4s, v1.4s
- zip2 v21.4s, v0.4s, v1.4s
- zip1 v22.4s, v2.4s, v3.4s
- zip2 v23.4s, v2.4s, v3.4s
-
- zip1 v0.2d, v20.2d, v22.2d
- zip2 v1.2d, v20.2d, v22.2d
- zip1 v2.2d, v21.2d, v23.2d
- zip2 v3.2d, v21.2d, v23.2d
-
- zip1 v20.4s, v5.4s, v6.4s
- zip2 v21.4s, v5.4s, v6.4s
- zip1 v22.4s, v7.4s, v8.4s
- zip2 v23.4s, v7.4s, v8.4s
-
- zip1 v5.2d, v20.2d, v22.2d
- zip2 v6.2d, v20.2d, v22.2d
- zip1 v7.2d, v21.2d, v23.2d
- zip2 v8.2d, v21.2d, v23.2d
-
- zip1 v20.4s, v10.4s, v11.4s
- zip2 v21.4s, v10.4s, v11.4s
- zip1 v22.4s, v12.4s, v13.4s
- zip2 v23.4s, v12.4s, v13.4s
-
- zip1 v10.2d, v20.2d, v22.2d
- zip2 v11.2d, v20.2d, v22.2d
- zip1 v12.2d, v21.2d, v23.2d
- zip2 v13.2d, v21.2d, v23.2d
-
- zip1 v20.4s, v15.4s, v16.4s
- zip2 v21.4s, v15.4s, v16.4s
- zip1 v22.4s, v17.4s, v18.4s
- zip2 v23.4s, v17.4s, v18.4s
-
- zip1 v15.2d, v20.2d, v22.2d
- zip2 v16.2d, v20.2d, v22.2d
- zip1 v17.2d, v21.2d, v23.2d
- zip2 v18.2d, v21.2d, v23.2d
-
- add v4.4s, v4.4s, v24.4s
- add v9.4s, v9.4s, v28.4s
- and v4.16b, v4.16b, v27.16b
-
- add v0.4s, v0.4s, v24.4s
- add v5.4s, v5.4s, v28.4s
- add v10.4s, v10.4s, v29.4s
- add v15.4s, v15.4s, v30.4s
-
- add v1.4s, v1.4s, v24.4s
- add v6.4s, v6.4s, v28.4s
- add v11.4s, v11.4s, v29.4s
- add v16.4s, v16.4s, v30.4s
-
- add v2.4s, v2.4s, v24.4s
- add v7.4s, v7.4s, v28.4s
- add v12.4s, v12.4s, v29.4s
- add v17.4s, v17.4s, v30.4s
-
- add v3.4s, v3.4s, v24.4s
- add v8.4s, v8.4s, v28.4s
- add v13.4s, v13.4s, v29.4s
- add v18.4s, v18.4s, v30.4s
-
- mov x16, v4.d[0] // Move the R key to GPRs
- mov x17, v4.d[1]
- mov v27.16b, v9.16b // Store the S key
-
- bl Lpoly_hash_ad_internal
-
- mov x3, x0
- cmp x2, #256
- b.le Lseal_tail
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v0.16b
- eor v21.16b, v21.16b, v5.16b
- eor v22.16b, v22.16b, v10.16b
- eor v23.16b, v23.16b, v15.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v1.16b
- eor v21.16b, v21.16b, v6.16b
- eor v22.16b, v22.16b, v11.16b
- eor v23.16b, v23.16b, v16.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v2.16b
- eor v21.16b, v21.16b, v7.16b
- eor v22.16b, v22.16b, v12.16b
- eor v23.16b, v23.16b, v17.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v3.16b
- eor v21.16b, v21.16b, v8.16b
- eor v22.16b, v22.16b, v13.16b
- eor v23.16b, v23.16b, v18.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- sub x2, x2, #256
-
- mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
- mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
-
-Lseal_main_loop:
- adrp x11, Lchacha20_consts@PAGE
- add x11, x11, Lchacha20_consts@PAGEOFF
-
- ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
- mov v4.16b, v24.16b
-
- ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
- mov v9.16b, v28.16b
-
- ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
- mov v14.16b, v29.16b
-
- ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
- add v15.4s, v15.4s, v25.4s
- mov v19.16b, v30.16b
-
- eor v20.16b, v20.16b, v20.16b //zero
- not v21.16b, v20.16b // -1
- sub v21.4s, v25.4s, v21.4s // Add +1
- ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
- add v19.4s, v19.4s, v20.4s
-
- sub x5, x5, #32
-.align 5
-Lseal_main_loop_rounds:
- add v0.4s, v0.4s, v5.4s
- add v1.4s, v1.4s, v6.4s
- add v2.4s, v2.4s, v7.4s
- add v3.4s, v3.4s, v8.4s
- add v4.4s, v4.4s, v9.4s
-
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- eor v18.16b, v18.16b, v3.16b
- eor v19.16b, v19.16b, v4.16b
-
- rev32 v15.8h, v15.8h
- rev32 v16.8h, v16.8h
- rev32 v17.8h, v17.8h
- rev32 v18.8h, v18.8h
- rev32 v19.8h, v19.8h
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- add v13.4s, v13.4s, v18.4s
- add v14.4s, v14.4s, v19.4s
-
- eor v5.16b, v5.16b, v10.16b
- eor v6.16b, v6.16b, v11.16b
- eor v7.16b, v7.16b, v12.16b
- eor v8.16b, v8.16b, v13.16b
- eor v9.16b, v9.16b, v14.16b
-
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- ushr v5.4s, v6.4s, #20
- sli v5.4s, v6.4s, #12
- ushr v6.4s, v7.4s, #20
- sli v6.4s, v7.4s, #12
- ushr v7.4s, v8.4s, #20
- sli v7.4s, v8.4s, #12
- ushr v8.4s, v9.4s, #20
- sli v8.4s, v9.4s, #12
-
- add v0.4s, v0.4s, v20.4s
- add v1.4s, v1.4s, v5.4s
- add v2.4s, v2.4s, v6.4s
- add v3.4s, v3.4s, v7.4s
- add v4.4s, v4.4s, v8.4s
-
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- eor v18.16b, v18.16b, v3.16b
- eor v19.16b, v19.16b, v4.16b
-
- tbl v15.16b, {v15.16b}, v26.16b
- tbl v16.16b, {v16.16b}, v26.16b
- tbl v17.16b, {v17.16b}, v26.16b
- tbl v18.16b, {v18.16b}, v26.16b
- tbl v19.16b, {v19.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- add v13.4s, v13.4s, v18.4s
- add v14.4s, v14.4s, v19.4s
-
- eor v20.16b, v20.16b, v10.16b
- eor v5.16b, v5.16b, v11.16b
- eor v6.16b, v6.16b, v12.16b
- eor v7.16b, v7.16b, v13.16b
- eor v8.16b, v8.16b, v14.16b
-
- ushr v9.4s, v8.4s, #25
- sli v9.4s, v8.4s, #7
- ushr v8.4s, v7.4s, #25
- sli v8.4s, v7.4s, #7
- ushr v7.4s, v6.4s, #25
- sli v7.4s, v6.4s, #7
- ushr v6.4s, v5.4s, #25
- sli v6.4s, v5.4s, #7
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
-
- ext v9.16b, v9.16b, v9.16b, #4
- ext v14.16b, v14.16b, v14.16b, #8
- ext v19.16b, v19.16b, v19.16b, #12
- ldp x11, x12, [x3], 16
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- add v0.4s, v0.4s, v6.4s
- add v1.4s, v1.4s, v7.4s
- add v2.4s, v2.4s, v8.4s
- add v3.4s, v3.4s, v5.4s
- add v4.4s, v4.4s, v9.4s
-
- eor v18.16b, v18.16b, v0.16b
- eor v15.16b, v15.16b, v1.16b
- eor v16.16b, v16.16b, v2.16b
- eor v17.16b, v17.16b, v3.16b
- eor v19.16b, v19.16b, v4.16b
-
- rev32 v18.8h, v18.8h
- rev32 v15.8h, v15.8h
- rev32 v16.8h, v16.8h
- rev32 v17.8h, v17.8h
- rev32 v19.8h, v19.8h
-
- add v12.4s, v12.4s, v18.4s
- add v13.4s, v13.4s, v15.4s
- add v10.4s, v10.4s, v16.4s
- add v11.4s, v11.4s, v17.4s
- add v14.4s, v14.4s, v19.4s
-
- eor v6.16b, v6.16b, v12.16b
- eor v7.16b, v7.16b, v13.16b
- eor v8.16b, v8.16b, v10.16b
- eor v5.16b, v5.16b, v11.16b
- eor v9.16b, v9.16b, v14.16b
-
- ushr v20.4s, v6.4s, #20
- sli v20.4s, v6.4s, #12
- ushr v6.4s, v7.4s, #20
- sli v6.4s, v7.4s, #12
- ushr v7.4s, v8.4s, #20
- sli v7.4s, v8.4s, #12
- ushr v8.4s, v5.4s, #20
- sli v8.4s, v5.4s, #12
- ushr v5.4s, v9.4s, #20
- sli v5.4s, v9.4s, #12
-
- add v0.4s, v0.4s, v20.4s
- add v1.4s, v1.4s, v6.4s
- add v2.4s, v2.4s, v7.4s
- add v3.4s, v3.4s, v8.4s
- add v4.4s, v4.4s, v5.4s
-
- eor v18.16b, v18.16b, v0.16b
- eor v15.16b, v15.16b, v1.16b
- eor v16.16b, v16.16b, v2.16b
- eor v17.16b, v17.16b, v3.16b
- eor v19.16b, v19.16b, v4.16b
-
- tbl v18.16b, {v18.16b}, v26.16b
- tbl v15.16b, {v15.16b}, v26.16b
- tbl v16.16b, {v16.16b}, v26.16b
- tbl v17.16b, {v17.16b}, v26.16b
- tbl v19.16b, {v19.16b}, v26.16b
-
- add v12.4s, v12.4s, v18.4s
- add v13.4s, v13.4s, v15.4s
- add v10.4s, v10.4s, v16.4s
- add v11.4s, v11.4s, v17.4s
- add v14.4s, v14.4s, v19.4s
-
- eor v20.16b, v20.16b, v12.16b
- eor v6.16b, v6.16b, v13.16b
- eor v7.16b, v7.16b, v10.16b
- eor v8.16b, v8.16b, v11.16b
- eor v5.16b, v5.16b, v14.16b
-
- ushr v9.4s, v5.4s, #25
- sli v9.4s, v5.4s, #7
- ushr v5.4s, v8.4s, #25
- sli v5.4s, v8.4s, #7
- ushr v8.4s, v7.4s, #25
- sli v8.4s, v7.4s, #7
- ushr v7.4s, v6.4s, #25
- sli v7.4s, v6.4s, #7
- ushr v6.4s, v20.4s, #25
- sli v6.4s, v20.4s, #7
-
- ext v9.16b, v9.16b, v9.16b, #12
- ext v14.16b, v14.16b, v14.16b, #8
- ext v19.16b, v19.16b, v19.16b, #4
- subs x6, x6, #1
- b.ge Lseal_main_loop_rounds
- ldp x11, x12, [x3], 16
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- subs x7, x7, #1
- b.gt Lseal_main_loop_rounds
-
- eor v20.16b, v20.16b, v20.16b //zero
- not v21.16b, v20.16b // -1
- sub v21.4s, v25.4s, v21.4s // Add +1
- ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
- add v19.4s, v19.4s, v20.4s
-
- add v15.4s, v15.4s, v25.4s
- mov x11, #5
- dup v20.4s, w11
- add v25.4s, v25.4s, v20.4s
-
- zip1 v20.4s, v0.4s, v1.4s
- zip2 v21.4s, v0.4s, v1.4s
- zip1 v22.4s, v2.4s, v3.4s
- zip2 v23.4s, v2.4s, v3.4s
-
- zip1 v0.2d, v20.2d, v22.2d
- zip2 v1.2d, v20.2d, v22.2d
- zip1 v2.2d, v21.2d, v23.2d
- zip2 v3.2d, v21.2d, v23.2d
-
- zip1 v20.4s, v5.4s, v6.4s
- zip2 v21.4s, v5.4s, v6.4s
- zip1 v22.4s, v7.4s, v8.4s
- zip2 v23.4s, v7.4s, v8.4s
-
- zip1 v5.2d, v20.2d, v22.2d
- zip2 v6.2d, v20.2d, v22.2d
- zip1 v7.2d, v21.2d, v23.2d
- zip2 v8.2d, v21.2d, v23.2d
-
- zip1 v20.4s, v10.4s, v11.4s
- zip2 v21.4s, v10.4s, v11.4s
- zip1 v22.4s, v12.4s, v13.4s
- zip2 v23.4s, v12.4s, v13.4s
-
- zip1 v10.2d, v20.2d, v22.2d
- zip2 v11.2d, v20.2d, v22.2d
- zip1 v12.2d, v21.2d, v23.2d
- zip2 v13.2d, v21.2d, v23.2d
-
- zip1 v20.4s, v15.4s, v16.4s
- zip2 v21.4s, v15.4s, v16.4s
- zip1 v22.4s, v17.4s, v18.4s
- zip2 v23.4s, v17.4s, v18.4s
-
- zip1 v15.2d, v20.2d, v22.2d
- zip2 v16.2d, v20.2d, v22.2d
- zip1 v17.2d, v21.2d, v23.2d
- zip2 v18.2d, v21.2d, v23.2d
-
- add v0.4s, v0.4s, v24.4s
- add v5.4s, v5.4s, v28.4s
- add v10.4s, v10.4s, v29.4s
- add v15.4s, v15.4s, v30.4s
-
- add v1.4s, v1.4s, v24.4s
- add v6.4s, v6.4s, v28.4s
- add v11.4s, v11.4s, v29.4s
- add v16.4s, v16.4s, v30.4s
-
- add v2.4s, v2.4s, v24.4s
- add v7.4s, v7.4s, v28.4s
- add v12.4s, v12.4s, v29.4s
- add v17.4s, v17.4s, v30.4s
-
- add v3.4s, v3.4s, v24.4s
- add v8.4s, v8.4s, v28.4s
- add v13.4s, v13.4s, v29.4s
- add v18.4s, v18.4s, v30.4s
-
- add v4.4s, v4.4s, v24.4s
- add v9.4s, v9.4s, v28.4s
- add v14.4s, v14.4s, v29.4s
- add v19.4s, v19.4s, v30.4s
-
- cmp x2, #320
- b.le Lseal_tail
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v0.16b
- eor v21.16b, v21.16b, v5.16b
- eor v22.16b, v22.16b, v10.16b
- eor v23.16b, v23.16b, v15.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v1.16b
- eor v21.16b, v21.16b, v6.16b
- eor v22.16b, v22.16b, v11.16b
- eor v23.16b, v23.16b, v16.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v2.16b
- eor v21.16b, v21.16b, v7.16b
- eor v22.16b, v22.16b, v12.16b
- eor v23.16b, v23.16b, v17.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v3.16b
- eor v21.16b, v21.16b, v8.16b
- eor v22.16b, v22.16b, v13.16b
- eor v23.16b, v23.16b, v18.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v4.16b
- eor v21.16b, v21.16b, v9.16b
- eor v22.16b, v22.16b, v14.16b
- eor v23.16b, v23.16b, v19.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- sub x2, x2, #320
-
- mov x6, #0
- mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
-
- b Lseal_main_loop
-
-Lseal_tail:
- // This part of the function handles the storage and authentication of the last [0,320) bytes
- // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
- cmp x2, #64
- b.lt Lseal_tail_64
-
- // Store and authenticate 64B blocks per iteration
- ld1 {v20.16b - v23.16b}, [x1], #64
-
- eor v20.16b, v20.16b, v0.16b
- eor v21.16b, v21.16b, v5.16b
- eor v22.16b, v22.16b, v10.16b
- eor v23.16b, v23.16b, v15.16b
- mov x11, v20.d[0]
- mov x12, v20.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- mov x11, v21.d[0]
- mov x12, v21.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- mov x11, v22.d[0]
- mov x12, v22.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- mov x11, v23.d[0]
- mov x12, v23.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- st1 {v20.16b - v23.16b}, [x0], #64
- sub x2, x2, #64
-
- // Shift the state left by 64 bytes for the next iteration of the loop
- mov v0.16b, v1.16b
- mov v5.16b, v6.16b
- mov v10.16b, v11.16b
- mov v15.16b, v16.16b
-
- mov v1.16b, v2.16b
- mov v6.16b, v7.16b
- mov v11.16b, v12.16b
- mov v16.16b, v17.16b
-
- mov v2.16b, v3.16b
- mov v7.16b, v8.16b
- mov v12.16b, v13.16b
- mov v17.16b, v18.16b
-
- mov v3.16b, v4.16b
- mov v8.16b, v9.16b
- mov v13.16b, v14.16b
- mov v18.16b, v19.16b
-
- b Lseal_tail
-
-Lseal_tail_64:
- ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
-
- // Here we handle the last [0,64) bytes of plaintext
- cmp x2, #16
- b.lt Lseal_tail_16
- // Each iteration encrypt and authenticate a 16B block
- ld1 {v20.16b}, [x1], #16
- eor v20.16b, v20.16b, v0.16b
- mov x11, v20.d[0]
- mov x12, v20.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- st1 {v20.16b}, [x0], #16
-
- sub x2, x2, #16
-
- // Shift the state left by 16 bytes for the next iteration of the loop
- mov v0.16b, v5.16b
- mov v5.16b, v10.16b
- mov v10.16b, v15.16b
-
- b Lseal_tail_64
-
-Lseal_tail_16:
- // Here we handle the last [0,16) bytes of ciphertext that require a padded block
- cbz x2, Lseal_hash_extra
-
- eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
- eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
- not v22.16b, v20.16b
-
- mov x6, x2
- add x1, x1, x2
-
- cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding
-
- mov x7, #16 // We need to load some extra_in first for padding
- sub x7, x7, x2
- cmp x4, x7
- csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
- mov x12, x7
- add x3, x3, x7
- sub x4, x4, x7
-
-Lseal_tail16_compose_extra_in:
- ext v20.16b, v20.16b, v20.16b, #15
- ldrb w11, [x3, #-1]!
- mov v20.b[0], w11
- subs x7, x7, #1
- b.gt Lseal_tail16_compose_extra_in
-
- add x3, x3, x12
-
-Lseal_tail_16_compose:
- ext v20.16b, v20.16b, v20.16b, #15
- ldrb w11, [x1, #-1]!
- mov v20.b[0], w11
- ext v21.16b, v22.16b, v21.16b, #15
- subs x2, x2, #1
- b.gt Lseal_tail_16_compose
-
- and v0.16b, v0.16b, v21.16b
- eor v20.16b, v20.16b, v0.16b
- mov v21.16b, v20.16b
-
-Lseal_tail_16_store:
- umov w11, v20.b[0]
- strb w11, [x0], #1
- ext v20.16b, v20.16b, v20.16b, #1
- subs x6, x6, #1
- b.gt Lseal_tail_16_store
-
- // Hash in the final ct block concatenated with extra_in
- mov x11, v21.d[0]
- mov x12, v21.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
-
-Lseal_hash_extra:
- cbz x4, Lseal_finalize
-
-Lseal_hash_extra_loop:
- cmp x4, #16
- b.lt Lseal_hash_extra_tail
- ld1 {v20.16b}, [x3], #16
- mov x11, v20.d[0]
- mov x12, v20.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- sub x4, x4, #16
- b Lseal_hash_extra_loop
-
-Lseal_hash_extra_tail:
- cbz x4, Lseal_finalize
- eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
- add x3, x3, x4
-
-Lseal_hash_extra_load:
- ext v20.16b, v20.16b, v20.16b, #15
- ldrb w11, [x3, #-1]!
- mov v20.b[0], w11
- subs x4, x4, #1
- b.gt Lseal_hash_extra_load
-
- // Hash in the final padded extra_in blcok
- mov x11, v20.d[0]
- mov x12, v20.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
-
-Lseal_finalize:
- mov x11, v31.d[0]
- mov x12, v31.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- // Final reduction step
- sub x12, xzr, x15
- orr x13, xzr, #3
- subs x11, x8, #-5
- sbcs x12, x9, x12
- sbcs x13, x10, x13
- csel x8, x11, x8, cs
- csel x9, x12, x9, cs
- csel x10, x13, x10, cs
- mov x11, v27.d[0]
- mov x12, v27.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
-
- stp x8, x9, [x5]
-
- ldp d8, d9, [sp, #16]
- ldp d10, d11, [sp, #32]
- ldp d12, d13, [sp, #48]
- ldp d14, d15, [sp, #64]
-.cfi_restore b15
-.cfi_restore b14
-.cfi_restore b13
-.cfi_restore b12
-.cfi_restore b11
-.cfi_restore b10
-.cfi_restore b9
-.cfi_restore b8
- ldp x29, x30, [sp], 80
-.cfi_restore w29
-.cfi_restore w30
-.cfi_def_cfa_offset 0
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-Lseal_128:
- // On some architectures preparing 5 blocks for small buffers is wasteful
- eor v25.16b, v25.16b, v25.16b
- mov x11, #1
- mov v25.s[0], w11
- mov v0.16b, v24.16b
- mov v1.16b, v24.16b
- mov v2.16b, v24.16b
- mov v5.16b, v28.16b
- mov v6.16b, v28.16b
- mov v7.16b, v28.16b
- mov v10.16b, v29.16b
- mov v11.16b, v29.16b
- mov v12.16b, v29.16b
- mov v17.16b, v30.16b
- add v15.4s, v17.4s, v25.4s
- add v16.4s, v15.4s, v25.4s
-
- mov x6, #10
-
-Lseal_128_rounds:
- add v0.4s, v0.4s, v5.4s
- add v1.4s, v1.4s, v6.4s
- add v2.4s, v2.4s, v7.4s
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- rev32 v15.8h, v15.8h
- rev32 v16.8h, v16.8h
- rev32 v17.8h, v17.8h
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- eor v5.16b, v5.16b, v10.16b
- eor v6.16b, v6.16b, v11.16b
- eor v7.16b, v7.16b, v12.16b
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- ushr v5.4s, v6.4s, #20
- sli v5.4s, v6.4s, #12
- ushr v6.4s, v7.4s, #20
- sli v6.4s, v7.4s, #12
-
- add v0.4s, v0.4s, v20.4s
- add v1.4s, v1.4s, v5.4s
- add v2.4s, v2.4s, v6.4s
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- tbl v15.16b, {v15.16b}, v26.16b
- tbl v16.16b, {v16.16b}, v26.16b
- tbl v17.16b, {v17.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- eor v20.16b, v20.16b, v10.16b
- eor v5.16b, v5.16b, v11.16b
- eor v6.16b, v6.16b, v12.16b
- ushr v7.4s, v6.4s, #25
- sli v7.4s, v6.4s, #7
- ushr v6.4s, v5.4s, #25
- sli v6.4s, v5.4s, #7
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
-
- ext v5.16b, v5.16b, v5.16b, #4
- ext v6.16b, v6.16b, v6.16b, #4
- ext v7.16b, v7.16b, v7.16b, #4
-
- ext v10.16b, v10.16b, v10.16b, #8
- ext v11.16b, v11.16b, v11.16b, #8
- ext v12.16b, v12.16b, v12.16b, #8
-
- ext v15.16b, v15.16b, v15.16b, #12
- ext v16.16b, v16.16b, v16.16b, #12
- ext v17.16b, v17.16b, v17.16b, #12
- add v0.4s, v0.4s, v5.4s
- add v1.4s, v1.4s, v6.4s
- add v2.4s, v2.4s, v7.4s
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- rev32 v15.8h, v15.8h
- rev32 v16.8h, v16.8h
- rev32 v17.8h, v17.8h
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- eor v5.16b, v5.16b, v10.16b
- eor v6.16b, v6.16b, v11.16b
- eor v7.16b, v7.16b, v12.16b
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- ushr v5.4s, v6.4s, #20
- sli v5.4s, v6.4s, #12
- ushr v6.4s, v7.4s, #20
- sli v6.4s, v7.4s, #12
-
- add v0.4s, v0.4s, v20.4s
- add v1.4s, v1.4s, v5.4s
- add v2.4s, v2.4s, v6.4s
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- tbl v15.16b, {v15.16b}, v26.16b
- tbl v16.16b, {v16.16b}, v26.16b
- tbl v17.16b, {v17.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- eor v20.16b, v20.16b, v10.16b
- eor v5.16b, v5.16b, v11.16b
- eor v6.16b, v6.16b, v12.16b
- ushr v7.4s, v6.4s, #25
- sli v7.4s, v6.4s, #7
- ushr v6.4s, v5.4s, #25
- sli v6.4s, v5.4s, #7
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
-
- ext v5.16b, v5.16b, v5.16b, #12
- ext v6.16b, v6.16b, v6.16b, #12
- ext v7.16b, v7.16b, v7.16b, #12
-
- ext v10.16b, v10.16b, v10.16b, #8
- ext v11.16b, v11.16b, v11.16b, #8
- ext v12.16b, v12.16b, v12.16b, #8
-
- ext v15.16b, v15.16b, v15.16b, #4
- ext v16.16b, v16.16b, v16.16b, #4
- ext v17.16b, v17.16b, v17.16b, #4
- subs x6, x6, #1
- b.hi Lseal_128_rounds
-
- add v0.4s, v0.4s, v24.4s
- add v1.4s, v1.4s, v24.4s
- add v2.4s, v2.4s, v24.4s
-
- add v5.4s, v5.4s, v28.4s
- add v6.4s, v6.4s, v28.4s
- add v7.4s, v7.4s, v28.4s
-
- // Only the first 32 bytes of the third block (counter = 0) are needed,
- // so skip updating v12 and v17.
- add v10.4s, v10.4s, v29.4s
- add v11.4s, v11.4s, v29.4s
-
- add v30.4s, v30.4s, v25.4s
- add v15.4s, v15.4s, v30.4s
- add v30.4s, v30.4s, v25.4s
- add v16.4s, v16.4s, v30.4s
-
- and v2.16b, v2.16b, v27.16b
- mov x16, v2.d[0] // Move the R key to GPRs
- mov x17, v2.d[1]
- mov v27.16b, v7.16b // Store the S key
-
- bl Lpoly_hash_ad_internal
- b Lseal_tail
-.cfi_endproc
-
-
-/////////////////////////////////
-//
-// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
-//
-.globl _chacha20_poly1305_open
-.private_extern _chacha20_poly1305_open
-
-.align 6
-_chacha20_poly1305_open:
- AARCH64_SIGN_LINK_REGISTER
-.cfi_startproc
- stp x29, x30, [sp, #-80]!
-.cfi_def_cfa_offset 80
-.cfi_offset w30, -72
-.cfi_offset w29, -80
- mov x29, sp
- // We probably could do .cfi_def_cfa w29, 80 at this point, but since
- // we don't actually use the frame pointer like that, it's probably not
- // worth bothering.
- stp d8, d9, [sp, #16]
- stp d10, d11, [sp, #32]
- stp d12, d13, [sp, #48]
- stp d14, d15, [sp, #64]
-.cfi_offset b15, -8
-.cfi_offset b14, -16
-.cfi_offset b13, -24
-.cfi_offset b12, -32
-.cfi_offset b11, -40
-.cfi_offset b10, -48
-.cfi_offset b9, -56
-.cfi_offset b8, -64
-
- adrp x11, Lchacha20_consts@PAGE
- add x11, x11, Lchacha20_consts@PAGEOFF
-
- ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
- ld1 {v28.16b - v30.16b}, [x5]
-
- mov x15, #1 // Prepare the Poly1305 state
- mov x8, #0
- mov x9, #0
- mov x10, #0
-
- mov v31.d[0], x4 // Store the input and aad lengths
- mov v31.d[1], x2
-
- cmp x2, #128
- b.le Lopen_128 // Optimization for smaller buffers
-
- // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
- mov v0.16b, v24.16b
- mov v5.16b, v28.16b
- mov v10.16b, v29.16b
- mov v15.16b, v30.16b
-
- mov x6, #10
-
-.align 5
-Lopen_init_rounds:
- add v0.4s, v0.4s, v5.4s
- eor v15.16b, v15.16b, v0.16b
- rev32 v15.8h, v15.8h
-
- add v10.4s, v10.4s, v15.4s
- eor v5.16b, v5.16b, v10.16b
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- add v0.4s, v0.4s, v20.4s
- eor v15.16b, v15.16b, v0.16b
- tbl v15.16b, {v15.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- eor v20.16b, v20.16b, v10.16b
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
- ext v5.16b, v5.16b, v5.16b, #4
- ext v10.16b, v10.16b, v10.16b, #8
- ext v15.16b, v15.16b, v15.16b, #12
- add v0.4s, v0.4s, v5.4s
- eor v15.16b, v15.16b, v0.16b
- rev32 v15.8h, v15.8h
-
- add v10.4s, v10.4s, v15.4s
- eor v5.16b, v5.16b, v10.16b
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- add v0.4s, v0.4s, v20.4s
- eor v15.16b, v15.16b, v0.16b
- tbl v15.16b, {v15.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- eor v20.16b, v20.16b, v10.16b
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
- ext v5.16b, v5.16b, v5.16b, #12
- ext v10.16b, v10.16b, v10.16b, #8
- ext v15.16b, v15.16b, v15.16b, #4
- subs x6, x6, #1
- b.hi Lopen_init_rounds
-
- add v0.4s, v0.4s, v24.4s
- add v5.4s, v5.4s, v28.4s
-
- and v0.16b, v0.16b, v27.16b
- mov x16, v0.d[0] // Move the R key to GPRs
- mov x17, v0.d[1]
- mov v27.16b, v5.16b // Store the S key
-
- bl Lpoly_hash_ad_internal
-
-Lopen_ad_done:
- mov x3, x1
-
-// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
-Lopen_main_loop:
-
- cmp x2, #192
- b.lt Lopen_tail
-
- adrp x11, Lchacha20_consts@PAGE
- add x11, x11, Lchacha20_consts@PAGEOFF
-
- ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
- mov v4.16b, v24.16b
-
- ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
- mov v9.16b, v28.16b
-
- ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
- mov v14.16b, v29.16b
-
- ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
- sub x5, x5, #32
- add v15.4s, v15.4s, v25.4s
- mov v19.16b, v30.16b
-
- eor v20.16b, v20.16b, v20.16b //zero
- not v21.16b, v20.16b // -1
- sub v21.4s, v25.4s, v21.4s // Add +1
- ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
- add v19.4s, v19.4s, v20.4s
-
- lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
- sub x4, x4, #10
-
- mov x7, #10
- subs x6, x7, x4
- subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
- csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
-
- cbz x7, Lopen_main_loop_rounds_short
-
-.align 5
-Lopen_main_loop_rounds:
- ldp x11, x12, [x3], 16
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
-Lopen_main_loop_rounds_short:
- add v0.4s, v0.4s, v5.4s
- add v1.4s, v1.4s, v6.4s
- add v2.4s, v2.4s, v7.4s
- add v3.4s, v3.4s, v8.4s
- add v4.4s, v4.4s, v9.4s
-
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- eor v18.16b, v18.16b, v3.16b
- eor v19.16b, v19.16b, v4.16b
-
- rev32 v15.8h, v15.8h
- rev32 v16.8h, v16.8h
- rev32 v17.8h, v17.8h
- rev32 v18.8h, v18.8h
- rev32 v19.8h, v19.8h
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- add v13.4s, v13.4s, v18.4s
- add v14.4s, v14.4s, v19.4s
-
- eor v5.16b, v5.16b, v10.16b
- eor v6.16b, v6.16b, v11.16b
- eor v7.16b, v7.16b, v12.16b
- eor v8.16b, v8.16b, v13.16b
- eor v9.16b, v9.16b, v14.16b
-
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- ushr v5.4s, v6.4s, #20
- sli v5.4s, v6.4s, #12
- ushr v6.4s, v7.4s, #20
- sli v6.4s, v7.4s, #12
- ushr v7.4s, v8.4s, #20
- sli v7.4s, v8.4s, #12
- ushr v8.4s, v9.4s, #20
- sli v8.4s, v9.4s, #12
-
- add v0.4s, v0.4s, v20.4s
- add v1.4s, v1.4s, v5.4s
- add v2.4s, v2.4s, v6.4s
- add v3.4s, v3.4s, v7.4s
- add v4.4s, v4.4s, v8.4s
-
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- eor v18.16b, v18.16b, v3.16b
- eor v19.16b, v19.16b, v4.16b
-
- tbl v15.16b, {v15.16b}, v26.16b
- tbl v16.16b, {v16.16b}, v26.16b
- tbl v17.16b, {v17.16b}, v26.16b
- tbl v18.16b, {v18.16b}, v26.16b
- tbl v19.16b, {v19.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- add v13.4s, v13.4s, v18.4s
- add v14.4s, v14.4s, v19.4s
-
- eor v20.16b, v20.16b, v10.16b
- eor v5.16b, v5.16b, v11.16b
- eor v6.16b, v6.16b, v12.16b
- eor v7.16b, v7.16b, v13.16b
- eor v8.16b, v8.16b, v14.16b
-
- ushr v9.4s, v8.4s, #25
- sli v9.4s, v8.4s, #7
- ushr v8.4s, v7.4s, #25
- sli v8.4s, v7.4s, #7
- ushr v7.4s, v6.4s, #25
- sli v7.4s, v6.4s, #7
- ushr v6.4s, v5.4s, #25
- sli v6.4s, v5.4s, #7
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
-
- ext v9.16b, v9.16b, v9.16b, #4
- ext v14.16b, v14.16b, v14.16b, #8
- ext v19.16b, v19.16b, v19.16b, #12
- ldp x11, x12, [x3], 16
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- add v0.4s, v0.4s, v6.4s
- add v1.4s, v1.4s, v7.4s
- add v2.4s, v2.4s, v8.4s
- add v3.4s, v3.4s, v5.4s
- add v4.4s, v4.4s, v9.4s
-
- eor v18.16b, v18.16b, v0.16b
- eor v15.16b, v15.16b, v1.16b
- eor v16.16b, v16.16b, v2.16b
- eor v17.16b, v17.16b, v3.16b
- eor v19.16b, v19.16b, v4.16b
-
- rev32 v18.8h, v18.8h
- rev32 v15.8h, v15.8h
- rev32 v16.8h, v16.8h
- rev32 v17.8h, v17.8h
- rev32 v19.8h, v19.8h
-
- add v12.4s, v12.4s, v18.4s
- add v13.4s, v13.4s, v15.4s
- add v10.4s, v10.4s, v16.4s
- add v11.4s, v11.4s, v17.4s
- add v14.4s, v14.4s, v19.4s
-
- eor v6.16b, v6.16b, v12.16b
- eor v7.16b, v7.16b, v13.16b
- eor v8.16b, v8.16b, v10.16b
- eor v5.16b, v5.16b, v11.16b
- eor v9.16b, v9.16b, v14.16b
-
- ushr v20.4s, v6.4s, #20
- sli v20.4s, v6.4s, #12
- ushr v6.4s, v7.4s, #20
- sli v6.4s, v7.4s, #12
- ushr v7.4s, v8.4s, #20
- sli v7.4s, v8.4s, #12
- ushr v8.4s, v5.4s, #20
- sli v8.4s, v5.4s, #12
- ushr v5.4s, v9.4s, #20
- sli v5.4s, v9.4s, #12
-
- add v0.4s, v0.4s, v20.4s
- add v1.4s, v1.4s, v6.4s
- add v2.4s, v2.4s, v7.4s
- add v3.4s, v3.4s, v8.4s
- add v4.4s, v4.4s, v5.4s
-
- eor v18.16b, v18.16b, v0.16b
- eor v15.16b, v15.16b, v1.16b
- eor v16.16b, v16.16b, v2.16b
- eor v17.16b, v17.16b, v3.16b
- eor v19.16b, v19.16b, v4.16b
-
- tbl v18.16b, {v18.16b}, v26.16b
- tbl v15.16b, {v15.16b}, v26.16b
- tbl v16.16b, {v16.16b}, v26.16b
- tbl v17.16b, {v17.16b}, v26.16b
- tbl v19.16b, {v19.16b}, v26.16b
-
- add v12.4s, v12.4s, v18.4s
- add v13.4s, v13.4s, v15.4s
- add v10.4s, v10.4s, v16.4s
- add v11.4s, v11.4s, v17.4s
- add v14.4s, v14.4s, v19.4s
-
- eor v20.16b, v20.16b, v12.16b
- eor v6.16b, v6.16b, v13.16b
- eor v7.16b, v7.16b, v10.16b
- eor v8.16b, v8.16b, v11.16b
- eor v5.16b, v5.16b, v14.16b
-
- ushr v9.4s, v5.4s, #25
- sli v9.4s, v5.4s, #7
- ushr v5.4s, v8.4s, #25
- sli v5.4s, v8.4s, #7
- ushr v8.4s, v7.4s, #25
- sli v8.4s, v7.4s, #7
- ushr v7.4s, v6.4s, #25
- sli v7.4s, v6.4s, #7
- ushr v6.4s, v20.4s, #25
- sli v6.4s, v20.4s, #7
-
- ext v9.16b, v9.16b, v9.16b, #12
- ext v14.16b, v14.16b, v14.16b, #8
- ext v19.16b, v19.16b, v19.16b, #4
- subs x7, x7, #1
- b.gt Lopen_main_loop_rounds
- subs x6, x6, #1
- b.ge Lopen_main_loop_rounds_short
-
- eor v20.16b, v20.16b, v20.16b //zero
- not v21.16b, v20.16b // -1
- sub v21.4s, v25.4s, v21.4s // Add +1
- ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
- add v19.4s, v19.4s, v20.4s
-
- add v15.4s, v15.4s, v25.4s
- mov x11, #5
- dup v20.4s, w11
- add v25.4s, v25.4s, v20.4s
-
- zip1 v20.4s, v0.4s, v1.4s
- zip2 v21.4s, v0.4s, v1.4s
- zip1 v22.4s, v2.4s, v3.4s
- zip2 v23.4s, v2.4s, v3.4s
-
- zip1 v0.2d, v20.2d, v22.2d
- zip2 v1.2d, v20.2d, v22.2d
- zip1 v2.2d, v21.2d, v23.2d
- zip2 v3.2d, v21.2d, v23.2d
-
- zip1 v20.4s, v5.4s, v6.4s
- zip2 v21.4s, v5.4s, v6.4s
- zip1 v22.4s, v7.4s, v8.4s
- zip2 v23.4s, v7.4s, v8.4s
-
- zip1 v5.2d, v20.2d, v22.2d
- zip2 v6.2d, v20.2d, v22.2d
- zip1 v7.2d, v21.2d, v23.2d
- zip2 v8.2d, v21.2d, v23.2d
-
- zip1 v20.4s, v10.4s, v11.4s
- zip2 v21.4s, v10.4s, v11.4s
- zip1 v22.4s, v12.4s, v13.4s
- zip2 v23.4s, v12.4s, v13.4s
-
- zip1 v10.2d, v20.2d, v22.2d
- zip2 v11.2d, v20.2d, v22.2d
- zip1 v12.2d, v21.2d, v23.2d
- zip2 v13.2d, v21.2d, v23.2d
-
- zip1 v20.4s, v15.4s, v16.4s
- zip2 v21.4s, v15.4s, v16.4s
- zip1 v22.4s, v17.4s, v18.4s
- zip2 v23.4s, v17.4s, v18.4s
-
- zip1 v15.2d, v20.2d, v22.2d
- zip2 v16.2d, v20.2d, v22.2d
- zip1 v17.2d, v21.2d, v23.2d
- zip2 v18.2d, v21.2d, v23.2d
-
- add v0.4s, v0.4s, v24.4s
- add v5.4s, v5.4s, v28.4s
- add v10.4s, v10.4s, v29.4s
- add v15.4s, v15.4s, v30.4s
-
- add v1.4s, v1.4s, v24.4s
- add v6.4s, v6.4s, v28.4s
- add v11.4s, v11.4s, v29.4s
- add v16.4s, v16.4s, v30.4s
-
- add v2.4s, v2.4s, v24.4s
- add v7.4s, v7.4s, v28.4s
- add v12.4s, v12.4s, v29.4s
- add v17.4s, v17.4s, v30.4s
-
- add v3.4s, v3.4s, v24.4s
- add v8.4s, v8.4s, v28.4s
- add v13.4s, v13.4s, v29.4s
- add v18.4s, v18.4s, v30.4s
-
- add v4.4s, v4.4s, v24.4s
- add v9.4s, v9.4s, v28.4s
- add v14.4s, v14.4s, v29.4s
- add v19.4s, v19.4s, v30.4s
-
- // We can always safely store 192 bytes
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v0.16b
- eor v21.16b, v21.16b, v5.16b
- eor v22.16b, v22.16b, v10.16b
- eor v23.16b, v23.16b, v15.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v1.16b
- eor v21.16b, v21.16b, v6.16b
- eor v22.16b, v22.16b, v11.16b
- eor v23.16b, v23.16b, v16.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v2.16b
- eor v21.16b, v21.16b, v7.16b
- eor v22.16b, v22.16b, v12.16b
- eor v23.16b, v23.16b, v17.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- sub x2, x2, #192
-
- mov v0.16b, v3.16b
- mov v5.16b, v8.16b
- mov v10.16b, v13.16b
- mov v15.16b, v18.16b
-
- cmp x2, #64
- b.lt Lopen_tail_64_store
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v3.16b
- eor v21.16b, v21.16b, v8.16b
- eor v22.16b, v22.16b, v13.16b
- eor v23.16b, v23.16b, v18.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- sub x2, x2, #64
-
- mov v0.16b, v4.16b
- mov v5.16b, v9.16b
- mov v10.16b, v14.16b
- mov v15.16b, v19.16b
-
- cmp x2, #64
- b.lt Lopen_tail_64_store
-
- ld1 {v20.16b - v23.16b}, [x1], #64
- eor v20.16b, v20.16b, v4.16b
- eor v21.16b, v21.16b, v9.16b
- eor v22.16b, v22.16b, v14.16b
- eor v23.16b, v23.16b, v19.16b
- st1 {v20.16b - v23.16b}, [x0], #64
-
- sub x2, x2, #64
- b Lopen_main_loop
-
-Lopen_tail:
-
- cbz x2, Lopen_finalize
-
- lsr x4, x2, #4 // How many whole blocks we have to hash
-
- cmp x2, #64
- b.le Lopen_tail_64
- cmp x2, #128
- b.le Lopen_tail_128
-
-Lopen_tail_192:
- // We need three more blocks
- mov v0.16b, v24.16b
- mov v1.16b, v24.16b
- mov v2.16b, v24.16b
- mov v5.16b, v28.16b
- mov v6.16b, v28.16b
- mov v7.16b, v28.16b
- mov v10.16b, v29.16b
- mov v11.16b, v29.16b
- mov v12.16b, v29.16b
- mov v15.16b, v30.16b
- mov v16.16b, v30.16b
- mov v17.16b, v30.16b
- eor v23.16b, v23.16b, v23.16b
- eor v21.16b, v21.16b, v21.16b
- ins v23.s[0], v25.s[0]
- ins v21.d[0], x15
-
- add v22.4s, v23.4s, v21.4s
- add v21.4s, v22.4s, v21.4s
-
- add v15.4s, v15.4s, v21.4s
- add v16.4s, v16.4s, v23.4s
- add v17.4s, v17.4s, v22.4s
-
- mov x7, #10
- subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
- csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
- sub x4, x4, x7
-
- cbz x7, Lopen_tail_192_rounds_no_hash
-
-Lopen_tail_192_rounds:
- ldp x11, x12, [x3], 16
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
-Lopen_tail_192_rounds_no_hash:
- add v0.4s, v0.4s, v5.4s
- add v1.4s, v1.4s, v6.4s
- add v2.4s, v2.4s, v7.4s
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- rev32 v15.8h, v15.8h
- rev32 v16.8h, v16.8h
- rev32 v17.8h, v17.8h
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- eor v5.16b, v5.16b, v10.16b
- eor v6.16b, v6.16b, v11.16b
- eor v7.16b, v7.16b, v12.16b
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- ushr v5.4s, v6.4s, #20
- sli v5.4s, v6.4s, #12
- ushr v6.4s, v7.4s, #20
- sli v6.4s, v7.4s, #12
-
- add v0.4s, v0.4s, v20.4s
- add v1.4s, v1.4s, v5.4s
- add v2.4s, v2.4s, v6.4s
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- tbl v15.16b, {v15.16b}, v26.16b
- tbl v16.16b, {v16.16b}, v26.16b
- tbl v17.16b, {v17.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- eor v20.16b, v20.16b, v10.16b
- eor v5.16b, v5.16b, v11.16b
- eor v6.16b, v6.16b, v12.16b
- ushr v7.4s, v6.4s, #25
- sli v7.4s, v6.4s, #7
- ushr v6.4s, v5.4s, #25
- sli v6.4s, v5.4s, #7
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
-
- ext v5.16b, v5.16b, v5.16b, #4
- ext v6.16b, v6.16b, v6.16b, #4
- ext v7.16b, v7.16b, v7.16b, #4
-
- ext v10.16b, v10.16b, v10.16b, #8
- ext v11.16b, v11.16b, v11.16b, #8
- ext v12.16b, v12.16b, v12.16b, #8
-
- ext v15.16b, v15.16b, v15.16b, #12
- ext v16.16b, v16.16b, v16.16b, #12
- ext v17.16b, v17.16b, v17.16b, #12
- add v0.4s, v0.4s, v5.4s
- add v1.4s, v1.4s, v6.4s
- add v2.4s, v2.4s, v7.4s
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- rev32 v15.8h, v15.8h
- rev32 v16.8h, v16.8h
- rev32 v17.8h, v17.8h
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- eor v5.16b, v5.16b, v10.16b
- eor v6.16b, v6.16b, v11.16b
- eor v7.16b, v7.16b, v12.16b
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- ushr v5.4s, v6.4s, #20
- sli v5.4s, v6.4s, #12
- ushr v6.4s, v7.4s, #20
- sli v6.4s, v7.4s, #12
-
- add v0.4s, v0.4s, v20.4s
- add v1.4s, v1.4s, v5.4s
- add v2.4s, v2.4s, v6.4s
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- tbl v15.16b, {v15.16b}, v26.16b
- tbl v16.16b, {v16.16b}, v26.16b
- tbl v17.16b, {v17.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- eor v20.16b, v20.16b, v10.16b
- eor v5.16b, v5.16b, v11.16b
- eor v6.16b, v6.16b, v12.16b
- ushr v7.4s, v6.4s, #25
- sli v7.4s, v6.4s, #7
- ushr v6.4s, v5.4s, #25
- sli v6.4s, v5.4s, #7
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
-
- ext v5.16b, v5.16b, v5.16b, #12
- ext v6.16b, v6.16b, v6.16b, #12
- ext v7.16b, v7.16b, v7.16b, #12
-
- ext v10.16b, v10.16b, v10.16b, #8
- ext v11.16b, v11.16b, v11.16b, #8
- ext v12.16b, v12.16b, v12.16b, #8
-
- ext v15.16b, v15.16b, v15.16b, #4
- ext v16.16b, v16.16b, v16.16b, #4
- ext v17.16b, v17.16b, v17.16b, #4
- subs x7, x7, #1
- b.gt Lopen_tail_192_rounds
- subs x6, x6, #1
- b.ge Lopen_tail_192_rounds_no_hash
-
- // We hashed 160 bytes at most, may still have 32 bytes left
-Lopen_tail_192_hash:
- cbz x4, Lopen_tail_192_hash_done
- ldp x11, x12, [x3], 16
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- sub x4, x4, #1
- b Lopen_tail_192_hash
-
-Lopen_tail_192_hash_done:
-
- add v0.4s, v0.4s, v24.4s
- add v1.4s, v1.4s, v24.4s
- add v2.4s, v2.4s, v24.4s
- add v5.4s, v5.4s, v28.4s
- add v6.4s, v6.4s, v28.4s
- add v7.4s, v7.4s, v28.4s
- add v10.4s, v10.4s, v29.4s
- add v11.4s, v11.4s, v29.4s
- add v12.4s, v12.4s, v29.4s
- add v15.4s, v15.4s, v30.4s
- add v16.4s, v16.4s, v30.4s
- add v17.4s, v17.4s, v30.4s
-
- add v15.4s, v15.4s, v21.4s
- add v16.4s, v16.4s, v23.4s
- add v17.4s, v17.4s, v22.4s
-
- ld1 {v20.16b - v23.16b}, [x1], #64
-
- eor v20.16b, v20.16b, v1.16b
- eor v21.16b, v21.16b, v6.16b
- eor v22.16b, v22.16b, v11.16b
- eor v23.16b, v23.16b, v16.16b
-
- st1 {v20.16b - v23.16b}, [x0], #64
-
- ld1 {v20.16b - v23.16b}, [x1], #64
-
- eor v20.16b, v20.16b, v2.16b
- eor v21.16b, v21.16b, v7.16b
- eor v22.16b, v22.16b, v12.16b
- eor v23.16b, v23.16b, v17.16b
-
- st1 {v20.16b - v23.16b}, [x0], #64
-
- sub x2, x2, #128
- b Lopen_tail_64_store
-
-Lopen_tail_128:
- // We need two more blocks
- mov v0.16b, v24.16b
- mov v1.16b, v24.16b
- mov v5.16b, v28.16b
- mov v6.16b, v28.16b
- mov v10.16b, v29.16b
- mov v11.16b, v29.16b
- mov v15.16b, v30.16b
- mov v16.16b, v30.16b
- eor v23.16b, v23.16b, v23.16b
- eor v22.16b, v22.16b, v22.16b
- ins v23.s[0], v25.s[0]
- ins v22.d[0], x15
- add v22.4s, v22.4s, v23.4s
-
- add v15.4s, v15.4s, v22.4s
- add v16.4s, v16.4s, v23.4s
-
- mov x6, #10
- sub x6, x6, x4
-
-Lopen_tail_128_rounds:
- add v0.4s, v0.4s, v5.4s
- eor v15.16b, v15.16b, v0.16b
- rev32 v15.8h, v15.8h
-
- add v10.4s, v10.4s, v15.4s
- eor v5.16b, v5.16b, v10.16b
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- add v0.4s, v0.4s, v20.4s
- eor v15.16b, v15.16b, v0.16b
- tbl v15.16b, {v15.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- eor v20.16b, v20.16b, v10.16b
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
- ext v5.16b, v5.16b, v5.16b, #4
- ext v10.16b, v10.16b, v10.16b, #8
- ext v15.16b, v15.16b, v15.16b, #12
- add v1.4s, v1.4s, v6.4s
- eor v16.16b, v16.16b, v1.16b
- rev32 v16.8h, v16.8h
-
- add v11.4s, v11.4s, v16.4s
- eor v6.16b, v6.16b, v11.16b
- ushr v20.4s, v6.4s, #20
- sli v20.4s, v6.4s, #12
- add v1.4s, v1.4s, v20.4s
- eor v16.16b, v16.16b, v1.16b
- tbl v16.16b, {v16.16b}, v26.16b
-
- add v11.4s, v11.4s, v16.4s
- eor v20.16b, v20.16b, v11.16b
- ushr v6.4s, v20.4s, #25
- sli v6.4s, v20.4s, #7
- ext v6.16b, v6.16b, v6.16b, #4
- ext v11.16b, v11.16b, v11.16b, #8
- ext v16.16b, v16.16b, v16.16b, #12
- add v0.4s, v0.4s, v5.4s
- eor v15.16b, v15.16b, v0.16b
- rev32 v15.8h, v15.8h
-
- add v10.4s, v10.4s, v15.4s
- eor v5.16b, v5.16b, v10.16b
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- add v0.4s, v0.4s, v20.4s
- eor v15.16b, v15.16b, v0.16b
- tbl v15.16b, {v15.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- eor v20.16b, v20.16b, v10.16b
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
- ext v5.16b, v5.16b, v5.16b, #12
- ext v10.16b, v10.16b, v10.16b, #8
- ext v15.16b, v15.16b, v15.16b, #4
- add v1.4s, v1.4s, v6.4s
- eor v16.16b, v16.16b, v1.16b
- rev32 v16.8h, v16.8h
-
- add v11.4s, v11.4s, v16.4s
- eor v6.16b, v6.16b, v11.16b
- ushr v20.4s, v6.4s, #20
- sli v20.4s, v6.4s, #12
- add v1.4s, v1.4s, v20.4s
- eor v16.16b, v16.16b, v1.16b
- tbl v16.16b, {v16.16b}, v26.16b
-
- add v11.4s, v11.4s, v16.4s
- eor v20.16b, v20.16b, v11.16b
- ushr v6.4s, v20.4s, #25
- sli v6.4s, v20.4s, #7
- ext v6.16b, v6.16b, v6.16b, #12
- ext v11.16b, v11.16b, v11.16b, #8
- ext v16.16b, v16.16b, v16.16b, #4
- subs x6, x6, #1
- b.gt Lopen_tail_128_rounds
- cbz x4, Lopen_tail_128_rounds_done
- subs x4, x4, #1
- ldp x11, x12, [x3], 16
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- b Lopen_tail_128_rounds
-
-Lopen_tail_128_rounds_done:
- add v0.4s, v0.4s, v24.4s
- add v1.4s, v1.4s, v24.4s
- add v5.4s, v5.4s, v28.4s
- add v6.4s, v6.4s, v28.4s
- add v10.4s, v10.4s, v29.4s
- add v11.4s, v11.4s, v29.4s
- add v15.4s, v15.4s, v30.4s
- add v16.4s, v16.4s, v30.4s
- add v15.4s, v15.4s, v22.4s
- add v16.4s, v16.4s, v23.4s
-
- ld1 {v20.16b - v23.16b}, [x1], #64
-
- eor v20.16b, v20.16b, v1.16b
- eor v21.16b, v21.16b, v6.16b
- eor v22.16b, v22.16b, v11.16b
- eor v23.16b, v23.16b, v16.16b
-
- st1 {v20.16b - v23.16b}, [x0], #64
- sub x2, x2, #64
-
- b Lopen_tail_64_store
-
-Lopen_tail_64:
- // We just need a single block
- mov v0.16b, v24.16b
- mov v5.16b, v28.16b
- mov v10.16b, v29.16b
- mov v15.16b, v30.16b
- eor v23.16b, v23.16b, v23.16b
- ins v23.s[0], v25.s[0]
- add v15.4s, v15.4s, v23.4s
-
- mov x6, #10
- sub x6, x6, x4
-
-Lopen_tail_64_rounds:
- add v0.4s, v0.4s, v5.4s
- eor v15.16b, v15.16b, v0.16b
- rev32 v15.8h, v15.8h
-
- add v10.4s, v10.4s, v15.4s
- eor v5.16b, v5.16b, v10.16b
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- add v0.4s, v0.4s, v20.4s
- eor v15.16b, v15.16b, v0.16b
- tbl v15.16b, {v15.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- eor v20.16b, v20.16b, v10.16b
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
- ext v5.16b, v5.16b, v5.16b, #4
- ext v10.16b, v10.16b, v10.16b, #8
- ext v15.16b, v15.16b, v15.16b, #12
- add v0.4s, v0.4s, v5.4s
- eor v15.16b, v15.16b, v0.16b
- rev32 v15.8h, v15.8h
-
- add v10.4s, v10.4s, v15.4s
- eor v5.16b, v5.16b, v10.16b
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- add v0.4s, v0.4s, v20.4s
- eor v15.16b, v15.16b, v0.16b
- tbl v15.16b, {v15.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- eor v20.16b, v20.16b, v10.16b
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
- ext v5.16b, v5.16b, v5.16b, #12
- ext v10.16b, v10.16b, v10.16b, #8
- ext v15.16b, v15.16b, v15.16b, #4
- subs x6, x6, #1
- b.gt Lopen_tail_64_rounds
- cbz x4, Lopen_tail_64_rounds_done
- subs x4, x4, #1
- ldp x11, x12, [x3], 16
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- b Lopen_tail_64_rounds
-
-Lopen_tail_64_rounds_done:
- add v0.4s, v0.4s, v24.4s
- add v5.4s, v5.4s, v28.4s
- add v10.4s, v10.4s, v29.4s
- add v15.4s, v15.4s, v30.4s
- add v15.4s, v15.4s, v23.4s
-
-Lopen_tail_64_store:
- cmp x2, #16
- b.lt Lopen_tail_16
-
- ld1 {v20.16b}, [x1], #16
- eor v20.16b, v20.16b, v0.16b
- st1 {v20.16b}, [x0], #16
- mov v0.16b, v5.16b
- mov v5.16b, v10.16b
- mov v10.16b, v15.16b
- sub x2, x2, #16
- b Lopen_tail_64_store
-
-Lopen_tail_16:
- // Here we handle the last [0,16) bytes that require a padded block
- cbz x2, Lopen_finalize
-
- eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
- eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
- not v22.16b, v20.16b
-
- add x7, x1, x2
- mov x6, x2
-
-Lopen_tail_16_compose:
- ext v20.16b, v20.16b, v20.16b, #15
- ldrb w11, [x7, #-1]!
- mov v20.b[0], w11
- ext v21.16b, v22.16b, v21.16b, #15
- subs x2, x2, #1
- b.gt Lopen_tail_16_compose
-
- and v20.16b, v20.16b, v21.16b
- // Hash in the final padded block
- mov x11, v20.d[0]
- mov x12, v20.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- eor v20.16b, v20.16b, v0.16b
-
-Lopen_tail_16_store:
- umov w11, v20.b[0]
- strb w11, [x0], #1
- ext v20.16b, v20.16b, v20.16b, #1
- subs x6, x6, #1
- b.gt Lopen_tail_16_store
-
-Lopen_finalize:
- mov x11, v31.d[0]
- mov x12, v31.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- // Final reduction step
- sub x12, xzr, x15
- orr x13, xzr, #3
- subs x11, x8, #-5
- sbcs x12, x9, x12
- sbcs x13, x10, x13
- csel x8, x11, x8, cs
- csel x9, x12, x9, cs
- csel x10, x13, x10, cs
- mov x11, v27.d[0]
- mov x12, v27.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
-
- stp x8, x9, [x5]
-
- ldp d8, d9, [sp, #16]
- ldp d10, d11, [sp, #32]
- ldp d12, d13, [sp, #48]
- ldp d14, d15, [sp, #64]
-.cfi_restore b15
-.cfi_restore b14
-.cfi_restore b13
-.cfi_restore b12
-.cfi_restore b11
-.cfi_restore b10
-.cfi_restore b9
-.cfi_restore b8
- ldp x29, x30, [sp], 80
-.cfi_restore w29
-.cfi_restore w30
-.cfi_def_cfa_offset 0
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-Lopen_128:
- // On some architectures preparing 5 blocks for small buffers is wasteful
- eor v25.16b, v25.16b, v25.16b
- mov x11, #1
- mov v25.s[0], w11
- mov v0.16b, v24.16b
- mov v1.16b, v24.16b
- mov v2.16b, v24.16b
- mov v5.16b, v28.16b
- mov v6.16b, v28.16b
- mov v7.16b, v28.16b
- mov v10.16b, v29.16b
- mov v11.16b, v29.16b
- mov v12.16b, v29.16b
- mov v17.16b, v30.16b
- add v15.4s, v17.4s, v25.4s
- add v16.4s, v15.4s, v25.4s
-
- mov x6, #10
-
-Lopen_128_rounds:
- add v0.4s, v0.4s, v5.4s
- add v1.4s, v1.4s, v6.4s
- add v2.4s, v2.4s, v7.4s
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- rev32 v15.8h, v15.8h
- rev32 v16.8h, v16.8h
- rev32 v17.8h, v17.8h
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- eor v5.16b, v5.16b, v10.16b
- eor v6.16b, v6.16b, v11.16b
- eor v7.16b, v7.16b, v12.16b
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- ushr v5.4s, v6.4s, #20
- sli v5.4s, v6.4s, #12
- ushr v6.4s, v7.4s, #20
- sli v6.4s, v7.4s, #12
-
- add v0.4s, v0.4s, v20.4s
- add v1.4s, v1.4s, v5.4s
- add v2.4s, v2.4s, v6.4s
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- tbl v15.16b, {v15.16b}, v26.16b
- tbl v16.16b, {v16.16b}, v26.16b
- tbl v17.16b, {v17.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- eor v20.16b, v20.16b, v10.16b
- eor v5.16b, v5.16b, v11.16b
- eor v6.16b, v6.16b, v12.16b
- ushr v7.4s, v6.4s, #25
- sli v7.4s, v6.4s, #7
- ushr v6.4s, v5.4s, #25
- sli v6.4s, v5.4s, #7
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
-
- ext v5.16b, v5.16b, v5.16b, #4
- ext v6.16b, v6.16b, v6.16b, #4
- ext v7.16b, v7.16b, v7.16b, #4
-
- ext v10.16b, v10.16b, v10.16b, #8
- ext v11.16b, v11.16b, v11.16b, #8
- ext v12.16b, v12.16b, v12.16b, #8
-
- ext v15.16b, v15.16b, v15.16b, #12
- ext v16.16b, v16.16b, v16.16b, #12
- ext v17.16b, v17.16b, v17.16b, #12
- add v0.4s, v0.4s, v5.4s
- add v1.4s, v1.4s, v6.4s
- add v2.4s, v2.4s, v7.4s
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- rev32 v15.8h, v15.8h
- rev32 v16.8h, v16.8h
- rev32 v17.8h, v17.8h
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- eor v5.16b, v5.16b, v10.16b
- eor v6.16b, v6.16b, v11.16b
- eor v7.16b, v7.16b, v12.16b
- ushr v20.4s, v5.4s, #20
- sli v20.4s, v5.4s, #12
- ushr v5.4s, v6.4s, #20
- sli v5.4s, v6.4s, #12
- ushr v6.4s, v7.4s, #20
- sli v6.4s, v7.4s, #12
-
- add v0.4s, v0.4s, v20.4s
- add v1.4s, v1.4s, v5.4s
- add v2.4s, v2.4s, v6.4s
- eor v15.16b, v15.16b, v0.16b
- eor v16.16b, v16.16b, v1.16b
- eor v17.16b, v17.16b, v2.16b
- tbl v15.16b, {v15.16b}, v26.16b
- tbl v16.16b, {v16.16b}, v26.16b
- tbl v17.16b, {v17.16b}, v26.16b
-
- add v10.4s, v10.4s, v15.4s
- add v11.4s, v11.4s, v16.4s
- add v12.4s, v12.4s, v17.4s
- eor v20.16b, v20.16b, v10.16b
- eor v5.16b, v5.16b, v11.16b
- eor v6.16b, v6.16b, v12.16b
- ushr v7.4s, v6.4s, #25
- sli v7.4s, v6.4s, #7
- ushr v6.4s, v5.4s, #25
- sli v6.4s, v5.4s, #7
- ushr v5.4s, v20.4s, #25
- sli v5.4s, v20.4s, #7
-
- ext v5.16b, v5.16b, v5.16b, #12
- ext v6.16b, v6.16b, v6.16b, #12
- ext v7.16b, v7.16b, v7.16b, #12
-
- ext v10.16b, v10.16b, v10.16b, #8
- ext v11.16b, v11.16b, v11.16b, #8
- ext v12.16b, v12.16b, v12.16b, #8
-
- ext v15.16b, v15.16b, v15.16b, #4
- ext v16.16b, v16.16b, v16.16b, #4
- ext v17.16b, v17.16b, v17.16b, #4
- subs x6, x6, #1
- b.hi Lopen_128_rounds
-
- add v0.4s, v0.4s, v24.4s
- add v1.4s, v1.4s, v24.4s
- add v2.4s, v2.4s, v24.4s
-
- add v5.4s, v5.4s, v28.4s
- add v6.4s, v6.4s, v28.4s
- add v7.4s, v7.4s, v28.4s
-
- add v10.4s, v10.4s, v29.4s
- add v11.4s, v11.4s, v29.4s
-
- add v30.4s, v30.4s, v25.4s
- add v15.4s, v15.4s, v30.4s
- add v30.4s, v30.4s, v25.4s
- add v16.4s, v16.4s, v30.4s
-
- and v2.16b, v2.16b, v27.16b
- mov x16, v2.d[0] // Move the R key to GPRs
- mov x17, v2.d[1]
- mov v27.16b, v7.16b // Store the S key
-
- bl Lpoly_hash_ad_internal
-
-Lopen_128_store:
- cmp x2, #64
- b.lt Lopen_128_store_64
-
- ld1 {v20.16b - v23.16b}, [x1], #64
-
- mov x11, v20.d[0]
- mov x12, v20.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- mov x11, v21.d[0]
- mov x12, v21.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- mov x11, v22.d[0]
- mov x12, v22.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- mov x11, v23.d[0]
- mov x12, v23.d[1]
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
-
- eor v20.16b, v20.16b, v0.16b
- eor v21.16b, v21.16b, v5.16b
- eor v22.16b, v22.16b, v10.16b
- eor v23.16b, v23.16b, v15.16b
-
- st1 {v20.16b - v23.16b}, [x0], #64
-
- sub x2, x2, #64
-
- mov v0.16b, v1.16b
- mov v5.16b, v6.16b
- mov v10.16b, v11.16b
- mov v15.16b, v16.16b
-
-Lopen_128_store_64:
-
- lsr x4, x2, #4
- mov x3, x1
-
-Lopen_128_hash_64:
- cbz x4, Lopen_tail_64_store
- ldp x11, x12, [x3], 16
- adds x8, x8, x11
- adcs x9, x9, x12
- adc x10, x10, x15
- mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
- umulh x12, x8, x16
- mul x13, x9, x16
- umulh x14, x9, x16
- adds x12, x12, x13
- mul x13, x10, x16
- adc x13, x13, x14
- mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
- umulh x8, x8, x17
- adds x12, x12, x14
- mul x14, x9, x17
- umulh x9, x9, x17
- adcs x14, x14, x8
- mul x10, x10, x17
- adc x10, x10, x9
- adds x13, x13, x14
- adc x14, x10, xzr
- and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
- and x8, x13, #-4
- extr x13, x14, x13, #2
- adds x8, x8, x11
- lsr x11, x14, #2
- adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
- adds x8, x8, x13
- adcs x9, x9, x12
- adc x10, x10, xzr // At this point acc2 has the value of 4 at most
- sub x4, x4, #1
- b Lopen_128_hash_64
-.cfi_endproc
-
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/aesv8-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/aesv8-armv8-apple.S
deleted file mode 100644
index 144c4af..0000000
--- a/apple-aarch64/crypto/fipsmodule/aesv8-armv8-apple.S
+++ /dev/null
@@ -1,791 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-#if __ARM_MAX_ARCH__>=7
-.text
-
-.section __TEXT,__const
-.align 5
-Lrcon:
-.long 0x01,0x01,0x01,0x01
-.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
-.long 0x1b,0x1b,0x1b,0x1b
-
-.text
-
-.globl _aes_hw_set_encrypt_key
-.private_extern _aes_hw_set_encrypt_key
-
-.align 5
-_aes_hw_set_encrypt_key:
-Lenc_key:
- // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
- AARCH64_VALID_CALL_TARGET
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- mov x3,#-1
- cmp x0,#0
- b.eq Lenc_key_abort
- cmp x2,#0
- b.eq Lenc_key_abort
- mov x3,#-2
- cmp w1,#128
- b.lt Lenc_key_abort
- cmp w1,#256
- b.gt Lenc_key_abort
- tst w1,#0x3f
- b.ne Lenc_key_abort
-
- adrp x3,Lrcon@PAGE
- add x3,x3,Lrcon@PAGEOFF
- cmp w1,#192
-
- eor v0.16b,v0.16b,v0.16b
- ld1 {v3.16b},[x0],#16
- mov w1,#8 // reuse w1
- ld1 {v1.4s,v2.4s},[x3],#32
-
- b.lt Loop128
- b.eq L192
- b L256
-
-.align 4
-Loop128:
- tbl v6.16b,{v3.16b},v2.16b
- ext v5.16b,v0.16b,v3.16b,#12
- st1 {v3.4s},[x2],#16
- aese v6.16b,v0.16b
- subs w1,w1,#1
-
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
- eor v3.16b,v3.16b,v5.16b
- shl v1.16b,v1.16b,#1
- eor v3.16b,v3.16b,v6.16b
- b.ne Loop128
-
- ld1 {v1.4s},[x3]
-
- tbl v6.16b,{v3.16b},v2.16b
- ext v5.16b,v0.16b,v3.16b,#12
- st1 {v3.4s},[x2],#16
- aese v6.16b,v0.16b
-
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
- eor v3.16b,v3.16b,v5.16b
- shl v1.16b,v1.16b,#1
- eor v3.16b,v3.16b,v6.16b
-
- tbl v6.16b,{v3.16b},v2.16b
- ext v5.16b,v0.16b,v3.16b,#12
- st1 {v3.4s},[x2],#16
- aese v6.16b,v0.16b
-
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
- eor v3.16b,v3.16b,v5.16b
- eor v3.16b,v3.16b,v6.16b
- st1 {v3.4s},[x2]
- add x2,x2,#0x50
-
- mov w12,#10
- b Ldone
-
-.align 4
-L192:
- ld1 {v4.8b},[x0],#8
- movi v6.16b,#8 // borrow v6.16b
- st1 {v3.4s},[x2],#16
- sub v2.16b,v2.16b,v6.16b // adjust the mask
-
-Loop192:
- tbl v6.16b,{v4.16b},v2.16b
- ext v5.16b,v0.16b,v3.16b,#12
- st1 {v4.8b},[x2],#8
- aese v6.16b,v0.16b
- subs w1,w1,#1
-
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v3.16b,v3.16b,v5.16b
-
- dup v5.4s,v3.s[3]
- eor v5.16b,v5.16b,v4.16b
- eor v6.16b,v6.16b,v1.16b
- ext v4.16b,v0.16b,v4.16b,#12
- shl v1.16b,v1.16b,#1
- eor v4.16b,v4.16b,v5.16b
- eor v3.16b,v3.16b,v6.16b
- eor v4.16b,v4.16b,v6.16b
- st1 {v3.4s},[x2],#16
- b.ne Loop192
-
- mov w12,#12
- add x2,x2,#0x20
- b Ldone
-
-.align 4
-L256:
- ld1 {v4.16b},[x0]
- mov w1,#7
- mov w12,#14
- st1 {v3.4s},[x2],#16
-
-Loop256:
- tbl v6.16b,{v4.16b},v2.16b
- ext v5.16b,v0.16b,v3.16b,#12
- st1 {v4.4s},[x2],#16
- aese v6.16b,v0.16b
- subs w1,w1,#1
-
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v3.16b,v3.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
- eor v3.16b,v3.16b,v5.16b
- shl v1.16b,v1.16b,#1
- eor v3.16b,v3.16b,v6.16b
- st1 {v3.4s},[x2],#16
- b.eq Ldone
-
- dup v6.4s,v3.s[3] // just splat
- ext v5.16b,v0.16b,v4.16b,#12
- aese v6.16b,v0.16b
-
- eor v4.16b,v4.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v4.16b,v4.16b,v5.16b
- ext v5.16b,v0.16b,v5.16b,#12
- eor v4.16b,v4.16b,v5.16b
-
- eor v4.16b,v4.16b,v6.16b
- b Loop256
-
-Ldone:
- str w12,[x2]
- mov x3,#0
-
-Lenc_key_abort:
- mov x0,x3 // return value
- ldr x29,[sp],#16
- ret
-
-
-.globl _aes_hw_set_decrypt_key
-.private_extern _aes_hw_set_decrypt_key
-
-.align 5
-_aes_hw_set_decrypt_key:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- bl Lenc_key
-
- cmp x0,#0
- b.ne Ldec_key_abort
-
- sub x2,x2,#240 // restore original x2
- mov x4,#-16
- add x0,x2,x12,lsl#4 // end of key schedule
-
- ld1 {v0.4s},[x2]
- ld1 {v1.4s},[x0]
- st1 {v0.4s},[x0],x4
- st1 {v1.4s},[x2],#16
-
-Loop_imc:
- ld1 {v0.4s},[x2]
- ld1 {v1.4s},[x0]
- aesimc v0.16b,v0.16b
- aesimc v1.16b,v1.16b
- st1 {v0.4s},[x0],x4
- st1 {v1.4s},[x2],#16
- cmp x0,x2
- b.hi Loop_imc
-
- ld1 {v0.4s},[x2]
- aesimc v0.16b,v0.16b
- st1 {v0.4s},[x0]
-
- eor x0,x0,x0 // return value
-Ldec_key_abort:
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-.globl _aes_hw_encrypt
-.private_extern _aes_hw_encrypt
-
-.align 5
-_aes_hw_encrypt:
- AARCH64_VALID_CALL_TARGET
- ldr w3,[x2,#240]
- ld1 {v0.4s},[x2],#16
- ld1 {v2.16b},[x0]
- sub w3,w3,#2
- ld1 {v1.4s},[x2],#16
-
-Loop_enc:
- aese v2.16b,v0.16b
- aesmc v2.16b,v2.16b
- ld1 {v0.4s},[x2],#16
- subs w3,w3,#2
- aese v2.16b,v1.16b
- aesmc v2.16b,v2.16b
- ld1 {v1.4s},[x2],#16
- b.gt Loop_enc
-
- aese v2.16b,v0.16b
- aesmc v2.16b,v2.16b
- ld1 {v0.4s},[x2]
- aese v2.16b,v1.16b
- eor v2.16b,v2.16b,v0.16b
-
- st1 {v2.16b},[x1]
- ret
-
-.globl _aes_hw_decrypt
-.private_extern _aes_hw_decrypt
-
-.align 5
-_aes_hw_decrypt:
- AARCH64_VALID_CALL_TARGET
- ldr w3,[x2,#240]
- ld1 {v0.4s},[x2],#16
- ld1 {v2.16b},[x0]
- sub w3,w3,#2
- ld1 {v1.4s},[x2],#16
-
-Loop_dec:
- aesd v2.16b,v0.16b
- aesimc v2.16b,v2.16b
- ld1 {v0.4s},[x2],#16
- subs w3,w3,#2
- aesd v2.16b,v1.16b
- aesimc v2.16b,v2.16b
- ld1 {v1.4s},[x2],#16
- b.gt Loop_dec
-
- aesd v2.16b,v0.16b
- aesimc v2.16b,v2.16b
- ld1 {v0.4s},[x2]
- aesd v2.16b,v1.16b
- eor v2.16b,v2.16b,v0.16b
-
- st1 {v2.16b},[x1]
- ret
-
-.globl _aes_hw_cbc_encrypt
-.private_extern _aes_hw_cbc_encrypt
-
-.align 5
-_aes_hw_cbc_encrypt:
- // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
- AARCH64_VALID_CALL_TARGET
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- subs x2,x2,#16
- mov x8,#16
- b.lo Lcbc_abort
- csel x8,xzr,x8,eq
-
- cmp w5,#0 // en- or decrypting?
- ldr w5,[x3,#240]
- and x2,x2,#-16
- ld1 {v6.16b},[x4]
- ld1 {v0.16b},[x0],x8
-
- ld1 {v16.4s,v17.4s},[x3] // load key schedule...
- sub w5,w5,#6
- add x7,x3,x5,lsl#4 // pointer to last 7 round keys
- sub w5,w5,#2
- ld1 {v18.4s,v19.4s},[x7],#32
- ld1 {v20.4s,v21.4s},[x7],#32
- ld1 {v22.4s,v23.4s},[x7],#32
- ld1 {v7.4s},[x7]
-
- add x7,x3,#32
- mov w6,w5
- b.eq Lcbc_dec
-
- cmp w5,#2
- eor v0.16b,v0.16b,v6.16b
- eor v5.16b,v16.16b,v7.16b
- b.eq Lcbc_enc128
-
- ld1 {v2.4s,v3.4s},[x7]
- add x7,x3,#16
- add x6,x3,#16*4
- add x12,x3,#16*5
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- add x14,x3,#16*6
- add x3,x3,#16*7
- b Lenter_cbc_enc
-
-.align 4
-Loop_cbc_enc:
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- st1 {v6.16b},[x1],#16
-Lenter_cbc_enc:
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v2.16b
- aesmc v0.16b,v0.16b
- ld1 {v16.4s},[x6]
- cmp w5,#4
- aese v0.16b,v3.16b
- aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x12]
- b.eq Lcbc_enc192
-
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- ld1 {v16.4s},[x14]
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x3]
- nop
-
-Lcbc_enc192:
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- subs x2,x2,#16
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- csel x8,xzr,x8,eq
- aese v0.16b,v18.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v19.16b
- aesmc v0.16b,v0.16b
- ld1 {v16.16b},[x0],x8
- aese v0.16b,v20.16b
- aesmc v0.16b,v0.16b
- eor v16.16b,v16.16b,v5.16b
- aese v0.16b,v21.16b
- aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
- aese v0.16b,v22.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v23.16b
- eor v6.16b,v0.16b,v7.16b
- b.hs Loop_cbc_enc
-
- st1 {v6.16b},[x1],#16
- b Lcbc_done
-
-.align 5
-Lcbc_enc128:
- ld1 {v2.4s,v3.4s},[x7]
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- b Lenter_cbc_enc128
-Loop_cbc_enc128:
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- st1 {v6.16b},[x1],#16
-Lenter_cbc_enc128:
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- subs x2,x2,#16
- aese v0.16b,v2.16b
- aesmc v0.16b,v0.16b
- csel x8,xzr,x8,eq
- aese v0.16b,v3.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v18.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v19.16b
- aesmc v0.16b,v0.16b
- ld1 {v16.16b},[x0],x8
- aese v0.16b,v20.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v21.16b
- aesmc v0.16b,v0.16b
- aese v0.16b,v22.16b
- aesmc v0.16b,v0.16b
- eor v16.16b,v16.16b,v5.16b
- aese v0.16b,v23.16b
- eor v6.16b,v0.16b,v7.16b
- b.hs Loop_cbc_enc128
-
- st1 {v6.16b},[x1],#16
- b Lcbc_done
-.align 5
-Lcbc_dec:
- ld1 {v18.16b},[x0],#16
- subs x2,x2,#32 // bias
- add w6,w5,#2
- orr v3.16b,v0.16b,v0.16b
- orr v1.16b,v0.16b,v0.16b
- orr v19.16b,v18.16b,v18.16b
- b.lo Lcbc_dec_tail
-
- orr v1.16b,v18.16b,v18.16b
- ld1 {v18.16b},[x0],#16
- orr v2.16b,v0.16b,v0.16b
- orr v3.16b,v1.16b,v1.16b
- orr v19.16b,v18.16b,v18.16b
-
-Loop3x_cbc_dec:
- aesd v0.16b,v16.16b
- aesimc v0.16b,v0.16b
- aesd v1.16b,v16.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v16.16b
- aesimc v18.16b,v18.16b
- ld1 {v16.4s},[x7],#16
- subs w6,w6,#2
- aesd v0.16b,v17.16b
- aesimc v0.16b,v0.16b
- aesd v1.16b,v17.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v17.16b
- aesimc v18.16b,v18.16b
- ld1 {v17.4s},[x7],#16
- b.gt Loop3x_cbc_dec
-
- aesd v0.16b,v16.16b
- aesimc v0.16b,v0.16b
- aesd v1.16b,v16.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v16.16b
- aesimc v18.16b,v18.16b
- eor v4.16b,v6.16b,v7.16b
- subs x2,x2,#0x30
- eor v5.16b,v2.16b,v7.16b
- csel x6,x2,x6,lo // x6, w6, is zero at this point
- aesd v0.16b,v17.16b
- aesimc v0.16b,v0.16b
- aesd v1.16b,v17.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v17.16b
- aesimc v18.16b,v18.16b
- eor v17.16b,v3.16b,v7.16b
- add x0,x0,x6 // x0 is adjusted in such way that
- // at exit from the loop v1.16b-v18.16b
- // are loaded with last "words"
- orr v6.16b,v19.16b,v19.16b
- mov x7,x3
- aesd v0.16b,v20.16b
- aesimc v0.16b,v0.16b
- aesd v1.16b,v20.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v20.16b
- aesimc v18.16b,v18.16b
- ld1 {v2.16b},[x0],#16
- aesd v0.16b,v21.16b
- aesimc v0.16b,v0.16b
- aesd v1.16b,v21.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v21.16b
- aesimc v18.16b,v18.16b
- ld1 {v3.16b},[x0],#16
- aesd v0.16b,v22.16b
- aesimc v0.16b,v0.16b
- aesd v1.16b,v22.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v22.16b
- aesimc v18.16b,v18.16b
- ld1 {v19.16b},[x0],#16
- aesd v0.16b,v23.16b
- aesd v1.16b,v23.16b
- aesd v18.16b,v23.16b
- ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
- add w6,w5,#2
- eor v4.16b,v4.16b,v0.16b
- eor v5.16b,v5.16b,v1.16b
- eor v18.16b,v18.16b,v17.16b
- ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
- st1 {v4.16b},[x1],#16
- orr v0.16b,v2.16b,v2.16b
- st1 {v5.16b},[x1],#16
- orr v1.16b,v3.16b,v3.16b
- st1 {v18.16b},[x1],#16
- orr v18.16b,v19.16b,v19.16b
- b.hs Loop3x_cbc_dec
-
- cmn x2,#0x30
- b.eq Lcbc_done
- nop
-
-Lcbc_dec_tail:
- aesd v1.16b,v16.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v16.16b
- aesimc v18.16b,v18.16b
- ld1 {v16.4s},[x7],#16
- subs w6,w6,#2
- aesd v1.16b,v17.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v17.16b
- aesimc v18.16b,v18.16b
- ld1 {v17.4s},[x7],#16
- b.gt Lcbc_dec_tail
-
- aesd v1.16b,v16.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v16.16b
- aesimc v18.16b,v18.16b
- aesd v1.16b,v17.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v17.16b
- aesimc v18.16b,v18.16b
- aesd v1.16b,v20.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v20.16b
- aesimc v18.16b,v18.16b
- cmn x2,#0x20
- aesd v1.16b,v21.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v21.16b
- aesimc v18.16b,v18.16b
- eor v5.16b,v6.16b,v7.16b
- aesd v1.16b,v22.16b
- aesimc v1.16b,v1.16b
- aesd v18.16b,v22.16b
- aesimc v18.16b,v18.16b
- eor v17.16b,v3.16b,v7.16b
- aesd v1.16b,v23.16b
- aesd v18.16b,v23.16b
- b.eq Lcbc_dec_one
- eor v5.16b,v5.16b,v1.16b
- eor v17.16b,v17.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
- st1 {v5.16b},[x1],#16
- st1 {v17.16b},[x1],#16
- b Lcbc_done
-
-Lcbc_dec_one:
- eor v5.16b,v5.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
- st1 {v5.16b},[x1],#16
-
-Lcbc_done:
- st1 {v6.16b},[x4]
-Lcbc_abort:
- ldr x29,[sp],#16
- ret
-
-.globl _aes_hw_ctr32_encrypt_blocks
-.private_extern _aes_hw_ctr32_encrypt_blocks
-
-.align 5
-_aes_hw_ctr32_encrypt_blocks:
- // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
- AARCH64_VALID_CALL_TARGET
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- ldr w5,[x3,#240]
-
- ldr w8, [x4, #12]
- ld1 {v0.4s},[x4]
-
- ld1 {v16.4s,v17.4s},[x3] // load key schedule...
- sub w5,w5,#4
- mov x12,#16
- cmp x2,#2
- add x7,x3,x5,lsl#4 // pointer to last 5 round keys
- sub w5,w5,#2
- ld1 {v20.4s,v21.4s},[x7],#32
- ld1 {v22.4s,v23.4s},[x7],#32
- ld1 {v7.4s},[x7]
- add x7,x3,#32
- mov w6,w5
- csel x12,xzr,x12,lo
-
- // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
- // affected by silicon errata #1742098 [0] and #1655431 [1],
- // respectively, where the second instruction of an aese/aesmc
- // instruction pair may execute twice if an interrupt is taken right
- // after the first instruction consumes an input register of which a
- // single 32-bit lane has been updated the last time it was modified.
- //
- // This function uses a counter in one 32-bit lane. The vmov lines
- // could write to v1.16b and v18.16b directly, but that trips this bugs.
- // We write to v6.16b and copy to the final register as a workaround.
- //
- // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
- // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
-#ifndef __AARCH64EB__
- rev w8, w8
-#endif
- add w10, w8, #1
- orr v6.16b,v0.16b,v0.16b
- rev w10, w10
- mov v6.s[3],w10
- add w8, w8, #2
- orr v1.16b,v6.16b,v6.16b
- b.ls Lctr32_tail
- rev w12, w8
- mov v6.s[3],w12
- sub x2,x2,#3 // bias
- orr v18.16b,v6.16b,v6.16b
- b Loop3x_ctr32
-
-.align 4
-Loop3x_ctr32:
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v1.16b,v1.16b
- aese v18.16b,v16.16b
- aesmc v18.16b,v18.16b
- ld1 {v16.4s},[x7],#16
- subs w6,w6,#2
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v17.16b
- aesmc v1.16b,v1.16b
- aese v18.16b,v17.16b
- aesmc v18.16b,v18.16b
- ld1 {v17.4s},[x7],#16
- b.gt Loop3x_ctr32
-
- aese v0.16b,v16.16b
- aesmc v4.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v5.16b,v1.16b
- ld1 {v2.16b},[x0],#16
- add w9,w8,#1
- aese v18.16b,v16.16b
- aesmc v18.16b,v18.16b
- ld1 {v3.16b},[x0],#16
- rev w9,w9
- aese v4.16b,v17.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v17.16b
- aesmc v5.16b,v5.16b
- ld1 {v19.16b},[x0],#16
- mov x7,x3
- aese v18.16b,v17.16b
- aesmc v17.16b,v18.16b
- aese v4.16b,v20.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v20.16b
- aesmc v5.16b,v5.16b
- eor v2.16b,v2.16b,v7.16b
- add w10,w8,#2
- aese v17.16b,v20.16b
- aesmc v17.16b,v17.16b
- eor v3.16b,v3.16b,v7.16b
- add w8,w8,#3
- aese v4.16b,v21.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v21.16b
- aesmc v5.16b,v5.16b
- // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
- // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
- // 32-bit mode. See the comment above.
- eor v19.16b,v19.16b,v7.16b
- mov v6.s[3], w9
- aese v17.16b,v21.16b
- aesmc v17.16b,v17.16b
- orr v0.16b,v6.16b,v6.16b
- rev w10,w10
- aese v4.16b,v22.16b
- aesmc v4.16b,v4.16b
- mov v6.s[3], w10
- rev w12,w8
- aese v5.16b,v22.16b
- aesmc v5.16b,v5.16b
- orr v1.16b,v6.16b,v6.16b
- mov v6.s[3], w12
- aese v17.16b,v22.16b
- aesmc v17.16b,v17.16b
- orr v18.16b,v6.16b,v6.16b
- subs x2,x2,#3
- aese v4.16b,v23.16b
- aese v5.16b,v23.16b
- aese v17.16b,v23.16b
-
- eor v2.16b,v2.16b,v4.16b
- ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
- st1 {v2.16b},[x1],#16
- eor v3.16b,v3.16b,v5.16b
- mov w6,w5
- st1 {v3.16b},[x1],#16
- eor v19.16b,v19.16b,v17.16b
- ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
- st1 {v19.16b},[x1],#16
- b.hs Loop3x_ctr32
-
- adds x2,x2,#3
- b.eq Lctr32_done
- cmp x2,#1
- mov x12,#16
- csel x12,xzr,x12,eq
-
-Lctr32_tail:
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v1.16b,v1.16b
- ld1 {v16.4s},[x7],#16
- subs w6,w6,#2
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v17.16b
- aesmc v1.16b,v1.16b
- ld1 {v17.4s},[x7],#16
- b.gt Lctr32_tail
-
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v1.16b,v1.16b
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v17.16b
- aesmc v1.16b,v1.16b
- ld1 {v2.16b},[x0],x12
- aese v0.16b,v20.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v20.16b
- aesmc v1.16b,v1.16b
- ld1 {v3.16b},[x0]
- aese v0.16b,v21.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v21.16b
- aesmc v1.16b,v1.16b
- eor v2.16b,v2.16b,v7.16b
- aese v0.16b,v22.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v22.16b
- aesmc v1.16b,v1.16b
- eor v3.16b,v3.16b,v7.16b
- aese v0.16b,v23.16b
- aese v1.16b,v23.16b
-
- cmp x2,#1
- eor v2.16b,v2.16b,v0.16b
- eor v3.16b,v3.16b,v1.16b
- st1 {v2.16b},[x1],#16
- b.eq Lctr32_done
- st1 {v3.16b},[x1]
-
-Lctr32_done:
- ldr x29,[sp],#16
- ret
-
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-apple.S
deleted file mode 100644
index 13be797..0000000
--- a/apple-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-apple.S
+++ /dev/null
@@ -1,1555 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-#if __ARM_MAX_ARCH__ >= 8
-
-
-.text
-.globl _aes_gcm_enc_kernel
-.private_extern _aes_gcm_enc_kernel
-
-.align 4
-_aes_gcm_enc_kernel:
- AARCH64_SIGN_LINK_REGISTER
- stp x29, x30, [sp, #-128]!
- mov x29, sp
- stp x19, x20, [sp, #16]
- mov x16, x4
- mov x8, x5
- stp x21, x22, [sp, #32]
- stp x23, x24, [sp, #48]
- stp d8, d9, [sp, #64]
- stp d10, d11, [sp, #80]
- stp d12, d13, [sp, #96]
- stp d14, d15, [sp, #112]
- ldr w17, [x8, #240]
- add x19, x8, x17, lsl #4 // borrow input_l1 for last key
- ldp x13, x14, [x19] // load round N keys
- ldr q31, [x19, #-16] // load round N-1 keys
- add x4, x0, x1, lsr #3 // end_input_ptr
- lsr x5, x1, #3 // byte_len
- mov x15, x5
- ldp x10, x11, [x16] // ctr96_b64, ctr96_t32
- ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible
- sub x5, x5, #1 // byte_len - 1
- ldr q18, [x8, #0] // load rk0
- and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
- ldr q25, [x8, #112] // load rk7
- add x5, x5, x0
- lsr x12, x11, #32
- fmov d2, x10 // CTR block 2
- orr w11, w11, w11
- rev w12, w12 // rev_ctr32
- fmov d1, x10 // CTR block 1
- aese v0.16b, v18.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 0
- add w12, w12, #1 // increment rev_ctr32
- rev w9, w12 // CTR block 1
- fmov d3, x10 // CTR block 3
- orr x9, x11, x9, lsl #32 // CTR block 1
- add w12, w12, #1 // CTR block 1
- ldr q19, [x8, #16] // load rk1
- fmov v1.d[1], x9 // CTR block 1
- rev w9, w12 // CTR block 2
- add w12, w12, #1 // CTR block 2
- orr x9, x11, x9, lsl #32 // CTR block 2
- ldr q20, [x8, #32] // load rk2
- fmov v2.d[1], x9 // CTR block 2
- rev w9, w12 // CTR block 3
- aese v0.16b, v19.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 1
- orr x9, x11, x9, lsl #32 // CTR block 3
- fmov v3.d[1], x9 // CTR block 3
- aese v1.16b, v18.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 0
- ldr q21, [x8, #48] // load rk3
- aese v0.16b, v20.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 2
- ldr q24, [x8, #96] // load rk6
- aese v2.16b, v18.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 0
- ldr q23, [x8, #80] // load rk5
- aese v1.16b, v19.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 1
- ldr q14, [x6, #48] // load h3l | h3h
- ext v14.16b, v14.16b, v14.16b, #8
- aese v3.16b, v18.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 0
- aese v2.16b, v19.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 1
- ldr q22, [x8, #64] // load rk4
- aese v1.16b, v20.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 2
- ldr q13, [x6, #32] // load h2l | h2h
- ext v13.16b, v13.16b, v13.16b, #8
- aese v3.16b, v19.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 1
- ldr q30, [x8, #192] // load rk12
- aese v2.16b, v20.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 2
- ldr q15, [x6, #80] // load h4l | h4h
- ext v15.16b, v15.16b, v15.16b, #8
- aese v1.16b, v21.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 3
- ldr q29, [x8, #176] // load rk11
- aese v3.16b, v20.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 2
- ldr q26, [x8, #128] // load rk8
- aese v2.16b, v21.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 3
- add w12, w12, #1 // CTR block 3
- aese v0.16b, v21.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 3
- aese v3.16b, v21.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 3
- ld1 { v11.16b}, [x3]
- ext v11.16b, v11.16b, v11.16b, #8
- rev64 v11.16b, v11.16b
- aese v2.16b, v22.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 4
- aese v0.16b, v22.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 4
- aese v1.16b, v22.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 4
- aese v3.16b, v22.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 4
- cmp x17, #12 // setup flags for AES-128/192/256 check
- aese v0.16b, v23.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 5
- aese v1.16b, v23.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 5
- aese v3.16b, v23.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 5
- aese v2.16b, v23.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 5
- aese v1.16b, v24.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 6
- trn2 v17.2d, v14.2d, v15.2d // h4l | h3l
- aese v3.16b, v24.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 6
- ldr q27, [x8, #144] // load rk9
- aese v0.16b, v24.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 6
- ldr q12, [x6] // load h1l | h1h
- ext v12.16b, v12.16b, v12.16b, #8
- aese v2.16b, v24.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 6
- ldr q28, [x8, #160] // load rk10
- aese v1.16b, v25.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 7
- trn1 v9.2d, v14.2d, v15.2d // h4h | h3h
- aese v0.16b, v25.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 7
- aese v2.16b, v25.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 7
- aese v3.16b, v25.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 7
- trn2 v16.2d, v12.2d, v13.2d // h2l | h1l
- aese v1.16b, v26.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 8
- aese v2.16b, v26.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 8
- aese v3.16b, v26.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 8
- aese v0.16b, v26.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 8
- b.lt Lenc_finish_first_blocks // branch if AES-128
-
- aese v1.16b, v27.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 9
- aese v2.16b, v27.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 9
- aese v3.16b, v27.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 9
- aese v0.16b, v27.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 9
- aese v1.16b, v28.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 10
- aese v2.16b, v28.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 10
- aese v3.16b, v28.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 10
- aese v0.16b, v28.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 10
- b.eq Lenc_finish_first_blocks // branch if AES-192
-
- aese v1.16b, v29.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 11
- aese v2.16b, v29.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 11
- aese v0.16b, v29.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 11
- aese v3.16b, v29.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 11
- aese v1.16b, v30.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 12
- aese v2.16b, v30.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 12
- aese v0.16b, v30.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 12
- aese v3.16b, v30.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 12
-
-Lenc_finish_first_blocks:
- cmp x0, x5 // check if we have <= 4 blocks
- eor v17.16b, v17.16b, v9.16b // h4k | h3k
- aese v2.16b, v31.16b // AES block 2 - round N-1
- trn1 v8.2d, v12.2d, v13.2d // h2h | h1h
- aese v1.16b, v31.16b // AES block 1 - round N-1
- aese v0.16b, v31.16b // AES block 0 - round N-1
- aese v3.16b, v31.16b // AES block 3 - round N-1
- eor v16.16b, v16.16b, v8.16b // h2k | h1k
- b.ge Lenc_tail // handle tail
-
- ldp x19, x20, [x0, #16] // AES block 1 - load plaintext
- rev w9, w12 // CTR block 4
- ldp x6, x7, [x0, #0] // AES block 0 - load plaintext
- ldp x23, x24, [x0, #48] // AES block 3 - load plaintext
- ldp x21, x22, [x0, #32] // AES block 2 - load plaintext
- add x0, x0, #64 // AES input_ptr update
- eor x19, x19, x13 // AES block 1 - round N low
- eor x20, x20, x14 // AES block 1 - round N high
- fmov d5, x19 // AES block 1 - mov low
- eor x6, x6, x13 // AES block 0 - round N low
- eor x7, x7, x14 // AES block 0 - round N high
- eor x24, x24, x14 // AES block 3 - round N high
- fmov d4, x6 // AES block 0 - mov low
- cmp x0, x5 // check if we have <= 8 blocks
- fmov v4.d[1], x7 // AES block 0 - mov high
- eor x23, x23, x13 // AES block 3 - round N low
- eor x21, x21, x13 // AES block 2 - round N low
- fmov v5.d[1], x20 // AES block 1 - mov high
- fmov d6, x21 // AES block 2 - mov low
- add w12, w12, #1 // CTR block 4
- orr x9, x11, x9, lsl #32 // CTR block 4
- fmov d7, x23 // AES block 3 - mov low
- eor x22, x22, x14 // AES block 2 - round N high
- fmov v6.d[1], x22 // AES block 2 - mov high
- eor v4.16b, v4.16b, v0.16b // AES block 0 - result
- fmov d0, x10 // CTR block 4
- fmov v0.d[1], x9 // CTR block 4
- rev w9, w12 // CTR block 5
- add w12, w12, #1 // CTR block 5
- eor v5.16b, v5.16b, v1.16b // AES block 1 - result
- fmov d1, x10 // CTR block 5
- orr x9, x11, x9, lsl #32 // CTR block 5
- fmov v1.d[1], x9 // CTR block 5
- rev w9, w12 // CTR block 6
- st1 { v4.16b}, [x2], #16 // AES block 0 - store result
- fmov v7.d[1], x24 // AES block 3 - mov high
- orr x9, x11, x9, lsl #32 // CTR block 6
- eor v6.16b, v6.16b, v2.16b // AES block 2 - result
- st1 { v5.16b}, [x2], #16 // AES block 1 - store result
- add w12, w12, #1 // CTR block 6
- fmov d2, x10 // CTR block 6
- fmov v2.d[1], x9 // CTR block 6
- st1 { v6.16b}, [x2], #16 // AES block 2 - store result
- rev w9, w12 // CTR block 7
- orr x9, x11, x9, lsl #32 // CTR block 7
- eor v7.16b, v7.16b, v3.16b // AES block 3 - result
- st1 { v7.16b}, [x2], #16 // AES block 3 - store result
- b.ge Lenc_prepretail // do prepretail
-
-Lenc_main_loop: // main loop start
- aese v0.16b, v18.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
- rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free)
- aese v1.16b, v18.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
- fmov d3, x10 // CTR block 4k+3
- aese v2.16b, v18.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
- ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
- aese v0.16b, v19.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
- fmov v3.d[1], x9 // CTR block 4k+3
- aese v1.16b, v19.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
- ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext
- aese v2.16b, v19.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
- ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext
- aese v0.16b, v20.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
- eor v4.16b, v4.16b, v11.16b // PRE 1
- aese v1.16b, v20.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
- aese v3.16b, v18.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
- eor x23, x23, x13 // AES block 4k+7 - round N low
- aese v0.16b, v21.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
- mov d10, v17.d[1] // GHASH block 4k - mid
- pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
- eor x22, x22, x14 // AES block 4k+6 - round N high
- mov d8, v4.d[1] // GHASH block 4k - mid
- aese v3.16b, v19.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
- rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free)
- aese v0.16b, v22.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
- pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
- eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
- aese v2.16b, v20.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
- aese v0.16b, v23.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
- rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free)
- pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
- pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
- rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free)
- pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
- eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
- mov d4, v5.d[1] // GHASH block 4k+1 - mid
- aese v1.16b, v21.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
- aese v3.16b, v20.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
- eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
- aese v2.16b, v21.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
- aese v1.16b, v22.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
- mov d8, v6.d[1] // GHASH block 4k+2 - mid
- aese v3.16b, v21.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
- eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
- aese v2.16b, v22.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
- aese v0.16b, v24.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
- eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
- aese v3.16b, v22.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
- pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
- aese v0.16b, v25.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
- aese v3.16b, v23.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
- ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
- aese v1.16b, v23.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
- aese v0.16b, v26.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
- aese v2.16b, v23.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
- aese v1.16b, v24.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
- eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
- pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
- pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
- aese v1.16b, v25.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
- pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
- eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
- aese v3.16b, v24.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
- ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext
- aese v1.16b, v26.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
- mov d4, v7.d[1] // GHASH block 4k+3 - mid
- aese v2.16b, v24.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
- eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
- pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
- pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
- eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid
- aese v2.16b, v25.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
- eor x19, x19, x13 // AES block 4k+5 - round N low
- aese v2.16b, v26.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
- eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
- aese v3.16b, v25.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
- eor x21, x21, x13 // AES block 4k+6 - round N low
- aese v3.16b, v26.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
- movi v8.8b, #0xc2
- pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid
- eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
- cmp x17, #12 // setup flags for AES-128/192/256 check
- fmov d5, x19 // AES block 4k+5 - mov low
- ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext
- b.lt Lenc_main_loop_continue // branch if AES-128
-
- aese v1.16b, v27.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
- aese v0.16b, v27.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
- aese v2.16b, v27.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
- aese v3.16b, v27.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
- aese v0.16b, v28.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
- aese v1.16b, v28.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
- aese v2.16b, v28.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
- aese v3.16b, v28.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
- b.eq Lenc_main_loop_continue // branch if AES-192
-
- aese v0.16b, v29.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
- aese v1.16b, v29.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
- aese v2.16b, v29.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
- aese v3.16b, v29.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
- aese v1.16b, v30.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
- aese v0.16b, v30.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
- aese v2.16b, v30.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
- aese v3.16b, v30.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
-
-Lenc_main_loop_continue:
- shl d8, d8, #56 // mod_constant
- eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low
- eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid
- add w12, w12, #1 // CTR block 4k+3
- eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
- add x0, x0, #64 // AES input_ptr update
- pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
- rev w9, w12 // CTR block 4k+8
- ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
- eor x6, x6, x13 // AES block 4k+4 - round N low
- eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up
- eor x7, x7, x14 // AES block 4k+4 - round N high
- fmov d4, x6 // AES block 4k+4 - mov low
- orr x9, x11, x9, lsl #32 // CTR block 4k+8
- eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid
- eor x20, x20, x14 // AES block 4k+5 - round N high
- eor x24, x24, x14 // AES block 4k+7 - round N high
- add w12, w12, #1 // CTR block 4k+8
- aese v0.16b, v31.16b // AES block 4k+4 - round N-1
- fmov v4.d[1], x7 // AES block 4k+4 - mov high
- eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
- fmov d7, x23 // AES block 4k+7 - mov low
- aese v1.16b, v31.16b // AES block 4k+5 - round N-1
- fmov v5.d[1], x20 // AES block 4k+5 - mov high
- fmov d6, x21 // AES block 4k+6 - mov low
- cmp x0, x5 // LOOP CONTROL
- fmov v6.d[1], x22 // AES block 4k+6 - mov high
- pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
- eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result
- fmov d0, x10 // CTR block 4k+8
- fmov v0.d[1], x9 // CTR block 4k+8
- rev w9, w12 // CTR block 4k+9
- add w12, w12, #1 // CTR block 4k+9
- eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result
- fmov d1, x10 // CTR block 4k+9
- orr x9, x11, x9, lsl #32 // CTR block 4k+9
- fmov v1.d[1], x9 // CTR block 4k+9
- aese v2.16b, v31.16b // AES block 4k+6 - round N-1
- rev w9, w12 // CTR block 4k+10
- st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result
- orr x9, x11, x9, lsl #32 // CTR block 4k+10
- eor v11.16b, v11.16b, v9.16b // MODULO - fold into low
- fmov v7.d[1], x24 // AES block 4k+7 - mov high
- ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
- st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result
- add w12, w12, #1 // CTR block 4k+10
- aese v3.16b, v31.16b // AES block 4k+7 - round N-1
- eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result
- fmov d2, x10 // CTR block 4k+10
- st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result
- fmov v2.d[1], x9 // CTR block 4k+10
- rev w9, w12 // CTR block 4k+11
- eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
- orr x9, x11, x9, lsl #32 // CTR block 4k+11
- eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result
- st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result
- b.lt Lenc_main_loop
-
-Lenc_prepretail: // PREPRETAIL
- aese v1.16b, v18.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
- rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free)
- aese v2.16b, v18.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
- fmov d3, x10 // CTR block 4k+3
- aese v0.16b, v18.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
- rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free)
- fmov v3.d[1], x9 // CTR block 4k+3
- ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
- aese v2.16b, v19.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
- aese v0.16b, v19.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
- eor v4.16b, v4.16b, v11.16b // PRE 1
- rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free)
- aese v2.16b, v20.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
- aese v3.16b, v18.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
- mov d10, v17.d[1] // GHASH block 4k - mid
- aese v1.16b, v19.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
- pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
- mov d8, v4.d[1] // GHASH block 4k - mid
- pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
- aese v2.16b, v21.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
- aese v1.16b, v20.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
- eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
- aese v0.16b, v20.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
- aese v3.16b, v19.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
- aese v1.16b, v21.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
- pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
- pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
- pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
- aese v3.16b, v20.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
- eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
- mov d4, v5.d[1] // GHASH block 4k+1 - mid
- aese v0.16b, v21.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
- eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
- aese v3.16b, v21.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
- eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
- mov d8, v6.d[1] // GHASH block 4k+2 - mid
- aese v0.16b, v22.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
- rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free)
- aese v3.16b, v22.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
- pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
- eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
- add w12, w12, #1 // CTR block 4k+3
- pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
- aese v3.16b, v23.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
- aese v2.16b, v22.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
- eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
- pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
- eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
- ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
- aese v2.16b, v23.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
- eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
- mov d4, v7.d[1] // GHASH block 4k+3 - mid
- aese v1.16b, v22.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
- pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
- eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid
- pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
- aese v1.16b, v23.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
- pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid
- eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
- aese v0.16b, v23.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
- aese v1.16b, v24.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
- aese v2.16b, v24.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
- aese v0.16b, v24.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
- movi v8.8b, #0xc2
- aese v3.16b, v24.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
- aese v1.16b, v25.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
- eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
- aese v0.16b, v25.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
- aese v3.16b, v25.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
- shl d8, d8, #56 // mod_constant
- aese v1.16b, v26.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
- eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid
- pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
- aese v3.16b, v26.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
- cmp x17, #12 // setup flags for AES-128/192/256 check
- aese v0.16b, v26.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
- eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low
- aese v2.16b, v25.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
- eor v10.16b, v10.16b, v9.16b // karatsuba tidy up
- aese v2.16b, v26.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
- pmull v4.1q, v9.1d, v8.1d
- ext v9.16b, v9.16b, v9.16b, #8
- eor v10.16b, v10.16b, v11.16b
- b.lt Lenc_finish_prepretail // branch if AES-128
-
- aese v1.16b, v27.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
- aese v3.16b, v27.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
- aese v0.16b, v27.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
- aese v2.16b, v27.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
- aese v3.16b, v28.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
- aese v1.16b, v28.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
- aese v0.16b, v28.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
- aese v2.16b, v28.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
- b.eq Lenc_finish_prepretail // branch if AES-192
-
- aese v1.16b, v29.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
- aese v0.16b, v29.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
- aese v3.16b, v29.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
- aese v2.16b, v29.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
- aese v1.16b, v30.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
- aese v0.16b, v30.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
- aese v3.16b, v30.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
- aese v2.16b, v30.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
-
-Lenc_finish_prepretail:
- eor v10.16b, v10.16b, v4.16b
- eor v10.16b, v10.16b, v9.16b
- pmull v4.1q, v10.1d, v8.1d
- ext v10.16b, v10.16b, v10.16b, #8
- aese v1.16b, v31.16b // AES block 4k+5 - round N-1
- eor v11.16b, v11.16b, v4.16b
- aese v3.16b, v31.16b // AES block 4k+7 - round N-1
- aese v0.16b, v31.16b // AES block 4k+4 - round N-1
- aese v2.16b, v31.16b // AES block 4k+6 - round N-1
- eor v11.16b, v11.16b, v10.16b
-
-Lenc_tail: // TAIL
- ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag
- sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process
- ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext
- eor x6, x6, x13 // AES block 4k+4 - round N low
- eor x7, x7, x14 // AES block 4k+4 - round N high
- cmp x5, #48
- fmov d4, x6 // AES block 4k+4 - mov low
- fmov v4.d[1], x7 // AES block 4k+4 - mov high
- eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result
- b.gt Lenc_blocks_more_than_3
- cmp x5, #32
- mov v3.16b, v2.16b
- movi v11.8b, #0
- movi v9.8b, #0
- sub w12, w12, #1
- mov v2.16b, v1.16b
- movi v10.8b, #0
- b.gt Lenc_blocks_more_than_2
- mov v3.16b, v1.16b
- sub w12, w12, #1
- cmp x5, #16
- b.gt Lenc_blocks_more_than_1
- sub w12, w12, #1
- b Lenc_blocks_less_than_1
-Lenc_blocks_more_than_3: // blocks left > 3
- st1 { v5.16b}, [x2], #16 // AES final-3 block - store result
- ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high
- rev64 v4.16b, v5.16b // GHASH final-3 block
- eor x6, x6, x13 // AES final-2 block - round N low
- eor v4.16b, v4.16b, v8.16b // feed in partial tag
- eor x7, x7, x14 // AES final-2 block - round N high
- mov d22, v4.d[1] // GHASH final-3 block - mid
- fmov d5, x6 // AES final-2 block - mov low
- fmov v5.d[1], x7 // AES final-2 block - mov high
- eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid
- movi v8.8b, #0 // suppress further partial tag feed in
- mov d10, v17.d[1] // GHASH final-3 block - mid
- pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low
- pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high
- pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid
- eor v5.16b, v5.16b, v1.16b // AES final-2 block - result
-Lenc_blocks_more_than_2: // blocks left > 2
- st1 { v5.16b}, [x2], #16 // AES final-2 block - store result
- ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high
- rev64 v4.16b, v5.16b // GHASH final-2 block
- eor x6, x6, x13 // AES final-1 block - round N low
- eor v4.16b, v4.16b, v8.16b // feed in partial tag
- fmov d5, x6 // AES final-1 block - mov low
- eor x7, x7, x14 // AES final-1 block - round N high
- fmov v5.d[1], x7 // AES final-1 block - mov high
- movi v8.8b, #0 // suppress further partial tag feed in
- pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high
- mov d22, v4.d[1] // GHASH final-2 block - mid
- pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low
- eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid
- eor v5.16b, v5.16b, v2.16b // AES final-1 block - result
- eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high
- pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid
- eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low
- eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid
-Lenc_blocks_more_than_1: // blocks left > 1
- st1 { v5.16b}, [x2], #16 // AES final-1 block - store result
- rev64 v4.16b, v5.16b // GHASH final-1 block
- ldp x6, x7, [x0], #16 // AES final block - load input low & high
- eor v4.16b, v4.16b, v8.16b // feed in partial tag
- movi v8.8b, #0 // suppress further partial tag feed in
- eor x6, x6, x13 // AES final block - round N low
- mov d22, v4.d[1] // GHASH final-1 block - mid
- pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high
- eor x7, x7, x14 // AES final block - round N high
- eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid
- eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high
- ins v22.d[1], v22.d[0] // GHASH final-1 block - mid
- fmov d5, x6 // AES final block - mov low
- fmov v5.d[1], x7 // AES final block - mov high
- pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid
- pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low
- eor v5.16b, v5.16b, v3.16b // AES final block - result
- eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid
- eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low
-Lenc_blocks_less_than_1: // blocks left <= 1
- and x1, x1, #127 // bit_length %= 128
- mvn x13, xzr // rkN_l = 0xffffffffffffffff
- sub x1, x1, #128 // bit_length -= 128
- neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128])
- ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored
- mvn x14, xzr // rkN_h = 0xffffffffffffffff
- and x1, x1, #127 // bit_length %= 128
- lsr x14, x14, x1 // rkN_h is mask for top 64b of last block
- cmp x1, #64
- csel x6, x13, x14, lt
- csel x7, x14, xzr, lt
- fmov d0, x6 // ctr0b is mask for last block
- fmov v0.d[1], x7
- and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits
- rev64 v4.16b, v5.16b // GHASH final block
- eor v4.16b, v4.16b, v8.16b // feed in partial tag
- bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing
- pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high
- mov d8, v4.d[1] // GHASH final block - mid
- rev w9, w12
- pmull v21.1q, v4.1d, v12.1d // GHASH final block - low
- eor v9.16b, v9.16b, v20.16b // GHASH final block - high
- eor v8.8b, v8.8b, v4.8b // GHASH final block - mid
- pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid
- eor v11.16b, v11.16b, v21.16b // GHASH final block - low
- eor v10.16b, v10.16b, v8.16b // GHASH final block - mid
- movi v8.8b, #0xc2
- eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
- shl d8, d8, #56 // mod_constant
- eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up
- pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
- ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
- eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
- eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
- pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
- ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
- str w9, [x16, #12] // store the updated counter
- st1 { v5.16b}, [x2] // store all 16B
- eor v11.16b, v11.16b, v9.16b // MODULO - fold into low
- eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
- ext v11.16b, v11.16b, v11.16b, #8
- rev64 v11.16b, v11.16b
- mov x0, x15
- st1 { v11.16b }, [x3]
- ldp x19, x20, [sp, #16]
- ldp x21, x22, [sp, #32]
- ldp x23, x24, [sp, #48]
- ldp d8, d9, [sp, #64]
- ldp d10, d11, [sp, #80]
- ldp d12, d13, [sp, #96]
- ldp d14, d15, [sp, #112]
- ldp x29, x30, [sp], #128
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-.globl _aes_gcm_dec_kernel
-.private_extern _aes_gcm_dec_kernel
-
-.align 4
-_aes_gcm_dec_kernel:
- AARCH64_SIGN_LINK_REGISTER
- stp x29, x30, [sp, #-128]!
- mov x29, sp
- stp x19, x20, [sp, #16]
- mov x16, x4
- mov x8, x5
- stp x21, x22, [sp, #32]
- stp x23, x24, [sp, #48]
- stp d8, d9, [sp, #64]
- stp d10, d11, [sp, #80]
- stp d12, d13, [sp, #96]
- stp d14, d15, [sp, #112]
- ldr w17, [x8, #240]
- add x19, x8, x17, lsl #4 // borrow input_l1 for last key
- ldp x13, x14, [x19] // load round N keys
- ldr q31, [x19, #-16] // load round N-1 keys
- lsr x5, x1, #3 // byte_len
- mov x15, x5
- ldp x10, x11, [x16] // ctr96_b64, ctr96_t32
- ldr q26, [x8, #128] // load rk8
- sub x5, x5, #1 // byte_len - 1
- ldr q25, [x8, #112] // load rk7
- and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
- add x4, x0, x1, lsr #3 // end_input_ptr
- ldr q24, [x8, #96] // load rk6
- lsr x12, x11, #32
- ldr q23, [x8, #80] // load rk5
- orr w11, w11, w11
- ldr q21, [x8, #48] // load rk3
- add x5, x5, x0
- rev w12, w12 // rev_ctr32
- add w12, w12, #1 // increment rev_ctr32
- fmov d3, x10 // CTR block 3
- rev w9, w12 // CTR block 1
- add w12, w12, #1 // CTR block 1
- fmov d1, x10 // CTR block 1
- orr x9, x11, x9, lsl #32 // CTR block 1
- ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible
- fmov v1.d[1], x9 // CTR block 1
- rev w9, w12 // CTR block 2
- add w12, w12, #1 // CTR block 2
- fmov d2, x10 // CTR block 2
- orr x9, x11, x9, lsl #32 // CTR block 2
- fmov v2.d[1], x9 // CTR block 2
- rev w9, w12 // CTR block 3
- orr x9, x11, x9, lsl #32 // CTR block 3
- ldr q18, [x8, #0] // load rk0
- fmov v3.d[1], x9 // CTR block 3
- add w12, w12, #1 // CTR block 3
- ldr q22, [x8, #64] // load rk4
- ldr q19, [x8, #16] // load rk1
- aese v0.16b, v18.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 0
- ldr q14, [x6, #48] // load h3l | h3h
- ext v14.16b, v14.16b, v14.16b, #8
- aese v3.16b, v18.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 0
- ldr q15, [x6, #80] // load h4l | h4h
- ext v15.16b, v15.16b, v15.16b, #8
- aese v1.16b, v18.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 0
- ldr q13, [x6, #32] // load h2l | h2h
- ext v13.16b, v13.16b, v13.16b, #8
- aese v2.16b, v18.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 0
- ldr q20, [x8, #32] // load rk2
- aese v0.16b, v19.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 1
- aese v1.16b, v19.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 1
- ld1 { v11.16b}, [x3]
- ext v11.16b, v11.16b, v11.16b, #8
- rev64 v11.16b, v11.16b
- aese v2.16b, v19.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 1
- ldr q27, [x8, #144] // load rk9
- aese v3.16b, v19.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 1
- ldr q30, [x8, #192] // load rk12
- aese v0.16b, v20.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 2
- ldr q12, [x6] // load h1l | h1h
- ext v12.16b, v12.16b, v12.16b, #8
- aese v2.16b, v20.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 2
- ldr q28, [x8, #160] // load rk10
- aese v3.16b, v20.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 2
- aese v0.16b, v21.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 3
- aese v1.16b, v20.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 2
- aese v3.16b, v21.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 3
- aese v0.16b, v22.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 4
- aese v2.16b, v21.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 3
- aese v1.16b, v21.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 3
- aese v3.16b, v22.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 4
- aese v2.16b, v22.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 4
- aese v1.16b, v22.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 4
- aese v3.16b, v23.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 5
- aese v0.16b, v23.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 5
- aese v1.16b, v23.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 5
- aese v2.16b, v23.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 5
- aese v0.16b, v24.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 6
- aese v3.16b, v24.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 6
- cmp x17, #12 // setup flags for AES-128/192/256 check
- aese v1.16b, v24.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 6
- aese v2.16b, v24.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 6
- aese v0.16b, v25.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 7
- aese v1.16b, v25.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 7
- aese v3.16b, v25.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 7
- aese v0.16b, v26.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 8
- aese v2.16b, v25.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 7
- aese v3.16b, v26.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 8
- aese v1.16b, v26.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 8
- ldr q29, [x8, #176] // load rk11
- aese v2.16b, v26.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 8
- b.lt Ldec_finish_first_blocks // branch if AES-128
-
- aese v0.16b, v27.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 9
- aese v1.16b, v27.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 9
- aese v3.16b, v27.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 9
- aese v2.16b, v27.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 9
- aese v0.16b, v28.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 10
- aese v1.16b, v28.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 10
- aese v3.16b, v28.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 10
- aese v2.16b, v28.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 10
- b.eq Ldec_finish_first_blocks // branch if AES-192
-
- aese v0.16b, v29.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 11
- aese v3.16b, v29.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 11
- aese v1.16b, v29.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 11
- aese v2.16b, v29.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 11
- aese v1.16b, v30.16b
- aesmc v1.16b, v1.16b // AES block 1 - round 12
- aese v0.16b, v30.16b
- aesmc v0.16b, v0.16b // AES block 0 - round 12
- aese v2.16b, v30.16b
- aesmc v2.16b, v2.16b // AES block 2 - round 12
- aese v3.16b, v30.16b
- aesmc v3.16b, v3.16b // AES block 3 - round 12
-
-Ldec_finish_first_blocks:
- cmp x0, x5 // check if we have <= 4 blocks
- trn1 v9.2d, v14.2d, v15.2d // h4h | h3h
- trn2 v17.2d, v14.2d, v15.2d // h4l | h3l
- trn1 v8.2d, v12.2d, v13.2d // h2h | h1h
- trn2 v16.2d, v12.2d, v13.2d // h2l | h1l
- eor v17.16b, v17.16b, v9.16b // h4k | h3k
- aese v1.16b, v31.16b // AES block 1 - round N-1
- aese v2.16b, v31.16b // AES block 2 - round N-1
- eor v16.16b, v16.16b, v8.16b // h2k | h1k
- aese v3.16b, v31.16b // AES block 3 - round N-1
- aese v0.16b, v31.16b // AES block 0 - round N-1
- b.ge Ldec_tail // handle tail
-
- ldr q4, [x0, #0] // AES block 0 - load ciphertext
- ldr q5, [x0, #16] // AES block 1 - load ciphertext
- rev w9, w12 // CTR block 4
- eor v0.16b, v4.16b, v0.16b // AES block 0 - result
- eor v1.16b, v5.16b, v1.16b // AES block 1 - result
- rev64 v5.16b, v5.16b // GHASH block 1
- ldr q7, [x0, #48] // AES block 3 - load ciphertext
- mov x7, v0.d[1] // AES block 0 - mov high
- mov x6, v0.d[0] // AES block 0 - mov low
- rev64 v4.16b, v4.16b // GHASH block 0
- add w12, w12, #1 // CTR block 4
- fmov d0, x10 // CTR block 4
- orr x9, x11, x9, lsl #32 // CTR block 4
- fmov v0.d[1], x9 // CTR block 4
- rev w9, w12 // CTR block 5
- add w12, w12, #1 // CTR block 5
- mov x19, v1.d[0] // AES block 1 - mov low
- orr x9, x11, x9, lsl #32 // CTR block 5
- mov x20, v1.d[1] // AES block 1 - mov high
- eor x7, x7, x14 // AES block 0 - round N high
- eor x6, x6, x13 // AES block 0 - round N low
- stp x6, x7, [x2], #16 // AES block 0 - store result
- fmov d1, x10 // CTR block 5
- ldr q6, [x0, #32] // AES block 2 - load ciphertext
- add x0, x0, #64 // AES input_ptr update
- fmov v1.d[1], x9 // CTR block 5
- rev w9, w12 // CTR block 6
- add w12, w12, #1 // CTR block 6
- eor x19, x19, x13 // AES block 1 - round N low
- orr x9, x11, x9, lsl #32 // CTR block 6
- eor x20, x20, x14 // AES block 1 - round N high
- stp x19, x20, [x2], #16 // AES block 1 - store result
- eor v2.16b, v6.16b, v2.16b // AES block 2 - result
- cmp x0, x5 // check if we have <= 8 blocks
- b.ge Ldec_prepretail // do prepretail
-
-Ldec_main_loop: // main loop start
- mov x21, v2.d[0] // AES block 4k+2 - mov low
- ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
- eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result
- aese v0.16b, v18.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
- mov x22, v2.d[1] // AES block 4k+2 - mov high
- aese v1.16b, v18.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
- fmov d2, x10 // CTR block 4k+6
- fmov v2.d[1], x9 // CTR block 4k+6
- eor v4.16b, v4.16b, v11.16b // PRE 1
- rev w9, w12 // CTR block 4k+7
- aese v0.16b, v19.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
- mov x24, v3.d[1] // AES block 4k+3 - mov high
- aese v1.16b, v19.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
- mov x23, v3.d[0] // AES block 4k+3 - mov low
- pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
- mov d8, v4.d[1] // GHASH block 4k - mid
- fmov d3, x10 // CTR block 4k+7
- aese v0.16b, v20.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
- orr x9, x11, x9, lsl #32 // CTR block 4k+7
- aese v2.16b, v18.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
- fmov v3.d[1], x9 // CTR block 4k+7
- aese v1.16b, v20.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
- eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
- aese v0.16b, v21.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
- eor x22, x22, x14 // AES block 4k+2 - round N high
- aese v2.16b, v19.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
- mov d10, v17.d[1] // GHASH block 4k - mid
- aese v1.16b, v21.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
- rev64 v6.16b, v6.16b // GHASH block 4k+2
- aese v3.16b, v18.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
- eor x21, x21, x13 // AES block 4k+2 - round N low
- aese v2.16b, v20.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
- stp x21, x22, [x2], #16 // AES block 4k+2 - store result
- pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
- pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
- aese v2.16b, v21.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
- rev64 v7.16b, v7.16b // GHASH block 4k+3
- pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
- eor x23, x23, x13 // AES block 4k+3 - round N low
- pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
- eor x24, x24, x14 // AES block 4k+3 - round N high
- eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
- aese v2.16b, v22.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
- aese v3.16b, v19.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
- mov d4, v5.d[1] // GHASH block 4k+1 - mid
- aese v0.16b, v22.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
- eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
- aese v2.16b, v23.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
- add w12, w12, #1 // CTR block 4k+7
- aese v3.16b, v20.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
- mov d8, v6.d[1] // GHASH block 4k+2 - mid
- aese v1.16b, v22.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
- eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
- pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
- aese v3.16b, v21.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
- eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
- aese v1.16b, v23.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
- aese v0.16b, v23.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
- eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
- pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
- rev w9, w12 // CTR block 4k+8
- aese v1.16b, v24.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
- ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
- aese v0.16b, v24.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
- add w12, w12, #1 // CTR block 4k+8
- aese v3.16b, v22.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
- aese v1.16b, v25.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
- eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
- aese v0.16b, v25.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
- pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
- mov d6, v7.d[1] // GHASH block 4k+3 - mid
- aese v3.16b, v23.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
- pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
- aese v0.16b, v26.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
- eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
- aese v3.16b, v24.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
- pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
- orr x9, x11, x9, lsl #32 // CTR block 4k+8
- eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
- pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
- cmp x17, #12 // setup flags for AES-128/192/256 check
- eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid
- aese v1.16b, v26.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
- aese v2.16b, v24.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
- eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
- pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid
- movi v8.8b, #0xc2
- aese v2.16b, v25.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
- eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low
- aese v3.16b, v25.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
- shl d8, d8, #56 // mod_constant
- aese v2.16b, v26.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
- eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid
- aese v3.16b, v26.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
- b.lt Ldec_main_loop_continue // branch if AES-128
-
- aese v0.16b, v27.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
- aese v2.16b, v27.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
- aese v1.16b, v27.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
- aese v3.16b, v27.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
- aese v0.16b, v28.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
- aese v1.16b, v28.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
- aese v2.16b, v28.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
- aese v3.16b, v28.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
- b.eq Ldec_main_loop_continue // branch if AES-192
-
- aese v0.16b, v29.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
- aese v1.16b, v29.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
- aese v2.16b, v29.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
- aese v3.16b, v29.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
- aese v0.16b, v30.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
- aese v1.16b, v30.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
- aese v2.16b, v30.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
- aese v3.16b, v30.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
-
-Ldec_main_loop_continue:
- pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
- eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
- ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext
- aese v0.16b, v31.16b // AES block 4k+4 - round N-1
- ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
- eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up
- ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext
- eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result
- stp x23, x24, [x2], #16 // AES block 4k+3 - store result
- eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
- ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext
- ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext
- mov x7, v0.d[1] // AES block 4k+4 - mov high
- eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
- aese v1.16b, v31.16b // AES block 4k+5 - round N-1
- add x0, x0, #64 // AES input_ptr update
- mov x6, v0.d[0] // AES block 4k+4 - mov low
- fmov d0, x10 // CTR block 4k+8
- fmov v0.d[1], x9 // CTR block 4k+8
- pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
- eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result
- rev w9, w12 // CTR block 4k+9
- aese v2.16b, v31.16b // AES block 4k+6 - round N-1
- orr x9, x11, x9, lsl #32 // CTR block 4k+9
- cmp x0, x5 // LOOP CONTROL
- add w12, w12, #1 // CTR block 4k+9
- eor x6, x6, x13 // AES block 4k+4 - round N low
- eor x7, x7, x14 // AES block 4k+4 - round N high
- mov x20, v1.d[1] // AES block 4k+5 - mov high
- eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result
- eor v11.16b, v11.16b, v8.16b // MODULO - fold into low
- mov x19, v1.d[0] // AES block 4k+5 - mov low
- fmov d1, x10 // CTR block 4k+9
- ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
- fmov v1.d[1], x9 // CTR block 4k+9
- rev w9, w12 // CTR block 4k+10
- add w12, w12, #1 // CTR block 4k+10
- aese v3.16b, v31.16b // AES block 4k+7 - round N-1
- orr x9, x11, x9, lsl #32 // CTR block 4k+10
- rev64 v5.16b, v5.16b // GHASH block 4k+5
- eor x20, x20, x14 // AES block 4k+5 - round N high
- stp x6, x7, [x2], #16 // AES block 4k+4 - store result
- eor x19, x19, x13 // AES block 4k+5 - round N low
- stp x19, x20, [x2], #16 // AES block 4k+5 - store result
- rev64 v4.16b, v4.16b // GHASH block 4k+4
- eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
- b.lt Ldec_main_loop
-
-Ldec_prepretail: // PREPRETAIL
- ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
- mov x21, v2.d[0] // AES block 4k+2 - mov low
- eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result
- aese v0.16b, v18.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
- mov x22, v2.d[1] // AES block 4k+2 - mov high
- aese v1.16b, v18.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
- fmov d2, x10 // CTR block 4k+6
- fmov v2.d[1], x9 // CTR block 4k+6
- rev w9, w12 // CTR block 4k+7
- eor v4.16b, v4.16b, v11.16b // PRE 1
- rev64 v6.16b, v6.16b // GHASH block 4k+2
- orr x9, x11, x9, lsl #32 // CTR block 4k+7
- mov x23, v3.d[0] // AES block 4k+3 - mov low
- aese v1.16b, v19.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
- mov x24, v3.d[1] // AES block 4k+3 - mov high
- pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
- mov d8, v4.d[1] // GHASH block 4k - mid
- fmov d3, x10 // CTR block 4k+7
- pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
- fmov v3.d[1], x9 // CTR block 4k+7
- aese v2.16b, v18.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
- mov d10, v17.d[1] // GHASH block 4k - mid
- aese v0.16b, v19.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
- eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
- pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
- aese v2.16b, v19.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
- rev64 v7.16b, v7.16b // GHASH block 4k+3
- aese v3.16b, v18.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
- pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
- eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
- pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
- aese v3.16b, v19.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
- mov d4, v5.d[1] // GHASH block 4k+1 - mid
- aese v0.16b, v20.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
- aese v1.16b, v20.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
- eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
- aese v2.16b, v20.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
- aese v0.16b, v21.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
- mov d8, v6.d[1] // GHASH block 4k+2 - mid
- aese v3.16b, v20.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
- eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
- pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
- aese v0.16b, v22.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
- aese v3.16b, v21.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
- eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
- pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
- aese v0.16b, v23.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
- eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
- aese v3.16b, v22.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
- pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
- eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
- pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
- aese v3.16b, v23.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
- ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
- aese v2.16b, v21.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
- aese v1.16b, v21.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
- eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
- pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
- aese v2.16b, v22.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
- mov d6, v7.d[1] // GHASH block 4k+3 - mid
- aese v1.16b, v22.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
- pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
- aese v2.16b, v23.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
- eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid
- aese v1.16b, v23.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
- aese v3.16b, v24.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
- eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
- aese v2.16b, v24.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
- aese v0.16b, v24.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
- movi v8.8b, #0xc2
- aese v1.16b, v24.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
- eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low
- pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid
- aese v3.16b, v25.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
- cmp x17, #12 // setup flags for AES-128/192/256 check
- eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
- aese v1.16b, v25.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
- aese v0.16b, v25.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
- eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid
- aese v3.16b, v26.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
- aese v2.16b, v25.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
- eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
- aese v1.16b, v26.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
- aese v0.16b, v26.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
- shl d8, d8, #56 // mod_constant
- aese v2.16b, v26.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
- b.lt Ldec_finish_prepretail // branch if AES-128
-
- aese v1.16b, v27.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
- aese v2.16b, v27.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
- aese v3.16b, v27.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
- aese v0.16b, v27.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
- aese v2.16b, v28.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
- aese v3.16b, v28.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
- aese v0.16b, v28.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
- aese v1.16b, v28.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
- b.eq Ldec_finish_prepretail // branch if AES-192
-
- aese v2.16b, v29.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
- aese v0.16b, v29.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
- aese v1.16b, v29.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
- aese v2.16b, v30.16b
- aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
- aese v3.16b, v29.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
- aese v1.16b, v30.16b
- aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
- aese v0.16b, v30.16b
- aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
- aese v3.16b, v30.16b
- aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
-
-Ldec_finish_prepretail:
- eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up
- pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
- ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
- eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
- eor x22, x22, x14 // AES block 4k+2 - round N high
- eor x23, x23, x13 // AES block 4k+3 - round N low
- eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
- add w12, w12, #1 // CTR block 4k+7
- eor x21, x21, x13 // AES block 4k+2 - round N low
- pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
- eor x24, x24, x14 // AES block 4k+3 - round N high
- stp x21, x22, [x2], #16 // AES block 4k+2 - store result
- ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
- stp x23, x24, [x2], #16 // AES block 4k+3 - store result
-
- eor v11.16b, v11.16b, v8.16b // MODULO - fold into low
- aese v1.16b, v31.16b // AES block 4k+5 - round N-1
- aese v0.16b, v31.16b // AES block 4k+4 - round N-1
- aese v3.16b, v31.16b // AES block 4k+7 - round N-1
- aese v2.16b, v31.16b // AES block 4k+6 - round N-1
- eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
-
-Ldec_tail: // TAIL
- sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process
- ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext
- eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result
- mov x6, v0.d[0] // AES block 4k+4 - mov low
- mov x7, v0.d[1] // AES block 4k+4 - mov high
- ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag
- cmp x5, #48
- eor x6, x6, x13 // AES block 4k+4 - round N low
- eor x7, x7, x14 // AES block 4k+4 - round N high
- b.gt Ldec_blocks_more_than_3
- sub w12, w12, #1
- mov v3.16b, v2.16b
- movi v10.8b, #0
- movi v11.8b, #0
- cmp x5, #32
- movi v9.8b, #0
- mov v2.16b, v1.16b
- b.gt Ldec_blocks_more_than_2
- sub w12, w12, #1
- mov v3.16b, v1.16b
- cmp x5, #16
- b.gt Ldec_blocks_more_than_1
- sub w12, w12, #1
- b Ldec_blocks_less_than_1
-Ldec_blocks_more_than_3: // blocks left > 3
- rev64 v4.16b, v5.16b // GHASH final-3 block
- ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext
- stp x6, x7, [x2], #16 // AES final-3 block - store result
- mov d10, v17.d[1] // GHASH final-3 block - mid
- eor v4.16b, v4.16b, v8.16b // feed in partial tag
- eor v0.16b, v5.16b, v1.16b // AES final-2 block - result
- mov d22, v4.d[1] // GHASH final-3 block - mid
- mov x6, v0.d[0] // AES final-2 block - mov low
- mov x7, v0.d[1] // AES final-2 block - mov high
- eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid
- movi v8.8b, #0 // suppress further partial tag feed in
- pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high
- pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid
- eor x6, x6, x13 // AES final-2 block - round N low
- pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low
- eor x7, x7, x14 // AES final-2 block - round N high
-Ldec_blocks_more_than_2: // blocks left > 2
- rev64 v4.16b, v5.16b // GHASH final-2 block
- ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext
- eor v4.16b, v4.16b, v8.16b // feed in partial tag
- stp x6, x7, [x2], #16 // AES final-2 block - store result
- eor v0.16b, v5.16b, v2.16b // AES final-1 block - result
- mov d22, v4.d[1] // GHASH final-2 block - mid
- pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low
- pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high
- eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid
- mov x6, v0.d[0] // AES final-1 block - mov low
- mov x7, v0.d[1] // AES final-1 block - mov high
- eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low
- movi v8.8b, #0 // suppress further partial tag feed in
- pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid
- eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high
- eor x6, x6, x13 // AES final-1 block - round N low
- eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid
- eor x7, x7, x14 // AES final-1 block - round N high
-Ldec_blocks_more_than_1: // blocks left > 1
- stp x6, x7, [x2], #16 // AES final-1 block - store result
- rev64 v4.16b, v5.16b // GHASH final-1 block
- ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext
- eor v4.16b, v4.16b, v8.16b // feed in partial tag
- movi v8.8b, #0 // suppress further partial tag feed in
- mov d22, v4.d[1] // GHASH final-1 block - mid
- eor v0.16b, v5.16b, v3.16b // AES final block - result
- pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high
- eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid
- pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low
- mov x6, v0.d[0] // AES final block - mov low
- ins v22.d[1], v22.d[0] // GHASH final-1 block - mid
- mov x7, v0.d[1] // AES final block - mov high
- pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid
- eor x6, x6, x13 // AES final block - round N low
- eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low
- eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high
- eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid
- eor x7, x7, x14 // AES final block - round N high
-Ldec_blocks_less_than_1: // blocks left <= 1
- and x1, x1, #127 // bit_length %= 128
- mvn x14, xzr // rkN_h = 0xffffffffffffffff
- sub x1, x1, #128 // bit_length -= 128
- mvn x13, xzr // rkN_l = 0xffffffffffffffff
- ldp x4, x5, [x2] // load existing bytes we need to not overwrite
- neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128])
- and x1, x1, #127 // bit_length %= 128
- lsr x14, x14, x1 // rkN_h is mask for top 64b of last block
- cmp x1, #64
- csel x9, x13, x14, lt
- csel x10, x14, xzr, lt
- fmov d0, x9 // ctr0b is mask for last block
- and x6, x6, x9
- mov v0.d[1], x10
- bic x4, x4, x9 // mask out low existing bytes
- rev w9, w12
- bic x5, x5, x10 // mask out high existing bytes
- orr x6, x6, x4
- and x7, x7, x10
- orr x7, x7, x5
- and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits
- rev64 v4.16b, v5.16b // GHASH final block
- eor v4.16b, v4.16b, v8.16b // feed in partial tag
- pmull v21.1q, v4.1d, v12.1d // GHASH final block - low
- mov d8, v4.d[1] // GHASH final block - mid
- eor v8.8b, v8.8b, v4.8b // GHASH final block - mid
- pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high
- pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid
- eor v9.16b, v9.16b, v20.16b // GHASH final block - high
- eor v11.16b, v11.16b, v21.16b // GHASH final block - low
- eor v10.16b, v10.16b, v8.16b // GHASH final block - mid
- movi v8.8b, #0xc2
- eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
- shl d8, d8, #56 // mod_constant
- eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up
- pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
- ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
- eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
- eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
- pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
- ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
- eor v11.16b, v11.16b, v8.16b // MODULO - fold into low
- stp x6, x7, [x2]
- str w9, [x16, #12] // store the updated counter
- eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
- ext v11.16b, v11.16b, v11.16b, #8
- rev64 v11.16b, v11.16b
- mov x0, x15
- st1 { v11.16b }, [x3]
- ldp x19, x20, [sp, #16]
- ldp x21, x22, [sp, #32]
- ldp x23, x24, [sp, #48]
- ldp d8, d9, [sp, #64]
- ldp d10, d11, [sp, #80]
- ldp d12, d13, [sp, #96]
- ldp d14, d15, [sp, #112]
- ldp x29, x30, [sp], #128
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/armv8-mont-apple.S b/apple-aarch64/crypto/fipsmodule/armv8-mont-apple.S
deleted file mode 100644
index cf798a3..0000000
--- a/apple-aarch64/crypto/fipsmodule/armv8-mont-apple.S
+++ /dev/null
@@ -1,1425 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.text
-
-.globl _bn_mul_mont
-.private_extern _bn_mul_mont
-
-.align 5
-_bn_mul_mont:
- AARCH64_SIGN_LINK_REGISTER
- tst x5,#7
- b.eq __bn_sqr8x_mont
- tst x5,#3
- b.eq __bn_mul4x_mont
-Lmul_mont:
- stp x29,x30,[sp,#-64]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
-
- ldr x9,[x2],#8 // bp[0]
- sub x22,sp,x5,lsl#3
- ldp x7,x8,[x1],#16 // ap[0..1]
- lsl x5,x5,#3
- ldr x4,[x4] // *n0
- and x22,x22,#-16 // ABI says so
- ldp x13,x14,[x3],#16 // np[0..1]
-
- mul x6,x7,x9 // ap[0]*bp[0]
- sub x21,x5,#16 // j=num-2
- umulh x7,x7,x9
- mul x10,x8,x9 // ap[1]*bp[0]
- umulh x11,x8,x9
-
- mul x15,x6,x4 // "tp[0]"*n0
- mov sp,x22 // alloca
-
- // (*) mul x12,x13,x15 // np[0]*m1
- umulh x13,x13,x15
- mul x16,x14,x15 // np[1]*m1
- // (*) adds x12,x12,x6 // discarded
- // (*) As for removal of first multiplication and addition
- // instructions. The outcome of first addition is
- // guaranteed to be zero, which leaves two computationally
- // significant outcomes: it either carries or not. Then
- // question is when does it carry? Is there alternative
- // way to deduce it? If you follow operations, you can
- // observe that condition for carry is quite simple:
- // x6 being non-zero. So that carry can be calculated
- // by adding -1 to x6. That's what next instruction does.
- subs xzr,x6,#1 // (*)
- umulh x17,x14,x15
- adc x13,x13,xzr
- cbz x21,L1st_skip
-
-L1st:
- ldr x8,[x1],#8
- adds x6,x10,x7
- sub x21,x21,#8 // j--
- adc x7,x11,xzr
-
- ldr x14,[x3],#8
- adds x12,x16,x13
- mul x10,x8,x9 // ap[j]*bp[0]
- adc x13,x17,xzr
- umulh x11,x8,x9
-
- adds x12,x12,x6
- mul x16,x14,x15 // np[j]*m1
- adc x13,x13,xzr
- umulh x17,x14,x15
- str x12,[x22],#8 // tp[j-1]
- cbnz x21,L1st
-
-L1st_skip:
- adds x6,x10,x7
- sub x1,x1,x5 // rewind x1
- adc x7,x11,xzr
-
- adds x12,x16,x13
- sub x3,x3,x5 // rewind x3
- adc x13,x17,xzr
-
- adds x12,x12,x6
- sub x20,x5,#8 // i=num-1
- adcs x13,x13,x7
-
- adc x19,xzr,xzr // upmost overflow bit
- stp x12,x13,[x22]
-
-Louter:
- ldr x9,[x2],#8 // bp[i]
- ldp x7,x8,[x1],#16
- ldr x23,[sp] // tp[0]
- add x22,sp,#8
-
- mul x6,x7,x9 // ap[0]*bp[i]
- sub x21,x5,#16 // j=num-2
- umulh x7,x7,x9
- ldp x13,x14,[x3],#16
- mul x10,x8,x9 // ap[1]*bp[i]
- adds x6,x6,x23
- umulh x11,x8,x9
- adc x7,x7,xzr
-
- mul x15,x6,x4
- sub x20,x20,#8 // i--
-
- // (*) mul x12,x13,x15 // np[0]*m1
- umulh x13,x13,x15
- mul x16,x14,x15 // np[1]*m1
- // (*) adds x12,x12,x6
- subs xzr,x6,#1 // (*)
- umulh x17,x14,x15
- cbz x21,Linner_skip
-
-Linner:
- ldr x8,[x1],#8
- adc x13,x13,xzr
- ldr x23,[x22],#8 // tp[j]
- adds x6,x10,x7
- sub x21,x21,#8 // j--
- adc x7,x11,xzr
-
- adds x12,x16,x13
- ldr x14,[x3],#8
- adc x13,x17,xzr
-
- mul x10,x8,x9 // ap[j]*bp[i]
- adds x6,x6,x23
- umulh x11,x8,x9
- adc x7,x7,xzr
-
- mul x16,x14,x15 // np[j]*m1
- adds x12,x12,x6
- umulh x17,x14,x15
- str x12,[x22,#-16] // tp[j-1]
- cbnz x21,Linner
-
-Linner_skip:
- ldr x23,[x22],#8 // tp[j]
- adc x13,x13,xzr
- adds x6,x10,x7
- sub x1,x1,x5 // rewind x1
- adc x7,x11,xzr
-
- adds x12,x16,x13
- sub x3,x3,x5 // rewind x3
- adcs x13,x17,x19
- adc x19,xzr,xzr
-
- adds x6,x6,x23
- adc x7,x7,xzr
-
- adds x12,x12,x6
- adcs x13,x13,x7
- adc x19,x19,xzr // upmost overflow bit
- stp x12,x13,[x22,#-16]
-
- cbnz x20,Louter
-
- // Final step. We see if result is larger than modulus, and
- // if it is, subtract the modulus. But comparison implies
- // subtraction. So we subtract modulus, see if it borrowed,
- // and conditionally copy original value.
- ldr x23,[sp] // tp[0]
- add x22,sp,#8
- ldr x14,[x3],#8 // np[0]
- subs x21,x5,#8 // j=num-1 and clear borrow
- mov x1,x0
-Lsub:
- sbcs x8,x23,x14 // tp[j]-np[j]
- ldr x23,[x22],#8
- sub x21,x21,#8 // j--
- ldr x14,[x3],#8
- str x8,[x1],#8 // rp[j]=tp[j]-np[j]
- cbnz x21,Lsub
-
- sbcs x8,x23,x14
- sbcs x19,x19,xzr // did it borrow?
- str x8,[x1],#8 // rp[num-1]
-
- ldr x23,[sp] // tp[0]
- add x22,sp,#8
- ldr x8,[x0],#8 // rp[0]
- sub x5,x5,#8 // num--
- nop
-Lcond_copy:
- sub x5,x5,#8 // num--
- csel x14,x23,x8,lo // did it borrow?
- ldr x23,[x22],#8
- ldr x8,[x0],#8
- str xzr,[x22,#-16] // wipe tp
- str x14,[x0,#-16]
- cbnz x5,Lcond_copy
-
- csel x14,x23,x8,lo
- str xzr,[x22,#-8] // wipe tp
- str x14,[x0,#-8]
-
- ldp x19,x20,[x29,#16]
- mov sp,x29
- ldp x21,x22,[x29,#32]
- mov x0,#1
- ldp x23,x24,[x29,#48]
- ldr x29,[sp],#64
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-.align 5
-__bn_sqr8x_mont:
- // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
- // only from bn_mul_mont which has already signed the return address.
- cmp x1,x2
- b.ne __bn_mul4x_mont
-Lsqr8x_mont:
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- stp x0,x3,[sp,#96] // offload rp and np
-
- ldp x6,x7,[x1,#8*0]
- ldp x8,x9,[x1,#8*2]
- ldp x10,x11,[x1,#8*4]
- ldp x12,x13,[x1,#8*6]
-
- sub x2,sp,x5,lsl#4
- lsl x5,x5,#3
- ldr x4,[x4] // *n0
- mov sp,x2 // alloca
- sub x27,x5,#8*8
- b Lsqr8x_zero_start
-
-Lsqr8x_zero:
- sub x27,x27,#8*8
- stp xzr,xzr,[x2,#8*0]
- stp xzr,xzr,[x2,#8*2]
- stp xzr,xzr,[x2,#8*4]
- stp xzr,xzr,[x2,#8*6]
-Lsqr8x_zero_start:
- stp xzr,xzr,[x2,#8*8]
- stp xzr,xzr,[x2,#8*10]
- stp xzr,xzr,[x2,#8*12]
- stp xzr,xzr,[x2,#8*14]
- add x2,x2,#8*16
- cbnz x27,Lsqr8x_zero
-
- add x3,x1,x5
- add x1,x1,#8*8
- mov x19,xzr
- mov x20,xzr
- mov x21,xzr
- mov x22,xzr
- mov x23,xzr
- mov x24,xzr
- mov x25,xzr
- mov x26,xzr
- mov x2,sp
- str x4,[x29,#112] // offload n0
-
- // Multiply everything but a[i]*a[i]
-.align 4
-Lsqr8x_outer_loop:
- // a[1]a[0] (i)
- // a[2]a[0]
- // a[3]a[0]
- // a[4]a[0]
- // a[5]a[0]
- // a[6]a[0]
- // a[7]a[0]
- // a[2]a[1] (ii)
- // a[3]a[1]
- // a[4]a[1]
- // a[5]a[1]
- // a[6]a[1]
- // a[7]a[1]
- // a[3]a[2] (iii)
- // a[4]a[2]
- // a[5]a[2]
- // a[6]a[2]
- // a[7]a[2]
- // a[4]a[3] (iv)
- // a[5]a[3]
- // a[6]a[3]
- // a[7]a[3]
- // a[5]a[4] (v)
- // a[6]a[4]
- // a[7]a[4]
- // a[6]a[5] (vi)
- // a[7]a[5]
- // a[7]a[6] (vii)
-
- mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
- mul x15,x8,x6
- mul x16,x9,x6
- mul x17,x10,x6
- adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
- mul x14,x11,x6
- adcs x21,x21,x15
- mul x15,x12,x6
- adcs x22,x22,x16
- mul x16,x13,x6
- adcs x23,x23,x17
- umulh x17,x7,x6 // hi(a[1..7]*a[0])
- adcs x24,x24,x14
- umulh x14,x8,x6
- adcs x25,x25,x15
- umulh x15,x9,x6
- adcs x26,x26,x16
- umulh x16,x10,x6
- stp x19,x20,[x2],#8*2 // t[0..1]
- adc x19,xzr,xzr // t[8]
- adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
- umulh x17,x11,x6
- adcs x22,x22,x14
- umulh x14,x12,x6
- adcs x23,x23,x15
- umulh x15,x13,x6
- adcs x24,x24,x16
- mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
- adcs x25,x25,x17
- mul x17,x9,x7
- adcs x26,x26,x14
- mul x14,x10,x7
- adc x19,x19,x15
-
- mul x15,x11,x7
- adds x22,x22,x16
- mul x16,x12,x7
- adcs x23,x23,x17
- mul x17,x13,x7
- adcs x24,x24,x14
- umulh x14,x8,x7 // hi(a[2..7]*a[1])
- adcs x25,x25,x15
- umulh x15,x9,x7
- adcs x26,x26,x16
- umulh x16,x10,x7
- adcs x19,x19,x17
- umulh x17,x11,x7
- stp x21,x22,[x2],#8*2 // t[2..3]
- adc x20,xzr,xzr // t[9]
- adds x23,x23,x14
- umulh x14,x12,x7
- adcs x24,x24,x15
- umulh x15,x13,x7
- adcs x25,x25,x16
- mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
- adcs x26,x26,x17
- mul x17,x10,x8
- adcs x19,x19,x14
- mul x14,x11,x8
- adc x20,x20,x15
-
- mul x15,x12,x8
- adds x24,x24,x16
- mul x16,x13,x8
- adcs x25,x25,x17
- umulh x17,x9,x8 // hi(a[3..7]*a[2])
- adcs x26,x26,x14
- umulh x14,x10,x8
- adcs x19,x19,x15
- umulh x15,x11,x8
- adcs x20,x20,x16
- umulh x16,x12,x8
- stp x23,x24,[x2],#8*2 // t[4..5]
- adc x21,xzr,xzr // t[10]
- adds x25,x25,x17
- umulh x17,x13,x8
- adcs x26,x26,x14
- mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
- adcs x19,x19,x15
- mul x15,x11,x9
- adcs x20,x20,x16
- mul x16,x12,x9
- adc x21,x21,x17
-
- mul x17,x13,x9
- adds x26,x26,x14
- umulh x14,x10,x9 // hi(a[4..7]*a[3])
- adcs x19,x19,x15
- umulh x15,x11,x9
- adcs x20,x20,x16
- umulh x16,x12,x9
- adcs x21,x21,x17
- umulh x17,x13,x9
- stp x25,x26,[x2],#8*2 // t[6..7]
- adc x22,xzr,xzr // t[11]
- adds x19,x19,x14
- mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
- adcs x20,x20,x15
- mul x15,x12,x10
- adcs x21,x21,x16
- mul x16,x13,x10
- adc x22,x22,x17
-
- umulh x17,x11,x10 // hi(a[5..7]*a[4])
- adds x20,x20,x14
- umulh x14,x12,x10
- adcs x21,x21,x15
- umulh x15,x13,x10
- adcs x22,x22,x16
- mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
- adc x23,xzr,xzr // t[12]
- adds x21,x21,x17
- mul x17,x13,x11
- adcs x22,x22,x14
- umulh x14,x12,x11 // hi(a[6..7]*a[5])
- adc x23,x23,x15
-
- umulh x15,x13,x11
- adds x22,x22,x16
- mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
- adcs x23,x23,x17
- umulh x17,x13,x12 // hi(a[7]*a[6])
- adc x24,xzr,xzr // t[13]
- adds x23,x23,x14
- sub x27,x3,x1 // done yet?
- adc x24,x24,x15
-
- adds x24,x24,x16
- sub x14,x3,x5 // rewinded ap
- adc x25,xzr,xzr // t[14]
- add x25,x25,x17
-
- cbz x27,Lsqr8x_outer_break
-
- mov x4,x6
- ldp x6,x7,[x2,#8*0]
- ldp x8,x9,[x2,#8*2]
- ldp x10,x11,[x2,#8*4]
- ldp x12,x13,[x2,#8*6]
- adds x19,x19,x6
- adcs x20,x20,x7
- ldp x6,x7,[x1,#8*0]
- adcs x21,x21,x8
- adcs x22,x22,x9
- ldp x8,x9,[x1,#8*2]
- adcs x23,x23,x10
- adcs x24,x24,x11
- ldp x10,x11,[x1,#8*4]
- adcs x25,x25,x12
- mov x0,x1
- adcs x26,xzr,x13
- ldp x12,x13,[x1,#8*6]
- add x1,x1,#8*8
- //adc x28,xzr,xzr // moved below
- mov x27,#-8*8
-
- // a[8]a[0]
- // a[9]a[0]
- // a[a]a[0]
- // a[b]a[0]
- // a[c]a[0]
- // a[d]a[0]
- // a[e]a[0]
- // a[f]a[0]
- // a[8]a[1]
- // a[f]a[1]........................
- // a[8]a[2]
- // a[f]a[2]........................
- // a[8]a[3]
- // a[f]a[3]........................
- // a[8]a[4]
- // a[f]a[4]........................
- // a[8]a[5]
- // a[f]a[5]........................
- // a[8]a[6]
- // a[f]a[6]........................
- // a[8]a[7]
- // a[f]a[7]........................
-Lsqr8x_mul:
- mul x14,x6,x4
- adc x28,xzr,xzr // carry bit, modulo-scheduled
- mul x15,x7,x4
- add x27,x27,#8
- mul x16,x8,x4
- mul x17,x9,x4
- adds x19,x19,x14
- mul x14,x10,x4
- adcs x20,x20,x15
- mul x15,x11,x4
- adcs x21,x21,x16
- mul x16,x12,x4
- adcs x22,x22,x17
- mul x17,x13,x4
- adcs x23,x23,x14
- umulh x14,x6,x4
- adcs x24,x24,x15
- umulh x15,x7,x4
- adcs x25,x25,x16
- umulh x16,x8,x4
- adcs x26,x26,x17
- umulh x17,x9,x4
- adc x28,x28,xzr
- str x19,[x2],#8
- adds x19,x20,x14
- umulh x14,x10,x4
- adcs x20,x21,x15
- umulh x15,x11,x4
- adcs x21,x22,x16
- umulh x16,x12,x4
- adcs x22,x23,x17
- umulh x17,x13,x4
- ldr x4,[x0,x27]
- adcs x23,x24,x14
- adcs x24,x25,x15
- adcs x25,x26,x16
- adcs x26,x28,x17
- //adc x28,xzr,xzr // moved above
- cbnz x27,Lsqr8x_mul
- // note that carry flag is guaranteed
- // to be zero at this point
- cmp x1,x3 // done yet?
- b.eq Lsqr8x_break
-
- ldp x6,x7,[x2,#8*0]
- ldp x8,x9,[x2,#8*2]
- ldp x10,x11,[x2,#8*4]
- ldp x12,x13,[x2,#8*6]
- adds x19,x19,x6
- ldr x4,[x0,#-8*8]
- adcs x20,x20,x7
- ldp x6,x7,[x1,#8*0]
- adcs x21,x21,x8
- adcs x22,x22,x9
- ldp x8,x9,[x1,#8*2]
- adcs x23,x23,x10
- adcs x24,x24,x11
- ldp x10,x11,[x1,#8*4]
- adcs x25,x25,x12
- mov x27,#-8*8
- adcs x26,x26,x13
- ldp x12,x13,[x1,#8*6]
- add x1,x1,#8*8
- //adc x28,xzr,xzr // moved above
- b Lsqr8x_mul
-
-.align 4
-Lsqr8x_break:
- ldp x6,x7,[x0,#8*0]
- add x1,x0,#8*8
- ldp x8,x9,[x0,#8*2]
- sub x14,x3,x1 // is it last iteration?
- ldp x10,x11,[x0,#8*4]
- sub x15,x2,x14
- ldp x12,x13,[x0,#8*6]
- cbz x14,Lsqr8x_outer_loop
-
- stp x19,x20,[x2,#8*0]
- ldp x19,x20,[x15,#8*0]
- stp x21,x22,[x2,#8*2]
- ldp x21,x22,[x15,#8*2]
- stp x23,x24,[x2,#8*4]
- ldp x23,x24,[x15,#8*4]
- stp x25,x26,[x2,#8*6]
- mov x2,x15
- ldp x25,x26,[x15,#8*6]
- b Lsqr8x_outer_loop
-
-.align 4
-Lsqr8x_outer_break:
- // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
- ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
- ldp x15,x16,[sp,#8*1]
- ldp x11,x13,[x14,#8*2]
- add x1,x14,#8*4
- ldp x17,x14,[sp,#8*3]
-
- stp x19,x20,[x2,#8*0]
- mul x19,x7,x7
- stp x21,x22,[x2,#8*2]
- umulh x7,x7,x7
- stp x23,x24,[x2,#8*4]
- mul x8,x9,x9
- stp x25,x26,[x2,#8*6]
- mov x2,sp
- umulh x9,x9,x9
- adds x20,x7,x15,lsl#1
- extr x15,x16,x15,#63
- sub x27,x5,#8*4
-
-Lsqr4x_shift_n_add:
- adcs x21,x8,x15
- extr x16,x17,x16,#63
- sub x27,x27,#8*4
- adcs x22,x9,x16
- ldp x15,x16,[x2,#8*5]
- mul x10,x11,x11
- ldp x7,x9,[x1],#8*2
- umulh x11,x11,x11
- mul x12,x13,x13
- umulh x13,x13,x13
- extr x17,x14,x17,#63
- stp x19,x20,[x2,#8*0]
- adcs x23,x10,x17
- extr x14,x15,x14,#63
- stp x21,x22,[x2,#8*2]
- adcs x24,x11,x14
- ldp x17,x14,[x2,#8*7]
- extr x15,x16,x15,#63
- adcs x25,x12,x15
- extr x16,x17,x16,#63
- adcs x26,x13,x16
- ldp x15,x16,[x2,#8*9]
- mul x6,x7,x7
- ldp x11,x13,[x1],#8*2
- umulh x7,x7,x7
- mul x8,x9,x9
- umulh x9,x9,x9
- stp x23,x24,[x2,#8*4]
- extr x17,x14,x17,#63
- stp x25,x26,[x2,#8*6]
- add x2,x2,#8*8
- adcs x19,x6,x17
- extr x14,x15,x14,#63
- adcs x20,x7,x14
- ldp x17,x14,[x2,#8*3]
- extr x15,x16,x15,#63
- cbnz x27,Lsqr4x_shift_n_add
- ldp x1,x4,[x29,#104] // pull np and n0
-
- adcs x21,x8,x15
- extr x16,x17,x16,#63
- adcs x22,x9,x16
- ldp x15,x16,[x2,#8*5]
- mul x10,x11,x11
- umulh x11,x11,x11
- stp x19,x20,[x2,#8*0]
- mul x12,x13,x13
- umulh x13,x13,x13
- stp x21,x22,[x2,#8*2]
- extr x17,x14,x17,#63
- adcs x23,x10,x17
- extr x14,x15,x14,#63
- ldp x19,x20,[sp,#8*0]
- adcs x24,x11,x14
- extr x15,x16,x15,#63
- ldp x6,x7,[x1,#8*0]
- adcs x25,x12,x15
- extr x16,xzr,x16,#63
- ldp x8,x9,[x1,#8*2]
- adc x26,x13,x16
- ldp x10,x11,[x1,#8*4]
-
- // Reduce by 512 bits per iteration
- mul x28,x4,x19 // t[0]*n0
- ldp x12,x13,[x1,#8*6]
- add x3,x1,x5
- ldp x21,x22,[sp,#8*2]
- stp x23,x24,[x2,#8*4]
- ldp x23,x24,[sp,#8*4]
- stp x25,x26,[x2,#8*6]
- ldp x25,x26,[sp,#8*6]
- add x1,x1,#8*8
- mov x30,xzr // initial top-most carry
- mov x2,sp
- mov x27,#8
-
-Lsqr8x_reduction:
- // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
- mul x15,x7,x28
- sub x27,x27,#1
- mul x16,x8,x28
- str x28,[x2],#8 // put aside t[0]*n0 for tail processing
- mul x17,x9,x28
- // (*) adds xzr,x19,x14
- subs xzr,x19,#1 // (*)
- mul x14,x10,x28
- adcs x19,x20,x15
- mul x15,x11,x28
- adcs x20,x21,x16
- mul x16,x12,x28
- adcs x21,x22,x17
- mul x17,x13,x28
- adcs x22,x23,x14
- umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
- adcs x23,x24,x15
- umulh x15,x7,x28
- adcs x24,x25,x16
- umulh x16,x8,x28
- adcs x25,x26,x17
- umulh x17,x9,x28
- adc x26,xzr,xzr
- adds x19,x19,x14
- umulh x14,x10,x28
- adcs x20,x20,x15
- umulh x15,x11,x28
- adcs x21,x21,x16
- umulh x16,x12,x28
- adcs x22,x22,x17
- umulh x17,x13,x28
- mul x28,x4,x19 // next t[0]*n0
- adcs x23,x23,x14
- adcs x24,x24,x15
- adcs x25,x25,x16
- adc x26,x26,x17
- cbnz x27,Lsqr8x_reduction
-
- ldp x14,x15,[x2,#8*0]
- ldp x16,x17,[x2,#8*2]
- mov x0,x2
- sub x27,x3,x1 // done yet?
- adds x19,x19,x14
- adcs x20,x20,x15
- ldp x14,x15,[x2,#8*4]
- adcs x21,x21,x16
- adcs x22,x22,x17
- ldp x16,x17,[x2,#8*6]
- adcs x23,x23,x14
- adcs x24,x24,x15
- adcs x25,x25,x16
- adcs x26,x26,x17
- //adc x28,xzr,xzr // moved below
- cbz x27,Lsqr8x8_post_condition
-
- ldr x4,[x2,#-8*8]
- ldp x6,x7,[x1,#8*0]
- ldp x8,x9,[x1,#8*2]
- ldp x10,x11,[x1,#8*4]
- mov x27,#-8*8
- ldp x12,x13,[x1,#8*6]
- add x1,x1,#8*8
-
-Lsqr8x_tail:
- mul x14,x6,x4
- adc x28,xzr,xzr // carry bit, modulo-scheduled
- mul x15,x7,x4
- add x27,x27,#8
- mul x16,x8,x4
- mul x17,x9,x4
- adds x19,x19,x14
- mul x14,x10,x4
- adcs x20,x20,x15
- mul x15,x11,x4
- adcs x21,x21,x16
- mul x16,x12,x4
- adcs x22,x22,x17
- mul x17,x13,x4
- adcs x23,x23,x14
- umulh x14,x6,x4
- adcs x24,x24,x15
- umulh x15,x7,x4
- adcs x25,x25,x16
- umulh x16,x8,x4
- adcs x26,x26,x17
- umulh x17,x9,x4
- adc x28,x28,xzr
- str x19,[x2],#8
- adds x19,x20,x14
- umulh x14,x10,x4
- adcs x20,x21,x15
- umulh x15,x11,x4
- adcs x21,x22,x16
- umulh x16,x12,x4
- adcs x22,x23,x17
- umulh x17,x13,x4
- ldr x4,[x0,x27]
- adcs x23,x24,x14
- adcs x24,x25,x15
- adcs x25,x26,x16
- adcs x26,x28,x17
- //adc x28,xzr,xzr // moved above
- cbnz x27,Lsqr8x_tail
- // note that carry flag is guaranteed
- // to be zero at this point
- ldp x6,x7,[x2,#8*0]
- sub x27,x3,x1 // done yet?
- sub x16,x3,x5 // rewinded np
- ldp x8,x9,[x2,#8*2]
- ldp x10,x11,[x2,#8*4]
- ldp x12,x13,[x2,#8*6]
- cbz x27,Lsqr8x_tail_break
-
- ldr x4,[x0,#-8*8]
- adds x19,x19,x6
- adcs x20,x20,x7
- ldp x6,x7,[x1,#8*0]
- adcs x21,x21,x8
- adcs x22,x22,x9
- ldp x8,x9,[x1,#8*2]
- adcs x23,x23,x10
- adcs x24,x24,x11
- ldp x10,x11,[x1,#8*4]
- adcs x25,x25,x12
- mov x27,#-8*8
- adcs x26,x26,x13
- ldp x12,x13,[x1,#8*6]
- add x1,x1,#8*8
- //adc x28,xzr,xzr // moved above
- b Lsqr8x_tail
-
-.align 4
-Lsqr8x_tail_break:
- ldr x4,[x29,#112] // pull n0
- add x27,x2,#8*8 // end of current t[num] window
-
- subs xzr,x30,#1 // "move" top-most carry to carry bit
- adcs x14,x19,x6
- adcs x15,x20,x7
- ldp x19,x20,[x0,#8*0]
- adcs x21,x21,x8
- ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
- adcs x22,x22,x9
- ldp x8,x9,[x16,#8*2]
- adcs x23,x23,x10
- adcs x24,x24,x11
- ldp x10,x11,[x16,#8*4]
- adcs x25,x25,x12
- adcs x26,x26,x13
- ldp x12,x13,[x16,#8*6]
- add x1,x16,#8*8
- adc x30,xzr,xzr // top-most carry
- mul x28,x4,x19
- stp x14,x15,[x2,#8*0]
- stp x21,x22,[x2,#8*2]
- ldp x21,x22,[x0,#8*2]
- stp x23,x24,[x2,#8*4]
- ldp x23,x24,[x0,#8*4]
- cmp x27,x29 // did we hit the bottom?
- stp x25,x26,[x2,#8*6]
- mov x2,x0 // slide the window
- ldp x25,x26,[x0,#8*6]
- mov x27,#8
- b.ne Lsqr8x_reduction
-
- // Final step. We see if result is larger than modulus, and
- // if it is, subtract the modulus. But comparison implies
- // subtraction. So we subtract modulus, see if it borrowed,
- // and conditionally copy original value.
- ldr x0,[x29,#96] // pull rp
- add x2,x2,#8*8
- subs x14,x19,x6
- sbcs x15,x20,x7
- sub x27,x5,#8*8
- mov x3,x0 // x0 copy
-
-Lsqr8x_sub:
- sbcs x16,x21,x8
- ldp x6,x7,[x1,#8*0]
- sbcs x17,x22,x9
- stp x14,x15,[x0,#8*0]
- sbcs x14,x23,x10
- ldp x8,x9,[x1,#8*2]
- sbcs x15,x24,x11
- stp x16,x17,[x0,#8*2]
- sbcs x16,x25,x12
- ldp x10,x11,[x1,#8*4]
- sbcs x17,x26,x13
- ldp x12,x13,[x1,#8*6]
- add x1,x1,#8*8
- ldp x19,x20,[x2,#8*0]
- sub x27,x27,#8*8
- ldp x21,x22,[x2,#8*2]
- ldp x23,x24,[x2,#8*4]
- ldp x25,x26,[x2,#8*6]
- add x2,x2,#8*8
- stp x14,x15,[x0,#8*4]
- sbcs x14,x19,x6
- stp x16,x17,[x0,#8*6]
- add x0,x0,#8*8
- sbcs x15,x20,x7
- cbnz x27,Lsqr8x_sub
-
- sbcs x16,x21,x8
- mov x2,sp
- add x1,sp,x5
- ldp x6,x7,[x3,#8*0]
- sbcs x17,x22,x9
- stp x14,x15,[x0,#8*0]
- sbcs x14,x23,x10
- ldp x8,x9,[x3,#8*2]
- sbcs x15,x24,x11
- stp x16,x17,[x0,#8*2]
- sbcs x16,x25,x12
- ldp x19,x20,[x1,#8*0]
- sbcs x17,x26,x13
- ldp x21,x22,[x1,#8*2]
- sbcs xzr,x30,xzr // did it borrow?
- ldr x30,[x29,#8] // pull return address
- stp x14,x15,[x0,#8*4]
- stp x16,x17,[x0,#8*6]
-
- sub x27,x5,#8*4
-Lsqr4x_cond_copy:
- sub x27,x27,#8*4
- csel x14,x19,x6,lo
- stp xzr,xzr,[x2,#8*0]
- csel x15,x20,x7,lo
- ldp x6,x7,[x3,#8*4]
- ldp x19,x20,[x1,#8*4]
- csel x16,x21,x8,lo
- stp xzr,xzr,[x2,#8*2]
- add x2,x2,#8*4
- csel x17,x22,x9,lo
- ldp x8,x9,[x3,#8*6]
- ldp x21,x22,[x1,#8*6]
- add x1,x1,#8*4
- stp x14,x15,[x3,#8*0]
- stp x16,x17,[x3,#8*2]
- add x3,x3,#8*4
- stp xzr,xzr,[x1,#8*0]
- stp xzr,xzr,[x1,#8*2]
- cbnz x27,Lsqr4x_cond_copy
-
- csel x14,x19,x6,lo
- stp xzr,xzr,[x2,#8*0]
- csel x15,x20,x7,lo
- stp xzr,xzr,[x2,#8*2]
- csel x16,x21,x8,lo
- csel x17,x22,x9,lo
- stp x14,x15,[x3,#8*0]
- stp x16,x17,[x3,#8*2]
-
- b Lsqr8x_done
-
-.align 4
-Lsqr8x8_post_condition:
- adc x28,xzr,xzr
- ldr x30,[x29,#8] // pull return address
- // x19-7,x28 hold result, x6-7 hold modulus
- subs x6,x19,x6
- ldr x1,[x29,#96] // pull rp
- sbcs x7,x20,x7
- stp xzr,xzr,[sp,#8*0]
- sbcs x8,x21,x8
- stp xzr,xzr,[sp,#8*2]
- sbcs x9,x22,x9
- stp xzr,xzr,[sp,#8*4]
- sbcs x10,x23,x10
- stp xzr,xzr,[sp,#8*6]
- sbcs x11,x24,x11
- stp xzr,xzr,[sp,#8*8]
- sbcs x12,x25,x12
- stp xzr,xzr,[sp,#8*10]
- sbcs x13,x26,x13
- stp xzr,xzr,[sp,#8*12]
- sbcs x28,x28,xzr // did it borrow?
- stp xzr,xzr,[sp,#8*14]
-
- // x6-7 hold result-modulus
- csel x6,x19,x6,lo
- csel x7,x20,x7,lo
- csel x8,x21,x8,lo
- csel x9,x22,x9,lo
- stp x6,x7,[x1,#8*0]
- csel x10,x23,x10,lo
- csel x11,x24,x11,lo
- stp x8,x9,[x1,#8*2]
- csel x12,x25,x12,lo
- csel x13,x26,x13,lo
- stp x10,x11,[x1,#8*4]
- stp x12,x13,[x1,#8*6]
-
-Lsqr8x_done:
- ldp x19,x20,[x29,#16]
- mov sp,x29
- ldp x21,x22,[x29,#32]
- mov x0,#1
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldr x29,[sp],#128
- // x30 is popped earlier
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-.align 5
-__bn_mul4x_mont:
- // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
- // only from bn_mul_mont or __bn_mul8x_mont which have already signed the
- // return address.
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
-
- sub x26,sp,x5,lsl#3
- lsl x5,x5,#3
- ldr x4,[x4] // *n0
- sub sp,x26,#8*4 // alloca
-
- add x10,x2,x5
- add x27,x1,x5
- stp x0,x10,[x29,#96] // offload rp and &b[num]
-
- ldr x24,[x2,#8*0] // b[0]
- ldp x6,x7,[x1,#8*0] // a[0..3]
- ldp x8,x9,[x1,#8*2]
- add x1,x1,#8*4
- mov x19,xzr
- mov x20,xzr
- mov x21,xzr
- mov x22,xzr
- ldp x14,x15,[x3,#8*0] // n[0..3]
- ldp x16,x17,[x3,#8*2]
- adds x3,x3,#8*4 // clear carry bit
- mov x0,xzr
- mov x28,#0
- mov x26,sp
-
-Loop_mul4x_1st_reduction:
- mul x10,x6,x24 // lo(a[0..3]*b[0])
- adc x0,x0,xzr // modulo-scheduled
- mul x11,x7,x24
- add x28,x28,#8
- mul x12,x8,x24
- and x28,x28,#31
- mul x13,x9,x24
- adds x19,x19,x10
- umulh x10,x6,x24 // hi(a[0..3]*b[0])
- adcs x20,x20,x11
- mul x25,x19,x4 // t[0]*n0
- adcs x21,x21,x12
- umulh x11,x7,x24
- adcs x22,x22,x13
- umulh x12,x8,x24
- adc x23,xzr,xzr
- umulh x13,x9,x24
- ldr x24,[x2,x28] // next b[i] (or b[0])
- adds x20,x20,x10
- // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
- str x25,[x26],#8 // put aside t[0]*n0 for tail processing
- adcs x21,x21,x11
- mul x11,x15,x25
- adcs x22,x22,x12
- mul x12,x16,x25
- adc x23,x23,x13 // can't overflow
- mul x13,x17,x25
- // (*) adds xzr,x19,x10
- subs xzr,x19,#1 // (*)
- umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
- adcs x19,x20,x11
- umulh x11,x15,x25
- adcs x20,x21,x12
- umulh x12,x16,x25
- adcs x21,x22,x13
- umulh x13,x17,x25
- adcs x22,x23,x0
- adc x0,xzr,xzr
- adds x19,x19,x10
- sub x10,x27,x1
- adcs x20,x20,x11
- adcs x21,x21,x12
- adcs x22,x22,x13
- //adc x0,x0,xzr
- cbnz x28,Loop_mul4x_1st_reduction
-
- cbz x10,Lmul4x4_post_condition
-
- ldp x6,x7,[x1,#8*0] // a[4..7]
- ldp x8,x9,[x1,#8*2]
- add x1,x1,#8*4
- ldr x25,[sp] // a[0]*n0
- ldp x14,x15,[x3,#8*0] // n[4..7]
- ldp x16,x17,[x3,#8*2]
- add x3,x3,#8*4
-
-Loop_mul4x_1st_tail:
- mul x10,x6,x24 // lo(a[4..7]*b[i])
- adc x0,x0,xzr // modulo-scheduled
- mul x11,x7,x24
- add x28,x28,#8
- mul x12,x8,x24
- and x28,x28,#31
- mul x13,x9,x24
- adds x19,x19,x10
- umulh x10,x6,x24 // hi(a[4..7]*b[i])
- adcs x20,x20,x11
- umulh x11,x7,x24
- adcs x21,x21,x12
- umulh x12,x8,x24
- adcs x22,x22,x13
- umulh x13,x9,x24
- adc x23,xzr,xzr
- ldr x24,[x2,x28] // next b[i] (or b[0])
- adds x20,x20,x10
- mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
- adcs x21,x21,x11
- mul x11,x15,x25
- adcs x22,x22,x12
- mul x12,x16,x25
- adc x23,x23,x13 // can't overflow
- mul x13,x17,x25
- adds x19,x19,x10
- umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
- adcs x20,x20,x11
- umulh x11,x15,x25
- adcs x21,x21,x12
- umulh x12,x16,x25
- adcs x22,x22,x13
- adcs x23,x23,x0
- umulh x13,x17,x25
- adc x0,xzr,xzr
- ldr x25,[sp,x28] // next t[0]*n0
- str x19,[x26],#8 // result!!!
- adds x19,x20,x10
- sub x10,x27,x1 // done yet?
- adcs x20,x21,x11
- adcs x21,x22,x12
- adcs x22,x23,x13
- //adc x0,x0,xzr
- cbnz x28,Loop_mul4x_1st_tail
-
- sub x11,x27,x5 // rewinded x1
- cbz x10,Lmul4x_proceed
-
- ldp x6,x7,[x1,#8*0]
- ldp x8,x9,[x1,#8*2]
- add x1,x1,#8*4
- ldp x14,x15,[x3,#8*0]
- ldp x16,x17,[x3,#8*2]
- add x3,x3,#8*4
- b Loop_mul4x_1st_tail
-
-.align 5
-Lmul4x_proceed:
- ldr x24,[x2,#8*4]! // *++b
- adc x30,x0,xzr
- ldp x6,x7,[x11,#8*0] // a[0..3]
- sub x3,x3,x5 // rewind np
- ldp x8,x9,[x11,#8*2]
- add x1,x11,#8*4
-
- stp x19,x20,[x26,#8*0] // result!!!
- ldp x19,x20,[sp,#8*4] // t[0..3]
- stp x21,x22,[x26,#8*2] // result!!!
- ldp x21,x22,[sp,#8*6]
-
- ldp x14,x15,[x3,#8*0] // n[0..3]
- mov x26,sp
- ldp x16,x17,[x3,#8*2]
- adds x3,x3,#8*4 // clear carry bit
- mov x0,xzr
-
-.align 4
-Loop_mul4x_reduction:
- mul x10,x6,x24 // lo(a[0..3]*b[4])
- adc x0,x0,xzr // modulo-scheduled
- mul x11,x7,x24
- add x28,x28,#8
- mul x12,x8,x24
- and x28,x28,#31
- mul x13,x9,x24
- adds x19,x19,x10
- umulh x10,x6,x24 // hi(a[0..3]*b[4])
- adcs x20,x20,x11
- mul x25,x19,x4 // t[0]*n0
- adcs x21,x21,x12
- umulh x11,x7,x24
- adcs x22,x22,x13
- umulh x12,x8,x24
- adc x23,xzr,xzr
- umulh x13,x9,x24
- ldr x24,[x2,x28] // next b[i]
- adds x20,x20,x10
- // (*) mul x10,x14,x25
- str x25,[x26],#8 // put aside t[0]*n0 for tail processing
- adcs x21,x21,x11
- mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
- adcs x22,x22,x12
- mul x12,x16,x25
- adc x23,x23,x13 // can't overflow
- mul x13,x17,x25
- // (*) adds xzr,x19,x10
- subs xzr,x19,#1 // (*)
- umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
- adcs x19,x20,x11
- umulh x11,x15,x25
- adcs x20,x21,x12
- umulh x12,x16,x25
- adcs x21,x22,x13
- umulh x13,x17,x25
- adcs x22,x23,x0
- adc x0,xzr,xzr
- adds x19,x19,x10
- adcs x20,x20,x11
- adcs x21,x21,x12
- adcs x22,x22,x13
- //adc x0,x0,xzr
- cbnz x28,Loop_mul4x_reduction
-
- adc x0,x0,xzr
- ldp x10,x11,[x26,#8*4] // t[4..7]
- ldp x12,x13,[x26,#8*6]
- ldp x6,x7,[x1,#8*0] // a[4..7]
- ldp x8,x9,[x1,#8*2]
- add x1,x1,#8*4
- adds x19,x19,x10
- adcs x20,x20,x11
- adcs x21,x21,x12
- adcs x22,x22,x13
- //adc x0,x0,xzr
-
- ldr x25,[sp] // t[0]*n0
- ldp x14,x15,[x3,#8*0] // n[4..7]
- ldp x16,x17,[x3,#8*2]
- add x3,x3,#8*4
-
-.align 4
-Loop_mul4x_tail:
- mul x10,x6,x24 // lo(a[4..7]*b[4])
- adc x0,x0,xzr // modulo-scheduled
- mul x11,x7,x24
- add x28,x28,#8
- mul x12,x8,x24
- and x28,x28,#31
- mul x13,x9,x24
- adds x19,x19,x10
- umulh x10,x6,x24 // hi(a[4..7]*b[4])
- adcs x20,x20,x11
- umulh x11,x7,x24
- adcs x21,x21,x12
- umulh x12,x8,x24
- adcs x22,x22,x13
- umulh x13,x9,x24
- adc x23,xzr,xzr
- ldr x24,[x2,x28] // next b[i]
- adds x20,x20,x10
- mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
- adcs x21,x21,x11
- mul x11,x15,x25
- adcs x22,x22,x12
- mul x12,x16,x25
- adc x23,x23,x13 // can't overflow
- mul x13,x17,x25
- adds x19,x19,x10
- umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
- adcs x20,x20,x11
- umulh x11,x15,x25
- adcs x21,x21,x12
- umulh x12,x16,x25
- adcs x22,x22,x13
- umulh x13,x17,x25
- adcs x23,x23,x0
- ldr x25,[sp,x28] // next a[0]*n0
- adc x0,xzr,xzr
- str x19,[x26],#8 // result!!!
- adds x19,x20,x10
- sub x10,x27,x1 // done yet?
- adcs x20,x21,x11
- adcs x21,x22,x12
- adcs x22,x23,x13
- //adc x0,x0,xzr
- cbnz x28,Loop_mul4x_tail
-
- sub x11,x3,x5 // rewinded np?
- adc x0,x0,xzr
- cbz x10,Loop_mul4x_break
-
- ldp x10,x11,[x26,#8*4]
- ldp x12,x13,[x26,#8*6]
- ldp x6,x7,[x1,#8*0]
- ldp x8,x9,[x1,#8*2]
- add x1,x1,#8*4
- adds x19,x19,x10
- adcs x20,x20,x11
- adcs x21,x21,x12
- adcs x22,x22,x13
- //adc x0,x0,xzr
- ldp x14,x15,[x3,#8*0]
- ldp x16,x17,[x3,#8*2]
- add x3,x3,#8*4
- b Loop_mul4x_tail
-
-.align 4
-Loop_mul4x_break:
- ldp x12,x13,[x29,#96] // pull rp and &b[num]
- adds x19,x19,x30
- add x2,x2,#8*4 // bp++
- adcs x20,x20,xzr
- sub x1,x1,x5 // rewind ap
- adcs x21,x21,xzr
- stp x19,x20,[x26,#8*0] // result!!!
- adcs x22,x22,xzr
- ldp x19,x20,[sp,#8*4] // t[0..3]
- adc x30,x0,xzr
- stp x21,x22,[x26,#8*2] // result!!!
- cmp x2,x13 // done yet?
- ldp x21,x22,[sp,#8*6]
- ldp x14,x15,[x11,#8*0] // n[0..3]
- ldp x16,x17,[x11,#8*2]
- add x3,x11,#8*4
- b.eq Lmul4x_post
-
- ldr x24,[x2]
- ldp x6,x7,[x1,#8*0] // a[0..3]
- ldp x8,x9,[x1,#8*2]
- adds x1,x1,#8*4 // clear carry bit
- mov x0,xzr
- mov x26,sp
- b Loop_mul4x_reduction
-
-.align 4
-Lmul4x_post:
- // Final step. We see if result is larger than modulus, and
- // if it is, subtract the modulus. But comparison implies
- // subtraction. So we subtract modulus, see if it borrowed,
- // and conditionally copy original value.
- mov x0,x12
- mov x27,x12 // x0 copy
- subs x10,x19,x14
- add x26,sp,#8*8
- sbcs x11,x20,x15
- sub x28,x5,#8*4
-
-Lmul4x_sub:
- sbcs x12,x21,x16
- ldp x14,x15,[x3,#8*0]
- sub x28,x28,#8*4
- ldp x19,x20,[x26,#8*0]
- sbcs x13,x22,x17
- ldp x16,x17,[x3,#8*2]
- add x3,x3,#8*4
- ldp x21,x22,[x26,#8*2]
- add x26,x26,#8*4
- stp x10,x11,[x0,#8*0]
- sbcs x10,x19,x14
- stp x12,x13,[x0,#8*2]
- add x0,x0,#8*4
- sbcs x11,x20,x15
- cbnz x28,Lmul4x_sub
-
- sbcs x12,x21,x16
- mov x26,sp
- add x1,sp,#8*4
- ldp x6,x7,[x27,#8*0]
- sbcs x13,x22,x17
- stp x10,x11,[x0,#8*0]
- ldp x8,x9,[x27,#8*2]
- stp x12,x13,[x0,#8*2]
- ldp x19,x20,[x1,#8*0]
- ldp x21,x22,[x1,#8*2]
- sbcs xzr,x30,xzr // did it borrow?
- ldr x30,[x29,#8] // pull return address
-
- sub x28,x5,#8*4
-Lmul4x_cond_copy:
- sub x28,x28,#8*4
- csel x10,x19,x6,lo
- stp xzr,xzr,[x26,#8*0]
- csel x11,x20,x7,lo
- ldp x6,x7,[x27,#8*4]
- ldp x19,x20,[x1,#8*4]
- csel x12,x21,x8,lo
- stp xzr,xzr,[x26,#8*2]
- add x26,x26,#8*4
- csel x13,x22,x9,lo
- ldp x8,x9,[x27,#8*6]
- ldp x21,x22,[x1,#8*6]
- add x1,x1,#8*4
- stp x10,x11,[x27,#8*0]
- stp x12,x13,[x27,#8*2]
- add x27,x27,#8*4
- cbnz x28,Lmul4x_cond_copy
-
- csel x10,x19,x6,lo
- stp xzr,xzr,[x26,#8*0]
- csel x11,x20,x7,lo
- stp xzr,xzr,[x26,#8*2]
- csel x12,x21,x8,lo
- stp xzr,xzr,[x26,#8*3]
- csel x13,x22,x9,lo
- stp xzr,xzr,[x26,#8*4]
- stp x10,x11,[x27,#8*0]
- stp x12,x13,[x27,#8*2]
-
- b Lmul4x_done
-
-.align 4
-Lmul4x4_post_condition:
- adc x0,x0,xzr
- ldr x1,[x29,#96] // pull rp
- // x19-3,x0 hold result, x14-7 hold modulus
- subs x6,x19,x14
- ldr x30,[x29,#8] // pull return address
- sbcs x7,x20,x15
- stp xzr,xzr,[sp,#8*0]
- sbcs x8,x21,x16
- stp xzr,xzr,[sp,#8*2]
- sbcs x9,x22,x17
- stp xzr,xzr,[sp,#8*4]
- sbcs xzr,x0,xzr // did it borrow?
- stp xzr,xzr,[sp,#8*6]
-
- // x6-3 hold result-modulus
- csel x6,x19,x6,lo
- csel x7,x20,x7,lo
- csel x8,x21,x8,lo
- csel x9,x22,x9,lo
- stp x6,x7,[x1,#8*0]
- stp x8,x9,[x1,#8*2]
-
-Lmul4x_done:
- ldp x19,x20,[x29,#16]
- mov sp,x29
- ldp x21,x22,[x29,#32]
- mov x0,#1
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldr x29,[sp],#128
- // x30 is popped earlier
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 4
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/bn-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/bn-armv8-apple.S
deleted file mode 100644
index 5e3471a..0000000
--- a/apple-aarch64/crypto/fipsmodule/bn-armv8-apple.S
+++ /dev/null
@@ -1,89 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.text
-
-// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-// size_t num);
-
-.globl _bn_add_words
-.private_extern _bn_add_words
-.align 4
-_bn_add_words:
- AARCH64_VALID_CALL_TARGET
- # Clear the carry flag.
- cmn xzr, xzr
-
- # aarch64 can load two registers at a time, so we do two loop iterations at
- # at a time. Split x3 = 2 * x8 + x3. This allows loop
- # operations to use CBNZ without clobbering the carry flag.
- lsr x8, x3, #1
- and x3, x3, #1
-
- cbz x8, Ladd_tail
-Ladd_loop:
- ldp x4, x5, [x1], #16
- ldp x6, x7, [x2], #16
- sub x8, x8, #1
- adcs x4, x4, x6
- adcs x5, x5, x7
- stp x4, x5, [x0], #16
- cbnz x8, Ladd_loop
-
-Ladd_tail:
- cbz x3, Ladd_exit
- ldr x4, [x1], #8
- ldr x6, [x2], #8
- adcs x4, x4, x6
- str x4, [x0], #8
-
-Ladd_exit:
- cset x0, cs
- ret
-
-
-// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-// size_t num);
-
-.globl _bn_sub_words
-.private_extern _bn_sub_words
-.align 4
-_bn_sub_words:
- AARCH64_VALID_CALL_TARGET
- # Set the carry flag. Arm's borrow bit is flipped from the carry flag,
- # so we want C = 1 here.
- cmp xzr, xzr
-
- # aarch64 can load two registers at a time, so we do two loop iterations at
- # at a time. Split x3 = 2 * x8 + x3. This allows loop
- # operations to use CBNZ without clobbering the carry flag.
- lsr x8, x3, #1
- and x3, x3, #1
-
- cbz x8, Lsub_tail
-Lsub_loop:
- ldp x4, x5, [x1], #16
- ldp x6, x7, [x2], #16
- sub x8, x8, #1
- sbcs x4, x4, x6
- sbcs x5, x5, x7
- stp x4, x5, [x0], #16
- cbnz x8, Lsub_loop
-
-Lsub_tail:
- cbz x3, Lsub_exit
- ldr x4, [x1], #8
- ldr x6, [x2], #8
- sbcs x4, x4, x6
- str x4, [x0], #8
-
-Lsub_exit:
- cset x0, cc
- ret
-
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8-apple.S
deleted file mode 100644
index a76b8d1..0000000
--- a/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8-apple.S
+++ /dev/null
@@ -1,335 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.text
-
-.globl _gcm_init_neon
-.private_extern _gcm_init_neon
-
-.align 4
-_gcm_init_neon:
- AARCH64_VALID_CALL_TARGET
- // This function is adapted from gcm_init_v8. xC2 is t3.
- ld1 {v17.2d}, [x1] // load H
- movi v19.16b, #0xe1
- shl v19.2d, v19.2d, #57 // 0xc2.0
- ext v3.16b, v17.16b, v17.16b, #8
- ushr v18.2d, v19.2d, #63
- dup v17.4s, v17.s[1]
- ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
- ushr v18.2d, v3.2d, #63
- sshr v17.4s, v17.4s, #31 // broadcast carry bit
- and v18.16b, v18.16b, v16.16b
- shl v3.2d, v3.2d, #1
- ext v18.16b, v18.16b, v18.16b, #8
- and v16.16b, v16.16b, v17.16b
- orr v3.16b, v3.16b, v18.16b // H<<<=1
- eor v5.16b, v3.16b, v16.16b // twisted H
- st1 {v5.2d}, [x0] // store Htable[0]
- ret
-
-
-.globl _gcm_gmult_neon
-.private_extern _gcm_gmult_neon
-
-.align 4
-_gcm_gmult_neon:
- AARCH64_VALID_CALL_TARGET
- ld1 {v3.16b}, [x0] // load Xi
- ld1 {v5.1d}, [x1], #8 // load twisted H
- ld1 {v6.1d}, [x1]
- adrp x9, Lmasks@PAGE // load constants
- add x9, x9, Lmasks@PAGEOFF
- ld1 {v24.2d, v25.2d}, [x9]
- rev64 v3.16b, v3.16b // byteswap Xi
- ext v3.16b, v3.16b, v3.16b, #8
- eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
-
- mov x3, #16
- b Lgmult_neon
-
-
-.globl _gcm_ghash_neon
-.private_extern _gcm_ghash_neon
-
-.align 4
-_gcm_ghash_neon:
- AARCH64_VALID_CALL_TARGET
- ld1 {v0.16b}, [x0] // load Xi
- ld1 {v5.1d}, [x1], #8 // load twisted H
- ld1 {v6.1d}, [x1]
- adrp x9, Lmasks@PAGE // load constants
- add x9, x9, Lmasks@PAGEOFF
- ld1 {v24.2d, v25.2d}, [x9]
- rev64 v0.16b, v0.16b // byteswap Xi
- ext v0.16b, v0.16b, v0.16b, #8
- eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
-
-Loop_neon:
- ld1 {v3.16b}, [x2], #16 // load inp
- rev64 v3.16b, v3.16b // byteswap inp
- ext v3.16b, v3.16b, v3.16b, #8
- eor v3.16b, v3.16b, v0.16b // inp ^= Xi
-
-Lgmult_neon:
- // Split the input into v3 and v4. (The upper halves are unused,
- // so it is okay to leave them alone.)
- ins v4.d[0], v3.d[1]
- ext v16.8b, v5.8b, v5.8b, #1 // A1
- pmull v16.8h, v16.8b, v3.8b // F = A1*B
- ext v0.8b, v3.8b, v3.8b, #1 // B1
- pmull v0.8h, v5.8b, v0.8b // E = A*B1
- ext v17.8b, v5.8b, v5.8b, #2 // A2
- pmull v17.8h, v17.8b, v3.8b // H = A2*B
- ext v19.8b, v3.8b, v3.8b, #2 // B2
- pmull v19.8h, v5.8b, v19.8b // G = A*B2
- ext v18.8b, v5.8b, v5.8b, #3 // A3
- eor v16.16b, v16.16b, v0.16b // L = E + F
- pmull v18.8h, v18.8b, v3.8b // J = A3*B
- ext v0.8b, v3.8b, v3.8b, #3 // B3
- eor v17.16b, v17.16b, v19.16b // M = G + H
- pmull v0.8h, v5.8b, v0.8b // I = A*B3
-
- // Here we diverge from the 32-bit version. It computes the following
- // (instructions reordered for clarity):
- //
- // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
- // vand $t0#hi, $t0#hi, $k48
- // veor $t0#lo, $t0#lo, $t0#hi
- //
- // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
- // vand $t1#hi, $t1#hi, $k32
- // veor $t1#lo, $t1#lo, $t1#hi
- //
- // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
- // vand $t2#hi, $t2#hi, $k16
- // veor $t2#lo, $t2#lo, $t2#hi
- //
- // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
- // vmov.i64 $t3#hi, #0
- //
- // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
- // upper halves of SIMD registers, so we must split each half into
- // separate registers. To compensate, we pair computations up and
- // parallelize.
-
- ext v19.8b, v3.8b, v3.8b, #4 // B4
- eor v18.16b, v18.16b, v0.16b // N = I + J
- pmull v19.8h, v5.8b, v19.8b // K = A*B4
-
- // This can probably be scheduled more efficiently. For now, we just
- // pair up independent instructions.
- zip1 v20.2d, v16.2d, v17.2d
- zip1 v22.2d, v18.2d, v19.2d
- zip2 v21.2d, v16.2d, v17.2d
- zip2 v23.2d, v18.2d, v19.2d
- eor v20.16b, v20.16b, v21.16b
- eor v22.16b, v22.16b, v23.16b
- and v21.16b, v21.16b, v24.16b
- and v23.16b, v23.16b, v25.16b
- eor v20.16b, v20.16b, v21.16b
- eor v22.16b, v22.16b, v23.16b
- zip1 v16.2d, v20.2d, v21.2d
- zip1 v18.2d, v22.2d, v23.2d
- zip2 v17.2d, v20.2d, v21.2d
- zip2 v19.2d, v22.2d, v23.2d
-
- ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
- ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
- pmull v0.8h, v5.8b, v3.8b // D = A*B
- ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
- ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
- eor v16.16b, v16.16b, v17.16b
- eor v18.16b, v18.16b, v19.16b
- eor v0.16b, v0.16b, v16.16b
- eor v0.16b, v0.16b, v18.16b
- eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
- ext v16.8b, v7.8b, v7.8b, #1 // A1
- pmull v16.8h, v16.8b, v3.8b // F = A1*B
- ext v1.8b, v3.8b, v3.8b, #1 // B1
- pmull v1.8h, v7.8b, v1.8b // E = A*B1
- ext v17.8b, v7.8b, v7.8b, #2 // A2
- pmull v17.8h, v17.8b, v3.8b // H = A2*B
- ext v19.8b, v3.8b, v3.8b, #2 // B2
- pmull v19.8h, v7.8b, v19.8b // G = A*B2
- ext v18.8b, v7.8b, v7.8b, #3 // A3
- eor v16.16b, v16.16b, v1.16b // L = E + F
- pmull v18.8h, v18.8b, v3.8b // J = A3*B
- ext v1.8b, v3.8b, v3.8b, #3 // B3
- eor v17.16b, v17.16b, v19.16b // M = G + H
- pmull v1.8h, v7.8b, v1.8b // I = A*B3
-
- // Here we diverge from the 32-bit version. It computes the following
- // (instructions reordered for clarity):
- //
- // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
- // vand $t0#hi, $t0#hi, $k48
- // veor $t0#lo, $t0#lo, $t0#hi
- //
- // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
- // vand $t1#hi, $t1#hi, $k32
- // veor $t1#lo, $t1#lo, $t1#hi
- //
- // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
- // vand $t2#hi, $t2#hi, $k16
- // veor $t2#lo, $t2#lo, $t2#hi
- //
- // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
- // vmov.i64 $t3#hi, #0
- //
- // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
- // upper halves of SIMD registers, so we must split each half into
- // separate registers. To compensate, we pair computations up and
- // parallelize.
-
- ext v19.8b, v3.8b, v3.8b, #4 // B4
- eor v18.16b, v18.16b, v1.16b // N = I + J
- pmull v19.8h, v7.8b, v19.8b // K = A*B4
-
- // This can probably be scheduled more efficiently. For now, we just
- // pair up independent instructions.
- zip1 v20.2d, v16.2d, v17.2d
- zip1 v22.2d, v18.2d, v19.2d
- zip2 v21.2d, v16.2d, v17.2d
- zip2 v23.2d, v18.2d, v19.2d
- eor v20.16b, v20.16b, v21.16b
- eor v22.16b, v22.16b, v23.16b
- and v21.16b, v21.16b, v24.16b
- and v23.16b, v23.16b, v25.16b
- eor v20.16b, v20.16b, v21.16b
- eor v22.16b, v22.16b, v23.16b
- zip1 v16.2d, v20.2d, v21.2d
- zip1 v18.2d, v22.2d, v23.2d
- zip2 v17.2d, v20.2d, v21.2d
- zip2 v19.2d, v22.2d, v23.2d
-
- ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
- ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
- pmull v1.8h, v7.8b, v3.8b // D = A*B
- ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
- ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
- eor v16.16b, v16.16b, v17.16b
- eor v18.16b, v18.16b, v19.16b
- eor v1.16b, v1.16b, v16.16b
- eor v1.16b, v1.16b, v18.16b
- ext v16.8b, v6.8b, v6.8b, #1 // A1
- pmull v16.8h, v16.8b, v4.8b // F = A1*B
- ext v2.8b, v4.8b, v4.8b, #1 // B1
- pmull v2.8h, v6.8b, v2.8b // E = A*B1
- ext v17.8b, v6.8b, v6.8b, #2 // A2
- pmull v17.8h, v17.8b, v4.8b // H = A2*B
- ext v19.8b, v4.8b, v4.8b, #2 // B2
- pmull v19.8h, v6.8b, v19.8b // G = A*B2
- ext v18.8b, v6.8b, v6.8b, #3 // A3
- eor v16.16b, v16.16b, v2.16b // L = E + F
- pmull v18.8h, v18.8b, v4.8b // J = A3*B
- ext v2.8b, v4.8b, v4.8b, #3 // B3
- eor v17.16b, v17.16b, v19.16b // M = G + H
- pmull v2.8h, v6.8b, v2.8b // I = A*B3
-
- // Here we diverge from the 32-bit version. It computes the following
- // (instructions reordered for clarity):
- //
- // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
- // vand $t0#hi, $t0#hi, $k48
- // veor $t0#lo, $t0#lo, $t0#hi
- //
- // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
- // vand $t1#hi, $t1#hi, $k32
- // veor $t1#lo, $t1#lo, $t1#hi
- //
- // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
- // vand $t2#hi, $t2#hi, $k16
- // veor $t2#lo, $t2#lo, $t2#hi
- //
- // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
- // vmov.i64 $t3#hi, #0
- //
- // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
- // upper halves of SIMD registers, so we must split each half into
- // separate registers. To compensate, we pair computations up and
- // parallelize.
-
- ext v19.8b, v4.8b, v4.8b, #4 // B4
- eor v18.16b, v18.16b, v2.16b // N = I + J
- pmull v19.8h, v6.8b, v19.8b // K = A*B4
-
- // This can probably be scheduled more efficiently. For now, we just
- // pair up independent instructions.
- zip1 v20.2d, v16.2d, v17.2d
- zip1 v22.2d, v18.2d, v19.2d
- zip2 v21.2d, v16.2d, v17.2d
- zip2 v23.2d, v18.2d, v19.2d
- eor v20.16b, v20.16b, v21.16b
- eor v22.16b, v22.16b, v23.16b
- and v21.16b, v21.16b, v24.16b
- and v23.16b, v23.16b, v25.16b
- eor v20.16b, v20.16b, v21.16b
- eor v22.16b, v22.16b, v23.16b
- zip1 v16.2d, v20.2d, v21.2d
- zip1 v18.2d, v22.2d, v23.2d
- zip2 v17.2d, v20.2d, v21.2d
- zip2 v19.2d, v22.2d, v23.2d
-
- ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
- ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
- pmull v2.8h, v6.8b, v4.8b // D = A*B
- ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
- ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
- eor v16.16b, v16.16b, v17.16b
- eor v18.16b, v18.16b, v19.16b
- eor v2.16b, v2.16b, v16.16b
- eor v2.16b, v2.16b, v18.16b
- ext v16.16b, v0.16b, v2.16b, #8
- eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
- eor v1.16b, v1.16b, v2.16b
- eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
- ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
- // This is a no-op due to the ins instruction below.
- // ins v2.d[0], v1.d[1]
-
- // equivalent of reduction_avx from ghash-x86_64.pl
- shl v17.2d, v0.2d, #57 // 1st phase
- shl v18.2d, v0.2d, #62
- eor v18.16b, v18.16b, v17.16b //
- shl v17.2d, v0.2d, #63
- eor v18.16b, v18.16b, v17.16b //
- // Note Xm contains {Xl.d[1], Xh.d[0]}.
- eor v18.16b, v18.16b, v1.16b
- ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
- ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
-
- ushr v18.2d, v0.2d, #1 // 2nd phase
- eor v2.16b, v2.16b,v0.16b
- eor v0.16b, v0.16b,v18.16b //
- ushr v18.2d, v18.2d, #6
- ushr v0.2d, v0.2d, #1 //
- eor v0.16b, v0.16b, v2.16b //
- eor v0.16b, v0.16b, v18.16b //
-
- subs x3, x3, #16
- bne Loop_neon
-
- rev64 v0.16b, v0.16b // byteswap Xi and write
- ext v0.16b, v0.16b, v0.16b, #8
- st1 {v0.16b}, [x0]
-
- ret
-
-
-.section __TEXT,__const
-.align 4
-Lmasks:
-.quad 0x0000ffffffffffff // k48
-.quad 0x00000000ffffffff // k32
-.quad 0x000000000000ffff // k16
-.quad 0x0000000000000000 // k0
-.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/ghashv8-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/ghashv8-armv8-apple.S
deleted file mode 100644
index 6bc8a4f..0000000
--- a/apple-aarch64/crypto/fipsmodule/ghashv8-armv8-apple.S
+++ /dev/null
@@ -1,565 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-#if __ARM_MAX_ARCH__>=7
-.text
-
-.globl _gcm_init_v8
-.private_extern _gcm_init_v8
-
-.align 4
-_gcm_init_v8:
- AARCH64_VALID_CALL_TARGET
- ld1 {v17.2d},[x1] //load input H
- movi v19.16b,#0xe1
- shl v19.2d,v19.2d,#57 //0xc2.0
- ext v3.16b,v17.16b,v17.16b,#8
- ushr v18.2d,v19.2d,#63
- dup v17.4s,v17.s[1]
- ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
- ushr v18.2d,v3.2d,#63
- sshr v17.4s,v17.4s,#31 //broadcast carry bit
- and v18.16b,v18.16b,v16.16b
- shl v3.2d,v3.2d,#1
- ext v18.16b,v18.16b,v18.16b,#8
- and v16.16b,v16.16b,v17.16b
- orr v3.16b,v3.16b,v18.16b //H<<<=1
- eor v20.16b,v3.16b,v16.16b //twisted H
- st1 {v20.2d},[x0],#16 //store Htable[0]
-
- //calculate H^2
- ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
- pmull v0.1q,v20.1d,v20.1d
- eor v16.16b,v16.16b,v20.16b
- pmull2 v2.1q,v20.2d,v20.2d
- pmull v1.1q,v16.1d,v16.1d
-
- ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
- eor v18.16b,v0.16b,v2.16b
- eor v1.16b,v1.16b,v17.16b
- eor v1.16b,v1.16b,v18.16b
- pmull v18.1q,v0.1d,v19.1d //1st phase
-
- ins v2.d[0],v1.d[1]
- ins v1.d[1],v0.d[0]
- eor v0.16b,v1.16b,v18.16b
-
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
- pmull v0.1q,v0.1d,v19.1d
- eor v18.16b,v18.16b,v2.16b
- eor v22.16b,v0.16b,v18.16b
-
- ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
- eor v17.16b,v17.16b,v22.16b
- ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
- st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
- //calculate H^3 and H^4
- pmull v0.1q,v20.1d, v22.1d
- pmull v5.1q,v22.1d,v22.1d
- pmull2 v2.1q,v20.2d, v22.2d
- pmull2 v7.1q,v22.2d,v22.2d
- pmull v1.1q,v16.1d,v17.1d
- pmull v6.1q,v17.1d,v17.1d
-
- ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
- ext v17.16b,v5.16b,v7.16b,#8
- eor v18.16b,v0.16b,v2.16b
- eor v1.16b,v1.16b,v16.16b
- eor v4.16b,v5.16b,v7.16b
- eor v6.16b,v6.16b,v17.16b
- eor v1.16b,v1.16b,v18.16b
- pmull v18.1q,v0.1d,v19.1d //1st phase
- eor v6.16b,v6.16b,v4.16b
- pmull v4.1q,v5.1d,v19.1d
-
- ins v2.d[0],v1.d[1]
- ins v7.d[0],v6.d[1]
- ins v1.d[1],v0.d[0]
- ins v6.d[1],v5.d[0]
- eor v0.16b,v1.16b,v18.16b
- eor v5.16b,v6.16b,v4.16b
-
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
- ext v4.16b,v5.16b,v5.16b,#8
- pmull v0.1q,v0.1d,v19.1d
- pmull v5.1q,v5.1d,v19.1d
- eor v18.16b,v18.16b,v2.16b
- eor v4.16b,v4.16b,v7.16b
- eor v20.16b, v0.16b,v18.16b //H^3
- eor v22.16b,v5.16b,v4.16b //H^4
-
- ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
- ext v17.16b,v22.16b,v22.16b,#8
- eor v16.16b,v16.16b,v20.16b
- eor v17.16b,v17.16b,v22.16b
- ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
- st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
- ret
-
-.globl _gcm_gmult_v8
-.private_extern _gcm_gmult_v8
-
-.align 4
-_gcm_gmult_v8:
- AARCH64_VALID_CALL_TARGET
- ld1 {v17.2d},[x0] //load Xi
- movi v19.16b,#0xe1
- ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
- shl v19.2d,v19.2d,#57
-#ifndef __AARCH64EB__
- rev64 v17.16b,v17.16b
-#endif
- ext v3.16b,v17.16b,v17.16b,#8
-
- pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
- eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
- pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
- pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
-
- ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
- eor v18.16b,v0.16b,v2.16b
- eor v1.16b,v1.16b,v17.16b
- eor v1.16b,v1.16b,v18.16b
- pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
-
- ins v2.d[0],v1.d[1]
- ins v1.d[1],v0.d[0]
- eor v0.16b,v1.16b,v18.16b
-
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
- pmull v0.1q,v0.1d,v19.1d
- eor v18.16b,v18.16b,v2.16b
- eor v0.16b,v0.16b,v18.16b
-
-#ifndef __AARCH64EB__
- rev64 v0.16b,v0.16b
-#endif
- ext v0.16b,v0.16b,v0.16b,#8
- st1 {v0.2d},[x0] //write out Xi
-
- ret
-
-.globl _gcm_ghash_v8
-.private_extern _gcm_ghash_v8
-
-.align 4
-_gcm_ghash_v8:
- AARCH64_VALID_CALL_TARGET
- cmp x3,#64
- b.hs Lgcm_ghash_v8_4x
- ld1 {v0.2d},[x0] //load [rotated] Xi
- //"[rotated]" means that
- //loaded value would have
- //to be rotated in order to
- //make it appear as in
- //algorithm specification
- subs x3,x3,#32 //see if x3 is 32 or larger
- mov x12,#16 //x12 is used as post-
- //increment for input pointer;
- //as loop is modulo-scheduled
- //x12 is zeroed just in time
- //to preclude overstepping
- //inp[len], which means that
- //last block[s] are actually
- //loaded twice, but last
- //copy is not processed
- ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
- movi v19.16b,#0xe1
- ld1 {v22.2d},[x1]
- csel x12,xzr,x12,eq //is it time to zero x12?
- ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
- ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
- shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
-#ifndef __AARCH64EB__
- rev64 v16.16b,v16.16b
- rev64 v0.16b,v0.16b
-#endif
- ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
- b.lo Lodd_tail_v8 //x3 was less than 32
- ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
-#ifndef __AARCH64EB__
- rev64 v17.16b,v17.16b
-#endif
- ext v7.16b,v17.16b,v17.16b,#8
- eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
- pmull v4.1q,v20.1d,v7.1d //H·Ii+1
- eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
- pmull2 v6.1q,v20.2d,v7.2d
- b Loop_mod2x_v8
-
-.align 4
-Loop_mod2x_v8:
- ext v18.16b,v3.16b,v3.16b,#8
- subs x3,x3,#32 //is there more data?
- pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
- csel x12,xzr,x12,lo //is it time to zero x12?
-
- pmull v5.1q,v21.1d,v17.1d
- eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
- pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
- eor v0.16b,v0.16b,v4.16b //accumulate
- pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
- ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
-
- eor v2.16b,v2.16b,v6.16b
- csel x12,xzr,x12,eq //is it time to zero x12?
- eor v1.16b,v1.16b,v5.16b
-
- ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
- eor v18.16b,v0.16b,v2.16b
- eor v1.16b,v1.16b,v17.16b
- ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
-#ifndef __AARCH64EB__
- rev64 v16.16b,v16.16b
-#endif
- eor v1.16b,v1.16b,v18.16b
- pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
-
-#ifndef __AARCH64EB__
- rev64 v17.16b,v17.16b
-#endif
- ins v2.d[0],v1.d[1]
- ins v1.d[1],v0.d[0]
- ext v7.16b,v17.16b,v17.16b,#8
- ext v3.16b,v16.16b,v16.16b,#8
- eor v0.16b,v1.16b,v18.16b
- pmull v4.1q,v20.1d,v7.1d //H·Ii+1
- eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
-
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
- pmull v0.1q,v0.1d,v19.1d
- eor v3.16b,v3.16b,v18.16b
- eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
- eor v3.16b,v3.16b,v0.16b
- pmull2 v6.1q,v20.2d,v7.2d
- b.hs Loop_mod2x_v8 //there was at least 32 more bytes
-
- eor v2.16b,v2.16b,v18.16b
- ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
- adds x3,x3,#32 //re-construct x3
- eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
- b.eq Ldone_v8 //is x3 zero?
-Lodd_tail_v8:
- ext v18.16b,v0.16b,v0.16b,#8
- eor v3.16b,v3.16b,v0.16b //inp^=Xi
- eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
-
- pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
- eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
- pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
- pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
-
- ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
- eor v18.16b,v0.16b,v2.16b
- eor v1.16b,v1.16b,v17.16b
- eor v1.16b,v1.16b,v18.16b
- pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
-
- ins v2.d[0],v1.d[1]
- ins v1.d[1],v0.d[0]
- eor v0.16b,v1.16b,v18.16b
-
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
- pmull v0.1q,v0.1d,v19.1d
- eor v18.16b,v18.16b,v2.16b
- eor v0.16b,v0.16b,v18.16b
-
-Ldone_v8:
-#ifndef __AARCH64EB__
- rev64 v0.16b,v0.16b
-#endif
- ext v0.16b,v0.16b,v0.16b,#8
- st1 {v0.2d},[x0] //write out Xi
-
- ret
-
-
-.align 4
-gcm_ghash_v8_4x:
-Lgcm_ghash_v8_4x:
- ld1 {v0.2d},[x0] //load [rotated] Xi
- ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
- movi v19.16b,#0xe1
- ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
- shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
-
- ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
-#ifndef __AARCH64EB__
- rev64 v0.16b,v0.16b
- rev64 v5.16b,v5.16b
- rev64 v6.16b,v6.16b
- rev64 v7.16b,v7.16b
- rev64 v4.16b,v4.16b
-#endif
- ext v25.16b,v7.16b,v7.16b,#8
- ext v24.16b,v6.16b,v6.16b,#8
- ext v23.16b,v5.16b,v5.16b,#8
-
- pmull v29.1q,v20.1d,v25.1d //H·Ii+3
- eor v7.16b,v7.16b,v25.16b
- pmull2 v31.1q,v20.2d,v25.2d
- pmull v30.1q,v21.1d,v7.1d
-
- pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
- eor v6.16b,v6.16b,v24.16b
- pmull2 v24.1q,v22.2d,v24.2d
- pmull2 v6.1q,v21.2d,v6.2d
-
- eor v29.16b,v29.16b,v16.16b
- eor v31.16b,v31.16b,v24.16b
- eor v30.16b,v30.16b,v6.16b
-
- pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
- eor v5.16b,v5.16b,v23.16b
- pmull2 v23.1q,v26.2d,v23.2d
- pmull v5.1q,v27.1d,v5.1d
-
- eor v29.16b,v29.16b,v7.16b
- eor v31.16b,v31.16b,v23.16b
- eor v30.16b,v30.16b,v5.16b
-
- subs x3,x3,#128
- b.lo Ltail4x
-
- b Loop4x
-
-.align 4
-Loop4x:
- eor v16.16b,v4.16b,v0.16b
- ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
- ext v3.16b,v16.16b,v16.16b,#8
-#ifndef __AARCH64EB__
- rev64 v5.16b,v5.16b
- rev64 v6.16b,v6.16b
- rev64 v7.16b,v7.16b
- rev64 v4.16b,v4.16b
-#endif
-
- pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
- eor v16.16b,v16.16b,v3.16b
- pmull2 v2.1q,v28.2d,v3.2d
- ext v25.16b,v7.16b,v7.16b,#8
- pmull2 v1.1q,v27.2d,v16.2d
-
- eor v0.16b,v0.16b,v29.16b
- eor v2.16b,v2.16b,v31.16b
- ext v24.16b,v6.16b,v6.16b,#8
- eor v1.16b,v1.16b,v30.16b
- ext v23.16b,v5.16b,v5.16b,#8
-
- ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
- eor v18.16b,v0.16b,v2.16b
- pmull v29.1q,v20.1d,v25.1d //H·Ii+3
- eor v7.16b,v7.16b,v25.16b
- eor v1.16b,v1.16b,v17.16b
- pmull2 v31.1q,v20.2d,v25.2d
- eor v1.16b,v1.16b,v18.16b
- pmull v30.1q,v21.1d,v7.1d
-
- pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
- ins v2.d[0],v1.d[1]
- ins v1.d[1],v0.d[0]
- pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
- eor v6.16b,v6.16b,v24.16b
- pmull2 v24.1q,v22.2d,v24.2d
- eor v0.16b,v1.16b,v18.16b
- pmull2 v6.1q,v21.2d,v6.2d
-
- eor v29.16b,v29.16b,v16.16b
- eor v31.16b,v31.16b,v24.16b
- eor v30.16b,v30.16b,v6.16b
-
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
- pmull v0.1q,v0.1d,v19.1d
- pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
- eor v5.16b,v5.16b,v23.16b
- eor v18.16b,v18.16b,v2.16b
- pmull2 v23.1q,v26.2d,v23.2d
- pmull v5.1q,v27.1d,v5.1d
-
- eor v0.16b,v0.16b,v18.16b
- eor v29.16b,v29.16b,v7.16b
- eor v31.16b,v31.16b,v23.16b
- ext v0.16b,v0.16b,v0.16b,#8
- eor v30.16b,v30.16b,v5.16b
-
- subs x3,x3,#64
- b.hs Loop4x
-
-Ltail4x:
- eor v16.16b,v4.16b,v0.16b
- ext v3.16b,v16.16b,v16.16b,#8
-
- pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
- eor v16.16b,v16.16b,v3.16b
- pmull2 v2.1q,v28.2d,v3.2d
- pmull2 v1.1q,v27.2d,v16.2d
-
- eor v0.16b,v0.16b,v29.16b
- eor v2.16b,v2.16b,v31.16b
- eor v1.16b,v1.16b,v30.16b
-
- adds x3,x3,#64
- b.eq Ldone4x
-
- cmp x3,#32
- b.lo Lone
- b.eq Ltwo
-Lthree:
- ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
- eor v18.16b,v0.16b,v2.16b
- eor v1.16b,v1.16b,v17.16b
- ld1 {v4.2d,v5.2d,v6.2d},[x2]
- eor v1.16b,v1.16b,v18.16b
-#ifndef __AARCH64EB__
- rev64 v5.16b,v5.16b
- rev64 v6.16b,v6.16b
- rev64 v4.16b,v4.16b
-#endif
-
- pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
- ins v2.d[0],v1.d[1]
- ins v1.d[1],v0.d[0]
- ext v24.16b,v6.16b,v6.16b,#8
- ext v23.16b,v5.16b,v5.16b,#8
- eor v0.16b,v1.16b,v18.16b
-
- pmull v29.1q,v20.1d,v24.1d //H·Ii+2
- eor v6.16b,v6.16b,v24.16b
-
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
- pmull v0.1q,v0.1d,v19.1d
- eor v18.16b,v18.16b,v2.16b
- pmull2 v31.1q,v20.2d,v24.2d
- pmull v30.1q,v21.1d,v6.1d
- eor v0.16b,v0.16b,v18.16b
- pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
- eor v5.16b,v5.16b,v23.16b
- ext v0.16b,v0.16b,v0.16b,#8
-
- pmull2 v23.1q,v22.2d,v23.2d
- eor v16.16b,v4.16b,v0.16b
- pmull2 v5.1q,v21.2d,v5.2d
- ext v3.16b,v16.16b,v16.16b,#8
-
- eor v29.16b,v29.16b,v7.16b
- eor v31.16b,v31.16b,v23.16b
- eor v30.16b,v30.16b,v5.16b
-
- pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
- eor v16.16b,v16.16b,v3.16b
- pmull2 v2.1q,v26.2d,v3.2d
- pmull v1.1q,v27.1d,v16.1d
-
- eor v0.16b,v0.16b,v29.16b
- eor v2.16b,v2.16b,v31.16b
- eor v1.16b,v1.16b,v30.16b
- b Ldone4x
-
-.align 4
-Ltwo:
- ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
- eor v18.16b,v0.16b,v2.16b
- eor v1.16b,v1.16b,v17.16b
- ld1 {v4.2d,v5.2d},[x2]
- eor v1.16b,v1.16b,v18.16b
-#ifndef __AARCH64EB__
- rev64 v5.16b,v5.16b
- rev64 v4.16b,v4.16b
-#endif
-
- pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
- ins v2.d[0],v1.d[1]
- ins v1.d[1],v0.d[0]
- ext v23.16b,v5.16b,v5.16b,#8
- eor v0.16b,v1.16b,v18.16b
-
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
- pmull v0.1q,v0.1d,v19.1d
- eor v18.16b,v18.16b,v2.16b
- eor v0.16b,v0.16b,v18.16b
- ext v0.16b,v0.16b,v0.16b,#8
-
- pmull v29.1q,v20.1d,v23.1d //H·Ii+1
- eor v5.16b,v5.16b,v23.16b
-
- eor v16.16b,v4.16b,v0.16b
- ext v3.16b,v16.16b,v16.16b,#8
-
- pmull2 v31.1q,v20.2d,v23.2d
- pmull v30.1q,v21.1d,v5.1d
-
- pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
- eor v16.16b,v16.16b,v3.16b
- pmull2 v2.1q,v22.2d,v3.2d
- pmull2 v1.1q,v21.2d,v16.2d
-
- eor v0.16b,v0.16b,v29.16b
- eor v2.16b,v2.16b,v31.16b
- eor v1.16b,v1.16b,v30.16b
- b Ldone4x
-
-.align 4
-Lone:
- ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
- eor v18.16b,v0.16b,v2.16b
- eor v1.16b,v1.16b,v17.16b
- ld1 {v4.2d},[x2]
- eor v1.16b,v1.16b,v18.16b
-#ifndef __AARCH64EB__
- rev64 v4.16b,v4.16b
-#endif
-
- pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
- ins v2.d[0],v1.d[1]
- ins v1.d[1],v0.d[0]
- eor v0.16b,v1.16b,v18.16b
-
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
- pmull v0.1q,v0.1d,v19.1d
- eor v18.16b,v18.16b,v2.16b
- eor v0.16b,v0.16b,v18.16b
- ext v0.16b,v0.16b,v0.16b,#8
-
- eor v16.16b,v4.16b,v0.16b
- ext v3.16b,v16.16b,v16.16b,#8
-
- pmull v0.1q,v20.1d,v3.1d
- eor v16.16b,v16.16b,v3.16b
- pmull2 v2.1q,v20.2d,v3.2d
- pmull v1.1q,v21.1d,v16.1d
-
-Ldone4x:
- ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
- eor v18.16b,v0.16b,v2.16b
- eor v1.16b,v1.16b,v17.16b
- eor v1.16b,v1.16b,v18.16b
-
- pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
- ins v2.d[0],v1.d[1]
- ins v1.d[1],v0.d[0]
- eor v0.16b,v1.16b,v18.16b
-
- ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
- pmull v0.1q,v0.1d,v19.1d
- eor v18.16b,v18.16b,v2.16b
- eor v0.16b,v0.16b,v18.16b
- ext v0.16b,v0.16b,v0.16b,#8
-
-#ifndef __AARCH64EB__
- rev64 v0.16b,v0.16b
-#endif
- st1 {v0.2d},[x0] //write out Xi
-
- ret
-
-.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/p256-armv8-asm-apple.S b/apple-aarch64/crypto/fipsmodule/p256-armv8-asm-apple.S
deleted file mode 100644
index c8469e6..0000000
--- a/apple-aarch64/crypto/fipsmodule/p256-armv8-asm-apple.S
+++ /dev/null
@@ -1,1726 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include "openssl/arm_arch.h"
-
-.section __TEXT,__const
-.align 5
-Lpoly:
-.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
-LRR: // 2^512 mod P precomputed for NIST P256 polynomial
-.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
-Lone_mont:
-.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
-Lone:
-.quad 1,0,0,0
-Lord:
-.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
-LordK:
-.quad 0xccd1c8aaee00bc4f
-.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.text
-
-// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
-// const BN_ULONG x2[4]);
-.globl _ecp_nistz256_mul_mont
-.private_extern _ecp_nistz256_mul_mont
-
-.align 4
-_ecp_nistz256_mul_mont:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-32]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
-
- ldr x3,[x2] // bp[0]
- ldp x4,x5,[x1]
- ldp x6,x7,[x1,#16]
- adrp x13,Lpoly@PAGE
- add x13,x13,Lpoly@PAGEOFF
- ldr x12,[x13,#8]
- ldr x13,[x13,#24]
-
- bl __ecp_nistz256_mul_mont
-
- ldp x19,x20,[sp,#16]
- ldp x29,x30,[sp],#32
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl _ecp_nistz256_sqr_mont
-.private_extern _ecp_nistz256_sqr_mont
-
-.align 4
-_ecp_nistz256_sqr_mont:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-32]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
-
- ldp x4,x5,[x1]
- ldp x6,x7,[x1,#16]
- adrp x13,Lpoly@PAGE
- add x13,x13,Lpoly@PAGEOFF
- ldr x12,[x13,#8]
- ldr x13,[x13,#24]
-
- bl __ecp_nistz256_sqr_mont
-
- ldp x19,x20,[sp,#16]
- ldp x29,x30,[sp],#32
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl _ecp_nistz256_div_by_2
-.private_extern _ecp_nistz256_div_by_2
-
-.align 4
-_ecp_nistz256_div_by_2:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- ldp x14,x15,[x1]
- ldp x16,x17,[x1,#16]
- adrp x13,Lpoly@PAGE
- add x13,x13,Lpoly@PAGEOFF
- ldr x12,[x13,#8]
- ldr x13,[x13,#24]
-
- bl __ecp_nistz256_div_by_2
-
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl _ecp_nistz256_mul_by_2
-.private_extern _ecp_nistz256_mul_by_2
-
-.align 4
-_ecp_nistz256_mul_by_2:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- ldp x14,x15,[x1]
- ldp x16,x17,[x1,#16]
- adrp x13,Lpoly@PAGE
- add x13,x13,Lpoly@PAGEOFF
- ldr x12,[x13,#8]
- ldr x13,[x13,#24]
- mov x8,x14
- mov x9,x15
- mov x10,x16
- mov x11,x17
-
- bl __ecp_nistz256_add_to // ret = a+a // 2*a
-
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl _ecp_nistz256_mul_by_3
-.private_extern _ecp_nistz256_mul_by_3
-
-.align 4
-_ecp_nistz256_mul_by_3:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- ldp x14,x15,[x1]
- ldp x16,x17,[x1,#16]
- adrp x13,Lpoly@PAGE
- add x13,x13,Lpoly@PAGEOFF
- ldr x12,[x13,#8]
- ldr x13,[x13,#24]
- mov x8,x14
- mov x9,x15
- mov x10,x16
- mov x11,x17
- mov x4,x14
- mov x5,x15
- mov x6,x16
- mov x7,x17
-
- bl __ecp_nistz256_add_to // ret = a+a // 2*a
-
- mov x8,x4
- mov x9,x5
- mov x10,x6
- mov x11,x7
-
- bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a
-
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
-// const BN_ULONG x2[4]);
-.globl _ecp_nistz256_sub
-.private_extern _ecp_nistz256_sub
-
-.align 4
-_ecp_nistz256_sub:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- ldp x14,x15,[x1]
- ldp x16,x17,[x1,#16]
- adrp x13,Lpoly@PAGE
- add x13,x13,Lpoly@PAGEOFF
- ldr x12,[x13,#8]
- ldr x13,[x13,#24]
-
- bl __ecp_nistz256_sub_from
-
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl _ecp_nistz256_neg
-.private_extern _ecp_nistz256_neg
-
-.align 4
-_ecp_nistz256_neg:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- mov x2,x1
- mov x14,xzr // a = 0
- mov x15,xzr
- mov x16,xzr
- mov x17,xzr
- adrp x13,Lpoly@PAGE
- add x13,x13,Lpoly@PAGEOFF
- ldr x12,[x13,#8]
- ldr x13,[x13,#24]
-
- bl __ecp_nistz256_sub_from
-
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
-// to x4-x7 and b[0] - to x3
-
-.align 4
-__ecp_nistz256_mul_mont:
- mul x14,x4,x3 // a[0]*b[0]
- umulh x8,x4,x3
-
- mul x15,x5,x3 // a[1]*b[0]
- umulh x9,x5,x3
-
- mul x16,x6,x3 // a[2]*b[0]
- umulh x10,x6,x3
-
- mul x17,x7,x3 // a[3]*b[0]
- umulh x11,x7,x3
- ldr x3,[x2,#8] // b[1]
-
- adds x15,x15,x8 // accumulate high parts of multiplication
- lsl x8,x14,#32
- adcs x16,x16,x9
- lsr x9,x14,#32
- adcs x17,x17,x10
- adc x19,xzr,x11
- mov x20,xzr
- subs x10,x14,x8 // "*0xffff0001"
- sbc x11,x14,x9
- adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
- mul x8,x4,x3 // lo(a[0]*b[i])
- adcs x15,x16,x9
- mul x9,x5,x3 // lo(a[1]*b[i])
- adcs x16,x17,x10 // +=acc[0]*0xffff0001
- mul x10,x6,x3 // lo(a[2]*b[i])
- adcs x17,x19,x11
- mul x11,x7,x3 // lo(a[3]*b[i])
- adc x19,x20,xzr
-
- adds x14,x14,x8 // accumulate low parts of multiplication
- umulh x8,x4,x3 // hi(a[0]*b[i])
- adcs x15,x15,x9
- umulh x9,x5,x3 // hi(a[1]*b[i])
- adcs x16,x16,x10
- umulh x10,x6,x3 // hi(a[2]*b[i])
- adcs x17,x17,x11
- umulh x11,x7,x3 // hi(a[3]*b[i])
- adc x19,x19,xzr
- ldr x3,[x2,#8*(1+1)] // b[1+1]
- adds x15,x15,x8 // accumulate high parts of multiplication
- lsl x8,x14,#32
- adcs x16,x16,x9
- lsr x9,x14,#32
- adcs x17,x17,x10
- adcs x19,x19,x11
- adc x20,xzr,xzr
- subs x10,x14,x8 // "*0xffff0001"
- sbc x11,x14,x9
- adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
- mul x8,x4,x3 // lo(a[0]*b[i])
- adcs x15,x16,x9
- mul x9,x5,x3 // lo(a[1]*b[i])
- adcs x16,x17,x10 // +=acc[0]*0xffff0001
- mul x10,x6,x3 // lo(a[2]*b[i])
- adcs x17,x19,x11
- mul x11,x7,x3 // lo(a[3]*b[i])
- adc x19,x20,xzr
-
- adds x14,x14,x8 // accumulate low parts of multiplication
- umulh x8,x4,x3 // hi(a[0]*b[i])
- adcs x15,x15,x9
- umulh x9,x5,x3 // hi(a[1]*b[i])
- adcs x16,x16,x10
- umulh x10,x6,x3 // hi(a[2]*b[i])
- adcs x17,x17,x11
- umulh x11,x7,x3 // hi(a[3]*b[i])
- adc x19,x19,xzr
- ldr x3,[x2,#8*(2+1)] // b[2+1]
- adds x15,x15,x8 // accumulate high parts of multiplication
- lsl x8,x14,#32
- adcs x16,x16,x9
- lsr x9,x14,#32
- adcs x17,x17,x10
- adcs x19,x19,x11
- adc x20,xzr,xzr
- subs x10,x14,x8 // "*0xffff0001"
- sbc x11,x14,x9
- adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
- mul x8,x4,x3 // lo(a[0]*b[i])
- adcs x15,x16,x9
- mul x9,x5,x3 // lo(a[1]*b[i])
- adcs x16,x17,x10 // +=acc[0]*0xffff0001
- mul x10,x6,x3 // lo(a[2]*b[i])
- adcs x17,x19,x11
- mul x11,x7,x3 // lo(a[3]*b[i])
- adc x19,x20,xzr
-
- adds x14,x14,x8 // accumulate low parts of multiplication
- umulh x8,x4,x3 // hi(a[0]*b[i])
- adcs x15,x15,x9
- umulh x9,x5,x3 // hi(a[1]*b[i])
- adcs x16,x16,x10
- umulh x10,x6,x3 // hi(a[2]*b[i])
- adcs x17,x17,x11
- umulh x11,x7,x3 // hi(a[3]*b[i])
- adc x19,x19,xzr
- adds x15,x15,x8 // accumulate high parts of multiplication
- lsl x8,x14,#32
- adcs x16,x16,x9
- lsr x9,x14,#32
- adcs x17,x17,x10
- adcs x19,x19,x11
- adc x20,xzr,xzr
- // last reduction
- subs x10,x14,x8 // "*0xffff0001"
- sbc x11,x14,x9
- adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
- adcs x15,x16,x9
- adcs x16,x17,x10 // +=acc[0]*0xffff0001
- adcs x17,x19,x11
- adc x19,x20,xzr
-
- adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus
- sbcs x9,x15,x12
- sbcs x10,x16,xzr
- sbcs x11,x17,x13
- sbcs xzr,x19,xzr // did it borrow?
-
- csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
- csel x15,x15,x9,lo
- csel x16,x16,x10,lo
- stp x14,x15,[x0]
- csel x17,x17,x11,lo
- stp x16,x17,[x0,#16]
-
- ret
-
-
-// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
-// to x4-x7
-
-.align 4
-__ecp_nistz256_sqr_mont:
- // | | | | | |a1*a0| |
- // | | | | |a2*a0| | |
- // | |a3*a2|a3*a0| | | |
- // | | | |a2*a1| | | |
- // | | |a3*a1| | | | |
- // *| | | | | | | | 2|
- // +|a3*a3|a2*a2|a1*a1|a0*a0|
- // |--+--+--+--+--+--+--+--|
- // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
- //
- // "can't overflow" below mark carrying into high part of
- // multiplication result, which can't overflow, because it
- // can never be all ones.
-
- mul x15,x5,x4 // a[1]*a[0]
- umulh x9,x5,x4
- mul x16,x6,x4 // a[2]*a[0]
- umulh x10,x6,x4
- mul x17,x7,x4 // a[3]*a[0]
- umulh x19,x7,x4
-
- adds x16,x16,x9 // accumulate high parts of multiplication
- mul x8,x6,x5 // a[2]*a[1]
- umulh x9,x6,x5
- adcs x17,x17,x10
- mul x10,x7,x5 // a[3]*a[1]
- umulh x11,x7,x5
- adc x19,x19,xzr // can't overflow
-
- mul x20,x7,x6 // a[3]*a[2]
- umulh x1,x7,x6
-
- adds x9,x9,x10 // accumulate high parts of multiplication
- mul x14,x4,x4 // a[0]*a[0]
- adc x10,x11,xzr // can't overflow
-
- adds x17,x17,x8 // accumulate low parts of multiplication
- umulh x4,x4,x4
- adcs x19,x19,x9
- mul x9,x5,x5 // a[1]*a[1]
- adcs x20,x20,x10
- umulh x5,x5,x5
- adc x1,x1,xzr // can't overflow
-
- adds x15,x15,x15 // acc[1-6]*=2
- mul x10,x6,x6 // a[2]*a[2]
- adcs x16,x16,x16
- umulh x6,x6,x6
- adcs x17,x17,x17
- mul x11,x7,x7 // a[3]*a[3]
- adcs x19,x19,x19
- umulh x7,x7,x7
- adcs x20,x20,x20
- adcs x1,x1,x1
- adc x2,xzr,xzr
-
- adds x15,x15,x4 // +a[i]*a[i]
- adcs x16,x16,x9
- adcs x17,x17,x5
- adcs x19,x19,x10
- adcs x20,x20,x6
- lsl x8,x14,#32
- adcs x1,x1,x11
- lsr x9,x14,#32
- adc x2,x2,x7
- subs x10,x14,x8 // "*0xffff0001"
- sbc x11,x14,x9
- adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
- adcs x15,x16,x9
- lsl x8,x14,#32
- adcs x16,x17,x10 // +=acc[0]*0xffff0001
- lsr x9,x14,#32
- adc x17,x11,xzr // can't overflow
- subs x10,x14,x8 // "*0xffff0001"
- sbc x11,x14,x9
- adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
- adcs x15,x16,x9
- lsl x8,x14,#32
- adcs x16,x17,x10 // +=acc[0]*0xffff0001
- lsr x9,x14,#32
- adc x17,x11,xzr // can't overflow
- subs x10,x14,x8 // "*0xffff0001"
- sbc x11,x14,x9
- adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
- adcs x15,x16,x9
- lsl x8,x14,#32
- adcs x16,x17,x10 // +=acc[0]*0xffff0001
- lsr x9,x14,#32
- adc x17,x11,xzr // can't overflow
- subs x10,x14,x8 // "*0xffff0001"
- sbc x11,x14,x9
- adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
- adcs x15,x16,x9
- adcs x16,x17,x10 // +=acc[0]*0xffff0001
- adc x17,x11,xzr // can't overflow
-
- adds x14,x14,x19 // accumulate upper half
- adcs x15,x15,x20
- adcs x16,x16,x1
- adcs x17,x17,x2
- adc x19,xzr,xzr
-
- adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus
- sbcs x9,x15,x12
- sbcs x10,x16,xzr
- sbcs x11,x17,x13
- sbcs xzr,x19,xzr // did it borrow?
-
- csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
- csel x15,x15,x9,lo
- csel x16,x16,x10,lo
- stp x14,x15,[x0]
- csel x17,x17,x11,lo
- stp x16,x17,[x0,#16]
-
- ret
-
-
-// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
-// x4-x7 and x8-x11. This is done because it's used in multiple
-// contexts, e.g. in multiplication by 2 and 3...
-
-.align 4
-__ecp_nistz256_add_to:
- adds x14,x14,x8 // ret = a+b
- adcs x15,x15,x9
- adcs x16,x16,x10
- adcs x17,x17,x11
- adc x1,xzr,xzr // zap x1
-
- adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus
- sbcs x9,x15,x12
- sbcs x10,x16,xzr
- sbcs x11,x17,x13
- sbcs xzr,x1,xzr // did subtraction borrow?
-
- csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
- csel x15,x15,x9,lo
- csel x16,x16,x10,lo
- stp x14,x15,[x0]
- csel x17,x17,x11,lo
- stp x16,x17,[x0,#16]
-
- ret
-
-
-
-.align 4
-__ecp_nistz256_sub_from:
- ldp x8,x9,[x2]
- ldp x10,x11,[x2,#16]
- subs x14,x14,x8 // ret = a-b
- sbcs x15,x15,x9
- sbcs x16,x16,x10
- sbcs x17,x17,x11
- sbc x1,xzr,xzr // zap x1
-
- subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus
- adcs x9,x15,x12
- adcs x10,x16,xzr
- adc x11,x17,x13
- cmp x1,xzr // did subtraction borrow?
-
- csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret
- csel x15,x15,x9,eq
- csel x16,x16,x10,eq
- stp x14,x15,[x0]
- csel x17,x17,x11,eq
- stp x16,x17,[x0,#16]
-
- ret
-
-
-
-.align 4
-__ecp_nistz256_sub_morf:
- ldp x8,x9,[x2]
- ldp x10,x11,[x2,#16]
- subs x14,x8,x14 // ret = b-a
- sbcs x15,x9,x15
- sbcs x16,x10,x16
- sbcs x17,x11,x17
- sbc x1,xzr,xzr // zap x1
-
- subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus
- adcs x9,x15,x12
- adcs x10,x16,xzr
- adc x11,x17,x13
- cmp x1,xzr // did subtraction borrow?
-
- csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret
- csel x15,x15,x9,eq
- csel x16,x16,x10,eq
- stp x14,x15,[x0]
- csel x17,x17,x11,eq
- stp x16,x17,[x0,#16]
-
- ret
-
-
-
-.align 4
-__ecp_nistz256_div_by_2:
- subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus
- adcs x9,x15,x12
- adcs x10,x16,xzr
- adcs x11,x17,x13
- adc x1,xzr,xzr // zap x1
- tst x14,#1 // is a even?
-
- csel x14,x14,x8,eq // ret = even ? a : a+modulus
- csel x15,x15,x9,eq
- csel x16,x16,x10,eq
- csel x17,x17,x11,eq
- csel x1,xzr,x1,eq
-
- lsr x14,x14,#1 // ret >>= 1
- orr x14,x14,x15,lsl#63
- lsr x15,x15,#1
- orr x15,x15,x16,lsl#63
- lsr x16,x16,#1
- orr x16,x16,x17,lsl#63
- lsr x17,x17,#1
- stp x14,x15,[x0]
- orr x17,x17,x1,lsl#63
- stp x16,x17,[x0,#16]
-
- ret
-
-.globl _ecp_nistz256_point_double
-.private_extern _ecp_nistz256_point_double
-
-.align 5
-_ecp_nistz256_point_double:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- sub sp,sp,#32*4
-
-Ldouble_shortcut:
- ldp x14,x15,[x1,#32]
- mov x21,x0
- ldp x16,x17,[x1,#48]
- mov x22,x1
- adrp x13,Lpoly@PAGE
- add x13,x13,Lpoly@PAGEOFF
- ldr x12,[x13,#8]
- mov x8,x14
- ldr x13,[x13,#24]
- mov x9,x15
- ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont
- mov x10,x16
- mov x11,x17
- ldp x6,x7,[x22,#64+16]
- add x0,sp,#0
- bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y);
-
- add x0,sp,#64
- bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z);
-
- ldp x8,x9,[x22]
- ldp x10,x11,[x22,#16]
- mov x4,x14 // put Zsqr aside for p256_sub
- mov x5,x15
- mov x6,x16
- mov x7,x17
- add x0,sp,#32
- bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x);
-
- add x2,x22,#0
- mov x14,x4 // restore Zsqr
- mov x15,x5
- ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont
- mov x16,x6
- mov x17,x7
- ldp x6,x7,[sp,#0+16]
- add x0,sp,#64
- bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr);
-
- add x0,sp,#0
- bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S);
-
- ldr x3,[x22,#32]
- ldp x4,x5,[x22,#64]
- ldp x6,x7,[x22,#64+16]
- add x2,x22,#32
- add x0,sp,#96
- bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y);
-
- mov x8,x14
- mov x9,x15
- ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont
- mov x10,x16
- mov x11,x17
- ldp x6,x7,[sp,#0+16]
- add x0,x21,#64
- bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0);
-
- add x0,sp,#96
- bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S);
-
- ldr x3,[sp,#64] // forward load for p256_mul_mont
- ldp x4,x5,[sp,#32]
- ldp x6,x7,[sp,#32+16]
- add x0,x21,#32
- bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0);
-
- add x2,sp,#64
- add x0,sp,#32
- bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr);
-
- mov x8,x14 // duplicate M
- mov x9,x15
- mov x10,x16
- mov x11,x17
- mov x4,x14 // put M aside
- mov x5,x15
- mov x6,x16
- mov x7,x17
- add x0,sp,#32
- bl __ecp_nistz256_add_to
- mov x8,x4 // restore M
- mov x9,x5
- ldr x3,[x22] // forward load for p256_mul_mont
- mov x10,x6
- ldp x4,x5,[sp,#0]
- mov x11,x7
- ldp x6,x7,[sp,#0+16]
- bl __ecp_nistz256_add_to // p256_mul_by_3(M, M);
-
- add x2,x22,#0
- add x0,sp,#0
- bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x);
-
- mov x8,x14
- mov x9,x15
- ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont
- mov x10,x16
- mov x11,x17
- ldp x6,x7,[sp,#32+16]
- add x0,sp,#96
- bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S);
-
- add x0,x21,#0
- bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M);
-
- add x2,sp,#96
- bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0);
-
- add x2,sp,#0
- add x0,sp,#0
- bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x);
-
- ldr x3,[sp,#32]
- mov x4,x14 // copy S
- mov x5,x15
- mov x6,x16
- mov x7,x17
- add x2,sp,#32
- bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M);
-
- add x2,x21,#32
- add x0,x21,#32
- bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y);
-
- add sp,x29,#0 // destroy frame
- ldp x19,x20,[x29,#16]
- ldp x21,x22,[x29,#32]
- ldp x29,x30,[sp],#96
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-.globl _ecp_nistz256_point_add
-.private_extern _ecp_nistz256_point_add
-
-.align 5
-_ecp_nistz256_point_add:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#32*12
-
- ldp x4,x5,[x2,#64] // in2_z
- ldp x6,x7,[x2,#64+16]
- mov x21,x0
- mov x22,x1
- mov x23,x2
- adrp x13,Lpoly@PAGE
- add x13,x13,Lpoly@PAGEOFF
- ldr x12,[x13,#8]
- ldr x13,[x13,#24]
- orr x8,x4,x5
- orr x10,x6,x7
- orr x25,x8,x10
- cmp x25,#0
- csetm x25,ne // ~in2infty
- add x0,sp,#192
- bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z);
-
- ldp x4,x5,[x22,#64] // in1_z
- ldp x6,x7,[x22,#64+16]
- orr x8,x4,x5
- orr x10,x6,x7
- orr x24,x8,x10
- cmp x24,#0
- csetm x24,ne // ~in1infty
- add x0,sp,#128
- bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
-
- ldr x3,[x23,#64]
- ldp x4,x5,[sp,#192]
- ldp x6,x7,[sp,#192+16]
- add x2,x23,#64
- add x0,sp,#320
- bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z);
-
- ldr x3,[x22,#64]
- ldp x4,x5,[sp,#128]
- ldp x6,x7,[sp,#128+16]
- add x2,x22,#64
- add x0,sp,#352
- bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
-
- ldr x3,[x22,#32]
- ldp x4,x5,[sp,#320]
- ldp x6,x7,[sp,#320+16]
- add x2,x22,#32
- add x0,sp,#320
- bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y);
-
- ldr x3,[x23,#32]
- ldp x4,x5,[sp,#352]
- ldp x6,x7,[sp,#352+16]
- add x2,x23,#32
- add x0,sp,#352
- bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
-
- add x2,sp,#320
- ldr x3,[sp,#192] // forward load for p256_mul_mont
- ldp x4,x5,[x22]
- ldp x6,x7,[x22,#16]
- add x0,sp,#160
- bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1);
-
- orr x14,x14,x15 // see if result is zero
- orr x16,x16,x17
- orr x26,x14,x16 // ~is_equal(S1,S2)
-
- add x2,sp,#192
- add x0,sp,#256
- bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr);
-
- ldr x3,[sp,#128]
- ldp x4,x5,[x23]
- ldp x6,x7,[x23,#16]
- add x2,sp,#128
- add x0,sp,#288
- bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr);
-
- add x2,sp,#256
- ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont
- ldp x6,x7,[sp,#160+16]
- add x0,sp,#96
- bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1);
-
- orr x14,x14,x15 // see if result is zero
- orr x16,x16,x17
- orr x14,x14,x16 // ~is_equal(U1,U2)
-
- mvn x27,x24 // -1/0 -> 0/-1
- mvn x28,x25 // -1/0 -> 0/-1
- orr x14,x14,x27
- orr x14,x14,x28
- orr x14,x14,x26
- cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
-
-Ladd_double:
- mov x1,x22
- mov x0,x21
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames
- b Ldouble_shortcut
-
-.align 4
-Ladd_proceed:
- add x0,sp,#192
- bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
-
- ldr x3,[x22,#64]
- ldp x4,x5,[sp,#96]
- ldp x6,x7,[sp,#96+16]
- add x2,x22,#64
- add x0,sp,#64
- bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
-
- ldp x4,x5,[sp,#96]
- ldp x6,x7,[sp,#96+16]
- add x0,sp,#128
- bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
-
- ldr x3,[x23,#64]
- ldp x4,x5,[sp,#64]
- ldp x6,x7,[sp,#64+16]
- add x2,x23,#64
- add x0,sp,#64
- bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z);
-
- ldr x3,[sp,#96]
- ldp x4,x5,[sp,#128]
- ldp x6,x7,[sp,#128+16]
- add x2,sp,#96
- add x0,sp,#224
- bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
-
- ldr x3,[sp,#128]
- ldp x4,x5,[sp,#256]
- ldp x6,x7,[sp,#256+16]
- add x2,sp,#128
- add x0,sp,#288
- bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr);
-
- mov x8,x14
- mov x9,x15
- mov x10,x16
- mov x11,x17
- add x0,sp,#128
- bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2);
-
- add x2,sp,#192
- add x0,sp,#0
- bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
-
- add x2,sp,#224
- bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
-
- add x2,sp,#288
- ldr x3,[sp,#224] // forward load for p256_mul_mont
- ldp x4,x5,[sp,#320]
- ldp x6,x7,[sp,#320+16]
- add x0,sp,#32
- bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
-
- add x2,sp,#224
- add x0,sp,#352
- bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub);
-
- ldr x3,[sp,#160]
- ldp x4,x5,[sp,#32]
- ldp x6,x7,[sp,#32+16]
- add x2,sp,#160
- add x0,sp,#32
- bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
-
- add x2,sp,#352
- bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
-
- ldp x4,x5,[sp,#0] // res
- ldp x6,x7,[sp,#0+16]
- ldp x8,x9,[x23] // in2
- ldp x10,x11,[x23,#16]
- ldp x14,x15,[x22,#0] // in1
- cmp x24,#0 // ~, remember?
- ldp x16,x17,[x22,#0+16]
- csel x8,x4,x8,ne
- csel x9,x5,x9,ne
- ldp x4,x5,[sp,#0+0+32] // res
- csel x10,x6,x10,ne
- csel x11,x7,x11,ne
- cmp x25,#0 // ~, remember?
- ldp x6,x7,[sp,#0+0+48]
- csel x14,x8,x14,ne
- csel x15,x9,x15,ne
- ldp x8,x9,[x23,#0+32] // in2
- csel x16,x10,x16,ne
- csel x17,x11,x17,ne
- ldp x10,x11,[x23,#0+48]
- stp x14,x15,[x21,#0]
- stp x16,x17,[x21,#0+16]
- ldp x14,x15,[x22,#32] // in1
- cmp x24,#0 // ~, remember?
- ldp x16,x17,[x22,#32+16]
- csel x8,x4,x8,ne
- csel x9,x5,x9,ne
- ldp x4,x5,[sp,#0+32+32] // res
- csel x10,x6,x10,ne
- csel x11,x7,x11,ne
- cmp x25,#0 // ~, remember?
- ldp x6,x7,[sp,#0+32+48]
- csel x14,x8,x14,ne
- csel x15,x9,x15,ne
- ldp x8,x9,[x23,#32+32] // in2
- csel x16,x10,x16,ne
- csel x17,x11,x17,ne
- ldp x10,x11,[x23,#32+48]
- stp x14,x15,[x21,#32]
- stp x16,x17,[x21,#32+16]
- ldp x14,x15,[x22,#64] // in1
- cmp x24,#0 // ~, remember?
- ldp x16,x17,[x22,#64+16]
- csel x8,x4,x8,ne
- csel x9,x5,x9,ne
- csel x10,x6,x10,ne
- csel x11,x7,x11,ne
- cmp x25,#0 // ~, remember?
- csel x14,x8,x14,ne
- csel x15,x9,x15,ne
- csel x16,x10,x16,ne
- csel x17,x11,x17,ne
- stp x14,x15,[x21,#64]
- stp x16,x17,[x21,#64+16]
-
-Ladd_done:
- add sp,x29,#0 // destroy frame
- ldp x19,x20,[x29,#16]
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-.globl _ecp_nistz256_point_add_affine
-.private_extern _ecp_nistz256_point_add_affine
-
-.align 5
-_ecp_nistz256_point_add_affine:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- sub sp,sp,#32*10
-
- mov x21,x0
- mov x22,x1
- mov x23,x2
- adrp x13,Lpoly@PAGE
- add x13,x13,Lpoly@PAGEOFF
- ldr x12,[x13,#8]
- ldr x13,[x13,#24]
-
- ldp x4,x5,[x1,#64] // in1_z
- ldp x6,x7,[x1,#64+16]
- orr x8,x4,x5
- orr x10,x6,x7
- orr x24,x8,x10
- cmp x24,#0
- csetm x24,ne // ~in1infty
-
- ldp x14,x15,[x2] // in2_x
- ldp x16,x17,[x2,#16]
- ldp x8,x9,[x2,#32] // in2_y
- ldp x10,x11,[x2,#48]
- orr x14,x14,x15
- orr x16,x16,x17
- orr x8,x8,x9
- orr x10,x10,x11
- orr x14,x14,x16
- orr x8,x8,x10
- orr x25,x14,x8
- cmp x25,#0
- csetm x25,ne // ~in2infty
-
- add x0,sp,#128
- bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
-
- mov x4,x14
- mov x5,x15
- mov x6,x16
- mov x7,x17
- ldr x3,[x23]
- add x2,x23,#0
- add x0,sp,#96
- bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x);
-
- add x2,x22,#0
- ldr x3,[x22,#64] // forward load for p256_mul_mont
- ldp x4,x5,[sp,#128]
- ldp x6,x7,[sp,#128+16]
- add x0,sp,#160
- bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x);
-
- add x2,x22,#64
- add x0,sp,#128
- bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
-
- ldr x3,[x22,#64]
- ldp x4,x5,[sp,#160]
- ldp x6,x7,[sp,#160+16]
- add x2,x22,#64
- add x0,sp,#64
- bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
-
- ldr x3,[x23,#32]
- ldp x4,x5,[sp,#128]
- ldp x6,x7,[sp,#128+16]
- add x2,x23,#32
- add x0,sp,#128
- bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
-
- add x2,x22,#32
- ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont
- ldp x6,x7,[sp,#160+16]
- add x0,sp,#192
- bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y);
-
- add x0,sp,#224
- bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
-
- ldp x4,x5,[sp,#192]
- ldp x6,x7,[sp,#192+16]
- add x0,sp,#288
- bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
-
- ldr x3,[sp,#160]
- ldp x4,x5,[sp,#224]
- ldp x6,x7,[sp,#224+16]
- add x2,sp,#160
- add x0,sp,#256
- bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
-
- ldr x3,[x22]
- ldp x4,x5,[sp,#224]
- ldp x6,x7,[sp,#224+16]
- add x2,x22,#0
- add x0,sp,#96
- bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr);
-
- mov x8,x14
- mov x9,x15
- mov x10,x16
- mov x11,x17
- add x0,sp,#224
- bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2);
-
- add x2,sp,#288
- add x0,sp,#0
- bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
-
- add x2,sp,#256
- bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
-
- add x2,sp,#96
- ldr x3,[x22,#32] // forward load for p256_mul_mont
- ldp x4,x5,[sp,#256]
- ldp x6,x7,[sp,#256+16]
- add x0,sp,#32
- bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
-
- add x2,x22,#32
- add x0,sp,#128
- bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub);
-
- ldr x3,[sp,#192]
- ldp x4,x5,[sp,#32]
- ldp x6,x7,[sp,#32+16]
- add x2,sp,#192
- add x0,sp,#32
- bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
-
- add x2,sp,#128
- bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
-
- ldp x4,x5,[sp,#0] // res
- ldp x6,x7,[sp,#0+16]
- ldp x8,x9,[x23] // in2
- ldp x10,x11,[x23,#16]
- ldp x14,x15,[x22,#0] // in1
- cmp x24,#0 // ~, remember?
- ldp x16,x17,[x22,#0+16]
- csel x8,x4,x8,ne
- csel x9,x5,x9,ne
- ldp x4,x5,[sp,#0+0+32] // res
- csel x10,x6,x10,ne
- csel x11,x7,x11,ne
- cmp x25,#0 // ~, remember?
- ldp x6,x7,[sp,#0+0+48]
- csel x14,x8,x14,ne
- csel x15,x9,x15,ne
- ldp x8,x9,[x23,#0+32] // in2
- csel x16,x10,x16,ne
- csel x17,x11,x17,ne
- ldp x10,x11,[x23,#0+48]
- stp x14,x15,[x21,#0]
- stp x16,x17,[x21,#0+16]
- adrp x23,Lone_mont@PAGE-64
- add x23,x23,Lone_mont@PAGEOFF-64
- ldp x14,x15,[x22,#32] // in1
- cmp x24,#0 // ~, remember?
- ldp x16,x17,[x22,#32+16]
- csel x8,x4,x8,ne
- csel x9,x5,x9,ne
- ldp x4,x5,[sp,#0+32+32] // res
- csel x10,x6,x10,ne
- csel x11,x7,x11,ne
- cmp x25,#0 // ~, remember?
- ldp x6,x7,[sp,#0+32+48]
- csel x14,x8,x14,ne
- csel x15,x9,x15,ne
- ldp x8,x9,[x23,#32+32] // in2
- csel x16,x10,x16,ne
- csel x17,x11,x17,ne
- ldp x10,x11,[x23,#32+48]
- stp x14,x15,[x21,#32]
- stp x16,x17,[x21,#32+16]
- ldp x14,x15,[x22,#64] // in1
- cmp x24,#0 // ~, remember?
- ldp x16,x17,[x22,#64+16]
- csel x8,x4,x8,ne
- csel x9,x5,x9,ne
- csel x10,x6,x10,ne
- csel x11,x7,x11,ne
- cmp x25,#0 // ~, remember?
- csel x14,x8,x14,ne
- csel x15,x9,x15,ne
- csel x16,x10,x16,ne
- csel x17,x11,x17,ne
- stp x14,x15,[x21,#64]
- stp x16,x17,[x21,#64+16]
-
- add sp,x29,#0 // destroy frame
- ldp x19,x20,[x29,#16]
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x29,x30,[sp],#80
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-////////////////////////////////////////////////////////////////////////
-// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
-// uint64_t b[4]);
-.globl _ecp_nistz256_ord_mul_mont
-.private_extern _ecp_nistz256_ord_mul_mont
-
-.align 4
-_ecp_nistz256_ord_mul_mont:
- AARCH64_VALID_CALL_TARGET
- // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
- stp x29,x30,[sp,#-64]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
-
- adrp x23,Lord@PAGE
- add x23,x23,Lord@PAGEOFF
- ldr x3,[x2] // bp[0]
- ldp x4,x5,[x1]
- ldp x6,x7,[x1,#16]
-
- ldp x12,x13,[x23,#0]
- ldp x21,x22,[x23,#16]
- ldr x23,[x23,#32]
-
- mul x14,x4,x3 // a[0]*b[0]
- umulh x8,x4,x3
-
- mul x15,x5,x3 // a[1]*b[0]
- umulh x9,x5,x3
-
- mul x16,x6,x3 // a[2]*b[0]
- umulh x10,x6,x3
-
- mul x17,x7,x3 // a[3]*b[0]
- umulh x19,x7,x3
-
- mul x24,x14,x23
-
- adds x15,x15,x8 // accumulate high parts of multiplication
- adcs x16,x16,x9
- adcs x17,x17,x10
- adc x19,x19,xzr
- mov x20,xzr
- ldr x3,[x2,#8*1] // b[i]
-
- lsl x8,x24,#32
- subs x16,x16,x24
- lsr x9,x24,#32
- sbcs x17,x17,x8
- sbcs x19,x19,x9
- sbc x20,x20,xzr
-
- subs xzr,x14,#1
- umulh x9,x12,x24
- mul x10,x13,x24
- umulh x11,x13,x24
-
- adcs x10,x10,x9
- mul x8,x4,x3
- adc x11,x11,xzr
- mul x9,x5,x3
-
- adds x14,x15,x10
- mul x10,x6,x3
- adcs x15,x16,x11
- mul x11,x7,x3
- adcs x16,x17,x24
- adcs x17,x19,x24
- adc x19,x20,xzr
-
- adds x14,x14,x8 // accumulate low parts
- umulh x8,x4,x3
- adcs x15,x15,x9
- umulh x9,x5,x3
- adcs x16,x16,x10
- umulh x10,x6,x3
- adcs x17,x17,x11
- umulh x11,x7,x3
- adc x19,x19,xzr
- mul x24,x14,x23
- adds x15,x15,x8 // accumulate high parts
- adcs x16,x16,x9
- adcs x17,x17,x10
- adcs x19,x19,x11
- adc x20,xzr,xzr
- ldr x3,[x2,#8*2] // b[i]
-
- lsl x8,x24,#32
- subs x16,x16,x24
- lsr x9,x24,#32
- sbcs x17,x17,x8
- sbcs x19,x19,x9
- sbc x20,x20,xzr
-
- subs xzr,x14,#1
- umulh x9,x12,x24
- mul x10,x13,x24
- umulh x11,x13,x24
-
- adcs x10,x10,x9
- mul x8,x4,x3
- adc x11,x11,xzr
- mul x9,x5,x3
-
- adds x14,x15,x10
- mul x10,x6,x3
- adcs x15,x16,x11
- mul x11,x7,x3
- adcs x16,x17,x24
- adcs x17,x19,x24
- adc x19,x20,xzr
-
- adds x14,x14,x8 // accumulate low parts
- umulh x8,x4,x3
- adcs x15,x15,x9
- umulh x9,x5,x3
- adcs x16,x16,x10
- umulh x10,x6,x3
- adcs x17,x17,x11
- umulh x11,x7,x3
- adc x19,x19,xzr
- mul x24,x14,x23
- adds x15,x15,x8 // accumulate high parts
- adcs x16,x16,x9
- adcs x17,x17,x10
- adcs x19,x19,x11
- adc x20,xzr,xzr
- ldr x3,[x2,#8*3] // b[i]
-
- lsl x8,x24,#32
- subs x16,x16,x24
- lsr x9,x24,#32
- sbcs x17,x17,x8
- sbcs x19,x19,x9
- sbc x20,x20,xzr
-
- subs xzr,x14,#1
- umulh x9,x12,x24
- mul x10,x13,x24
- umulh x11,x13,x24
-
- adcs x10,x10,x9
- mul x8,x4,x3
- adc x11,x11,xzr
- mul x9,x5,x3
-
- adds x14,x15,x10
- mul x10,x6,x3
- adcs x15,x16,x11
- mul x11,x7,x3
- adcs x16,x17,x24
- adcs x17,x19,x24
- adc x19,x20,xzr
-
- adds x14,x14,x8 // accumulate low parts
- umulh x8,x4,x3
- adcs x15,x15,x9
- umulh x9,x5,x3
- adcs x16,x16,x10
- umulh x10,x6,x3
- adcs x17,x17,x11
- umulh x11,x7,x3
- adc x19,x19,xzr
- mul x24,x14,x23
- adds x15,x15,x8 // accumulate high parts
- adcs x16,x16,x9
- adcs x17,x17,x10
- adcs x19,x19,x11
- adc x20,xzr,xzr
- lsl x8,x24,#32 // last reduction
- subs x16,x16,x24
- lsr x9,x24,#32
- sbcs x17,x17,x8
- sbcs x19,x19,x9
- sbc x20,x20,xzr
-
- subs xzr,x14,#1
- umulh x9,x12,x24
- mul x10,x13,x24
- umulh x11,x13,x24
-
- adcs x10,x10,x9
- adc x11,x11,xzr
-
- adds x14,x15,x10
- adcs x15,x16,x11
- adcs x16,x17,x24
- adcs x17,x19,x24
- adc x19,x20,xzr
-
- subs x8,x14,x12 // ret -= modulus
- sbcs x9,x15,x13
- sbcs x10,x16,x21
- sbcs x11,x17,x22
- sbcs xzr,x19,xzr
-
- csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
- csel x15,x15,x9,lo
- csel x16,x16,x10,lo
- stp x14,x15,[x0]
- csel x17,x17,x11,lo
- stp x16,x17,[x0,#16]
-
- ldp x19,x20,[sp,#16]
- ldp x21,x22,[sp,#32]
- ldp x23,x24,[sp,#48]
- ldr x29,[sp],#64
- ret
-
-
-////////////////////////////////////////////////////////////////////////
-// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
-// uint64_t rep);
-.globl _ecp_nistz256_ord_sqr_mont
-.private_extern _ecp_nistz256_ord_sqr_mont
-
-.align 4
-_ecp_nistz256_ord_sqr_mont:
- AARCH64_VALID_CALL_TARGET
- // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
- stp x29,x30,[sp,#-64]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
-
- adrp x23,Lord@PAGE
- add x23,x23,Lord@PAGEOFF
- ldp x4,x5,[x1]
- ldp x6,x7,[x1,#16]
-
- ldp x12,x13,[x23,#0]
- ldp x21,x22,[x23,#16]
- ldr x23,[x23,#32]
- b Loop_ord_sqr
-
-.align 4
-Loop_ord_sqr:
- sub x2,x2,#1
- ////////////////////////////////////////////////////////////////
- // | | | | | |a1*a0| |
- // | | | | |a2*a0| | |
- // | |a3*a2|a3*a0| | | |
- // | | | |a2*a1| | | |
- // | | |a3*a1| | | | |
- // *| | | | | | | | 2|
- // +|a3*a3|a2*a2|a1*a1|a0*a0|
- // |--+--+--+--+--+--+--+--|
- // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
- //
- // "can't overflow" below mark carrying into high part of
- // multiplication result, which can't overflow, because it
- // can never be all ones.
-
- mul x15,x5,x4 // a[1]*a[0]
- umulh x9,x5,x4
- mul x16,x6,x4 // a[2]*a[0]
- umulh x10,x6,x4
- mul x17,x7,x4 // a[3]*a[0]
- umulh x19,x7,x4
-
- adds x16,x16,x9 // accumulate high parts of multiplication
- mul x8,x6,x5 // a[2]*a[1]
- umulh x9,x6,x5
- adcs x17,x17,x10
- mul x10,x7,x5 // a[3]*a[1]
- umulh x11,x7,x5
- adc x19,x19,xzr // can't overflow
-
- mul x20,x7,x6 // a[3]*a[2]
- umulh x1,x7,x6
-
- adds x9,x9,x10 // accumulate high parts of multiplication
- mul x14,x4,x4 // a[0]*a[0]
- adc x10,x11,xzr // can't overflow
-
- adds x17,x17,x8 // accumulate low parts of multiplication
- umulh x4,x4,x4
- adcs x19,x19,x9
- mul x9,x5,x5 // a[1]*a[1]
- adcs x20,x20,x10
- umulh x5,x5,x5
- adc x1,x1,xzr // can't overflow
-
- adds x15,x15,x15 // acc[1-6]*=2
- mul x10,x6,x6 // a[2]*a[2]
- adcs x16,x16,x16
- umulh x6,x6,x6
- adcs x17,x17,x17
- mul x11,x7,x7 // a[3]*a[3]
- adcs x19,x19,x19
- umulh x7,x7,x7
- adcs x20,x20,x20
- adcs x1,x1,x1
- adc x3,xzr,xzr
-
- adds x15,x15,x4 // +a[i]*a[i]
- mul x24,x14,x23
- adcs x16,x16,x9
- adcs x17,x17,x5
- adcs x19,x19,x10
- adcs x20,x20,x6
- adcs x1,x1,x11
- adc x3,x3,x7
- subs xzr,x14,#1
- umulh x9,x12,x24
- mul x10,x13,x24
- umulh x11,x13,x24
-
- adcs x10,x10,x9
- adc x11,x11,xzr
-
- adds x14,x15,x10
- adcs x15,x16,x11
- adcs x16,x17,x24
- adc x17,xzr,x24 // can't overflow
- mul x11,x14,x23
- lsl x8,x24,#32
- subs x15,x15,x24
- lsr x9,x24,#32
- sbcs x16,x16,x8
- sbc x17,x17,x9 // can't borrow
- subs xzr,x14,#1
- umulh x9,x12,x11
- mul x10,x13,x11
- umulh x24,x13,x11
-
- adcs x10,x10,x9
- adc x24,x24,xzr
-
- adds x14,x15,x10
- adcs x15,x16,x24
- adcs x16,x17,x11
- adc x17,xzr,x11 // can't overflow
- mul x24,x14,x23
- lsl x8,x11,#32
- subs x15,x15,x11
- lsr x9,x11,#32
- sbcs x16,x16,x8
- sbc x17,x17,x9 // can't borrow
- subs xzr,x14,#1
- umulh x9,x12,x24
- mul x10,x13,x24
- umulh x11,x13,x24
-
- adcs x10,x10,x9
- adc x11,x11,xzr
-
- adds x14,x15,x10
- adcs x15,x16,x11
- adcs x16,x17,x24
- adc x17,xzr,x24 // can't overflow
- mul x11,x14,x23
- lsl x8,x24,#32
- subs x15,x15,x24
- lsr x9,x24,#32
- sbcs x16,x16,x8
- sbc x17,x17,x9 // can't borrow
- subs xzr,x14,#1
- umulh x9,x12,x11
- mul x10,x13,x11
- umulh x24,x13,x11
-
- adcs x10,x10,x9
- adc x24,x24,xzr
-
- adds x14,x15,x10
- adcs x15,x16,x24
- adcs x16,x17,x11
- adc x17,xzr,x11 // can't overflow
- lsl x8,x11,#32
- subs x15,x15,x11
- lsr x9,x11,#32
- sbcs x16,x16,x8
- sbc x17,x17,x9 // can't borrow
- adds x14,x14,x19 // accumulate upper half
- adcs x15,x15,x20
- adcs x16,x16,x1
- adcs x17,x17,x3
- adc x19,xzr,xzr
-
- subs x8,x14,x12 // ret -= modulus
- sbcs x9,x15,x13
- sbcs x10,x16,x21
- sbcs x11,x17,x22
- sbcs xzr,x19,xzr
-
- csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus
- csel x5,x15,x9,lo
- csel x6,x16,x10,lo
- csel x7,x17,x11,lo
-
- cbnz x2,Loop_ord_sqr
-
- stp x4,x5,[x0]
- stp x6,x7,[x0,#16]
-
- ldp x19,x20,[sp,#16]
- ldp x21,x22,[sp,#32]
- ldp x23,x24,[sp,#48]
- ldr x29,[sp],#64
- ret
-
-////////////////////////////////////////////////////////////////////////
-// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
-.globl _ecp_nistz256_select_w5
-.private_extern _ecp_nistz256_select_w5
-
-.align 4
-_ecp_nistz256_select_w5:
- AARCH64_VALID_CALL_TARGET
-
- // x10 := x0
- // w9 := 0; loop counter and incremented internal index
- mov x10, x0
- mov w9, #0
-
- // [v16-v21] := 0
- movi v16.16b, #0
- movi v17.16b, #0
- movi v18.16b, #0
- movi v19.16b, #0
- movi v20.16b, #0
- movi v21.16b, #0
-
-Lselect_w5_loop:
- // Loop 16 times.
-
- // Increment index (loop counter); tested at the end of the loop
- add w9, w9, #1
-
- // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
- // and advance x1 to point to the next entry
- ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
-
- // x11 := (w9 == w2)? All 1s : All 0s
- cmp w9, w2
- csetm x11, eq
-
- // continue loading ...
- ld1 {v26.2d, v27.2d}, [x1],#32
-
- // duplicate mask_64 into Mask (all 0s or all 1s)
- dup v3.2d, x11
-
- // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
- // i.e., values in output registers will remain the same if w9 != w2
- bit v16.16b, v22.16b, v3.16b
- bit v17.16b, v23.16b, v3.16b
-
- bit v18.16b, v24.16b, v3.16b
- bit v19.16b, v25.16b, v3.16b
-
- bit v20.16b, v26.16b, v3.16b
- bit v21.16b, v27.16b, v3.16b
-
- // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
- tbz w9, #4, Lselect_w5_loop
-
- // Write [v16-v21] to memory at the output pointer
- st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
- st1 {v20.2d, v21.2d}, [x10]
-
- ret
-
-
-
-////////////////////////////////////////////////////////////////////////
-// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
-.globl _ecp_nistz256_select_w7
-.private_extern _ecp_nistz256_select_w7
-
-.align 4
-_ecp_nistz256_select_w7:
- AARCH64_VALID_CALL_TARGET
-
- // w9 := 0; loop counter and incremented internal index
- mov w9, #0
-
- // [v16-v21] := 0
- movi v16.16b, #0
- movi v17.16b, #0
- movi v18.16b, #0
- movi v19.16b, #0
-
-Lselect_w7_loop:
- // Loop 64 times.
-
- // Increment index (loop counter); tested at the end of the loop
- add w9, w9, #1
-
- // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
- // and advance x1 to point to the next entry
- ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
-
- // x11 := (w9 == w2)? All 1s : All 0s
- cmp w9, w2
- csetm x11, eq
-
- // duplicate mask_64 into Mask (all 0s or all 1s)
- dup v3.2d, x11
-
- // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
- // i.e., values in output registers will remain the same if w9 != w2
- bit v16.16b, v22.16b, v3.16b
- bit v17.16b, v23.16b, v3.16b
-
- bit v18.16b, v24.16b, v3.16b
- bit v19.16b, v25.16b, v3.16b
-
- // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
- tbz w9, #6, Lselect_w7_loop
-
- // Write [v16-v19] to memory at the output pointer
- st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
-
- ret
-
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-apple.S b/apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-apple.S
deleted file mode 100644
index 49ea9b8..0000000
--- a/apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-apple.S
+++ /dev/null
@@ -1,309 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include "openssl/arm_arch.h"
-
-.text
-.globl _beeu_mod_inverse_vartime
-.private_extern _beeu_mod_inverse_vartime
-
-.align 4
-_beeu_mod_inverse_vartime:
- // Reserve enough space for 14 8-byte registers on the stack
- // in the first stp call for x29, x30.
- // Then store the remaining callee-saved registers.
- //
- // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 |
- // ^ ^
- // sp <------------------- 112 bytes ----------------> old sp
- // x29 (FP)
- //
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-112]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- stp x0,x2,[sp,#96]
-
- // B = b3..b0 := a
- ldp x25,x26,[x1]
- ldp x27,x28,[x1,#16]
-
- // n3..n0 := n
- // Note: the value of input params are changed in the following.
- ldp x0,x1,[x2]
- ldp x2,x30,[x2,#16]
-
- // A = a3..a0 := n
- mov x21, x0
- mov x22, x1
- mov x23, x2
- mov x24, x30
-
- // X = x4..x0 := 1
- mov x3, #1
- eor x4, x4, x4
- eor x5, x5, x5
- eor x6, x6, x6
- eor x7, x7, x7
-
- // Y = y4..y0 := 0
- eor x8, x8, x8
- eor x9, x9, x9
- eor x10, x10, x10
- eor x11, x11, x11
- eor x12, x12, x12
-
-Lbeeu_loop:
- // if B == 0, jump to .Lbeeu_loop_end
- orr x14, x25, x26
- orr x14, x14, x27
-
- // reverse the bit order of x25. This is needed for clz after this macro
- rbit x15, x25
-
- orr x14, x14, x28
- cbz x14,Lbeeu_loop_end
-
-
- // 0 < B < |n|,
- // 0 < A <= |n|,
- // (1) X*a == B (mod |n|),
- // (2) (-1)*Y*a == A (mod |n|)
-
- // Now divide B by the maximum possible power of two in the
- // integers, and divide X by the same value mod |n|.
- // When we're done, (1) still holds.
-
- // shift := number of trailing 0s in x25
- // ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
- clz x13, x15
-
- // If there is no shift, goto shift_A_Y
- cbz x13, Lbeeu_shift_A_Y
-
- // Shift B right by "x13" bits
- neg x14, x13
- lsr x25, x25, x13
- lsl x15, x26, x14
-
- lsr x26, x26, x13
- lsl x19, x27, x14
-
- orr x25, x25, x15
-
- lsr x27, x27, x13
- lsl x20, x28, x14
-
- orr x26, x26, x19
-
- lsr x28, x28, x13
-
- orr x27, x27, x20
-
-
- // Shift X right by "x13" bits, adding n whenever X becomes odd.
- // x13--;
- // x14 := 0; needed in the addition to the most significant word in SHIFT1
- eor x14, x14, x14
-Lbeeu_shift_loop_X:
- tbz x3, #0, Lshift1_0
- adds x3, x3, x0
- adcs x4, x4, x1
- adcs x5, x5, x2
- adcs x6, x6, x30
- adc x7, x7, x14
-Lshift1_0:
- // var0 := [var1|var0]<64..1>;
- // i.e. concatenate var1 and var0,
- // extract bits <64..1> from the resulting 128-bit value
- // and put them in var0
- extr x3, x4, x3, #1
- extr x4, x5, x4, #1
- extr x5, x6, x5, #1
- extr x6, x7, x6, #1
- lsr x7, x7, #1
-
- subs x13, x13, #1
- bne Lbeeu_shift_loop_X
-
- // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
- // with the following differences:
- // - "x13" is set directly to the number of trailing 0s in B
- // (using rbit and clz instructions)
- // - The loop is only used to call SHIFT1(X)
- // and x13 is decreased while executing the X loop.
- // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
-
-Lbeeu_shift_A_Y:
- // Same for A and Y.
- // Afterwards, (2) still holds.
- // Reverse the bit order of x21
- // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
- rbit x15, x21
- clz x13, x15
-
- // If there is no shift, goto |B-A|, X+Y update
- cbz x13, Lbeeu_update_B_X_or_A_Y
-
- // Shift A right by "x13" bits
- neg x14, x13
- lsr x21, x21, x13
- lsl x15, x22, x14
-
- lsr x22, x22, x13
- lsl x19, x23, x14
-
- orr x21, x21, x15
-
- lsr x23, x23, x13
- lsl x20, x24, x14
-
- orr x22, x22, x19
-
- lsr x24, x24, x13
-
- orr x23, x23, x20
-
-
- // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
- // x13--;
- // x14 := 0; needed in the addition to the most significant word in SHIFT1
- eor x14, x14, x14
-Lbeeu_shift_loop_Y:
- tbz x8, #0, Lshift1_1
- adds x8, x8, x0
- adcs x9, x9, x1
- adcs x10, x10, x2
- adcs x11, x11, x30
- adc x12, x12, x14
-Lshift1_1:
- // var0 := [var1|var0]<64..1>;
- // i.e. concatenate var1 and var0,
- // extract bits <64..1> from the resulting 128-bit value
- // and put them in var0
- extr x8, x9, x8, #1
- extr x9, x10, x9, #1
- extr x10, x11, x10, #1
- extr x11, x12, x11, #1
- lsr x12, x12, #1
-
- subs x13, x13, #1
- bne Lbeeu_shift_loop_Y
-
-Lbeeu_update_B_X_or_A_Y:
- // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
- // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
- // without taking a sign bit if generated. The lack of a carry would
- // indicate a negative result. See, for example,
- // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
- subs x14, x25, x21
- sbcs x15, x26, x22
- sbcs x19, x27, x23
- sbcs x20, x28, x24
- bcs Lbeeu_B_greater_than_A
-
- // Else A > B =>
- // A := A - B; Y := Y + X; goto beginning of the loop
- subs x21, x21, x25
- sbcs x22, x22, x26
- sbcs x23, x23, x27
- sbcs x24, x24, x28
-
- adds x8, x8, x3
- adcs x9, x9, x4
- adcs x10, x10, x5
- adcs x11, x11, x6
- adc x12, x12, x7
- b Lbeeu_loop
-
-Lbeeu_B_greater_than_A:
- // Continue with B > A =>
- // B := B - A; X := X + Y; goto beginning of the loop
- mov x25, x14
- mov x26, x15
- mov x27, x19
- mov x28, x20
-
- adds x3, x3, x8
- adcs x4, x4, x9
- adcs x5, x5, x10
- adcs x6, x6, x11
- adc x7, x7, x12
- b Lbeeu_loop
-
-Lbeeu_loop_end:
- // The Euclid's algorithm loop ends when A == gcd(a,n);
- // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
- // Since (-1)*Y*a == A (mod |n|), Y>0
- // then out = -Y mod n
-
- // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|)
- // Is A-1 == 0?
- // If not, fail.
- sub x14, x21, #1
- orr x14, x14, x22
- orr x14, x14, x23
- orr x14, x14, x24
- cbnz x14, Lbeeu_err
-
- // If Y>n ==> Y:=Y-n
-Lbeeu_reduction_loop:
- // x_i := y_i - n_i (X is no longer needed, use it as temp)
- // (x14 = 0 from above)
- subs x3, x8, x0
- sbcs x4, x9, x1
- sbcs x5, x10, x2
- sbcs x6, x11, x30
- sbcs x7, x12, x14
-
- // If result is non-negative (i.e., cs = carry set = no borrow),
- // y_i := x_i; goto reduce again
- // else
- // y_i := y_i; continue
- csel x8, x3, x8, cs
- csel x9, x4, x9, cs
- csel x10, x5, x10, cs
- csel x11, x6, x11, cs
- csel x12, x7, x12, cs
- bcs Lbeeu_reduction_loop
-
- // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
- // out = -Y = n-Y
- subs x8, x0, x8
- sbcs x9, x1, x9
- sbcs x10, x2, x10
- sbcs x11, x30, x11
-
- // Save Y in output (out (x0) was saved on the stack)
- ldr x3, [sp,#96]
- stp x8, x9, [x3]
- stp x10, x11, [x3,#16]
- // return 1 (success)
- mov x0, #1
- b Lbeeu_finish
-
-Lbeeu_err:
- // return 0 (error)
- eor x0, x0, x0
-
-Lbeeu_finish:
- // Restore callee-saved registers, except x0, x2
- add sp,x29,#0
- ldp x19,x20,[sp,#16]
- ldp x21,x22,[sp,#32]
- ldp x23,x24,[sp,#48]
- ldp x25,x26,[sp,#64]
- ldp x27,x28,[sp,#80]
- ldp x29,x30,[sp],#112
-
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/sha1-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/sha1-armv8-apple.S
deleted file mode 100644
index 744c630..0000000
--- a/apple-aarch64/crypto/fipsmodule/sha1-armv8-apple.S
+++ /dev/null
@@ -1,1227 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.text
-
-
-.private_extern _OPENSSL_armcap_P
-.globl _sha1_block_data_order
-.private_extern _sha1_block_data_order
-
-.align 6
-_sha1_block_data_order:
- // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
- AARCH64_VALID_CALL_TARGET
-#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
- adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P
-#else
- adrp x16,_OPENSSL_armcap_P@PAGE
-#endif
- ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
- tst w16,#ARMV8_SHA1
- b.ne Lv8_entry
-
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
-
- ldp w20,w21,[x0]
- ldp w22,w23,[x0,#8]
- ldr w24,[x0,#16]
-
-Loop:
- ldr x3,[x1],#64
- movz w28,#0x7999
- sub x2,x2,#1
- movk w28,#0x5a82,lsl#16
-#ifdef __AARCH64EB__
- ror x3,x3,#32
-#else
- rev32 x3,x3
-#endif
- add w24,w24,w28 // warm it up
- add w24,w24,w3
- lsr x4,x3,#32
- ldr x5,[x1,#-56]
- bic w25,w23,w21
- and w26,w22,w21
- ror w27,w20,#27
- add w23,w23,w28 // future e+=K
- orr w25,w25,w26
- add w24,w24,w27 // e+=rot(a,5)
- ror w21,w21,#2
- add w23,w23,w4 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
-#ifdef __AARCH64EB__
- ror x5,x5,#32
-#else
- rev32 x5,x5
-#endif
- bic w25,w22,w20
- and w26,w21,w20
- ror w27,w24,#27
- add w22,w22,w28 // future e+=K
- orr w25,w25,w26
- add w23,w23,w27 // e+=rot(a,5)
- ror w20,w20,#2
- add w22,w22,w5 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- lsr x6,x5,#32
- ldr x7,[x1,#-48]
- bic w25,w21,w24
- and w26,w20,w24
- ror w27,w23,#27
- add w21,w21,w28 // future e+=K
- orr w25,w25,w26
- add w22,w22,w27 // e+=rot(a,5)
- ror w24,w24,#2
- add w21,w21,w6 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
-#ifdef __AARCH64EB__
- ror x7,x7,#32
-#else
- rev32 x7,x7
-#endif
- bic w25,w20,w23
- and w26,w24,w23
- ror w27,w22,#27
- add w20,w20,w28 // future e+=K
- orr w25,w25,w26
- add w21,w21,w27 // e+=rot(a,5)
- ror w23,w23,#2
- add w20,w20,w7 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- lsr x8,x7,#32
- ldr x9,[x1,#-40]
- bic w25,w24,w22
- and w26,w23,w22
- ror w27,w21,#27
- add w24,w24,w28 // future e+=K
- orr w25,w25,w26
- add w20,w20,w27 // e+=rot(a,5)
- ror w22,w22,#2
- add w24,w24,w8 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
-#ifdef __AARCH64EB__
- ror x9,x9,#32
-#else
- rev32 x9,x9
-#endif
- bic w25,w23,w21
- and w26,w22,w21
- ror w27,w20,#27
- add w23,w23,w28 // future e+=K
- orr w25,w25,w26
- add w24,w24,w27 // e+=rot(a,5)
- ror w21,w21,#2
- add w23,w23,w9 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- lsr x10,x9,#32
- ldr x11,[x1,#-32]
- bic w25,w22,w20
- and w26,w21,w20
- ror w27,w24,#27
- add w22,w22,w28 // future e+=K
- orr w25,w25,w26
- add w23,w23,w27 // e+=rot(a,5)
- ror w20,w20,#2
- add w22,w22,w10 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
-#ifdef __AARCH64EB__
- ror x11,x11,#32
-#else
- rev32 x11,x11
-#endif
- bic w25,w21,w24
- and w26,w20,w24
- ror w27,w23,#27
- add w21,w21,w28 // future e+=K
- orr w25,w25,w26
- add w22,w22,w27 // e+=rot(a,5)
- ror w24,w24,#2
- add w21,w21,w11 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- lsr x12,x11,#32
- ldr x13,[x1,#-24]
- bic w25,w20,w23
- and w26,w24,w23
- ror w27,w22,#27
- add w20,w20,w28 // future e+=K
- orr w25,w25,w26
- add w21,w21,w27 // e+=rot(a,5)
- ror w23,w23,#2
- add w20,w20,w12 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
-#ifdef __AARCH64EB__
- ror x13,x13,#32
-#else
- rev32 x13,x13
-#endif
- bic w25,w24,w22
- and w26,w23,w22
- ror w27,w21,#27
- add w24,w24,w28 // future e+=K
- orr w25,w25,w26
- add w20,w20,w27 // e+=rot(a,5)
- ror w22,w22,#2
- add w24,w24,w13 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- lsr x14,x13,#32
- ldr x15,[x1,#-16]
- bic w25,w23,w21
- and w26,w22,w21
- ror w27,w20,#27
- add w23,w23,w28 // future e+=K
- orr w25,w25,w26
- add w24,w24,w27 // e+=rot(a,5)
- ror w21,w21,#2
- add w23,w23,w14 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
-#ifdef __AARCH64EB__
- ror x15,x15,#32
-#else
- rev32 x15,x15
-#endif
- bic w25,w22,w20
- and w26,w21,w20
- ror w27,w24,#27
- add w22,w22,w28 // future e+=K
- orr w25,w25,w26
- add w23,w23,w27 // e+=rot(a,5)
- ror w20,w20,#2
- add w22,w22,w15 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- lsr x16,x15,#32
- ldr x17,[x1,#-8]
- bic w25,w21,w24
- and w26,w20,w24
- ror w27,w23,#27
- add w21,w21,w28 // future e+=K
- orr w25,w25,w26
- add w22,w22,w27 // e+=rot(a,5)
- ror w24,w24,#2
- add w21,w21,w16 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
-#ifdef __AARCH64EB__
- ror x17,x17,#32
-#else
- rev32 x17,x17
-#endif
- bic w25,w20,w23
- and w26,w24,w23
- ror w27,w22,#27
- add w20,w20,w28 // future e+=K
- orr w25,w25,w26
- add w21,w21,w27 // e+=rot(a,5)
- ror w23,w23,#2
- add w20,w20,w17 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- lsr x19,x17,#32
- eor w3,w3,w5
- bic w25,w24,w22
- and w26,w23,w22
- ror w27,w21,#27
- eor w3,w3,w11
- add w24,w24,w28 // future e+=K
- orr w25,w25,w26
- add w20,w20,w27 // e+=rot(a,5)
- eor w3,w3,w16
- ror w22,w22,#2
- add w24,w24,w19 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w3,w3,#31
- eor w4,w4,w6
- bic w25,w23,w21
- and w26,w22,w21
- ror w27,w20,#27
- eor w4,w4,w12
- add w23,w23,w28 // future e+=K
- orr w25,w25,w26
- add w24,w24,w27 // e+=rot(a,5)
- eor w4,w4,w17
- ror w21,w21,#2
- add w23,w23,w3 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w4,w4,#31
- eor w5,w5,w7
- bic w25,w22,w20
- and w26,w21,w20
- ror w27,w24,#27
- eor w5,w5,w13
- add w22,w22,w28 // future e+=K
- orr w25,w25,w26
- add w23,w23,w27 // e+=rot(a,5)
- eor w5,w5,w19
- ror w20,w20,#2
- add w22,w22,w4 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w5,w5,#31
- eor w6,w6,w8
- bic w25,w21,w24
- and w26,w20,w24
- ror w27,w23,#27
- eor w6,w6,w14
- add w21,w21,w28 // future e+=K
- orr w25,w25,w26
- add w22,w22,w27 // e+=rot(a,5)
- eor w6,w6,w3
- ror w24,w24,#2
- add w21,w21,w5 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w6,w6,#31
- eor w7,w7,w9
- bic w25,w20,w23
- and w26,w24,w23
- ror w27,w22,#27
- eor w7,w7,w15
- add w20,w20,w28 // future e+=K
- orr w25,w25,w26
- add w21,w21,w27 // e+=rot(a,5)
- eor w7,w7,w4
- ror w23,w23,#2
- add w20,w20,w6 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ror w7,w7,#31
- movz w28,#0xeba1
- movk w28,#0x6ed9,lsl#16
- eor w8,w8,w10
- bic w25,w24,w22
- and w26,w23,w22
- ror w27,w21,#27
- eor w8,w8,w16
- add w24,w24,w28 // future e+=K
- orr w25,w25,w26
- add w20,w20,w27 // e+=rot(a,5)
- eor w8,w8,w5
- ror w22,w22,#2
- add w24,w24,w7 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w8,w8,#31
- eor w9,w9,w11
- eor w25,w23,w21
- ror w27,w20,#27
- add w23,w23,w28 // future e+=K
- eor w9,w9,w17
- eor w25,w25,w22
- add w24,w24,w27 // e+=rot(a,5)
- ror w21,w21,#2
- eor w9,w9,w6
- add w23,w23,w8 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w9,w9,#31
- eor w10,w10,w12
- eor w25,w22,w20
- ror w27,w24,#27
- add w22,w22,w28 // future e+=K
- eor w10,w10,w19
- eor w25,w25,w21
- add w23,w23,w27 // e+=rot(a,5)
- ror w20,w20,#2
- eor w10,w10,w7
- add w22,w22,w9 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w10,w10,#31
- eor w11,w11,w13
- eor w25,w21,w24
- ror w27,w23,#27
- add w21,w21,w28 // future e+=K
- eor w11,w11,w3
- eor w25,w25,w20
- add w22,w22,w27 // e+=rot(a,5)
- ror w24,w24,#2
- eor w11,w11,w8
- add w21,w21,w10 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w11,w11,#31
- eor w12,w12,w14
- eor w25,w20,w23
- ror w27,w22,#27
- add w20,w20,w28 // future e+=K
- eor w12,w12,w4
- eor w25,w25,w24
- add w21,w21,w27 // e+=rot(a,5)
- ror w23,w23,#2
- eor w12,w12,w9
- add w20,w20,w11 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ror w12,w12,#31
- eor w13,w13,w15
- eor w25,w24,w22
- ror w27,w21,#27
- add w24,w24,w28 // future e+=K
- eor w13,w13,w5
- eor w25,w25,w23
- add w20,w20,w27 // e+=rot(a,5)
- ror w22,w22,#2
- eor w13,w13,w10
- add w24,w24,w12 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w13,w13,#31
- eor w14,w14,w16
- eor w25,w23,w21
- ror w27,w20,#27
- add w23,w23,w28 // future e+=K
- eor w14,w14,w6
- eor w25,w25,w22
- add w24,w24,w27 // e+=rot(a,5)
- ror w21,w21,#2
- eor w14,w14,w11
- add w23,w23,w13 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w14,w14,#31
- eor w15,w15,w17
- eor w25,w22,w20
- ror w27,w24,#27
- add w22,w22,w28 // future e+=K
- eor w15,w15,w7
- eor w25,w25,w21
- add w23,w23,w27 // e+=rot(a,5)
- ror w20,w20,#2
- eor w15,w15,w12
- add w22,w22,w14 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w15,w15,#31
- eor w16,w16,w19
- eor w25,w21,w24
- ror w27,w23,#27
- add w21,w21,w28 // future e+=K
- eor w16,w16,w8
- eor w25,w25,w20
- add w22,w22,w27 // e+=rot(a,5)
- ror w24,w24,#2
- eor w16,w16,w13
- add w21,w21,w15 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w16,w16,#31
- eor w17,w17,w3
- eor w25,w20,w23
- ror w27,w22,#27
- add w20,w20,w28 // future e+=K
- eor w17,w17,w9
- eor w25,w25,w24
- add w21,w21,w27 // e+=rot(a,5)
- ror w23,w23,#2
- eor w17,w17,w14
- add w20,w20,w16 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ror w17,w17,#31
- eor w19,w19,w4
- eor w25,w24,w22
- ror w27,w21,#27
- add w24,w24,w28 // future e+=K
- eor w19,w19,w10
- eor w25,w25,w23
- add w20,w20,w27 // e+=rot(a,5)
- ror w22,w22,#2
- eor w19,w19,w15
- add w24,w24,w17 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w19,w19,#31
- eor w3,w3,w5
- eor w25,w23,w21
- ror w27,w20,#27
- add w23,w23,w28 // future e+=K
- eor w3,w3,w11
- eor w25,w25,w22
- add w24,w24,w27 // e+=rot(a,5)
- ror w21,w21,#2
- eor w3,w3,w16
- add w23,w23,w19 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w3,w3,#31
- eor w4,w4,w6
- eor w25,w22,w20
- ror w27,w24,#27
- add w22,w22,w28 // future e+=K
- eor w4,w4,w12
- eor w25,w25,w21
- add w23,w23,w27 // e+=rot(a,5)
- ror w20,w20,#2
- eor w4,w4,w17
- add w22,w22,w3 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w4,w4,#31
- eor w5,w5,w7
- eor w25,w21,w24
- ror w27,w23,#27
- add w21,w21,w28 // future e+=K
- eor w5,w5,w13
- eor w25,w25,w20
- add w22,w22,w27 // e+=rot(a,5)
- ror w24,w24,#2
- eor w5,w5,w19
- add w21,w21,w4 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w5,w5,#31
- eor w6,w6,w8
- eor w25,w20,w23
- ror w27,w22,#27
- add w20,w20,w28 // future e+=K
- eor w6,w6,w14
- eor w25,w25,w24
- add w21,w21,w27 // e+=rot(a,5)
- ror w23,w23,#2
- eor w6,w6,w3
- add w20,w20,w5 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ror w6,w6,#31
- eor w7,w7,w9
- eor w25,w24,w22
- ror w27,w21,#27
- add w24,w24,w28 // future e+=K
- eor w7,w7,w15
- eor w25,w25,w23
- add w20,w20,w27 // e+=rot(a,5)
- ror w22,w22,#2
- eor w7,w7,w4
- add w24,w24,w6 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w7,w7,#31
- eor w8,w8,w10
- eor w25,w23,w21
- ror w27,w20,#27
- add w23,w23,w28 // future e+=K
- eor w8,w8,w16
- eor w25,w25,w22
- add w24,w24,w27 // e+=rot(a,5)
- ror w21,w21,#2
- eor w8,w8,w5
- add w23,w23,w7 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w8,w8,#31
- eor w9,w9,w11
- eor w25,w22,w20
- ror w27,w24,#27
- add w22,w22,w28 // future e+=K
- eor w9,w9,w17
- eor w25,w25,w21
- add w23,w23,w27 // e+=rot(a,5)
- ror w20,w20,#2
- eor w9,w9,w6
- add w22,w22,w8 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w9,w9,#31
- eor w10,w10,w12
- eor w25,w21,w24
- ror w27,w23,#27
- add w21,w21,w28 // future e+=K
- eor w10,w10,w19
- eor w25,w25,w20
- add w22,w22,w27 // e+=rot(a,5)
- ror w24,w24,#2
- eor w10,w10,w7
- add w21,w21,w9 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w10,w10,#31
- eor w11,w11,w13
- eor w25,w20,w23
- ror w27,w22,#27
- add w20,w20,w28 // future e+=K
- eor w11,w11,w3
- eor w25,w25,w24
- add w21,w21,w27 // e+=rot(a,5)
- ror w23,w23,#2
- eor w11,w11,w8
- add w20,w20,w10 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ror w11,w11,#31
- movz w28,#0xbcdc
- movk w28,#0x8f1b,lsl#16
- eor w12,w12,w14
- eor w25,w24,w22
- ror w27,w21,#27
- add w24,w24,w28 // future e+=K
- eor w12,w12,w4
- eor w25,w25,w23
- add w20,w20,w27 // e+=rot(a,5)
- ror w22,w22,#2
- eor w12,w12,w9
- add w24,w24,w11 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w12,w12,#31
- orr w25,w21,w22
- and w26,w21,w22
- eor w13,w13,w15
- ror w27,w20,#27
- and w25,w25,w23
- add w23,w23,w28 // future e+=K
- eor w13,w13,w5
- add w24,w24,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w21,w21,#2
- eor w13,w13,w10
- add w23,w23,w12 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w13,w13,#31
- orr w25,w20,w21
- and w26,w20,w21
- eor w14,w14,w16
- ror w27,w24,#27
- and w25,w25,w22
- add w22,w22,w28 // future e+=K
- eor w14,w14,w6
- add w23,w23,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w20,w20,#2
- eor w14,w14,w11
- add w22,w22,w13 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w14,w14,#31
- orr w25,w24,w20
- and w26,w24,w20
- eor w15,w15,w17
- ror w27,w23,#27
- and w25,w25,w21
- add w21,w21,w28 // future e+=K
- eor w15,w15,w7
- add w22,w22,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w24,w24,#2
- eor w15,w15,w12
- add w21,w21,w14 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w15,w15,#31
- orr w25,w23,w24
- and w26,w23,w24
- eor w16,w16,w19
- ror w27,w22,#27
- and w25,w25,w20
- add w20,w20,w28 // future e+=K
- eor w16,w16,w8
- add w21,w21,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w23,w23,#2
- eor w16,w16,w13
- add w20,w20,w15 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ror w16,w16,#31
- orr w25,w22,w23
- and w26,w22,w23
- eor w17,w17,w3
- ror w27,w21,#27
- and w25,w25,w24
- add w24,w24,w28 // future e+=K
- eor w17,w17,w9
- add w20,w20,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w22,w22,#2
- eor w17,w17,w14
- add w24,w24,w16 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w17,w17,#31
- orr w25,w21,w22
- and w26,w21,w22
- eor w19,w19,w4
- ror w27,w20,#27
- and w25,w25,w23
- add w23,w23,w28 // future e+=K
- eor w19,w19,w10
- add w24,w24,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w21,w21,#2
- eor w19,w19,w15
- add w23,w23,w17 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w19,w19,#31
- orr w25,w20,w21
- and w26,w20,w21
- eor w3,w3,w5
- ror w27,w24,#27
- and w25,w25,w22
- add w22,w22,w28 // future e+=K
- eor w3,w3,w11
- add w23,w23,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w20,w20,#2
- eor w3,w3,w16
- add w22,w22,w19 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w3,w3,#31
- orr w25,w24,w20
- and w26,w24,w20
- eor w4,w4,w6
- ror w27,w23,#27
- and w25,w25,w21
- add w21,w21,w28 // future e+=K
- eor w4,w4,w12
- add w22,w22,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w24,w24,#2
- eor w4,w4,w17
- add w21,w21,w3 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w4,w4,#31
- orr w25,w23,w24
- and w26,w23,w24
- eor w5,w5,w7
- ror w27,w22,#27
- and w25,w25,w20
- add w20,w20,w28 // future e+=K
- eor w5,w5,w13
- add w21,w21,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w23,w23,#2
- eor w5,w5,w19
- add w20,w20,w4 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ror w5,w5,#31
- orr w25,w22,w23
- and w26,w22,w23
- eor w6,w6,w8
- ror w27,w21,#27
- and w25,w25,w24
- add w24,w24,w28 // future e+=K
- eor w6,w6,w14
- add w20,w20,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w22,w22,#2
- eor w6,w6,w3
- add w24,w24,w5 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w6,w6,#31
- orr w25,w21,w22
- and w26,w21,w22
- eor w7,w7,w9
- ror w27,w20,#27
- and w25,w25,w23
- add w23,w23,w28 // future e+=K
- eor w7,w7,w15
- add w24,w24,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w21,w21,#2
- eor w7,w7,w4
- add w23,w23,w6 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w7,w7,#31
- orr w25,w20,w21
- and w26,w20,w21
- eor w8,w8,w10
- ror w27,w24,#27
- and w25,w25,w22
- add w22,w22,w28 // future e+=K
- eor w8,w8,w16
- add w23,w23,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w20,w20,#2
- eor w8,w8,w5
- add w22,w22,w7 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w8,w8,#31
- orr w25,w24,w20
- and w26,w24,w20
- eor w9,w9,w11
- ror w27,w23,#27
- and w25,w25,w21
- add w21,w21,w28 // future e+=K
- eor w9,w9,w17
- add w22,w22,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w24,w24,#2
- eor w9,w9,w6
- add w21,w21,w8 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w9,w9,#31
- orr w25,w23,w24
- and w26,w23,w24
- eor w10,w10,w12
- ror w27,w22,#27
- and w25,w25,w20
- add w20,w20,w28 // future e+=K
- eor w10,w10,w19
- add w21,w21,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w23,w23,#2
- eor w10,w10,w7
- add w20,w20,w9 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ror w10,w10,#31
- orr w25,w22,w23
- and w26,w22,w23
- eor w11,w11,w13
- ror w27,w21,#27
- and w25,w25,w24
- add w24,w24,w28 // future e+=K
- eor w11,w11,w3
- add w20,w20,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w22,w22,#2
- eor w11,w11,w8
- add w24,w24,w10 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w11,w11,#31
- orr w25,w21,w22
- and w26,w21,w22
- eor w12,w12,w14
- ror w27,w20,#27
- and w25,w25,w23
- add w23,w23,w28 // future e+=K
- eor w12,w12,w4
- add w24,w24,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w21,w21,#2
- eor w12,w12,w9
- add w23,w23,w11 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w12,w12,#31
- orr w25,w20,w21
- and w26,w20,w21
- eor w13,w13,w15
- ror w27,w24,#27
- and w25,w25,w22
- add w22,w22,w28 // future e+=K
- eor w13,w13,w5
- add w23,w23,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w20,w20,#2
- eor w13,w13,w10
- add w22,w22,w12 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w13,w13,#31
- orr w25,w24,w20
- and w26,w24,w20
- eor w14,w14,w16
- ror w27,w23,#27
- and w25,w25,w21
- add w21,w21,w28 // future e+=K
- eor w14,w14,w6
- add w22,w22,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w24,w24,#2
- eor w14,w14,w11
- add w21,w21,w13 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w14,w14,#31
- orr w25,w23,w24
- and w26,w23,w24
- eor w15,w15,w17
- ror w27,w22,#27
- and w25,w25,w20
- add w20,w20,w28 // future e+=K
- eor w15,w15,w7
- add w21,w21,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w23,w23,#2
- eor w15,w15,w12
- add w20,w20,w14 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ror w15,w15,#31
- movz w28,#0xc1d6
- movk w28,#0xca62,lsl#16
- orr w25,w22,w23
- and w26,w22,w23
- eor w16,w16,w19
- ror w27,w21,#27
- and w25,w25,w24
- add w24,w24,w28 // future e+=K
- eor w16,w16,w8
- add w20,w20,w27 // e+=rot(a,5)
- orr w25,w25,w26
- ror w22,w22,#2
- eor w16,w16,w13
- add w24,w24,w15 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w16,w16,#31
- eor w17,w17,w3
- eor w25,w23,w21
- ror w27,w20,#27
- add w23,w23,w28 // future e+=K
- eor w17,w17,w9
- eor w25,w25,w22
- add w24,w24,w27 // e+=rot(a,5)
- ror w21,w21,#2
- eor w17,w17,w14
- add w23,w23,w16 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w17,w17,#31
- eor w19,w19,w4
- eor w25,w22,w20
- ror w27,w24,#27
- add w22,w22,w28 // future e+=K
- eor w19,w19,w10
- eor w25,w25,w21
- add w23,w23,w27 // e+=rot(a,5)
- ror w20,w20,#2
- eor w19,w19,w15
- add w22,w22,w17 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w19,w19,#31
- eor w3,w3,w5
- eor w25,w21,w24
- ror w27,w23,#27
- add w21,w21,w28 // future e+=K
- eor w3,w3,w11
- eor w25,w25,w20
- add w22,w22,w27 // e+=rot(a,5)
- ror w24,w24,#2
- eor w3,w3,w16
- add w21,w21,w19 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w3,w3,#31
- eor w4,w4,w6
- eor w25,w20,w23
- ror w27,w22,#27
- add w20,w20,w28 // future e+=K
- eor w4,w4,w12
- eor w25,w25,w24
- add w21,w21,w27 // e+=rot(a,5)
- ror w23,w23,#2
- eor w4,w4,w17
- add w20,w20,w3 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ror w4,w4,#31
- eor w5,w5,w7
- eor w25,w24,w22
- ror w27,w21,#27
- add w24,w24,w28 // future e+=K
- eor w5,w5,w13
- eor w25,w25,w23
- add w20,w20,w27 // e+=rot(a,5)
- ror w22,w22,#2
- eor w5,w5,w19
- add w24,w24,w4 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w5,w5,#31
- eor w6,w6,w8
- eor w25,w23,w21
- ror w27,w20,#27
- add w23,w23,w28 // future e+=K
- eor w6,w6,w14
- eor w25,w25,w22
- add w24,w24,w27 // e+=rot(a,5)
- ror w21,w21,#2
- eor w6,w6,w3
- add w23,w23,w5 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w6,w6,#31
- eor w7,w7,w9
- eor w25,w22,w20
- ror w27,w24,#27
- add w22,w22,w28 // future e+=K
- eor w7,w7,w15
- eor w25,w25,w21
- add w23,w23,w27 // e+=rot(a,5)
- ror w20,w20,#2
- eor w7,w7,w4
- add w22,w22,w6 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w7,w7,#31
- eor w8,w8,w10
- eor w25,w21,w24
- ror w27,w23,#27
- add w21,w21,w28 // future e+=K
- eor w8,w8,w16
- eor w25,w25,w20
- add w22,w22,w27 // e+=rot(a,5)
- ror w24,w24,#2
- eor w8,w8,w5
- add w21,w21,w7 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w8,w8,#31
- eor w9,w9,w11
- eor w25,w20,w23
- ror w27,w22,#27
- add w20,w20,w28 // future e+=K
- eor w9,w9,w17
- eor w25,w25,w24
- add w21,w21,w27 // e+=rot(a,5)
- ror w23,w23,#2
- eor w9,w9,w6
- add w20,w20,w8 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ror w9,w9,#31
- eor w10,w10,w12
- eor w25,w24,w22
- ror w27,w21,#27
- add w24,w24,w28 // future e+=K
- eor w10,w10,w19
- eor w25,w25,w23
- add w20,w20,w27 // e+=rot(a,5)
- ror w22,w22,#2
- eor w10,w10,w7
- add w24,w24,w9 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w10,w10,#31
- eor w11,w11,w13
- eor w25,w23,w21
- ror w27,w20,#27
- add w23,w23,w28 // future e+=K
- eor w11,w11,w3
- eor w25,w25,w22
- add w24,w24,w27 // e+=rot(a,5)
- ror w21,w21,#2
- eor w11,w11,w8
- add w23,w23,w10 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w11,w11,#31
- eor w12,w12,w14
- eor w25,w22,w20
- ror w27,w24,#27
- add w22,w22,w28 // future e+=K
- eor w12,w12,w4
- eor w25,w25,w21
- add w23,w23,w27 // e+=rot(a,5)
- ror w20,w20,#2
- eor w12,w12,w9
- add w22,w22,w11 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w12,w12,#31
- eor w13,w13,w15
- eor w25,w21,w24
- ror w27,w23,#27
- add w21,w21,w28 // future e+=K
- eor w13,w13,w5
- eor w25,w25,w20
- add w22,w22,w27 // e+=rot(a,5)
- ror w24,w24,#2
- eor w13,w13,w10
- add w21,w21,w12 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w13,w13,#31
- eor w14,w14,w16
- eor w25,w20,w23
- ror w27,w22,#27
- add w20,w20,w28 // future e+=K
- eor w14,w14,w6
- eor w25,w25,w24
- add w21,w21,w27 // e+=rot(a,5)
- ror w23,w23,#2
- eor w14,w14,w11
- add w20,w20,w13 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ror w14,w14,#31
- eor w15,w15,w17
- eor w25,w24,w22
- ror w27,w21,#27
- add w24,w24,w28 // future e+=K
- eor w15,w15,w7
- eor w25,w25,w23
- add w20,w20,w27 // e+=rot(a,5)
- ror w22,w22,#2
- eor w15,w15,w12
- add w24,w24,w14 // future e+=X[i]
- add w20,w20,w25 // e+=F(b,c,d)
- ror w15,w15,#31
- eor w16,w16,w19
- eor w25,w23,w21
- ror w27,w20,#27
- add w23,w23,w28 // future e+=K
- eor w16,w16,w8
- eor w25,w25,w22
- add w24,w24,w27 // e+=rot(a,5)
- ror w21,w21,#2
- eor w16,w16,w13
- add w23,w23,w15 // future e+=X[i]
- add w24,w24,w25 // e+=F(b,c,d)
- ror w16,w16,#31
- eor w17,w17,w3
- eor w25,w22,w20
- ror w27,w24,#27
- add w22,w22,w28 // future e+=K
- eor w17,w17,w9
- eor w25,w25,w21
- add w23,w23,w27 // e+=rot(a,5)
- ror w20,w20,#2
- eor w17,w17,w14
- add w22,w22,w16 // future e+=X[i]
- add w23,w23,w25 // e+=F(b,c,d)
- ror w17,w17,#31
- eor w19,w19,w4
- eor w25,w21,w24
- ror w27,w23,#27
- add w21,w21,w28 // future e+=K
- eor w19,w19,w10
- eor w25,w25,w20
- add w22,w22,w27 // e+=rot(a,5)
- ror w24,w24,#2
- eor w19,w19,w15
- add w21,w21,w17 // future e+=X[i]
- add w22,w22,w25 // e+=F(b,c,d)
- ror w19,w19,#31
- ldp w4,w5,[x0]
- eor w25,w20,w23
- ror w27,w22,#27
- add w20,w20,w28 // future e+=K
- eor w25,w25,w24
- add w21,w21,w27 // e+=rot(a,5)
- ror w23,w23,#2
- add w20,w20,w19 // future e+=X[i]
- add w21,w21,w25 // e+=F(b,c,d)
- ldp w6,w7,[x0,#8]
- eor w25,w24,w22
- ror w27,w21,#27
- eor w25,w25,w23
- add w20,w20,w27 // e+=rot(a,5)
- ror w22,w22,#2
- ldr w8,[x0,#16]
- add w20,w20,w25 // e+=F(b,c,d)
- add w21,w21,w5
- add w22,w22,w6
- add w20,w20,w4
- add w23,w23,w7
- add w24,w24,w8
- stp w20,w21,[x0]
- stp w22,w23,[x0,#8]
- str w24,[x0,#16]
- cbnz x2,Loop
-
- ldp x19,x20,[sp,#16]
- ldp x21,x22,[sp,#32]
- ldp x23,x24,[sp,#48]
- ldp x25,x26,[sp,#64]
- ldp x27,x28,[sp,#80]
- ldr x29,[sp],#96
- ret
-
-
-.align 6
-sha1_block_armv8:
- // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
- AARCH64_VALID_CALL_TARGET
-Lv8_entry:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- adrp x4,Lconst@PAGE
- add x4,x4,Lconst@PAGEOFF
- eor v1.16b,v1.16b,v1.16b
- ld1 {v0.4s},[x0],#16
- ld1 {v1.s}[0],[x0]
- sub x0,x0,#16
- ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4]
-
-Loop_hw:
- ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
- sub x2,x2,#1
- rev32 v4.16b,v4.16b
- rev32 v5.16b,v5.16b
-
- add v20.4s,v16.4s,v4.4s
- rev32 v6.16b,v6.16b
- orr v22.16b,v0.16b,v0.16b // offload
-
- add v21.4s,v16.4s,v5.4s
- rev32 v7.16b,v7.16b
-.long 0x5e280803 //sha1h v3.16b,v0.16b
-.long 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0
- add v20.4s,v16.4s,v6.4s
-.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
-.long 0x5e280802 //sha1h v2.16b,v0.16b // 1
-.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s
- add v21.4s,v16.4s,v7.4s
-.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
-.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
-.long 0x5e280803 //sha1h v3.16b,v0.16b // 2
-.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s
- add v20.4s,v16.4s,v4.4s
-.long 0x5e281885 //sha1su1 v5.16b,v4.16b
-.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
-.long 0x5e280802 //sha1h v2.16b,v0.16b // 3
-.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s
- add v21.4s,v17.4s,v5.4s
-.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
-.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
-.long 0x5e280803 //sha1h v3.16b,v0.16b // 4
-.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s
- add v20.4s,v17.4s,v6.4s
-.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
-.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
-.long 0x5e280802 //sha1h v2.16b,v0.16b // 5
-.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
- add v21.4s,v17.4s,v7.4s
-.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
-.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
-.long 0x5e280803 //sha1h v3.16b,v0.16b // 6
-.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
- add v20.4s,v17.4s,v4.4s
-.long 0x5e281885 //sha1su1 v5.16b,v4.16b
-.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
-.long 0x5e280802 //sha1h v2.16b,v0.16b // 7
-.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
- add v21.4s,v17.4s,v5.4s
-.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
-.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
-.long 0x5e280803 //sha1h v3.16b,v0.16b // 8
-.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
- add v20.4s,v18.4s,v6.4s
-.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
-.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
-.long 0x5e280802 //sha1h v2.16b,v0.16b // 9
-.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
- add v21.4s,v18.4s,v7.4s
-.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
-.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
-.long 0x5e280803 //sha1h v3.16b,v0.16b // 10
-.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
- add v20.4s,v18.4s,v4.4s
-.long 0x5e281885 //sha1su1 v5.16b,v4.16b
-.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
-.long 0x5e280802 //sha1h v2.16b,v0.16b // 11
-.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s
- add v21.4s,v18.4s,v5.4s
-.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
-.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
-.long 0x5e280803 //sha1h v3.16b,v0.16b // 12
-.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
- add v20.4s,v18.4s,v6.4s
-.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
-.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
-.long 0x5e280802 //sha1h v2.16b,v0.16b // 13
-.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s
- add v21.4s,v19.4s,v7.4s
-.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
-.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
-.long 0x5e280803 //sha1h v3.16b,v0.16b // 14
-.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
- add v20.4s,v19.4s,v4.4s
-.long 0x5e281885 //sha1su1 v5.16b,v4.16b
-.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
-.long 0x5e280802 //sha1h v2.16b,v0.16b // 15
-.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
- add v21.4s,v19.4s,v5.4s
-.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
-.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
-.long 0x5e280803 //sha1h v3.16b,v0.16b // 16
-.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
- add v20.4s,v19.4s,v6.4s
-.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
-.long 0x5e280802 //sha1h v2.16b,v0.16b // 17
-.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
- add v21.4s,v19.4s,v7.4s
-
-.long 0x5e280803 //sha1h v3.16b,v0.16b // 18
-.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
-
-.long 0x5e280802 //sha1h v2.16b,v0.16b // 19
-.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
-
- add v1.4s,v1.4s,v2.4s
- add v0.4s,v0.4s,v22.4s
-
- cbnz x2,Loop_hw
-
- st1 {v0.4s},[x0],#16
- st1 {v1.s}[0],[x0]
-
- ldr x29,[sp],#16
- ret
-
-.section __TEXT,__const
-.align 6
-Lconst:
-.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
-.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
-.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
-.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
-.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/sha256-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/sha256-armv8-apple.S
deleted file mode 100644
index b54bcf9..0000000
--- a/apple-aarch64/crypto/fipsmodule/sha256-armv8-apple.S
+++ /dev/null
@@ -1,1204 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License"). You may not use
-// this file except in compliance with the License. You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-//
-// Permission to use under GPLv2 terms is granted.
-// ====================================================================
-//
-// SHA256/512 for ARMv8.
-//
-// Performance in cycles per processed byte and improvement coefficient
-// over code generated with "default" compiler:
-//
-// SHA256-hw SHA256(*) SHA512
-// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
-// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
-// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
-// Denver 2.01 10.5 (+26%) 6.70 (+8%)
-// X-Gene 20.0 (+100%) 12.8 (+300%(***))
-// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
-// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
-//
-// (*) Software SHA256 results are of lesser relevance, presented
-// mostly for informational purposes.
-// (**) The result is a trade-off: it's possible to improve it by
-// 10% (or by 1 cycle per round), but at the cost of 20% loss
-// on Cortex-A53 (or by 4 cycles per round).
-// (***) Super-impressive coefficients over gcc-generated code are
-// indication of some compiler "pathology", most notably code
-// generated with -mgeneral-regs-only is significantly faster
-// and the gap is only 40-90%.
-
-#ifndef __KERNEL__
-# include <openssl/arm_arch.h>
-#endif
-
-.text
-
-
-.private_extern _OPENSSL_armcap_P
-.globl _sha256_block_data_order
-.private_extern _sha256_block_data_order
-
-.align 6
-_sha256_block_data_order:
- AARCH64_VALID_CALL_TARGET
-#ifndef __KERNEL__
-#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
- adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P
-#else
- adrp x16,_OPENSSL_armcap_P@PAGE
-#endif
- ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
- tst w16,#ARMV8_SHA256
- b.ne Lv8_entry
-#endif
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
-
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#4*4
-
- ldp w20,w21,[x0] // load context
- ldp w22,w23,[x0,#2*4]
- ldp w24,w25,[x0,#4*4]
- add x2,x1,x2,lsl#6 // end of input
- ldp w26,w27,[x0,#6*4]
- adrp x30,LK256@PAGE
- add x30,x30,LK256@PAGEOFF
- stp x0,x2,[x29,#96]
-
-Loop:
- ldp w3,w4,[x1],#2*4
- ldr w19,[x30],#4 // *K++
- eor w28,w21,w22 // magic seed
- str x1,[x29,#112]
-#ifndef __AARCH64EB__
- rev w3,w3 // 0
-#endif
- ror w16,w24,#6
- add w27,w27,w19 // h+=K[i]
- eor w6,w24,w24,ror#14
- and w17,w25,w24
- bic w19,w26,w24
- add w27,w27,w3 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w20,w21 // a^b, b^c in next round
- eor w16,w16,w6,ror#11 // Sigma1(e)
- ror w6,w20,#2
- add w27,w27,w17 // h+=Ch(e,f,g)
- eor w17,w20,w20,ror#9
- add w27,w27,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w23,w23,w27 // d+=h
- eor w28,w28,w21 // Maj(a,b,c)
- eor w17,w6,w17,ror#13 // Sigma0(a)
- add w27,w27,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w27,w27,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w4,w4 // 1
-#endif
- ldp w5,w6,[x1],#2*4
- add w27,w27,w17 // h+=Sigma0(a)
- ror w16,w23,#6
- add w26,w26,w28 // h+=K[i]
- eor w7,w23,w23,ror#14
- and w17,w24,w23
- bic w28,w25,w23
- add w26,w26,w4 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w27,w20 // a^b, b^c in next round
- eor w16,w16,w7,ror#11 // Sigma1(e)
- ror w7,w27,#2
- add w26,w26,w17 // h+=Ch(e,f,g)
- eor w17,w27,w27,ror#9
- add w26,w26,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w22,w22,w26 // d+=h
- eor w19,w19,w20 // Maj(a,b,c)
- eor w17,w7,w17,ror#13 // Sigma0(a)
- add w26,w26,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w26,w26,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w5,w5 // 2
-#endif
- add w26,w26,w17 // h+=Sigma0(a)
- ror w16,w22,#6
- add w25,w25,w19 // h+=K[i]
- eor w8,w22,w22,ror#14
- and w17,w23,w22
- bic w19,w24,w22
- add w25,w25,w5 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w26,w27 // a^b, b^c in next round
- eor w16,w16,w8,ror#11 // Sigma1(e)
- ror w8,w26,#2
- add w25,w25,w17 // h+=Ch(e,f,g)
- eor w17,w26,w26,ror#9
- add w25,w25,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w21,w21,w25 // d+=h
- eor w28,w28,w27 // Maj(a,b,c)
- eor w17,w8,w17,ror#13 // Sigma0(a)
- add w25,w25,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w25,w25,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w6,w6 // 3
-#endif
- ldp w7,w8,[x1],#2*4
- add w25,w25,w17 // h+=Sigma0(a)
- ror w16,w21,#6
- add w24,w24,w28 // h+=K[i]
- eor w9,w21,w21,ror#14
- and w17,w22,w21
- bic w28,w23,w21
- add w24,w24,w6 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w25,w26 // a^b, b^c in next round
- eor w16,w16,w9,ror#11 // Sigma1(e)
- ror w9,w25,#2
- add w24,w24,w17 // h+=Ch(e,f,g)
- eor w17,w25,w25,ror#9
- add w24,w24,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w20,w20,w24 // d+=h
- eor w19,w19,w26 // Maj(a,b,c)
- eor w17,w9,w17,ror#13 // Sigma0(a)
- add w24,w24,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w24,w24,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w7,w7 // 4
-#endif
- add w24,w24,w17 // h+=Sigma0(a)
- ror w16,w20,#6
- add w23,w23,w19 // h+=K[i]
- eor w10,w20,w20,ror#14
- and w17,w21,w20
- bic w19,w22,w20
- add w23,w23,w7 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w24,w25 // a^b, b^c in next round
- eor w16,w16,w10,ror#11 // Sigma1(e)
- ror w10,w24,#2
- add w23,w23,w17 // h+=Ch(e,f,g)
- eor w17,w24,w24,ror#9
- add w23,w23,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w27,w27,w23 // d+=h
- eor w28,w28,w25 // Maj(a,b,c)
- eor w17,w10,w17,ror#13 // Sigma0(a)
- add w23,w23,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w23,w23,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w8,w8 // 5
-#endif
- ldp w9,w10,[x1],#2*4
- add w23,w23,w17 // h+=Sigma0(a)
- ror w16,w27,#6
- add w22,w22,w28 // h+=K[i]
- eor w11,w27,w27,ror#14
- and w17,w20,w27
- bic w28,w21,w27
- add w22,w22,w8 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w23,w24 // a^b, b^c in next round
- eor w16,w16,w11,ror#11 // Sigma1(e)
- ror w11,w23,#2
- add w22,w22,w17 // h+=Ch(e,f,g)
- eor w17,w23,w23,ror#9
- add w22,w22,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w26,w26,w22 // d+=h
- eor w19,w19,w24 // Maj(a,b,c)
- eor w17,w11,w17,ror#13 // Sigma0(a)
- add w22,w22,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w22,w22,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w9,w9 // 6
-#endif
- add w22,w22,w17 // h+=Sigma0(a)
- ror w16,w26,#6
- add w21,w21,w19 // h+=K[i]
- eor w12,w26,w26,ror#14
- and w17,w27,w26
- bic w19,w20,w26
- add w21,w21,w9 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w22,w23 // a^b, b^c in next round
- eor w16,w16,w12,ror#11 // Sigma1(e)
- ror w12,w22,#2
- add w21,w21,w17 // h+=Ch(e,f,g)
- eor w17,w22,w22,ror#9
- add w21,w21,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w25,w25,w21 // d+=h
- eor w28,w28,w23 // Maj(a,b,c)
- eor w17,w12,w17,ror#13 // Sigma0(a)
- add w21,w21,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w21,w21,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w10,w10 // 7
-#endif
- ldp w11,w12,[x1],#2*4
- add w21,w21,w17 // h+=Sigma0(a)
- ror w16,w25,#6
- add w20,w20,w28 // h+=K[i]
- eor w13,w25,w25,ror#14
- and w17,w26,w25
- bic w28,w27,w25
- add w20,w20,w10 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w21,w22 // a^b, b^c in next round
- eor w16,w16,w13,ror#11 // Sigma1(e)
- ror w13,w21,#2
- add w20,w20,w17 // h+=Ch(e,f,g)
- eor w17,w21,w21,ror#9
- add w20,w20,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w24,w24,w20 // d+=h
- eor w19,w19,w22 // Maj(a,b,c)
- eor w17,w13,w17,ror#13 // Sigma0(a)
- add w20,w20,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w20,w20,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w11,w11 // 8
-#endif
- add w20,w20,w17 // h+=Sigma0(a)
- ror w16,w24,#6
- add w27,w27,w19 // h+=K[i]
- eor w14,w24,w24,ror#14
- and w17,w25,w24
- bic w19,w26,w24
- add w27,w27,w11 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w20,w21 // a^b, b^c in next round
- eor w16,w16,w14,ror#11 // Sigma1(e)
- ror w14,w20,#2
- add w27,w27,w17 // h+=Ch(e,f,g)
- eor w17,w20,w20,ror#9
- add w27,w27,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w23,w23,w27 // d+=h
- eor w28,w28,w21 // Maj(a,b,c)
- eor w17,w14,w17,ror#13 // Sigma0(a)
- add w27,w27,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w27,w27,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w12,w12 // 9
-#endif
- ldp w13,w14,[x1],#2*4
- add w27,w27,w17 // h+=Sigma0(a)
- ror w16,w23,#6
- add w26,w26,w28 // h+=K[i]
- eor w15,w23,w23,ror#14
- and w17,w24,w23
- bic w28,w25,w23
- add w26,w26,w12 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w27,w20 // a^b, b^c in next round
- eor w16,w16,w15,ror#11 // Sigma1(e)
- ror w15,w27,#2
- add w26,w26,w17 // h+=Ch(e,f,g)
- eor w17,w27,w27,ror#9
- add w26,w26,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w22,w22,w26 // d+=h
- eor w19,w19,w20 // Maj(a,b,c)
- eor w17,w15,w17,ror#13 // Sigma0(a)
- add w26,w26,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w26,w26,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w13,w13 // 10
-#endif
- add w26,w26,w17 // h+=Sigma0(a)
- ror w16,w22,#6
- add w25,w25,w19 // h+=K[i]
- eor w0,w22,w22,ror#14
- and w17,w23,w22
- bic w19,w24,w22
- add w25,w25,w13 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w26,w27 // a^b, b^c in next round
- eor w16,w16,w0,ror#11 // Sigma1(e)
- ror w0,w26,#2
- add w25,w25,w17 // h+=Ch(e,f,g)
- eor w17,w26,w26,ror#9
- add w25,w25,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w21,w21,w25 // d+=h
- eor w28,w28,w27 // Maj(a,b,c)
- eor w17,w0,w17,ror#13 // Sigma0(a)
- add w25,w25,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w25,w25,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w14,w14 // 11
-#endif
- ldp w15,w0,[x1],#2*4
- add w25,w25,w17 // h+=Sigma0(a)
- str w6,[sp,#12]
- ror w16,w21,#6
- add w24,w24,w28 // h+=K[i]
- eor w6,w21,w21,ror#14
- and w17,w22,w21
- bic w28,w23,w21
- add w24,w24,w14 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w25,w26 // a^b, b^c in next round
- eor w16,w16,w6,ror#11 // Sigma1(e)
- ror w6,w25,#2
- add w24,w24,w17 // h+=Ch(e,f,g)
- eor w17,w25,w25,ror#9
- add w24,w24,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w20,w20,w24 // d+=h
- eor w19,w19,w26 // Maj(a,b,c)
- eor w17,w6,w17,ror#13 // Sigma0(a)
- add w24,w24,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w24,w24,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w15,w15 // 12
-#endif
- add w24,w24,w17 // h+=Sigma0(a)
- str w7,[sp,#0]
- ror w16,w20,#6
- add w23,w23,w19 // h+=K[i]
- eor w7,w20,w20,ror#14
- and w17,w21,w20
- bic w19,w22,w20
- add w23,w23,w15 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w24,w25 // a^b, b^c in next round
- eor w16,w16,w7,ror#11 // Sigma1(e)
- ror w7,w24,#2
- add w23,w23,w17 // h+=Ch(e,f,g)
- eor w17,w24,w24,ror#9
- add w23,w23,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w27,w27,w23 // d+=h
- eor w28,w28,w25 // Maj(a,b,c)
- eor w17,w7,w17,ror#13 // Sigma0(a)
- add w23,w23,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w23,w23,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w0,w0 // 13
-#endif
- ldp w1,w2,[x1]
- add w23,w23,w17 // h+=Sigma0(a)
- str w8,[sp,#4]
- ror w16,w27,#6
- add w22,w22,w28 // h+=K[i]
- eor w8,w27,w27,ror#14
- and w17,w20,w27
- bic w28,w21,w27
- add w22,w22,w0 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w23,w24 // a^b, b^c in next round
- eor w16,w16,w8,ror#11 // Sigma1(e)
- ror w8,w23,#2
- add w22,w22,w17 // h+=Ch(e,f,g)
- eor w17,w23,w23,ror#9
- add w22,w22,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w26,w26,w22 // d+=h
- eor w19,w19,w24 // Maj(a,b,c)
- eor w17,w8,w17,ror#13 // Sigma0(a)
- add w22,w22,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w22,w22,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w1,w1 // 14
-#endif
- ldr w6,[sp,#12]
- add w22,w22,w17 // h+=Sigma0(a)
- str w9,[sp,#8]
- ror w16,w26,#6
- add w21,w21,w19 // h+=K[i]
- eor w9,w26,w26,ror#14
- and w17,w27,w26
- bic w19,w20,w26
- add w21,w21,w1 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w22,w23 // a^b, b^c in next round
- eor w16,w16,w9,ror#11 // Sigma1(e)
- ror w9,w22,#2
- add w21,w21,w17 // h+=Ch(e,f,g)
- eor w17,w22,w22,ror#9
- add w21,w21,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w25,w25,w21 // d+=h
- eor w28,w28,w23 // Maj(a,b,c)
- eor w17,w9,w17,ror#13 // Sigma0(a)
- add w21,w21,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w21,w21,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w2,w2 // 15
-#endif
- ldr w7,[sp,#0]
- add w21,w21,w17 // h+=Sigma0(a)
- str w10,[sp,#12]
- ror w16,w25,#6
- add w20,w20,w28 // h+=K[i]
- ror w9,w4,#7
- and w17,w26,w25
- ror w8,w1,#17
- bic w28,w27,w25
- ror w10,w21,#2
- add w20,w20,w2 // h+=X[i]
- eor w16,w16,w25,ror#11
- eor w9,w9,w4,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w21,w22 // a^b, b^c in next round
- eor w16,w16,w25,ror#25 // Sigma1(e)
- eor w10,w10,w21,ror#13
- add w20,w20,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w8,w8,w1,ror#19
- eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
- add w20,w20,w16 // h+=Sigma1(e)
- eor w19,w19,w22 // Maj(a,b,c)
- eor w17,w10,w21,ror#22 // Sigma0(a)
- eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
- add w3,w3,w12
- add w24,w24,w20 // d+=h
- add w20,w20,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w3,w3,w9
- add w20,w20,w17 // h+=Sigma0(a)
- add w3,w3,w8
-Loop_16_xx:
- ldr w8,[sp,#4]
- str w11,[sp,#0]
- ror w16,w24,#6
- add w27,w27,w19 // h+=K[i]
- ror w10,w5,#7
- and w17,w25,w24
- ror w9,w2,#17
- bic w19,w26,w24
- ror w11,w20,#2
- add w27,w27,w3 // h+=X[i]
- eor w16,w16,w24,ror#11
- eor w10,w10,w5,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w20,w21 // a^b, b^c in next round
- eor w16,w16,w24,ror#25 // Sigma1(e)
- eor w11,w11,w20,ror#13
- add w27,w27,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w9,w9,w2,ror#19
- eor w10,w10,w5,lsr#3 // sigma0(X[i+1])
- add w27,w27,w16 // h+=Sigma1(e)
- eor w28,w28,w21 // Maj(a,b,c)
- eor w17,w11,w20,ror#22 // Sigma0(a)
- eor w9,w9,w2,lsr#10 // sigma1(X[i+14])
- add w4,w4,w13
- add w23,w23,w27 // d+=h
- add w27,w27,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w4,w4,w10
- add w27,w27,w17 // h+=Sigma0(a)
- add w4,w4,w9
- ldr w9,[sp,#8]
- str w12,[sp,#4]
- ror w16,w23,#6
- add w26,w26,w28 // h+=K[i]
- ror w11,w6,#7
- and w17,w24,w23
- ror w10,w3,#17
- bic w28,w25,w23
- ror w12,w27,#2
- add w26,w26,w4 // h+=X[i]
- eor w16,w16,w23,ror#11
- eor w11,w11,w6,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w27,w20 // a^b, b^c in next round
- eor w16,w16,w23,ror#25 // Sigma1(e)
- eor w12,w12,w27,ror#13
- add w26,w26,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w10,w10,w3,ror#19
- eor w11,w11,w6,lsr#3 // sigma0(X[i+1])
- add w26,w26,w16 // h+=Sigma1(e)
- eor w19,w19,w20 // Maj(a,b,c)
- eor w17,w12,w27,ror#22 // Sigma0(a)
- eor w10,w10,w3,lsr#10 // sigma1(X[i+14])
- add w5,w5,w14
- add w22,w22,w26 // d+=h
- add w26,w26,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w5,w5,w11
- add w26,w26,w17 // h+=Sigma0(a)
- add w5,w5,w10
- ldr w10,[sp,#12]
- str w13,[sp,#8]
- ror w16,w22,#6
- add w25,w25,w19 // h+=K[i]
- ror w12,w7,#7
- and w17,w23,w22
- ror w11,w4,#17
- bic w19,w24,w22
- ror w13,w26,#2
- add w25,w25,w5 // h+=X[i]
- eor w16,w16,w22,ror#11
- eor w12,w12,w7,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w26,w27 // a^b, b^c in next round
- eor w16,w16,w22,ror#25 // Sigma1(e)
- eor w13,w13,w26,ror#13
- add w25,w25,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w11,w11,w4,ror#19
- eor w12,w12,w7,lsr#3 // sigma0(X[i+1])
- add w25,w25,w16 // h+=Sigma1(e)
- eor w28,w28,w27 // Maj(a,b,c)
- eor w17,w13,w26,ror#22 // Sigma0(a)
- eor w11,w11,w4,lsr#10 // sigma1(X[i+14])
- add w6,w6,w15
- add w21,w21,w25 // d+=h
- add w25,w25,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w6,w6,w12
- add w25,w25,w17 // h+=Sigma0(a)
- add w6,w6,w11
- ldr w11,[sp,#0]
- str w14,[sp,#12]
- ror w16,w21,#6
- add w24,w24,w28 // h+=K[i]
- ror w13,w8,#7
- and w17,w22,w21
- ror w12,w5,#17
- bic w28,w23,w21
- ror w14,w25,#2
- add w24,w24,w6 // h+=X[i]
- eor w16,w16,w21,ror#11
- eor w13,w13,w8,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w25,w26 // a^b, b^c in next round
- eor w16,w16,w21,ror#25 // Sigma1(e)
- eor w14,w14,w25,ror#13
- add w24,w24,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w12,w12,w5,ror#19
- eor w13,w13,w8,lsr#3 // sigma0(X[i+1])
- add w24,w24,w16 // h+=Sigma1(e)
- eor w19,w19,w26 // Maj(a,b,c)
- eor w17,w14,w25,ror#22 // Sigma0(a)
- eor w12,w12,w5,lsr#10 // sigma1(X[i+14])
- add w7,w7,w0
- add w20,w20,w24 // d+=h
- add w24,w24,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w7,w7,w13
- add w24,w24,w17 // h+=Sigma0(a)
- add w7,w7,w12
- ldr w12,[sp,#4]
- str w15,[sp,#0]
- ror w16,w20,#6
- add w23,w23,w19 // h+=K[i]
- ror w14,w9,#7
- and w17,w21,w20
- ror w13,w6,#17
- bic w19,w22,w20
- ror w15,w24,#2
- add w23,w23,w7 // h+=X[i]
- eor w16,w16,w20,ror#11
- eor w14,w14,w9,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w24,w25 // a^b, b^c in next round
- eor w16,w16,w20,ror#25 // Sigma1(e)
- eor w15,w15,w24,ror#13
- add w23,w23,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w13,w13,w6,ror#19
- eor w14,w14,w9,lsr#3 // sigma0(X[i+1])
- add w23,w23,w16 // h+=Sigma1(e)
- eor w28,w28,w25 // Maj(a,b,c)
- eor w17,w15,w24,ror#22 // Sigma0(a)
- eor w13,w13,w6,lsr#10 // sigma1(X[i+14])
- add w8,w8,w1
- add w27,w27,w23 // d+=h
- add w23,w23,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w8,w8,w14
- add w23,w23,w17 // h+=Sigma0(a)
- add w8,w8,w13
- ldr w13,[sp,#8]
- str w0,[sp,#4]
- ror w16,w27,#6
- add w22,w22,w28 // h+=K[i]
- ror w15,w10,#7
- and w17,w20,w27
- ror w14,w7,#17
- bic w28,w21,w27
- ror w0,w23,#2
- add w22,w22,w8 // h+=X[i]
- eor w16,w16,w27,ror#11
- eor w15,w15,w10,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w23,w24 // a^b, b^c in next round
- eor w16,w16,w27,ror#25 // Sigma1(e)
- eor w0,w0,w23,ror#13
- add w22,w22,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w14,w14,w7,ror#19
- eor w15,w15,w10,lsr#3 // sigma0(X[i+1])
- add w22,w22,w16 // h+=Sigma1(e)
- eor w19,w19,w24 // Maj(a,b,c)
- eor w17,w0,w23,ror#22 // Sigma0(a)
- eor w14,w14,w7,lsr#10 // sigma1(X[i+14])
- add w9,w9,w2
- add w26,w26,w22 // d+=h
- add w22,w22,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w9,w9,w15
- add w22,w22,w17 // h+=Sigma0(a)
- add w9,w9,w14
- ldr w14,[sp,#12]
- str w1,[sp,#8]
- ror w16,w26,#6
- add w21,w21,w19 // h+=K[i]
- ror w0,w11,#7
- and w17,w27,w26
- ror w15,w8,#17
- bic w19,w20,w26
- ror w1,w22,#2
- add w21,w21,w9 // h+=X[i]
- eor w16,w16,w26,ror#11
- eor w0,w0,w11,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w22,w23 // a^b, b^c in next round
- eor w16,w16,w26,ror#25 // Sigma1(e)
- eor w1,w1,w22,ror#13
- add w21,w21,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w15,w15,w8,ror#19
- eor w0,w0,w11,lsr#3 // sigma0(X[i+1])
- add w21,w21,w16 // h+=Sigma1(e)
- eor w28,w28,w23 // Maj(a,b,c)
- eor w17,w1,w22,ror#22 // Sigma0(a)
- eor w15,w15,w8,lsr#10 // sigma1(X[i+14])
- add w10,w10,w3
- add w25,w25,w21 // d+=h
- add w21,w21,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w10,w10,w0
- add w21,w21,w17 // h+=Sigma0(a)
- add w10,w10,w15
- ldr w15,[sp,#0]
- str w2,[sp,#12]
- ror w16,w25,#6
- add w20,w20,w28 // h+=K[i]
- ror w1,w12,#7
- and w17,w26,w25
- ror w0,w9,#17
- bic w28,w27,w25
- ror w2,w21,#2
- add w20,w20,w10 // h+=X[i]
- eor w16,w16,w25,ror#11
- eor w1,w1,w12,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w21,w22 // a^b, b^c in next round
- eor w16,w16,w25,ror#25 // Sigma1(e)
- eor w2,w2,w21,ror#13
- add w20,w20,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w0,w0,w9,ror#19
- eor w1,w1,w12,lsr#3 // sigma0(X[i+1])
- add w20,w20,w16 // h+=Sigma1(e)
- eor w19,w19,w22 // Maj(a,b,c)
- eor w17,w2,w21,ror#22 // Sigma0(a)
- eor w0,w0,w9,lsr#10 // sigma1(X[i+14])
- add w11,w11,w4
- add w24,w24,w20 // d+=h
- add w20,w20,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w11,w11,w1
- add w20,w20,w17 // h+=Sigma0(a)
- add w11,w11,w0
- ldr w0,[sp,#4]
- str w3,[sp,#0]
- ror w16,w24,#6
- add w27,w27,w19 // h+=K[i]
- ror w2,w13,#7
- and w17,w25,w24
- ror w1,w10,#17
- bic w19,w26,w24
- ror w3,w20,#2
- add w27,w27,w11 // h+=X[i]
- eor w16,w16,w24,ror#11
- eor w2,w2,w13,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w20,w21 // a^b, b^c in next round
- eor w16,w16,w24,ror#25 // Sigma1(e)
- eor w3,w3,w20,ror#13
- add w27,w27,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w1,w1,w10,ror#19
- eor w2,w2,w13,lsr#3 // sigma0(X[i+1])
- add w27,w27,w16 // h+=Sigma1(e)
- eor w28,w28,w21 // Maj(a,b,c)
- eor w17,w3,w20,ror#22 // Sigma0(a)
- eor w1,w1,w10,lsr#10 // sigma1(X[i+14])
- add w12,w12,w5
- add w23,w23,w27 // d+=h
- add w27,w27,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w12,w12,w2
- add w27,w27,w17 // h+=Sigma0(a)
- add w12,w12,w1
- ldr w1,[sp,#8]
- str w4,[sp,#4]
- ror w16,w23,#6
- add w26,w26,w28 // h+=K[i]
- ror w3,w14,#7
- and w17,w24,w23
- ror w2,w11,#17
- bic w28,w25,w23
- ror w4,w27,#2
- add w26,w26,w12 // h+=X[i]
- eor w16,w16,w23,ror#11
- eor w3,w3,w14,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w27,w20 // a^b, b^c in next round
- eor w16,w16,w23,ror#25 // Sigma1(e)
- eor w4,w4,w27,ror#13
- add w26,w26,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w2,w2,w11,ror#19
- eor w3,w3,w14,lsr#3 // sigma0(X[i+1])
- add w26,w26,w16 // h+=Sigma1(e)
- eor w19,w19,w20 // Maj(a,b,c)
- eor w17,w4,w27,ror#22 // Sigma0(a)
- eor w2,w2,w11,lsr#10 // sigma1(X[i+14])
- add w13,w13,w6
- add w22,w22,w26 // d+=h
- add w26,w26,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w13,w13,w3
- add w26,w26,w17 // h+=Sigma0(a)
- add w13,w13,w2
- ldr w2,[sp,#12]
- str w5,[sp,#8]
- ror w16,w22,#6
- add w25,w25,w19 // h+=K[i]
- ror w4,w15,#7
- and w17,w23,w22
- ror w3,w12,#17
- bic w19,w24,w22
- ror w5,w26,#2
- add w25,w25,w13 // h+=X[i]
- eor w16,w16,w22,ror#11
- eor w4,w4,w15,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w26,w27 // a^b, b^c in next round
- eor w16,w16,w22,ror#25 // Sigma1(e)
- eor w5,w5,w26,ror#13
- add w25,w25,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w3,w3,w12,ror#19
- eor w4,w4,w15,lsr#3 // sigma0(X[i+1])
- add w25,w25,w16 // h+=Sigma1(e)
- eor w28,w28,w27 // Maj(a,b,c)
- eor w17,w5,w26,ror#22 // Sigma0(a)
- eor w3,w3,w12,lsr#10 // sigma1(X[i+14])
- add w14,w14,w7
- add w21,w21,w25 // d+=h
- add w25,w25,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w14,w14,w4
- add w25,w25,w17 // h+=Sigma0(a)
- add w14,w14,w3
- ldr w3,[sp,#0]
- str w6,[sp,#12]
- ror w16,w21,#6
- add w24,w24,w28 // h+=K[i]
- ror w5,w0,#7
- and w17,w22,w21
- ror w4,w13,#17
- bic w28,w23,w21
- ror w6,w25,#2
- add w24,w24,w14 // h+=X[i]
- eor w16,w16,w21,ror#11
- eor w5,w5,w0,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w25,w26 // a^b, b^c in next round
- eor w16,w16,w21,ror#25 // Sigma1(e)
- eor w6,w6,w25,ror#13
- add w24,w24,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w4,w4,w13,ror#19
- eor w5,w5,w0,lsr#3 // sigma0(X[i+1])
- add w24,w24,w16 // h+=Sigma1(e)
- eor w19,w19,w26 // Maj(a,b,c)
- eor w17,w6,w25,ror#22 // Sigma0(a)
- eor w4,w4,w13,lsr#10 // sigma1(X[i+14])
- add w15,w15,w8
- add w20,w20,w24 // d+=h
- add w24,w24,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w15,w15,w5
- add w24,w24,w17 // h+=Sigma0(a)
- add w15,w15,w4
- ldr w4,[sp,#4]
- str w7,[sp,#0]
- ror w16,w20,#6
- add w23,w23,w19 // h+=K[i]
- ror w6,w1,#7
- and w17,w21,w20
- ror w5,w14,#17
- bic w19,w22,w20
- ror w7,w24,#2
- add w23,w23,w15 // h+=X[i]
- eor w16,w16,w20,ror#11
- eor w6,w6,w1,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w24,w25 // a^b, b^c in next round
- eor w16,w16,w20,ror#25 // Sigma1(e)
- eor w7,w7,w24,ror#13
- add w23,w23,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w5,w5,w14,ror#19
- eor w6,w6,w1,lsr#3 // sigma0(X[i+1])
- add w23,w23,w16 // h+=Sigma1(e)
- eor w28,w28,w25 // Maj(a,b,c)
- eor w17,w7,w24,ror#22 // Sigma0(a)
- eor w5,w5,w14,lsr#10 // sigma1(X[i+14])
- add w0,w0,w9
- add w27,w27,w23 // d+=h
- add w23,w23,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w0,w0,w6
- add w23,w23,w17 // h+=Sigma0(a)
- add w0,w0,w5
- ldr w5,[sp,#8]
- str w8,[sp,#4]
- ror w16,w27,#6
- add w22,w22,w28 // h+=K[i]
- ror w7,w2,#7
- and w17,w20,w27
- ror w6,w15,#17
- bic w28,w21,w27
- ror w8,w23,#2
- add w22,w22,w0 // h+=X[i]
- eor w16,w16,w27,ror#11
- eor w7,w7,w2,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w23,w24 // a^b, b^c in next round
- eor w16,w16,w27,ror#25 // Sigma1(e)
- eor w8,w8,w23,ror#13
- add w22,w22,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w6,w6,w15,ror#19
- eor w7,w7,w2,lsr#3 // sigma0(X[i+1])
- add w22,w22,w16 // h+=Sigma1(e)
- eor w19,w19,w24 // Maj(a,b,c)
- eor w17,w8,w23,ror#22 // Sigma0(a)
- eor w6,w6,w15,lsr#10 // sigma1(X[i+14])
- add w1,w1,w10
- add w26,w26,w22 // d+=h
- add w22,w22,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w1,w1,w7
- add w22,w22,w17 // h+=Sigma0(a)
- add w1,w1,w6
- ldr w6,[sp,#12]
- str w9,[sp,#8]
- ror w16,w26,#6
- add w21,w21,w19 // h+=K[i]
- ror w8,w3,#7
- and w17,w27,w26
- ror w7,w0,#17
- bic w19,w20,w26
- ror w9,w22,#2
- add w21,w21,w1 // h+=X[i]
- eor w16,w16,w26,ror#11
- eor w8,w8,w3,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w22,w23 // a^b, b^c in next round
- eor w16,w16,w26,ror#25 // Sigma1(e)
- eor w9,w9,w22,ror#13
- add w21,w21,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w7,w7,w0,ror#19
- eor w8,w8,w3,lsr#3 // sigma0(X[i+1])
- add w21,w21,w16 // h+=Sigma1(e)
- eor w28,w28,w23 // Maj(a,b,c)
- eor w17,w9,w22,ror#22 // Sigma0(a)
- eor w7,w7,w0,lsr#10 // sigma1(X[i+14])
- add w2,w2,w11
- add w25,w25,w21 // d+=h
- add w21,w21,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w2,w2,w8
- add w21,w21,w17 // h+=Sigma0(a)
- add w2,w2,w7
- ldr w7,[sp,#0]
- str w10,[sp,#12]
- ror w16,w25,#6
- add w20,w20,w28 // h+=K[i]
- ror w9,w4,#7
- and w17,w26,w25
- ror w8,w1,#17
- bic w28,w27,w25
- ror w10,w21,#2
- add w20,w20,w2 // h+=X[i]
- eor w16,w16,w25,ror#11
- eor w9,w9,w4,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w21,w22 // a^b, b^c in next round
- eor w16,w16,w25,ror#25 // Sigma1(e)
- eor w10,w10,w21,ror#13
- add w20,w20,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w8,w8,w1,ror#19
- eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
- add w20,w20,w16 // h+=Sigma1(e)
- eor w19,w19,w22 // Maj(a,b,c)
- eor w17,w10,w21,ror#22 // Sigma0(a)
- eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
- add w3,w3,w12
- add w24,w24,w20 // d+=h
- add w20,w20,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w3,w3,w9
- add w20,w20,w17 // h+=Sigma0(a)
- add w3,w3,w8
- cbnz w19,Loop_16_xx
-
- ldp x0,x2,[x29,#96]
- ldr x1,[x29,#112]
- sub x30,x30,#260 // rewind
-
- ldp w3,w4,[x0]
- ldp w5,w6,[x0,#2*4]
- add x1,x1,#14*4 // advance input pointer
- ldp w7,w8,[x0,#4*4]
- add w20,w20,w3
- ldp w9,w10,[x0,#6*4]
- add w21,w21,w4
- add w22,w22,w5
- add w23,w23,w6
- stp w20,w21,[x0]
- add w24,w24,w7
- add w25,w25,w8
- stp w22,w23,[x0,#2*4]
- add w26,w26,w9
- add w27,w27,w10
- cmp x1,x2
- stp w24,w25,[x0,#4*4]
- stp w26,w27,[x0,#6*4]
- b.ne Loop
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#4*4
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#128
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-.section __TEXT,__const
-.align 6
-
-LK256:
-.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.long 0 //terminator
-
-.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
-.text
-#ifndef __KERNEL__
-
-.align 6
-sha256_block_armv8:
-Lv8_entry:
- // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- ld1 {v0.4s,v1.4s},[x0]
- adrp x3,LK256@PAGE
- add x3,x3,LK256@PAGEOFF
-
-Loop_hw:
- ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
- sub x2,x2,#1
- ld1 {v16.4s},[x3],#16
- rev32 v4.16b,v4.16b
- rev32 v5.16b,v5.16b
- rev32 v6.16b,v6.16b
- rev32 v7.16b,v7.16b
- orr v18.16b,v0.16b,v0.16b // offload
- orr v19.16b,v1.16b,v1.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v4.4s
-.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
-.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
-.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v5.4s
-.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
-.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
-.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v6.4s
-.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
-.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
-.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v7.4s
-.long 0x5e282887 //sha256su0 v7.16b,v4.16b
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
-.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
-.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v4.4s
-.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
-.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
-.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v5.4s
-.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
-.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
-.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v6.4s
-.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
-.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
-.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v7.4s
-.long 0x5e282887 //sha256su0 v7.16b,v4.16b
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
-.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
-.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v4.4s
-.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
-.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
-.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v5.4s
-.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
-.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
-.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v6.4s
-.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
-.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
-.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v7.4s
-.long 0x5e282887 //sha256su0 v7.16b,v4.16b
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
-.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
-.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v4.4s
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
-.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
-
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v5.4s
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
-.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
-
- ld1 {v17.4s},[x3]
- add v16.4s,v16.4s,v6.4s
- sub x3,x3,#64*4-16 // rewind
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
-.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
-
- add v17.4s,v17.4s,v7.4s
- orr v2.16b,v0.16b,v0.16b
-.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
-.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
-
- add v0.4s,v0.4s,v18.4s
- add v1.4s,v1.4s,v19.4s
-
- cbnz x2,Loop_hw
-
- st1 {v0.4s,v1.4s},[x0]
-
- ldr x29,[sp],#16
- ret
-
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/sha512-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/sha512-armv8-apple.S
deleted file mode 100644
index 10e8aaf..0000000
--- a/apple-aarch64/crypto/fipsmodule/sha512-armv8-apple.S
+++ /dev/null
@@ -1,1606 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License"). You may not use
-// this file except in compliance with the License. You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-//
-// Permission to use under GPLv2 terms is granted.
-// ====================================================================
-//
-// SHA256/512 for ARMv8.
-//
-// Performance in cycles per processed byte and improvement coefficient
-// over code generated with "default" compiler:
-//
-// SHA256-hw SHA256(*) SHA512
-// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
-// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
-// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
-// Denver 2.01 10.5 (+26%) 6.70 (+8%)
-// X-Gene 20.0 (+100%) 12.8 (+300%(***))
-// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
-// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
-//
-// (*) Software SHA256 results are of lesser relevance, presented
-// mostly for informational purposes.
-// (**) The result is a trade-off: it's possible to improve it by
-// 10% (or by 1 cycle per round), but at the cost of 20% loss
-// on Cortex-A53 (or by 4 cycles per round).
-// (***) Super-impressive coefficients over gcc-generated code are
-// indication of some compiler "pathology", most notably code
-// generated with -mgeneral-regs-only is significantly faster
-// and the gap is only 40-90%.
-
-#ifndef __KERNEL__
-# include <openssl/arm_arch.h>
-#endif
-
-.text
-
-
-.private_extern _OPENSSL_armcap_P
-.globl _sha512_block_data_order
-.private_extern _sha512_block_data_order
-
-.align 6
-_sha512_block_data_order:
- AARCH64_VALID_CALL_TARGET
-#ifndef __KERNEL__
-#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
- adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P
-#else
- adrp x16,_OPENSSL_armcap_P@PAGE
-#endif
- ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
- tst w16,#ARMV8_SHA512
- b.ne Lv8_entry
-#endif
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
-
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#4*8
-
- ldp x20,x21,[x0] // load context
- ldp x22,x23,[x0,#2*8]
- ldp x24,x25,[x0,#4*8]
- add x2,x1,x2,lsl#7 // end of input
- ldp x26,x27,[x0,#6*8]
- adrp x30,LK512@PAGE
- add x30,x30,LK512@PAGEOFF
- stp x0,x2,[x29,#96]
-
-Loop:
- ldp x3,x4,[x1],#2*8
- ldr x19,[x30],#8 // *K++
- eor x28,x21,x22 // magic seed
- str x1,[x29,#112]
-#ifndef __AARCH64EB__
- rev x3,x3 // 0
-#endif
- ror x16,x24,#14
- add x27,x27,x19 // h+=K[i]
- eor x6,x24,x24,ror#23
- and x17,x25,x24
- bic x19,x26,x24
- add x27,x27,x3 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x20,x21 // a^b, b^c in next round
- eor x16,x16,x6,ror#18 // Sigma1(e)
- ror x6,x20,#28
- add x27,x27,x17 // h+=Ch(e,f,g)
- eor x17,x20,x20,ror#5
- add x27,x27,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x23,x23,x27 // d+=h
- eor x28,x28,x21 // Maj(a,b,c)
- eor x17,x6,x17,ror#34 // Sigma0(a)
- add x27,x27,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x27,x27,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x4,x4 // 1
-#endif
- ldp x5,x6,[x1],#2*8
- add x27,x27,x17 // h+=Sigma0(a)
- ror x16,x23,#14
- add x26,x26,x28 // h+=K[i]
- eor x7,x23,x23,ror#23
- and x17,x24,x23
- bic x28,x25,x23
- add x26,x26,x4 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x27,x20 // a^b, b^c in next round
- eor x16,x16,x7,ror#18 // Sigma1(e)
- ror x7,x27,#28
- add x26,x26,x17 // h+=Ch(e,f,g)
- eor x17,x27,x27,ror#5
- add x26,x26,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x22,x22,x26 // d+=h
- eor x19,x19,x20 // Maj(a,b,c)
- eor x17,x7,x17,ror#34 // Sigma0(a)
- add x26,x26,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x26,x26,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x5,x5 // 2
-#endif
- add x26,x26,x17 // h+=Sigma0(a)
- ror x16,x22,#14
- add x25,x25,x19 // h+=K[i]
- eor x8,x22,x22,ror#23
- and x17,x23,x22
- bic x19,x24,x22
- add x25,x25,x5 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x26,x27 // a^b, b^c in next round
- eor x16,x16,x8,ror#18 // Sigma1(e)
- ror x8,x26,#28
- add x25,x25,x17 // h+=Ch(e,f,g)
- eor x17,x26,x26,ror#5
- add x25,x25,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x21,x21,x25 // d+=h
- eor x28,x28,x27 // Maj(a,b,c)
- eor x17,x8,x17,ror#34 // Sigma0(a)
- add x25,x25,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x25,x25,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x6,x6 // 3
-#endif
- ldp x7,x8,[x1],#2*8
- add x25,x25,x17 // h+=Sigma0(a)
- ror x16,x21,#14
- add x24,x24,x28 // h+=K[i]
- eor x9,x21,x21,ror#23
- and x17,x22,x21
- bic x28,x23,x21
- add x24,x24,x6 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x25,x26 // a^b, b^c in next round
- eor x16,x16,x9,ror#18 // Sigma1(e)
- ror x9,x25,#28
- add x24,x24,x17 // h+=Ch(e,f,g)
- eor x17,x25,x25,ror#5
- add x24,x24,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x20,x20,x24 // d+=h
- eor x19,x19,x26 // Maj(a,b,c)
- eor x17,x9,x17,ror#34 // Sigma0(a)
- add x24,x24,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x24,x24,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x7,x7 // 4
-#endif
- add x24,x24,x17 // h+=Sigma0(a)
- ror x16,x20,#14
- add x23,x23,x19 // h+=K[i]
- eor x10,x20,x20,ror#23
- and x17,x21,x20
- bic x19,x22,x20
- add x23,x23,x7 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x24,x25 // a^b, b^c in next round
- eor x16,x16,x10,ror#18 // Sigma1(e)
- ror x10,x24,#28
- add x23,x23,x17 // h+=Ch(e,f,g)
- eor x17,x24,x24,ror#5
- add x23,x23,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x27,x27,x23 // d+=h
- eor x28,x28,x25 // Maj(a,b,c)
- eor x17,x10,x17,ror#34 // Sigma0(a)
- add x23,x23,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x23,x23,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x8,x8 // 5
-#endif
- ldp x9,x10,[x1],#2*8
- add x23,x23,x17 // h+=Sigma0(a)
- ror x16,x27,#14
- add x22,x22,x28 // h+=K[i]
- eor x11,x27,x27,ror#23
- and x17,x20,x27
- bic x28,x21,x27
- add x22,x22,x8 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x23,x24 // a^b, b^c in next round
- eor x16,x16,x11,ror#18 // Sigma1(e)
- ror x11,x23,#28
- add x22,x22,x17 // h+=Ch(e,f,g)
- eor x17,x23,x23,ror#5
- add x22,x22,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x26,x26,x22 // d+=h
- eor x19,x19,x24 // Maj(a,b,c)
- eor x17,x11,x17,ror#34 // Sigma0(a)
- add x22,x22,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x22,x22,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x9,x9 // 6
-#endif
- add x22,x22,x17 // h+=Sigma0(a)
- ror x16,x26,#14
- add x21,x21,x19 // h+=K[i]
- eor x12,x26,x26,ror#23
- and x17,x27,x26
- bic x19,x20,x26
- add x21,x21,x9 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x22,x23 // a^b, b^c in next round
- eor x16,x16,x12,ror#18 // Sigma1(e)
- ror x12,x22,#28
- add x21,x21,x17 // h+=Ch(e,f,g)
- eor x17,x22,x22,ror#5
- add x21,x21,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x25,x25,x21 // d+=h
- eor x28,x28,x23 // Maj(a,b,c)
- eor x17,x12,x17,ror#34 // Sigma0(a)
- add x21,x21,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x21,x21,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x10,x10 // 7
-#endif
- ldp x11,x12,[x1],#2*8
- add x21,x21,x17 // h+=Sigma0(a)
- ror x16,x25,#14
- add x20,x20,x28 // h+=K[i]
- eor x13,x25,x25,ror#23
- and x17,x26,x25
- bic x28,x27,x25
- add x20,x20,x10 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x21,x22 // a^b, b^c in next round
- eor x16,x16,x13,ror#18 // Sigma1(e)
- ror x13,x21,#28
- add x20,x20,x17 // h+=Ch(e,f,g)
- eor x17,x21,x21,ror#5
- add x20,x20,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x24,x24,x20 // d+=h
- eor x19,x19,x22 // Maj(a,b,c)
- eor x17,x13,x17,ror#34 // Sigma0(a)
- add x20,x20,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x20,x20,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x11,x11 // 8
-#endif
- add x20,x20,x17 // h+=Sigma0(a)
- ror x16,x24,#14
- add x27,x27,x19 // h+=K[i]
- eor x14,x24,x24,ror#23
- and x17,x25,x24
- bic x19,x26,x24
- add x27,x27,x11 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x20,x21 // a^b, b^c in next round
- eor x16,x16,x14,ror#18 // Sigma1(e)
- ror x14,x20,#28
- add x27,x27,x17 // h+=Ch(e,f,g)
- eor x17,x20,x20,ror#5
- add x27,x27,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x23,x23,x27 // d+=h
- eor x28,x28,x21 // Maj(a,b,c)
- eor x17,x14,x17,ror#34 // Sigma0(a)
- add x27,x27,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x27,x27,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x12,x12 // 9
-#endif
- ldp x13,x14,[x1],#2*8
- add x27,x27,x17 // h+=Sigma0(a)
- ror x16,x23,#14
- add x26,x26,x28 // h+=K[i]
- eor x15,x23,x23,ror#23
- and x17,x24,x23
- bic x28,x25,x23
- add x26,x26,x12 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x27,x20 // a^b, b^c in next round
- eor x16,x16,x15,ror#18 // Sigma1(e)
- ror x15,x27,#28
- add x26,x26,x17 // h+=Ch(e,f,g)
- eor x17,x27,x27,ror#5
- add x26,x26,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x22,x22,x26 // d+=h
- eor x19,x19,x20 // Maj(a,b,c)
- eor x17,x15,x17,ror#34 // Sigma0(a)
- add x26,x26,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x26,x26,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x13,x13 // 10
-#endif
- add x26,x26,x17 // h+=Sigma0(a)
- ror x16,x22,#14
- add x25,x25,x19 // h+=K[i]
- eor x0,x22,x22,ror#23
- and x17,x23,x22
- bic x19,x24,x22
- add x25,x25,x13 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x26,x27 // a^b, b^c in next round
- eor x16,x16,x0,ror#18 // Sigma1(e)
- ror x0,x26,#28
- add x25,x25,x17 // h+=Ch(e,f,g)
- eor x17,x26,x26,ror#5
- add x25,x25,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x21,x21,x25 // d+=h
- eor x28,x28,x27 // Maj(a,b,c)
- eor x17,x0,x17,ror#34 // Sigma0(a)
- add x25,x25,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x25,x25,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x14,x14 // 11
-#endif
- ldp x15,x0,[x1],#2*8
- add x25,x25,x17 // h+=Sigma0(a)
- str x6,[sp,#24]
- ror x16,x21,#14
- add x24,x24,x28 // h+=K[i]
- eor x6,x21,x21,ror#23
- and x17,x22,x21
- bic x28,x23,x21
- add x24,x24,x14 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x25,x26 // a^b, b^c in next round
- eor x16,x16,x6,ror#18 // Sigma1(e)
- ror x6,x25,#28
- add x24,x24,x17 // h+=Ch(e,f,g)
- eor x17,x25,x25,ror#5
- add x24,x24,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x20,x20,x24 // d+=h
- eor x19,x19,x26 // Maj(a,b,c)
- eor x17,x6,x17,ror#34 // Sigma0(a)
- add x24,x24,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x24,x24,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x15,x15 // 12
-#endif
- add x24,x24,x17 // h+=Sigma0(a)
- str x7,[sp,#0]
- ror x16,x20,#14
- add x23,x23,x19 // h+=K[i]
- eor x7,x20,x20,ror#23
- and x17,x21,x20
- bic x19,x22,x20
- add x23,x23,x15 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x24,x25 // a^b, b^c in next round
- eor x16,x16,x7,ror#18 // Sigma1(e)
- ror x7,x24,#28
- add x23,x23,x17 // h+=Ch(e,f,g)
- eor x17,x24,x24,ror#5
- add x23,x23,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x27,x27,x23 // d+=h
- eor x28,x28,x25 // Maj(a,b,c)
- eor x17,x7,x17,ror#34 // Sigma0(a)
- add x23,x23,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x23,x23,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x0,x0 // 13
-#endif
- ldp x1,x2,[x1]
- add x23,x23,x17 // h+=Sigma0(a)
- str x8,[sp,#8]
- ror x16,x27,#14
- add x22,x22,x28 // h+=K[i]
- eor x8,x27,x27,ror#23
- and x17,x20,x27
- bic x28,x21,x27
- add x22,x22,x0 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x23,x24 // a^b, b^c in next round
- eor x16,x16,x8,ror#18 // Sigma1(e)
- ror x8,x23,#28
- add x22,x22,x17 // h+=Ch(e,f,g)
- eor x17,x23,x23,ror#5
- add x22,x22,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x26,x26,x22 // d+=h
- eor x19,x19,x24 // Maj(a,b,c)
- eor x17,x8,x17,ror#34 // Sigma0(a)
- add x22,x22,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x22,x22,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x1,x1 // 14
-#endif
- ldr x6,[sp,#24]
- add x22,x22,x17 // h+=Sigma0(a)
- str x9,[sp,#16]
- ror x16,x26,#14
- add x21,x21,x19 // h+=K[i]
- eor x9,x26,x26,ror#23
- and x17,x27,x26
- bic x19,x20,x26
- add x21,x21,x1 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x22,x23 // a^b, b^c in next round
- eor x16,x16,x9,ror#18 // Sigma1(e)
- ror x9,x22,#28
- add x21,x21,x17 // h+=Ch(e,f,g)
- eor x17,x22,x22,ror#5
- add x21,x21,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x25,x25,x21 // d+=h
- eor x28,x28,x23 // Maj(a,b,c)
- eor x17,x9,x17,ror#34 // Sigma0(a)
- add x21,x21,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x21,x21,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x2,x2 // 15
-#endif
- ldr x7,[sp,#0]
- add x21,x21,x17 // h+=Sigma0(a)
- str x10,[sp,#24]
- ror x16,x25,#14
- add x20,x20,x28 // h+=K[i]
- ror x9,x4,#1
- and x17,x26,x25
- ror x8,x1,#19
- bic x28,x27,x25
- ror x10,x21,#28
- add x20,x20,x2 // h+=X[i]
- eor x16,x16,x25,ror#18
- eor x9,x9,x4,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x21,x22 // a^b, b^c in next round
- eor x16,x16,x25,ror#41 // Sigma1(e)
- eor x10,x10,x21,ror#34
- add x20,x20,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x8,x8,x1,ror#61
- eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
- add x20,x20,x16 // h+=Sigma1(e)
- eor x19,x19,x22 // Maj(a,b,c)
- eor x17,x10,x21,ror#39 // Sigma0(a)
- eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
- add x3,x3,x12
- add x24,x24,x20 // d+=h
- add x20,x20,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x3,x3,x9
- add x20,x20,x17 // h+=Sigma0(a)
- add x3,x3,x8
-Loop_16_xx:
- ldr x8,[sp,#8]
- str x11,[sp,#0]
- ror x16,x24,#14
- add x27,x27,x19 // h+=K[i]
- ror x10,x5,#1
- and x17,x25,x24
- ror x9,x2,#19
- bic x19,x26,x24
- ror x11,x20,#28
- add x27,x27,x3 // h+=X[i]
- eor x16,x16,x24,ror#18
- eor x10,x10,x5,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x20,x21 // a^b, b^c in next round
- eor x16,x16,x24,ror#41 // Sigma1(e)
- eor x11,x11,x20,ror#34
- add x27,x27,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x9,x9,x2,ror#61
- eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
- add x27,x27,x16 // h+=Sigma1(e)
- eor x28,x28,x21 // Maj(a,b,c)
- eor x17,x11,x20,ror#39 // Sigma0(a)
- eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
- add x4,x4,x13
- add x23,x23,x27 // d+=h
- add x27,x27,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x4,x4,x10
- add x27,x27,x17 // h+=Sigma0(a)
- add x4,x4,x9
- ldr x9,[sp,#16]
- str x12,[sp,#8]
- ror x16,x23,#14
- add x26,x26,x28 // h+=K[i]
- ror x11,x6,#1
- and x17,x24,x23
- ror x10,x3,#19
- bic x28,x25,x23
- ror x12,x27,#28
- add x26,x26,x4 // h+=X[i]
- eor x16,x16,x23,ror#18
- eor x11,x11,x6,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x27,x20 // a^b, b^c in next round
- eor x16,x16,x23,ror#41 // Sigma1(e)
- eor x12,x12,x27,ror#34
- add x26,x26,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x10,x10,x3,ror#61
- eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
- add x26,x26,x16 // h+=Sigma1(e)
- eor x19,x19,x20 // Maj(a,b,c)
- eor x17,x12,x27,ror#39 // Sigma0(a)
- eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
- add x5,x5,x14
- add x22,x22,x26 // d+=h
- add x26,x26,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x5,x5,x11
- add x26,x26,x17 // h+=Sigma0(a)
- add x5,x5,x10
- ldr x10,[sp,#24]
- str x13,[sp,#16]
- ror x16,x22,#14
- add x25,x25,x19 // h+=K[i]
- ror x12,x7,#1
- and x17,x23,x22
- ror x11,x4,#19
- bic x19,x24,x22
- ror x13,x26,#28
- add x25,x25,x5 // h+=X[i]
- eor x16,x16,x22,ror#18
- eor x12,x12,x7,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x26,x27 // a^b, b^c in next round
- eor x16,x16,x22,ror#41 // Sigma1(e)
- eor x13,x13,x26,ror#34
- add x25,x25,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x11,x11,x4,ror#61
- eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
- add x25,x25,x16 // h+=Sigma1(e)
- eor x28,x28,x27 // Maj(a,b,c)
- eor x17,x13,x26,ror#39 // Sigma0(a)
- eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
- add x6,x6,x15
- add x21,x21,x25 // d+=h
- add x25,x25,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x6,x6,x12
- add x25,x25,x17 // h+=Sigma0(a)
- add x6,x6,x11
- ldr x11,[sp,#0]
- str x14,[sp,#24]
- ror x16,x21,#14
- add x24,x24,x28 // h+=K[i]
- ror x13,x8,#1
- and x17,x22,x21
- ror x12,x5,#19
- bic x28,x23,x21
- ror x14,x25,#28
- add x24,x24,x6 // h+=X[i]
- eor x16,x16,x21,ror#18
- eor x13,x13,x8,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x25,x26 // a^b, b^c in next round
- eor x16,x16,x21,ror#41 // Sigma1(e)
- eor x14,x14,x25,ror#34
- add x24,x24,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x12,x12,x5,ror#61
- eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
- add x24,x24,x16 // h+=Sigma1(e)
- eor x19,x19,x26 // Maj(a,b,c)
- eor x17,x14,x25,ror#39 // Sigma0(a)
- eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
- add x7,x7,x0
- add x20,x20,x24 // d+=h
- add x24,x24,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x7,x7,x13
- add x24,x24,x17 // h+=Sigma0(a)
- add x7,x7,x12
- ldr x12,[sp,#8]
- str x15,[sp,#0]
- ror x16,x20,#14
- add x23,x23,x19 // h+=K[i]
- ror x14,x9,#1
- and x17,x21,x20
- ror x13,x6,#19
- bic x19,x22,x20
- ror x15,x24,#28
- add x23,x23,x7 // h+=X[i]
- eor x16,x16,x20,ror#18
- eor x14,x14,x9,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x24,x25 // a^b, b^c in next round
- eor x16,x16,x20,ror#41 // Sigma1(e)
- eor x15,x15,x24,ror#34
- add x23,x23,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x13,x13,x6,ror#61
- eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
- add x23,x23,x16 // h+=Sigma1(e)
- eor x28,x28,x25 // Maj(a,b,c)
- eor x17,x15,x24,ror#39 // Sigma0(a)
- eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
- add x8,x8,x1
- add x27,x27,x23 // d+=h
- add x23,x23,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x8,x8,x14
- add x23,x23,x17 // h+=Sigma0(a)
- add x8,x8,x13
- ldr x13,[sp,#16]
- str x0,[sp,#8]
- ror x16,x27,#14
- add x22,x22,x28 // h+=K[i]
- ror x15,x10,#1
- and x17,x20,x27
- ror x14,x7,#19
- bic x28,x21,x27
- ror x0,x23,#28
- add x22,x22,x8 // h+=X[i]
- eor x16,x16,x27,ror#18
- eor x15,x15,x10,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x23,x24 // a^b, b^c in next round
- eor x16,x16,x27,ror#41 // Sigma1(e)
- eor x0,x0,x23,ror#34
- add x22,x22,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x14,x14,x7,ror#61
- eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
- add x22,x22,x16 // h+=Sigma1(e)
- eor x19,x19,x24 // Maj(a,b,c)
- eor x17,x0,x23,ror#39 // Sigma0(a)
- eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
- add x9,x9,x2
- add x26,x26,x22 // d+=h
- add x22,x22,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x9,x9,x15
- add x22,x22,x17 // h+=Sigma0(a)
- add x9,x9,x14
- ldr x14,[sp,#24]
- str x1,[sp,#16]
- ror x16,x26,#14
- add x21,x21,x19 // h+=K[i]
- ror x0,x11,#1
- and x17,x27,x26
- ror x15,x8,#19
- bic x19,x20,x26
- ror x1,x22,#28
- add x21,x21,x9 // h+=X[i]
- eor x16,x16,x26,ror#18
- eor x0,x0,x11,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x22,x23 // a^b, b^c in next round
- eor x16,x16,x26,ror#41 // Sigma1(e)
- eor x1,x1,x22,ror#34
- add x21,x21,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x15,x15,x8,ror#61
- eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
- add x21,x21,x16 // h+=Sigma1(e)
- eor x28,x28,x23 // Maj(a,b,c)
- eor x17,x1,x22,ror#39 // Sigma0(a)
- eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
- add x10,x10,x3
- add x25,x25,x21 // d+=h
- add x21,x21,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x10,x10,x0
- add x21,x21,x17 // h+=Sigma0(a)
- add x10,x10,x15
- ldr x15,[sp,#0]
- str x2,[sp,#24]
- ror x16,x25,#14
- add x20,x20,x28 // h+=K[i]
- ror x1,x12,#1
- and x17,x26,x25
- ror x0,x9,#19
- bic x28,x27,x25
- ror x2,x21,#28
- add x20,x20,x10 // h+=X[i]
- eor x16,x16,x25,ror#18
- eor x1,x1,x12,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x21,x22 // a^b, b^c in next round
- eor x16,x16,x25,ror#41 // Sigma1(e)
- eor x2,x2,x21,ror#34
- add x20,x20,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x0,x0,x9,ror#61
- eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
- add x20,x20,x16 // h+=Sigma1(e)
- eor x19,x19,x22 // Maj(a,b,c)
- eor x17,x2,x21,ror#39 // Sigma0(a)
- eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
- add x11,x11,x4
- add x24,x24,x20 // d+=h
- add x20,x20,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x11,x11,x1
- add x20,x20,x17 // h+=Sigma0(a)
- add x11,x11,x0
- ldr x0,[sp,#8]
- str x3,[sp,#0]
- ror x16,x24,#14
- add x27,x27,x19 // h+=K[i]
- ror x2,x13,#1
- and x17,x25,x24
- ror x1,x10,#19
- bic x19,x26,x24
- ror x3,x20,#28
- add x27,x27,x11 // h+=X[i]
- eor x16,x16,x24,ror#18
- eor x2,x2,x13,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x20,x21 // a^b, b^c in next round
- eor x16,x16,x24,ror#41 // Sigma1(e)
- eor x3,x3,x20,ror#34
- add x27,x27,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x1,x1,x10,ror#61
- eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
- add x27,x27,x16 // h+=Sigma1(e)
- eor x28,x28,x21 // Maj(a,b,c)
- eor x17,x3,x20,ror#39 // Sigma0(a)
- eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
- add x12,x12,x5
- add x23,x23,x27 // d+=h
- add x27,x27,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x12,x12,x2
- add x27,x27,x17 // h+=Sigma0(a)
- add x12,x12,x1
- ldr x1,[sp,#16]
- str x4,[sp,#8]
- ror x16,x23,#14
- add x26,x26,x28 // h+=K[i]
- ror x3,x14,#1
- and x17,x24,x23
- ror x2,x11,#19
- bic x28,x25,x23
- ror x4,x27,#28
- add x26,x26,x12 // h+=X[i]
- eor x16,x16,x23,ror#18
- eor x3,x3,x14,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x27,x20 // a^b, b^c in next round
- eor x16,x16,x23,ror#41 // Sigma1(e)
- eor x4,x4,x27,ror#34
- add x26,x26,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x2,x2,x11,ror#61
- eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
- add x26,x26,x16 // h+=Sigma1(e)
- eor x19,x19,x20 // Maj(a,b,c)
- eor x17,x4,x27,ror#39 // Sigma0(a)
- eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
- add x13,x13,x6
- add x22,x22,x26 // d+=h
- add x26,x26,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x13,x13,x3
- add x26,x26,x17 // h+=Sigma0(a)
- add x13,x13,x2
- ldr x2,[sp,#24]
- str x5,[sp,#16]
- ror x16,x22,#14
- add x25,x25,x19 // h+=K[i]
- ror x4,x15,#1
- and x17,x23,x22
- ror x3,x12,#19
- bic x19,x24,x22
- ror x5,x26,#28
- add x25,x25,x13 // h+=X[i]
- eor x16,x16,x22,ror#18
- eor x4,x4,x15,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x26,x27 // a^b, b^c in next round
- eor x16,x16,x22,ror#41 // Sigma1(e)
- eor x5,x5,x26,ror#34
- add x25,x25,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x3,x3,x12,ror#61
- eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
- add x25,x25,x16 // h+=Sigma1(e)
- eor x28,x28,x27 // Maj(a,b,c)
- eor x17,x5,x26,ror#39 // Sigma0(a)
- eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
- add x14,x14,x7
- add x21,x21,x25 // d+=h
- add x25,x25,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x14,x14,x4
- add x25,x25,x17 // h+=Sigma0(a)
- add x14,x14,x3
- ldr x3,[sp,#0]
- str x6,[sp,#24]
- ror x16,x21,#14
- add x24,x24,x28 // h+=K[i]
- ror x5,x0,#1
- and x17,x22,x21
- ror x4,x13,#19
- bic x28,x23,x21
- ror x6,x25,#28
- add x24,x24,x14 // h+=X[i]
- eor x16,x16,x21,ror#18
- eor x5,x5,x0,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x25,x26 // a^b, b^c in next round
- eor x16,x16,x21,ror#41 // Sigma1(e)
- eor x6,x6,x25,ror#34
- add x24,x24,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x4,x4,x13,ror#61
- eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
- add x24,x24,x16 // h+=Sigma1(e)
- eor x19,x19,x26 // Maj(a,b,c)
- eor x17,x6,x25,ror#39 // Sigma0(a)
- eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
- add x15,x15,x8
- add x20,x20,x24 // d+=h
- add x24,x24,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x15,x15,x5
- add x24,x24,x17 // h+=Sigma0(a)
- add x15,x15,x4
- ldr x4,[sp,#8]
- str x7,[sp,#0]
- ror x16,x20,#14
- add x23,x23,x19 // h+=K[i]
- ror x6,x1,#1
- and x17,x21,x20
- ror x5,x14,#19
- bic x19,x22,x20
- ror x7,x24,#28
- add x23,x23,x15 // h+=X[i]
- eor x16,x16,x20,ror#18
- eor x6,x6,x1,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x24,x25 // a^b, b^c in next round
- eor x16,x16,x20,ror#41 // Sigma1(e)
- eor x7,x7,x24,ror#34
- add x23,x23,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x5,x5,x14,ror#61
- eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
- add x23,x23,x16 // h+=Sigma1(e)
- eor x28,x28,x25 // Maj(a,b,c)
- eor x17,x7,x24,ror#39 // Sigma0(a)
- eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
- add x0,x0,x9
- add x27,x27,x23 // d+=h
- add x23,x23,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x0,x0,x6
- add x23,x23,x17 // h+=Sigma0(a)
- add x0,x0,x5
- ldr x5,[sp,#16]
- str x8,[sp,#8]
- ror x16,x27,#14
- add x22,x22,x28 // h+=K[i]
- ror x7,x2,#1
- and x17,x20,x27
- ror x6,x15,#19
- bic x28,x21,x27
- ror x8,x23,#28
- add x22,x22,x0 // h+=X[i]
- eor x16,x16,x27,ror#18
- eor x7,x7,x2,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x23,x24 // a^b, b^c in next round
- eor x16,x16,x27,ror#41 // Sigma1(e)
- eor x8,x8,x23,ror#34
- add x22,x22,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x6,x6,x15,ror#61
- eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
- add x22,x22,x16 // h+=Sigma1(e)
- eor x19,x19,x24 // Maj(a,b,c)
- eor x17,x8,x23,ror#39 // Sigma0(a)
- eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
- add x1,x1,x10
- add x26,x26,x22 // d+=h
- add x22,x22,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x1,x1,x7
- add x22,x22,x17 // h+=Sigma0(a)
- add x1,x1,x6
- ldr x6,[sp,#24]
- str x9,[sp,#16]
- ror x16,x26,#14
- add x21,x21,x19 // h+=K[i]
- ror x8,x3,#1
- and x17,x27,x26
- ror x7,x0,#19
- bic x19,x20,x26
- ror x9,x22,#28
- add x21,x21,x1 // h+=X[i]
- eor x16,x16,x26,ror#18
- eor x8,x8,x3,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x22,x23 // a^b, b^c in next round
- eor x16,x16,x26,ror#41 // Sigma1(e)
- eor x9,x9,x22,ror#34
- add x21,x21,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x7,x7,x0,ror#61
- eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
- add x21,x21,x16 // h+=Sigma1(e)
- eor x28,x28,x23 // Maj(a,b,c)
- eor x17,x9,x22,ror#39 // Sigma0(a)
- eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
- add x2,x2,x11
- add x25,x25,x21 // d+=h
- add x21,x21,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x2,x2,x8
- add x21,x21,x17 // h+=Sigma0(a)
- add x2,x2,x7
- ldr x7,[sp,#0]
- str x10,[sp,#24]
- ror x16,x25,#14
- add x20,x20,x28 // h+=K[i]
- ror x9,x4,#1
- and x17,x26,x25
- ror x8,x1,#19
- bic x28,x27,x25
- ror x10,x21,#28
- add x20,x20,x2 // h+=X[i]
- eor x16,x16,x25,ror#18
- eor x9,x9,x4,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x21,x22 // a^b, b^c in next round
- eor x16,x16,x25,ror#41 // Sigma1(e)
- eor x10,x10,x21,ror#34
- add x20,x20,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x8,x8,x1,ror#61
- eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
- add x20,x20,x16 // h+=Sigma1(e)
- eor x19,x19,x22 // Maj(a,b,c)
- eor x17,x10,x21,ror#39 // Sigma0(a)
- eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
- add x3,x3,x12
- add x24,x24,x20 // d+=h
- add x20,x20,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x3,x3,x9
- add x20,x20,x17 // h+=Sigma0(a)
- add x3,x3,x8
- cbnz x19,Loop_16_xx
-
- ldp x0,x2,[x29,#96]
- ldr x1,[x29,#112]
- sub x30,x30,#648 // rewind
-
- ldp x3,x4,[x0]
- ldp x5,x6,[x0,#2*8]
- add x1,x1,#14*8 // advance input pointer
- ldp x7,x8,[x0,#4*8]
- add x20,x20,x3
- ldp x9,x10,[x0,#6*8]
- add x21,x21,x4
- add x22,x22,x5
- add x23,x23,x6
- stp x20,x21,[x0]
- add x24,x24,x7
- add x25,x25,x8
- stp x22,x23,[x0,#2*8]
- add x26,x26,x9
- add x27,x27,x10
- cmp x1,x2
- stp x24,x25,[x0,#4*8]
- stp x26,x27,[x0,#6*8]
- b.ne Loop
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#4*8
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#128
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-.section __TEXT,__const
-.align 6
-
-LK512:
-.quad 0x428a2f98d728ae22,0x7137449123ef65cd
-.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
-.quad 0x3956c25bf348b538,0x59f111f1b605d019
-.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
-.quad 0xd807aa98a3030242,0x12835b0145706fbe
-.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
-.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
-.quad 0x9bdc06a725c71235,0xc19bf174cf692694
-.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
-.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
-.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
-.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
-.quad 0x983e5152ee66dfab,0xa831c66d2db43210
-.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
-.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
-.quad 0x06ca6351e003826f,0x142929670a0e6e70
-.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
-.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
-.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
-.quad 0x81c2c92e47edaee6,0x92722c851482353b
-.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
-.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
-.quad 0xd192e819d6ef5218,0xd69906245565a910
-.quad 0xf40e35855771202a,0x106aa07032bbd1b8
-.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
-.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
-.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
-.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
-.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
-.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
-.quad 0x90befffa23631e28,0xa4506cebde82bde9
-.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
-.quad 0xca273eceea26619c,0xd186b8c721c0c207
-.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
-.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
-.quad 0x113f9804bef90dae,0x1b710b35131c471b
-.quad 0x28db77f523047d84,0x32caab7b40c72493
-.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
-.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
-.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
-.quad 0 // terminator
-
-.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
-.text
-#ifndef __KERNEL__
-
-.align 6
-sha512_block_armv8:
-Lv8_entry:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
- ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-
- ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
- adrp x3,LK512@PAGE
- add x3,x3,LK512@PAGEOFF
-
- rev64 v16.16b,v16.16b
- rev64 v17.16b,v17.16b
- rev64 v18.16b,v18.16b
- rev64 v19.16b,v19.16b
- rev64 v20.16b,v20.16b
- rev64 v21.16b,v21.16b
- rev64 v22.16b,v22.16b
- rev64 v23.16b,v23.16b
- b Loop_hw
-
-.align 4
-Loop_hw:
- ld1 {v24.2d},[x3],#16
- subs x2,x2,#1
- sub x4,x1,#128
- orr v26.16b,v0.16b,v0.16b // offload
- orr v27.16b,v1.16b,v1.16b
- orr v28.16b,v2.16b,v2.16b
- orr v29.16b,v3.16b,v3.16b
- csel x1,x1,x4,ne // conditional rewind
- add v24.2d,v24.2d,v16.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v2.16b,v3.16b,#8
- ext v6.16b,v1.16b,v2.16b,#8
- add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec08230 //sha512su0 v16.16b,v17.16b
- ext v7.16b,v20.16b,v21.16b,#8
-.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
-.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
- add v4.2d,v1.2d,v3.2d // "D + T1"
-.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
- add v25.2d,v25.2d,v17.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v4.16b,v2.16b,#8
- ext v6.16b,v0.16b,v4.16b,#8
- add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec08251 //sha512su0 v17.16b,v18.16b
- ext v7.16b,v21.16b,v22.16b,#8
-.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
-.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
- add v1.2d,v0.2d,v2.2d // "D + T1"
-.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
- add v24.2d,v24.2d,v18.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v1.16b,v4.16b,#8
- ext v6.16b,v3.16b,v1.16b,#8
- add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec08272 //sha512su0 v18.16b,v19.16b
- ext v7.16b,v22.16b,v23.16b,#8
-.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
-.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
- add v0.2d,v3.2d,v4.2d // "D + T1"
-.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
- add v25.2d,v25.2d,v19.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v0.16b,v1.16b,#8
- ext v6.16b,v2.16b,v0.16b,#8
- add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec08293 //sha512su0 v19.16b,v20.16b
- ext v7.16b,v23.16b,v16.16b,#8
-.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
-.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
- add v3.2d,v2.2d,v1.2d // "D + T1"
-.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
- add v24.2d,v24.2d,v20.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v3.16b,v0.16b,#8
- ext v6.16b,v4.16b,v3.16b,#8
- add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
- ext v7.16b,v16.16b,v17.16b,#8
-.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
-.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
- add v2.2d,v4.2d,v0.2d // "D + T1"
-.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
- add v25.2d,v25.2d,v21.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v2.16b,v3.16b,#8
- ext v6.16b,v1.16b,v2.16b,#8
- add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
- ext v7.16b,v17.16b,v18.16b,#8
-.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
-.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
- add v4.2d,v1.2d,v3.2d // "D + T1"
-.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
- add v24.2d,v24.2d,v22.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v4.16b,v2.16b,#8
- ext v6.16b,v0.16b,v4.16b,#8
- add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
- ext v7.16b,v18.16b,v19.16b,#8
-.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
-.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
- add v1.2d,v0.2d,v2.2d // "D + T1"
-.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
- add v25.2d,v25.2d,v23.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v1.16b,v4.16b,#8
- ext v6.16b,v3.16b,v1.16b,#8
- add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec08217 //sha512su0 v23.16b,v16.16b
- ext v7.16b,v19.16b,v20.16b,#8
-.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
-.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
- add v0.2d,v3.2d,v4.2d // "D + T1"
-.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
- add v24.2d,v24.2d,v16.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v0.16b,v1.16b,#8
- ext v6.16b,v2.16b,v0.16b,#8
- add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec08230 //sha512su0 v16.16b,v17.16b
- ext v7.16b,v20.16b,v21.16b,#8
-.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
-.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
- add v3.2d,v2.2d,v1.2d // "D + T1"
-.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
- add v25.2d,v25.2d,v17.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v3.16b,v0.16b,#8
- ext v6.16b,v4.16b,v3.16b,#8
- add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec08251 //sha512su0 v17.16b,v18.16b
- ext v7.16b,v21.16b,v22.16b,#8
-.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
-.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
- add v2.2d,v4.2d,v0.2d // "D + T1"
-.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
- add v24.2d,v24.2d,v18.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v2.16b,v3.16b,#8
- ext v6.16b,v1.16b,v2.16b,#8
- add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec08272 //sha512su0 v18.16b,v19.16b
- ext v7.16b,v22.16b,v23.16b,#8
-.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
-.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
- add v4.2d,v1.2d,v3.2d // "D + T1"
-.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
- add v25.2d,v25.2d,v19.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v4.16b,v2.16b,#8
- ext v6.16b,v0.16b,v4.16b,#8
- add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec08293 //sha512su0 v19.16b,v20.16b
- ext v7.16b,v23.16b,v16.16b,#8
-.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
-.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
- add v1.2d,v0.2d,v2.2d // "D + T1"
-.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
- add v24.2d,v24.2d,v20.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v1.16b,v4.16b,#8
- ext v6.16b,v3.16b,v1.16b,#8
- add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
- ext v7.16b,v16.16b,v17.16b,#8
-.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
-.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
- add v0.2d,v3.2d,v4.2d // "D + T1"
-.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
- add v25.2d,v25.2d,v21.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v0.16b,v1.16b,#8
- ext v6.16b,v2.16b,v0.16b,#8
- add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
- ext v7.16b,v17.16b,v18.16b,#8
-.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
-.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
- add v3.2d,v2.2d,v1.2d // "D + T1"
-.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
- add v24.2d,v24.2d,v22.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v3.16b,v0.16b,#8
- ext v6.16b,v4.16b,v3.16b,#8
- add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
- ext v7.16b,v18.16b,v19.16b,#8
-.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
-.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
- add v2.2d,v4.2d,v0.2d // "D + T1"
-.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
- add v25.2d,v25.2d,v23.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v2.16b,v3.16b,#8
- ext v6.16b,v1.16b,v2.16b,#8
- add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec08217 //sha512su0 v23.16b,v16.16b
- ext v7.16b,v19.16b,v20.16b,#8
-.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
-.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
- add v4.2d,v1.2d,v3.2d // "D + T1"
-.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
- add v24.2d,v24.2d,v16.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v4.16b,v2.16b,#8
- ext v6.16b,v0.16b,v4.16b,#8
- add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec08230 //sha512su0 v16.16b,v17.16b
- ext v7.16b,v20.16b,v21.16b,#8
-.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
-.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
- add v1.2d,v0.2d,v2.2d // "D + T1"
-.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
- add v25.2d,v25.2d,v17.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v1.16b,v4.16b,#8
- ext v6.16b,v3.16b,v1.16b,#8
- add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec08251 //sha512su0 v17.16b,v18.16b
- ext v7.16b,v21.16b,v22.16b,#8
-.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
-.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
- add v0.2d,v3.2d,v4.2d // "D + T1"
-.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
- add v24.2d,v24.2d,v18.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v0.16b,v1.16b,#8
- ext v6.16b,v2.16b,v0.16b,#8
- add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec08272 //sha512su0 v18.16b,v19.16b
- ext v7.16b,v22.16b,v23.16b,#8
-.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
-.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
- add v3.2d,v2.2d,v1.2d // "D + T1"
-.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
- add v25.2d,v25.2d,v19.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v3.16b,v0.16b,#8
- ext v6.16b,v4.16b,v3.16b,#8
- add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec08293 //sha512su0 v19.16b,v20.16b
- ext v7.16b,v23.16b,v16.16b,#8
-.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
-.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
- add v2.2d,v4.2d,v0.2d // "D + T1"
-.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
- add v24.2d,v24.2d,v20.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v2.16b,v3.16b,#8
- ext v6.16b,v1.16b,v2.16b,#8
- add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
- ext v7.16b,v16.16b,v17.16b,#8
-.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
-.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
- add v4.2d,v1.2d,v3.2d // "D + T1"
-.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
- add v25.2d,v25.2d,v21.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v4.16b,v2.16b,#8
- ext v6.16b,v0.16b,v4.16b,#8
- add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
- ext v7.16b,v17.16b,v18.16b,#8
-.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
-.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
- add v1.2d,v0.2d,v2.2d // "D + T1"
-.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
- add v24.2d,v24.2d,v22.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v1.16b,v4.16b,#8
- ext v6.16b,v3.16b,v1.16b,#8
- add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
- ext v7.16b,v18.16b,v19.16b,#8
-.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
-.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
- add v0.2d,v3.2d,v4.2d // "D + T1"
-.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
- add v25.2d,v25.2d,v23.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v0.16b,v1.16b,#8
- ext v6.16b,v2.16b,v0.16b,#8
- add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec08217 //sha512su0 v23.16b,v16.16b
- ext v7.16b,v19.16b,v20.16b,#8
-.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
-.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
- add v3.2d,v2.2d,v1.2d // "D + T1"
-.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
- add v24.2d,v24.2d,v16.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v3.16b,v0.16b,#8
- ext v6.16b,v4.16b,v3.16b,#8
- add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec08230 //sha512su0 v16.16b,v17.16b
- ext v7.16b,v20.16b,v21.16b,#8
-.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
-.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
- add v2.2d,v4.2d,v0.2d // "D + T1"
-.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
- add v25.2d,v25.2d,v17.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v2.16b,v3.16b,#8
- ext v6.16b,v1.16b,v2.16b,#8
- add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec08251 //sha512su0 v17.16b,v18.16b
- ext v7.16b,v21.16b,v22.16b,#8
-.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
-.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
- add v4.2d,v1.2d,v3.2d // "D + T1"
-.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
- add v24.2d,v24.2d,v18.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v4.16b,v2.16b,#8
- ext v6.16b,v0.16b,v4.16b,#8
- add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec08272 //sha512su0 v18.16b,v19.16b
- ext v7.16b,v22.16b,v23.16b,#8
-.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
-.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
- add v1.2d,v0.2d,v2.2d // "D + T1"
-.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
- add v25.2d,v25.2d,v19.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v1.16b,v4.16b,#8
- ext v6.16b,v3.16b,v1.16b,#8
- add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec08293 //sha512su0 v19.16b,v20.16b
- ext v7.16b,v23.16b,v16.16b,#8
-.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
-.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
- add v0.2d,v3.2d,v4.2d // "D + T1"
-.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
- add v24.2d,v24.2d,v20.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v0.16b,v1.16b,#8
- ext v6.16b,v2.16b,v0.16b,#8
- add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
- ext v7.16b,v16.16b,v17.16b,#8
-.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
-.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
- add v3.2d,v2.2d,v1.2d // "D + T1"
-.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
- add v25.2d,v25.2d,v21.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v3.16b,v0.16b,#8
- ext v6.16b,v4.16b,v3.16b,#8
- add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
- ext v7.16b,v17.16b,v18.16b,#8
-.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
-.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
- add v2.2d,v4.2d,v0.2d // "D + T1"
-.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
- add v24.2d,v24.2d,v22.2d
- ld1 {v25.2d},[x3],#16
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v2.16b,v3.16b,#8
- ext v6.16b,v1.16b,v2.16b,#8
- add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
- ext v7.16b,v18.16b,v19.16b,#8
-.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
-.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
- add v4.2d,v1.2d,v3.2d // "D + T1"
-.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
- add v25.2d,v25.2d,v23.2d
- ld1 {v24.2d},[x3],#16
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v4.16b,v2.16b,#8
- ext v6.16b,v0.16b,v4.16b,#8
- add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xcec08217 //sha512su0 v23.16b,v16.16b
- ext v7.16b,v19.16b,v20.16b,#8
-.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
-.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
- add v1.2d,v0.2d,v2.2d // "D + T1"
-.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
- ld1 {v25.2d},[x3],#16
- add v24.2d,v24.2d,v16.2d
- ld1 {v16.16b},[x1],#16 // load next input
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v1.16b,v4.16b,#8
- ext v6.16b,v3.16b,v1.16b,#8
- add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
- rev64 v16.16b,v16.16b
- add v0.2d,v3.2d,v4.2d // "D + T1"
-.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
- ld1 {v24.2d},[x3],#16
- add v25.2d,v25.2d,v17.2d
- ld1 {v17.16b},[x1],#16 // load next input
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v0.16b,v1.16b,#8
- ext v6.16b,v2.16b,v0.16b,#8
- add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
- rev64 v17.16b,v17.16b
- add v3.2d,v2.2d,v1.2d // "D + T1"
-.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
- ld1 {v25.2d},[x3],#16
- add v24.2d,v24.2d,v18.2d
- ld1 {v18.16b},[x1],#16 // load next input
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v3.16b,v0.16b,#8
- ext v6.16b,v4.16b,v3.16b,#8
- add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
- rev64 v18.16b,v18.16b
- add v2.2d,v4.2d,v0.2d // "D + T1"
-.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
- ld1 {v24.2d},[x3],#16
- add v25.2d,v25.2d,v19.2d
- ld1 {v19.16b},[x1],#16 // load next input
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v2.16b,v3.16b,#8
- ext v6.16b,v1.16b,v2.16b,#8
- add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
- rev64 v19.16b,v19.16b
- add v4.2d,v1.2d,v3.2d // "D + T1"
-.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
- ld1 {v25.2d},[x3],#16
- add v24.2d,v24.2d,v20.2d
- ld1 {v20.16b},[x1],#16 // load next input
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v4.16b,v2.16b,#8
- ext v6.16b,v0.16b,v4.16b,#8
- add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
- rev64 v20.16b,v20.16b
- add v1.2d,v0.2d,v2.2d // "D + T1"
-.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
- ld1 {v24.2d},[x3],#16
- add v25.2d,v25.2d,v21.2d
- ld1 {v21.16b},[x1],#16 // load next input
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v1.16b,v4.16b,#8
- ext v6.16b,v3.16b,v1.16b,#8
- add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
- rev64 v21.16b,v21.16b
- add v0.2d,v3.2d,v4.2d // "D + T1"
-.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
- ld1 {v25.2d},[x3],#16
- add v24.2d,v24.2d,v22.2d
- ld1 {v22.16b},[x1],#16 // load next input
- ext v24.16b,v24.16b,v24.16b,#8
- ext v5.16b,v0.16b,v1.16b,#8
- ext v6.16b,v2.16b,v0.16b,#8
- add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
-.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
- rev64 v22.16b,v22.16b
- add v3.2d,v2.2d,v1.2d // "D + T1"
-.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
- sub x3,x3,#80*8 // rewind
- add v25.2d,v25.2d,v23.2d
- ld1 {v23.16b},[x1],#16 // load next input
- ext v25.16b,v25.16b,v25.16b,#8
- ext v5.16b,v3.16b,v0.16b,#8
- ext v6.16b,v4.16b,v3.16b,#8
- add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
-.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
- rev64 v23.16b,v23.16b
- add v2.2d,v4.2d,v0.2d // "D + T1"
-.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
- add v0.2d,v0.2d,v26.2d // accumulate
- add v1.2d,v1.2d,v27.2d
- add v2.2d,v2.2d,v28.2d
- add v3.2d,v3.2d,v29.2d
-
- cbnz x2,Loop_hw
-
- st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
-
- ldr x29,[sp],#16
- ret
-
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/vpaes-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/vpaes-armv8-apple.S
deleted file mode 100644
index a108a96..0000000
--- a/apple-aarch64/crypto/fipsmodule/vpaes-armv8-apple.S
+++ /dev/null
@@ -1,1224 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.section __TEXT,__const
-
-
-.align 7 // totally strategic alignment
-_vpaes_consts:
-Lk_mc_forward: // mc_forward
-.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
-.quad 0x080B0A0904070605, 0x000302010C0F0E0D
-.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
-.quad 0x000302010C0F0E0D, 0x080B0A0904070605
-Lk_mc_backward: // mc_backward
-.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
-.quad 0x020100030E0D0C0F, 0x0A09080B06050407
-.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
-.quad 0x0A09080B06050407, 0x020100030E0D0C0F
-Lk_sr: // sr
-.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
-.quad 0x030E09040F0A0500, 0x0B06010C07020D08
-.quad 0x0F060D040B020900, 0x070E050C030A0108
-.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
-
-//
-// "Hot" constants
-//
-Lk_inv: // inv, inva
-.quad 0x0E05060F0D080180, 0x040703090A0B0C02
-.quad 0x01040A060F0B0780, 0x030D0E0C02050809
-Lk_ipt: // input transform (lo, hi)
-.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
-.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
-Lk_sbo: // sbou, sbot
-.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
-.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
-Lk_sb1: // sb1u, sb1t
-.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
-.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
-Lk_sb2: // sb2u, sb2t
-.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
-.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
-
-//
-// Decryption stuff
-//
-Lk_dipt: // decryption input transform
-.quad 0x0F505B040B545F00, 0x154A411E114E451A
-.quad 0x86E383E660056500, 0x12771772F491F194
-Lk_dsbo: // decryption sbox final output
-.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
-.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-Lk_dsb9: // decryption sbox output *9*u, *9*t
-.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
-.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
-Lk_dsbd: // decryption sbox output *D*u, *D*t
-.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
-.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
-Lk_dsbb: // decryption sbox output *B*u, *B*t
-.quad 0xD022649296B44200, 0x602646F6B0F2D404
-.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
-Lk_dsbe: // decryption sbox output *E*u, *E*t
-.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
-.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
-
-//
-// Key schedule constants
-//
-Lk_dksd: // decryption key schedule: invskew x*D
-.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
-.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
-Lk_dksb: // decryption key schedule: invskew x*B
-.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
-.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
-Lk_dkse: // decryption key schedule: invskew x*E + 0x63
-.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
-.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
-Lk_dks9: // decryption key schedule: invskew x*9
-.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
-.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
-
-Lk_rcon: // rcon
-.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
-
-Lk_opt: // output transform
-.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
-.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
-Lk_deskew: // deskew tables: inverts the sbox's "skew"
-.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
-.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
-
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
-.align 2
-
-.align 6
-
-.text
-##
-## _aes_preheat
-##
-## Fills register %r10 -> .aes_consts (so you can -fPIC)
-## and %xmm9-%xmm15 as specified below.
-##
-
-.align 4
-_vpaes_encrypt_preheat:
- adrp x10, Lk_inv@PAGE
- add x10, x10, Lk_inv@PAGEOFF
- movi v17.16b, #0x0f
- ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv
- ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo
- ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2
- ret
-
-
-##
-## _aes_encrypt_core
-##
-## AES-encrypt %xmm0.
-##
-## Inputs:
-## %xmm0 = input
-## %xmm9-%xmm15 as in _vpaes_preheat
-## (%rdx) = scheduled keys
-##
-## Output in %xmm0
-## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
-## Preserves %xmm6 - %xmm8 so you get some local vectors
-##
-##
-
-.align 4
-_vpaes_encrypt_core:
- mov x9, x2
- ldr w8, [x2,#240] // pull rounds
- adrp x11, Lk_mc_forward@PAGE+16
- add x11, x11, Lk_mc_forward@PAGEOFF+16
- // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
- ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
- and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
- ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
- tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
- // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
- tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
- eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
- eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
- b Lenc_entry
-
-.align 4
-Lenc_loop:
- // middle of middle round
- add x10, x11, #0x40
- tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
- ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
- tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
- eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
- tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
- eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
- tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
- ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
- tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
- eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
- tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
- eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
- tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
- eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
- and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
- eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
- sub w8, w8, #1 // nr--
-
-Lenc_entry:
- // top of round
- and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
- ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
- tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
- eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
- tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
- tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
- eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
- eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
- tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
- tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
- eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
- eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
- ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
- cbnz w8, Lenc_loop
-
- // middle of last round
- add x10, x11, #0x80
- // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
- // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
- tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
- ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
- tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
- eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
- eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
- tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
- ret
-
-
-.globl _vpaes_encrypt
-.private_extern _vpaes_encrypt
-
-.align 4
-_vpaes_encrypt:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- ld1 {v7.16b}, [x0]
- bl _vpaes_encrypt_preheat
- bl _vpaes_encrypt_core
- st1 {v0.16b}, [x1]
-
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-
-.align 4
-_vpaes_encrypt_2x:
- mov x9, x2
- ldr w8, [x2,#240] // pull rounds
- adrp x11, Lk_mc_forward@PAGE+16
- add x11, x11, Lk_mc_forward@PAGEOFF+16
- // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
- ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
- and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
- ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
- and v9.16b, v15.16b, v17.16b
- ushr v8.16b, v15.16b, #4
- tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
- tbl v9.16b, {v20.16b}, v9.16b
- // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
- tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
- tbl v10.16b, {v21.16b}, v8.16b
- eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
- eor v8.16b, v9.16b, v16.16b
- eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
- eor v8.16b, v8.16b, v10.16b
- b Lenc_2x_entry
-
-.align 4
-Lenc_2x_loop:
- // middle of middle round
- add x10, x11, #0x40
- tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
- tbl v12.16b, {v25.16b}, v10.16b
- ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
- tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
- tbl v8.16b, {v24.16b}, v11.16b
- eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
- eor v12.16b, v12.16b, v16.16b
- tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
- tbl v13.16b, {v27.16b}, v10.16b
- eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
- eor v8.16b, v8.16b, v12.16b
- tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
- tbl v10.16b, {v26.16b}, v11.16b
- ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
- tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
- tbl v11.16b, {v8.16b}, v1.16b
- eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
- eor v10.16b, v10.16b, v13.16b
- tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
- tbl v8.16b, {v8.16b}, v4.16b
- eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
- eor v11.16b, v11.16b, v10.16b
- tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
- tbl v12.16b, {v11.16b},v1.16b
- eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
- eor v8.16b, v8.16b, v11.16b
- and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
- eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
- eor v8.16b, v8.16b, v12.16b
- sub w8, w8, #1 // nr--
-
-Lenc_2x_entry:
- // top of round
- and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
- ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
- and v9.16b, v8.16b, v17.16b
- ushr v8.16b, v8.16b, #4
- tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
- tbl v13.16b, {v19.16b},v9.16b
- eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
- eor v9.16b, v9.16b, v8.16b
- tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
- tbl v11.16b, {v18.16b},v8.16b
- tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
- tbl v12.16b, {v18.16b},v9.16b
- eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
- eor v11.16b, v11.16b, v13.16b
- eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
- eor v12.16b, v12.16b, v13.16b
- tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
- tbl v10.16b, {v18.16b},v11.16b
- tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
- tbl v11.16b, {v18.16b},v12.16b
- eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
- eor v10.16b, v10.16b, v9.16b
- eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
- eor v11.16b, v11.16b, v8.16b
- ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
- cbnz w8, Lenc_2x_loop
-
- // middle of last round
- add x10, x11, #0x80
- // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
- // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
- tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
- tbl v12.16b, {v22.16b}, v10.16b
- ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
- tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
- tbl v8.16b, {v23.16b}, v11.16b
- eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
- eor v12.16b, v12.16b, v16.16b
- eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
- eor v8.16b, v8.16b, v12.16b
- tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
- tbl v1.16b, {v8.16b},v1.16b
- ret
-
-
-
-.align 4
-_vpaes_decrypt_preheat:
- adrp x10, Lk_inv@PAGE
- add x10, x10, Lk_inv@PAGEOFF
- movi v17.16b, #0x0f
- adrp x11, Lk_dipt@PAGE
- add x11, x11, Lk_dipt@PAGEOFF
- ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv
- ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo
- ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd
- ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe
- ret
-
-
-##
-## Decryption core
-##
-## Same API as encryption core.
-##
-
-.align 4
-_vpaes_decrypt_core:
- mov x9, x2
- ldr w8, [x2,#240] // pull rounds
-
- // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
- lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
- eor x11, x11, #0x30 // xor $0x30, %r11
- adrp x10, Lk_sr@PAGE
- add x10, x10, Lk_sr@PAGEOFF
- and x11, x11, #0x30 // and $0x30, %r11
- add x11, x11, x10
- adrp x10, Lk_mc_forward@PAGE+48
- add x10, x10, Lk_mc_forward@PAGEOFF+48
-
- ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
- and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
- ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
- tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
- ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5
- // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
- tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
- eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
- eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
- b Ldec_entry
-
-.align 4
-Ldec_loop:
-//
-// Inverse mix columns
-//
- // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
- // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
- tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
- tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
- eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
- // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
- eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
- // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
-
- tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
- tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
- tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
- eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
- // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
- eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
- // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
-
- tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
- tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
- tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
- eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
- // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
- eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
- // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
-
- tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
- tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
- tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
- eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
- ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
- eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
- sub w8, w8, #1 // sub $1,%rax # nr--
-
-Ldec_entry:
- // top of round
- and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
- ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
- tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
- eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
- tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
- tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
- eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
- eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
- tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
- tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
- eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
- eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
- ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
- cbnz w8, Ldec_loop
-
- // middle of last round
- // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
- tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
- // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
- ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
- tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
- eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
- eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
- tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
- ret
-
-
-.globl _vpaes_decrypt
-.private_extern _vpaes_decrypt
-
-.align 4
-_vpaes_decrypt:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- ld1 {v7.16b}, [x0]
- bl _vpaes_decrypt_preheat
- bl _vpaes_decrypt_core
- st1 {v0.16b}, [x1]
-
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-// v14-v15 input, v0-v1 output
-
-.align 4
-_vpaes_decrypt_2x:
- mov x9, x2
- ldr w8, [x2,#240] // pull rounds
-
- // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
- lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
- eor x11, x11, #0x30 // xor $0x30, %r11
- adrp x10, Lk_sr@PAGE
- add x10, x10, Lk_sr@PAGEOFF
- and x11, x11, #0x30 // and $0x30, %r11
- add x11, x11, x10
- adrp x10, Lk_mc_forward@PAGE+48
- add x10, x10, Lk_mc_forward@PAGEOFF+48
-
- ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
- and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
- ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
- and v9.16b, v15.16b, v17.16b
- ushr v8.16b, v15.16b, #4
- tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
- tbl v10.16b, {v20.16b},v9.16b
- ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5
- // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
- tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
- tbl v8.16b, {v21.16b},v8.16b
- eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
- eor v10.16b, v10.16b, v16.16b
- eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
- eor v8.16b, v8.16b, v10.16b
- b Ldec_2x_entry
-
-.align 4
-Ldec_2x_loop:
-//
-// Inverse mix columns
-//
- // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
- // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
- tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
- tbl v12.16b, {v24.16b}, v10.16b
- tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
- tbl v9.16b, {v25.16b}, v11.16b
- eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
- eor v8.16b, v12.16b, v16.16b
- // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
- eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
- eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
- // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
-
- tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
- tbl v12.16b, {v26.16b}, v10.16b
- tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
- tbl v8.16b, {v8.16b},v5.16b
- tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
- tbl v9.16b, {v27.16b}, v11.16b
- eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
- eor v8.16b, v8.16b, v12.16b
- // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
- eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
- eor v8.16b, v8.16b, v9.16b
- // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
-
- tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
- tbl v12.16b, {v28.16b}, v10.16b
- tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
- tbl v8.16b, {v8.16b},v5.16b
- tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
- tbl v9.16b, {v29.16b}, v11.16b
- eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
- eor v8.16b, v8.16b, v12.16b
- // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
- eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
- eor v8.16b, v8.16b, v9.16b
- // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
-
- tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
- tbl v12.16b, {v30.16b}, v10.16b
- tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
- tbl v8.16b, {v8.16b},v5.16b
- tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
- tbl v9.16b, {v31.16b}, v11.16b
- eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
- eor v8.16b, v8.16b, v12.16b
- ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
- eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
- eor v8.16b, v8.16b, v9.16b
- sub w8, w8, #1 // sub $1,%rax # nr--
-
-Ldec_2x_entry:
- // top of round
- and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
- ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
- and v9.16b, v8.16b, v17.16b
- ushr v8.16b, v8.16b, #4
- tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
- tbl v10.16b, {v19.16b},v9.16b
- eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
- eor v9.16b, v9.16b, v8.16b
- tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
- tbl v11.16b, {v18.16b},v8.16b
- tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
- tbl v12.16b, {v18.16b},v9.16b
- eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
- eor v11.16b, v11.16b, v10.16b
- eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
- eor v12.16b, v12.16b, v10.16b
- tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
- tbl v10.16b, {v18.16b},v11.16b
- tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
- tbl v11.16b, {v18.16b},v12.16b
- eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
- eor v10.16b, v10.16b, v9.16b
- eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
- eor v11.16b, v11.16b, v8.16b
- ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
- cbnz w8, Ldec_2x_loop
-
- // middle of last round
- // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
- tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
- tbl v12.16b, {v22.16b}, v10.16b
- // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
- tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
- tbl v9.16b, {v23.16b}, v11.16b
- ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
- eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
- eor v12.16b, v12.16b, v16.16b
- eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
- eor v8.16b, v9.16b, v12.16b
- tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
- tbl v1.16b, {v8.16b},v2.16b
- ret
-
-########################################################
-## ##
-## AES key schedule ##
-## ##
-########################################################
-
-.align 4
-_vpaes_key_preheat:
- adrp x10, Lk_inv@PAGE
- add x10, x10, Lk_inv@PAGEOFF
- movi v16.16b, #0x5b // Lk_s63
- adrp x11, Lk_sb1@PAGE
- add x11, x11, Lk_sb1@PAGEOFF
- movi v17.16b, #0x0f // Lk_s0F
- ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt
- adrp x10, Lk_dksd@PAGE
- add x10, x10, Lk_dksd@PAGEOFF
- ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1
- adrp x11, Lk_mc_forward@PAGE
- add x11, x11, Lk_mc_forward@PAGEOFF
- ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb
- ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9
- ld1 {v8.2d}, [x10] // Lk_rcon
- ld1 {v9.2d}, [x11] // Lk_mc_forward[0]
- ret
-
-
-
-.align 4
-_vpaes_schedule_core:
- AARCH64_SIGN_LINK_REGISTER
- stp x29, x30, [sp,#-16]!
- add x29,sp,#0
-
- bl _vpaes_key_preheat // load the tables
-
- ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
-
- // input transform
- mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
- bl _vpaes_schedule_transform
- mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
-
- adrp x10, Lk_sr@PAGE // lea Lk_sr(%rip),%r10
- add x10, x10, Lk_sr@PAGEOFF
-
- add x8, x8, x10
- cbnz w3, Lschedule_am_decrypting
-
- // encrypting, output zeroth round key after transform
- st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
- b Lschedule_go
-
-Lschedule_am_decrypting:
- // decrypting, output zeroth round key after shiftrows
- ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
- tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
- st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
- eor x8, x8, #0x30 // xor $0x30, %r8
-
-Lschedule_go:
- cmp w1, #192 // cmp $192, %esi
- b.hi Lschedule_256
- b.eq Lschedule_192
- // 128: fall though
-
-##
-## .schedule_128
-##
-## 128-bit specific part of key schedule.
-##
-## This schedule is really simple, because all its parts
-## are accomplished by the subroutines.
-##
-Lschedule_128:
- mov x0, #10 // mov $10, %esi
-
-Loop_schedule_128:
- sub x0, x0, #1 // dec %esi
- bl _vpaes_schedule_round
- cbz x0, Lschedule_mangle_last
- bl _vpaes_schedule_mangle // write output
- b Loop_schedule_128
-
-##
-## .aes_schedule_192
-##
-## 192-bit specific part of key schedule.
-##
-## The main body of this schedule is the same as the 128-bit
-## schedule, but with more smearing. The long, high side is
-## stored in %xmm7 as before, and the short, low side is in
-## the high bits of %xmm6.
-##
-## This schedule is somewhat nastier, however, because each
-## round produces 192 bits of key material, or 1.5 round keys.
-## Therefore, on each cycle we do 2 rounds and produce 3 round
-## keys.
-##
-.align 4
-Lschedule_192:
- sub x0, x0, #8
- ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
- bl _vpaes_schedule_transform // input transform
- mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
- eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
- ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
- mov x0, #4 // mov $4, %esi
-
-Loop_schedule_192:
- sub x0, x0, #1 // dec %esi
- bl _vpaes_schedule_round
- ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
- bl _vpaes_schedule_mangle // save key n
- bl _vpaes_schedule_192_smear
- bl _vpaes_schedule_mangle // save key n+1
- bl _vpaes_schedule_round
- cbz x0, Lschedule_mangle_last
- bl _vpaes_schedule_mangle // save key n+2
- bl _vpaes_schedule_192_smear
- b Loop_schedule_192
-
-##
-## .aes_schedule_256
-##
-## 256-bit specific part of key schedule.
-##
-## The structure here is very similar to the 128-bit
-## schedule, but with an additional "low side" in
-## %xmm6. The low side's rounds are the same as the
-## high side's, except no rcon and no rotation.
-##
-.align 4
-Lschedule_256:
- ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
- bl _vpaes_schedule_transform // input transform
- mov x0, #7 // mov $7, %esi
-
-Loop_schedule_256:
- sub x0, x0, #1 // dec %esi
- bl _vpaes_schedule_mangle // output low result
- mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
-
- // high round
- bl _vpaes_schedule_round
- cbz x0, Lschedule_mangle_last
- bl _vpaes_schedule_mangle
-
- // low round. swap xmm7 and xmm6
- dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
- movi v4.16b, #0
- mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
- mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
- bl _vpaes_schedule_low_round
- mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
-
- b Loop_schedule_256
-
-##
-## .aes_schedule_mangle_last
-##
-## Mangler for last round of key schedule
-## Mangles %xmm0
-## when encrypting, outputs out(%xmm0) ^ 63
-## when decrypting, outputs unskew(%xmm0)
-##
-## Always called right before return... jumps to cleanup and exits
-##
-.align 4
-Lschedule_mangle_last:
- // schedule last round key from xmm0
- adrp x11, Lk_deskew@PAGE // lea Lk_deskew(%rip),%r11 # prepare to deskew
- add x11, x11, Lk_deskew@PAGEOFF
-
- cbnz w3, Lschedule_mangle_last_dec
-
- // encrypting
- ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
- adrp x11, Lk_opt@PAGE // lea Lk_opt(%rip), %r11 # prepare to output transform
- add x11, x11, Lk_opt@PAGEOFF
- add x2, x2, #32 // add $32, %rdx
- tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
-
-Lschedule_mangle_last_dec:
- ld1 {v20.2d,v21.2d}, [x11] // reload constants
- sub x2, x2, #16 // add $-16, %rdx
- eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0
- bl _vpaes_schedule_transform // output transform
- st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
-
- // cleanup
- eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
- eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
- eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
- eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
- eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
- eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
- eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
- eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
- ldp x29, x30, [sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-##
-## .aes_schedule_192_smear
-##
-## Smear the short, low side in the 192-bit key schedule.
-##
-## Inputs:
-## %xmm7: high side, b a x y
-## %xmm6: low side, d c 0 0
-## %xmm13: 0
-##
-## Outputs:
-## %xmm6: b+c+d b+c 0 0
-## %xmm0: b+c+d b+c b a
-##
-
-.align 4
-_vpaes_schedule_192_smear:
- movi v1.16b, #0
- dup v0.4s, v7.s[3]
- ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
- ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
- eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
- eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
- eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
- mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
- ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
- ret
-
-
-##
-## .aes_schedule_round
-##
-## Runs one main round of the key schedule on %xmm0, %xmm7
-##
-## Specifically, runs subbytes on the high dword of %xmm0
-## then rotates it by one byte and xors into the low dword of
-## %xmm7.
-##
-## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
-## next rcon.
-##
-## Smears the dwords of %xmm7 by xoring the low into the
-## second low, result into third, result into highest.
-##
-## Returns results in %xmm7 = %xmm0.
-## Clobbers %xmm1-%xmm4, %r11.
-##
-
-.align 4
-_vpaes_schedule_round:
- // extract rcon from xmm8
- movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
- ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
- ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
- eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
-
- // rotate
- dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
- ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
-
- // fall through...
-
- // low round: same as high round, but no rotation and no rcon.
-_vpaes_schedule_low_round:
- // smear xmm7
- ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
- eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
- ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
-
- // subbytes
- and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
- ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
- eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
- tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
- eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
- tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
- eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
- tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
- eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7
- tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
- eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
- tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
- eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
- eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
- tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
- tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
- eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
-
- // add in smeared stuff
- eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
- eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
- ret
-
-
-##
-## .aes_schedule_transform
-##
-## Linear-transform %xmm0 according to tables at (%r11)
-##
-## Requires that %xmm9 = 0x0F0F... as in preheat
-## Output in %xmm0
-## Clobbers %xmm1, %xmm2
-##
-
-.align 4
-_vpaes_schedule_transform:
- and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
- ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
- // vmovdqa (%r11), %xmm2 # lo
- tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
- // vmovdqa 16(%r11), %xmm1 # hi
- tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
- eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
- ret
-
-
-##
-## .aes_schedule_mangle
-##
-## Mangle xmm0 from (basis-transformed) standard version
-## to our version.
-##
-## On encrypt,
-## xor with 0x63
-## multiply by circulant 0,1,1,1
-## apply shiftrows transform
-##
-## On decrypt,
-## xor with 0x63
-## multiply by "inverse mixcolumns" circulant E,B,D,9
-## deskew
-## apply shiftrows transform
-##
-##
-## Writes out to (%rdx), and increments or decrements it
-## Keeps track of round number mod 4 in %r8
-## Preserves xmm0
-## Clobbers xmm1-xmm5
-##
-
-.align 4
-_vpaes_schedule_mangle:
- mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
- // vmovdqa .Lk_mc_forward(%rip),%xmm5
- cbnz w3, Lschedule_mangle_dec
-
- // encrypting
- eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4
- add x2, x2, #16 // add $16, %rdx
- tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
- tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
- tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
- eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
- ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
- eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
-
- b Lschedule_mangle_both
-.align 4
-Lschedule_mangle_dec:
- // inverse mix columns
- // lea .Lk_dksd(%rip),%r11
- ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
- and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
-
- // vmovdqa 0x00(%r11), %xmm2
- tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
- // vmovdqa 0x10(%r11), %xmm3
- tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
- eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
- tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
-
- // vmovdqa 0x20(%r11), %xmm2
- tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
- eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
- // vmovdqa 0x30(%r11), %xmm3
- tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
- eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
- tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
-
- // vmovdqa 0x40(%r11), %xmm2
- tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
- eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
- // vmovdqa 0x50(%r11), %xmm3
- tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
- eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
-
- // vmovdqa 0x60(%r11), %xmm2
- tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
- tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
- // vmovdqa 0x70(%r11), %xmm4
- tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
- ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
- eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
- eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
-
- sub x2, x2, #16 // add $-16, %rdx
-
-Lschedule_mangle_both:
- tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
- add x8, x8, #48 // add $-16, %r8
- and x8, x8, #~(1<<6) // and $0x30, %r8
- st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
- ret
-
-
-.globl _vpaes_set_encrypt_key
-.private_extern _vpaes_set_encrypt_key
-
-.align 4
-_vpaes_set_encrypt_key:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- stp d8,d9,[sp,#-16]! // ABI spec says so
-
- lsr w9, w1, #5 // shr $5,%eax
- add w9, w9, #5 // $5,%eax
- str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
-
- mov w3, #0 // mov $0,%ecx
- mov x8, #0x30 // mov $0x30,%r8d
- bl _vpaes_schedule_core
- eor x0, x0, x0
-
- ldp d8,d9,[sp],#16
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-.globl _vpaes_set_decrypt_key
-.private_extern _vpaes_set_decrypt_key
-
-.align 4
-_vpaes_set_decrypt_key:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- stp d8,d9,[sp,#-16]! // ABI spec says so
-
- lsr w9, w1, #5 // shr $5,%eax
- add w9, w9, #5 // $5,%eax
- str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
- lsl w9, w9, #4 // shl $4,%eax
- add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
- add x2, x2, x9
-
- mov w3, #1 // mov $1,%ecx
- lsr w8, w1, #1 // shr $1,%r8d
- and x8, x8, #32 // and $32,%r8d
- eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32
- bl _vpaes_schedule_core
-
- ldp d8,d9,[sp],#16
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-.globl _vpaes_cbc_encrypt
-.private_extern _vpaes_cbc_encrypt
-
-.align 4
-_vpaes_cbc_encrypt:
- AARCH64_SIGN_LINK_REGISTER
- cbz x2, Lcbc_abort
- cmp w5, #0 // check direction
- b.eq vpaes_cbc_decrypt
-
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- mov x17, x2 // reassign
- mov x2, x3 // reassign
-
- ld1 {v0.16b}, [x4] // load ivec
- bl _vpaes_encrypt_preheat
- b Lcbc_enc_loop
-
-.align 4
-Lcbc_enc_loop:
- ld1 {v7.16b}, [x0],#16 // load input
- eor v7.16b, v7.16b, v0.16b // xor with ivec
- bl _vpaes_encrypt_core
- st1 {v0.16b}, [x1],#16 // save output
- subs x17, x17, #16
- b.hi Lcbc_enc_loop
-
- st1 {v0.16b}, [x4] // write ivec
-
- ldp x29,x30,[sp],#16
-Lcbc_abort:
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-
-.align 4
-vpaes_cbc_decrypt:
- // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
- // only from vpaes_cbc_encrypt which has already signed the return address.
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- stp d8,d9,[sp,#-16]! // ABI spec says so
- stp d10,d11,[sp,#-16]!
- stp d12,d13,[sp,#-16]!
- stp d14,d15,[sp,#-16]!
-
- mov x17, x2 // reassign
- mov x2, x3 // reassign
- ld1 {v6.16b}, [x4] // load ivec
- bl _vpaes_decrypt_preheat
- tst x17, #16
- b.eq Lcbc_dec_loop2x
-
- ld1 {v7.16b}, [x0], #16 // load input
- bl _vpaes_decrypt_core
- eor v0.16b, v0.16b, v6.16b // xor with ivec
- orr v6.16b, v7.16b, v7.16b // next ivec value
- st1 {v0.16b}, [x1], #16
- subs x17, x17, #16
- b.ls Lcbc_dec_done
-
-.align 4
-Lcbc_dec_loop2x:
- ld1 {v14.16b,v15.16b}, [x0], #32
- bl _vpaes_decrypt_2x
- eor v0.16b, v0.16b, v6.16b // xor with ivec
- eor v1.16b, v1.16b, v14.16b
- orr v6.16b, v15.16b, v15.16b
- st1 {v0.16b,v1.16b}, [x1], #32
- subs x17, x17, #32
- b.hi Lcbc_dec_loop2x
-
-Lcbc_dec_done:
- st1 {v6.16b}, [x4]
-
- ldp d14,d15,[sp],#16
- ldp d12,d13,[sp],#16
- ldp d10,d11,[sp],#16
- ldp d8,d9,[sp],#16
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-.globl _vpaes_ctr32_encrypt_blocks
-.private_extern _vpaes_ctr32_encrypt_blocks
-
-.align 4
-_vpaes_ctr32_encrypt_blocks:
- AARCH64_SIGN_LINK_REGISTER
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- stp d8,d9,[sp,#-16]! // ABI spec says so
- stp d10,d11,[sp,#-16]!
- stp d12,d13,[sp,#-16]!
- stp d14,d15,[sp,#-16]!
-
- cbz x2, Lctr32_done
-
- // Note, unlike the other functions, x2 here is measured in blocks,
- // not bytes.
- mov x17, x2
- mov x2, x3
-
- // Load the IV and counter portion.
- ldr w6, [x4, #12]
- ld1 {v7.16b}, [x4]
-
- bl _vpaes_encrypt_preheat
- tst x17, #1
- rev w6, w6 // The counter is big-endian.
- b.eq Lctr32_prep_loop
-
- // Handle one block so the remaining block count is even for
- // _vpaes_encrypt_2x.
- ld1 {v6.16b}, [x0], #16 // Load input ahead of time
- bl _vpaes_encrypt_core
- eor v0.16b, v0.16b, v6.16b // XOR input and result
- st1 {v0.16b}, [x1], #16
- subs x17, x17, #1
- // Update the counter.
- add w6, w6, #1
- rev w7, w6
- mov v7.s[3], w7
- b.ls Lctr32_done
-
-Lctr32_prep_loop:
- // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
- // uses v14 and v15.
- mov v15.16b, v7.16b
- mov v14.16b, v7.16b
- add w6, w6, #1
- rev w7, w6
- mov v15.s[3], w7
-
-Lctr32_loop:
- ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time
- bl _vpaes_encrypt_2x
- eor v0.16b, v0.16b, v6.16b // XOR input and result
- eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
- st1 {v0.16b,v1.16b}, [x1], #32
- subs x17, x17, #2
- // Update the counter.
- add w7, w6, #1
- add w6, w6, #2
- rev w7, w7
- mov v14.s[3], w7
- rev w7, w6
- mov v15.s[3], w7
- b.hi Lctr32_loop
-
-Lctr32_done:
- ldp d14,d15,[sp],#16
- ldp d12,d13,[sp],#16
- ldp d10,d11,[sp],#16
- ldp d8,d9,[sp],#16
- ldp x29,x30,[sp],#16
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/test/trampoline-armv8-apple.S b/apple-aarch64/crypto/test/trampoline-armv8-apple.S
deleted file mode 100644
index 99055e0..0000000
--- a/apple-aarch64/crypto/test/trampoline-armv8-apple.S
+++ /dev/null
@@ -1,750 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.text
-
-// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
-// with |argv|, then saves the callee-saved registers into |state|. It returns
-// the result of |func|. The |unwind| argument is unused.
-// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
-// const uint64_t *argv, size_t argc,
-// uint64_t unwind);
-
-.globl _abi_test_trampoline
-.private_extern _abi_test_trampoline
-.align 4
-_abi_test_trampoline:
-Labi_test_trampoline_begin:
- AARCH64_SIGN_LINK_REGISTER
- // Stack layout (low to high addresses)
- // x29,x30 (16 bytes)
- // d8-d15 (64 bytes)
- // x19-x28 (80 bytes)
- // x1 (8 bytes)
- // padding (8 bytes)
- stp x29, x30, [sp, #-176]!
- mov x29, sp
-
- // Saved callee-saved registers and |state|.
- stp d8, d9, [sp, #16]
- stp d10, d11, [sp, #32]
- stp d12, d13, [sp, #48]
- stp d14, d15, [sp, #64]
- stp x19, x20, [sp, #80]
- stp x21, x22, [sp, #96]
- stp x23, x24, [sp, #112]
- stp x25, x26, [sp, #128]
- stp x27, x28, [sp, #144]
- str x1, [sp, #160]
-
- // Load registers from |state|, with the exception of x29. x29 is the
- // frame pointer and also callee-saved, but AAPCS64 allows platforms to
- // mandate that x29 always point to a frame. iOS64 does so, which means
- // we cannot fill x29 with entropy without violating ABI rules
- // ourselves. x29 is tested separately below.
- ldp d8, d9, [x1], #16
- ldp d10, d11, [x1], #16
- ldp d12, d13, [x1], #16
- ldp d14, d15, [x1], #16
- ldp x19, x20, [x1], #16
- ldp x21, x22, [x1], #16
- ldp x23, x24, [x1], #16
- ldp x25, x26, [x1], #16
- ldp x27, x28, [x1], #16
-
- // Move parameters into temporary registers.
- mov x9, x0
- mov x10, x2
- mov x11, x3
-
- // Load parameters into registers.
- cbz x11, Largs_done
- ldr x0, [x10], #8
- subs x11, x11, #1
- b.eq Largs_done
- ldr x1, [x10], #8
- subs x11, x11, #1
- b.eq Largs_done
- ldr x2, [x10], #8
- subs x11, x11, #1
- b.eq Largs_done
- ldr x3, [x10], #8
- subs x11, x11, #1
- b.eq Largs_done
- ldr x4, [x10], #8
- subs x11, x11, #1
- b.eq Largs_done
- ldr x5, [x10], #8
- subs x11, x11, #1
- b.eq Largs_done
- ldr x6, [x10], #8
- subs x11, x11, #1
- b.eq Largs_done
- ldr x7, [x10], #8
-
-Largs_done:
- blr x9
-
- // Reload |state| and store registers.
- ldr x1, [sp, #160]
- stp d8, d9, [x1], #16
- stp d10, d11, [x1], #16
- stp d12, d13, [x1], #16
- stp d14, d15, [x1], #16
- stp x19, x20, [x1], #16
- stp x21, x22, [x1], #16
- stp x23, x24, [x1], #16
- stp x25, x26, [x1], #16
- stp x27, x28, [x1], #16
-
- // |func| is required to preserve x29, the frame pointer. We cannot load
- // random values into x29 (see comment above), so compare it against the
- // expected value and zero the field of |state| if corrupted.
- mov x9, sp
- cmp x29, x9
- b.eq Lx29_ok
- str xzr, [x1]
-
-Lx29_ok:
- // Restore callee-saved registers.
- ldp d8, d9, [sp, #16]
- ldp d10, d11, [sp, #32]
- ldp d12, d13, [sp, #48]
- ldp d14, d15, [sp, #64]
- ldp x19, x20, [sp, #80]
- ldp x21, x22, [sp, #96]
- ldp x23, x24, [sp, #112]
- ldp x25, x26, [sp, #128]
- ldp x27, x28, [sp, #144]
-
- ldp x29, x30, [sp], #176
- AARCH64_VALIDATE_LINK_REGISTER
- ret
-
-
-.globl _abi_test_clobber_x0
-.private_extern _abi_test_clobber_x0
-.align 4
-_abi_test_clobber_x0:
- AARCH64_VALID_CALL_TARGET
- mov x0, xzr
- ret
-
-
-.globl _abi_test_clobber_x1
-.private_extern _abi_test_clobber_x1
-.align 4
-_abi_test_clobber_x1:
- AARCH64_VALID_CALL_TARGET
- mov x1, xzr
- ret
-
-
-.globl _abi_test_clobber_x2
-.private_extern _abi_test_clobber_x2
-.align 4
-_abi_test_clobber_x2:
- AARCH64_VALID_CALL_TARGET
- mov x2, xzr
- ret
-
-
-.globl _abi_test_clobber_x3
-.private_extern _abi_test_clobber_x3
-.align 4
-_abi_test_clobber_x3:
- AARCH64_VALID_CALL_TARGET
- mov x3, xzr
- ret
-
-
-.globl _abi_test_clobber_x4
-.private_extern _abi_test_clobber_x4
-.align 4
-_abi_test_clobber_x4:
- AARCH64_VALID_CALL_TARGET
- mov x4, xzr
- ret
-
-
-.globl _abi_test_clobber_x5
-.private_extern _abi_test_clobber_x5
-.align 4
-_abi_test_clobber_x5:
- AARCH64_VALID_CALL_TARGET
- mov x5, xzr
- ret
-
-
-.globl _abi_test_clobber_x6
-.private_extern _abi_test_clobber_x6
-.align 4
-_abi_test_clobber_x6:
- AARCH64_VALID_CALL_TARGET
- mov x6, xzr
- ret
-
-
-.globl _abi_test_clobber_x7
-.private_extern _abi_test_clobber_x7
-.align 4
-_abi_test_clobber_x7:
- AARCH64_VALID_CALL_TARGET
- mov x7, xzr
- ret
-
-
-.globl _abi_test_clobber_x8
-.private_extern _abi_test_clobber_x8
-.align 4
-_abi_test_clobber_x8:
- AARCH64_VALID_CALL_TARGET
- mov x8, xzr
- ret
-
-
-.globl _abi_test_clobber_x9
-.private_extern _abi_test_clobber_x9
-.align 4
-_abi_test_clobber_x9:
- AARCH64_VALID_CALL_TARGET
- mov x9, xzr
- ret
-
-
-.globl _abi_test_clobber_x10
-.private_extern _abi_test_clobber_x10
-.align 4
-_abi_test_clobber_x10:
- AARCH64_VALID_CALL_TARGET
- mov x10, xzr
- ret
-
-
-.globl _abi_test_clobber_x11
-.private_extern _abi_test_clobber_x11
-.align 4
-_abi_test_clobber_x11:
- AARCH64_VALID_CALL_TARGET
- mov x11, xzr
- ret
-
-
-.globl _abi_test_clobber_x12
-.private_extern _abi_test_clobber_x12
-.align 4
-_abi_test_clobber_x12:
- AARCH64_VALID_CALL_TARGET
- mov x12, xzr
- ret
-
-
-.globl _abi_test_clobber_x13
-.private_extern _abi_test_clobber_x13
-.align 4
-_abi_test_clobber_x13:
- AARCH64_VALID_CALL_TARGET
- mov x13, xzr
- ret
-
-
-.globl _abi_test_clobber_x14
-.private_extern _abi_test_clobber_x14
-.align 4
-_abi_test_clobber_x14:
- AARCH64_VALID_CALL_TARGET
- mov x14, xzr
- ret
-
-
-.globl _abi_test_clobber_x15
-.private_extern _abi_test_clobber_x15
-.align 4
-_abi_test_clobber_x15:
- AARCH64_VALID_CALL_TARGET
- mov x15, xzr
- ret
-
-
-.globl _abi_test_clobber_x16
-.private_extern _abi_test_clobber_x16
-.align 4
-_abi_test_clobber_x16:
- AARCH64_VALID_CALL_TARGET
- mov x16, xzr
- ret
-
-
-.globl _abi_test_clobber_x17
-.private_extern _abi_test_clobber_x17
-.align 4
-_abi_test_clobber_x17:
- AARCH64_VALID_CALL_TARGET
- mov x17, xzr
- ret
-
-
-.globl _abi_test_clobber_x19
-.private_extern _abi_test_clobber_x19
-.align 4
-_abi_test_clobber_x19:
- AARCH64_VALID_CALL_TARGET
- mov x19, xzr
- ret
-
-
-.globl _abi_test_clobber_x20
-.private_extern _abi_test_clobber_x20
-.align 4
-_abi_test_clobber_x20:
- AARCH64_VALID_CALL_TARGET
- mov x20, xzr
- ret
-
-
-.globl _abi_test_clobber_x21
-.private_extern _abi_test_clobber_x21
-.align 4
-_abi_test_clobber_x21:
- AARCH64_VALID_CALL_TARGET
- mov x21, xzr
- ret
-
-
-.globl _abi_test_clobber_x22
-.private_extern _abi_test_clobber_x22
-.align 4
-_abi_test_clobber_x22:
- AARCH64_VALID_CALL_TARGET
- mov x22, xzr
- ret
-
-
-.globl _abi_test_clobber_x23
-.private_extern _abi_test_clobber_x23
-.align 4
-_abi_test_clobber_x23:
- AARCH64_VALID_CALL_TARGET
- mov x23, xzr
- ret
-
-
-.globl _abi_test_clobber_x24
-.private_extern _abi_test_clobber_x24
-.align 4
-_abi_test_clobber_x24:
- AARCH64_VALID_CALL_TARGET
- mov x24, xzr
- ret
-
-
-.globl _abi_test_clobber_x25
-.private_extern _abi_test_clobber_x25
-.align 4
-_abi_test_clobber_x25:
- AARCH64_VALID_CALL_TARGET
- mov x25, xzr
- ret
-
-
-.globl _abi_test_clobber_x26
-.private_extern _abi_test_clobber_x26
-.align 4
-_abi_test_clobber_x26:
- AARCH64_VALID_CALL_TARGET
- mov x26, xzr
- ret
-
-
-.globl _abi_test_clobber_x27
-.private_extern _abi_test_clobber_x27
-.align 4
-_abi_test_clobber_x27:
- AARCH64_VALID_CALL_TARGET
- mov x27, xzr
- ret
-
-
-.globl _abi_test_clobber_x28
-.private_extern _abi_test_clobber_x28
-.align 4
-_abi_test_clobber_x28:
- AARCH64_VALID_CALL_TARGET
- mov x28, xzr
- ret
-
-
-.globl _abi_test_clobber_x29
-.private_extern _abi_test_clobber_x29
-.align 4
-_abi_test_clobber_x29:
- AARCH64_VALID_CALL_TARGET
- mov x29, xzr
- ret
-
-
-.globl _abi_test_clobber_d0
-.private_extern _abi_test_clobber_d0
-.align 4
-_abi_test_clobber_d0:
- AARCH64_VALID_CALL_TARGET
- fmov d0, xzr
- ret
-
-
-.globl _abi_test_clobber_d1
-.private_extern _abi_test_clobber_d1
-.align 4
-_abi_test_clobber_d1:
- AARCH64_VALID_CALL_TARGET
- fmov d1, xzr
- ret
-
-
-.globl _abi_test_clobber_d2
-.private_extern _abi_test_clobber_d2
-.align 4
-_abi_test_clobber_d2:
- AARCH64_VALID_CALL_TARGET
- fmov d2, xzr
- ret
-
-
-.globl _abi_test_clobber_d3
-.private_extern _abi_test_clobber_d3
-.align 4
-_abi_test_clobber_d3:
- AARCH64_VALID_CALL_TARGET
- fmov d3, xzr
- ret
-
-
-.globl _abi_test_clobber_d4
-.private_extern _abi_test_clobber_d4
-.align 4
-_abi_test_clobber_d4:
- AARCH64_VALID_CALL_TARGET
- fmov d4, xzr
- ret
-
-
-.globl _abi_test_clobber_d5
-.private_extern _abi_test_clobber_d5
-.align 4
-_abi_test_clobber_d5:
- AARCH64_VALID_CALL_TARGET
- fmov d5, xzr
- ret
-
-
-.globl _abi_test_clobber_d6
-.private_extern _abi_test_clobber_d6
-.align 4
-_abi_test_clobber_d6:
- AARCH64_VALID_CALL_TARGET
- fmov d6, xzr
- ret
-
-
-.globl _abi_test_clobber_d7
-.private_extern _abi_test_clobber_d7
-.align 4
-_abi_test_clobber_d7:
- AARCH64_VALID_CALL_TARGET
- fmov d7, xzr
- ret
-
-
-.globl _abi_test_clobber_d8
-.private_extern _abi_test_clobber_d8
-.align 4
-_abi_test_clobber_d8:
- AARCH64_VALID_CALL_TARGET
- fmov d8, xzr
- ret
-
-
-.globl _abi_test_clobber_d9
-.private_extern _abi_test_clobber_d9
-.align 4
-_abi_test_clobber_d9:
- AARCH64_VALID_CALL_TARGET
- fmov d9, xzr
- ret
-
-
-.globl _abi_test_clobber_d10
-.private_extern _abi_test_clobber_d10
-.align 4
-_abi_test_clobber_d10:
- AARCH64_VALID_CALL_TARGET
- fmov d10, xzr
- ret
-
-
-.globl _abi_test_clobber_d11
-.private_extern _abi_test_clobber_d11
-.align 4
-_abi_test_clobber_d11:
- AARCH64_VALID_CALL_TARGET
- fmov d11, xzr
- ret
-
-
-.globl _abi_test_clobber_d12
-.private_extern _abi_test_clobber_d12
-.align 4
-_abi_test_clobber_d12:
- AARCH64_VALID_CALL_TARGET
- fmov d12, xzr
- ret
-
-
-.globl _abi_test_clobber_d13
-.private_extern _abi_test_clobber_d13
-.align 4
-_abi_test_clobber_d13:
- AARCH64_VALID_CALL_TARGET
- fmov d13, xzr
- ret
-
-
-.globl _abi_test_clobber_d14
-.private_extern _abi_test_clobber_d14
-.align 4
-_abi_test_clobber_d14:
- AARCH64_VALID_CALL_TARGET
- fmov d14, xzr
- ret
-
-
-.globl _abi_test_clobber_d15
-.private_extern _abi_test_clobber_d15
-.align 4
-_abi_test_clobber_d15:
- AARCH64_VALID_CALL_TARGET
- fmov d15, xzr
- ret
-
-
-.globl _abi_test_clobber_d16
-.private_extern _abi_test_clobber_d16
-.align 4
-_abi_test_clobber_d16:
- AARCH64_VALID_CALL_TARGET
- fmov d16, xzr
- ret
-
-
-.globl _abi_test_clobber_d17
-.private_extern _abi_test_clobber_d17
-.align 4
-_abi_test_clobber_d17:
- AARCH64_VALID_CALL_TARGET
- fmov d17, xzr
- ret
-
-
-.globl _abi_test_clobber_d18
-.private_extern _abi_test_clobber_d18
-.align 4
-_abi_test_clobber_d18:
- AARCH64_VALID_CALL_TARGET
- fmov d18, xzr
- ret
-
-
-.globl _abi_test_clobber_d19
-.private_extern _abi_test_clobber_d19
-.align 4
-_abi_test_clobber_d19:
- AARCH64_VALID_CALL_TARGET
- fmov d19, xzr
- ret
-
-
-.globl _abi_test_clobber_d20
-.private_extern _abi_test_clobber_d20
-.align 4
-_abi_test_clobber_d20:
- AARCH64_VALID_CALL_TARGET
- fmov d20, xzr
- ret
-
-
-.globl _abi_test_clobber_d21
-.private_extern _abi_test_clobber_d21
-.align 4
-_abi_test_clobber_d21:
- AARCH64_VALID_CALL_TARGET
- fmov d21, xzr
- ret
-
-
-.globl _abi_test_clobber_d22
-.private_extern _abi_test_clobber_d22
-.align 4
-_abi_test_clobber_d22:
- AARCH64_VALID_CALL_TARGET
- fmov d22, xzr
- ret
-
-
-.globl _abi_test_clobber_d23
-.private_extern _abi_test_clobber_d23
-.align 4
-_abi_test_clobber_d23:
- AARCH64_VALID_CALL_TARGET
- fmov d23, xzr
- ret
-
-
-.globl _abi_test_clobber_d24
-.private_extern _abi_test_clobber_d24
-.align 4
-_abi_test_clobber_d24:
- AARCH64_VALID_CALL_TARGET
- fmov d24, xzr
- ret
-
-
-.globl _abi_test_clobber_d25
-.private_extern _abi_test_clobber_d25
-.align 4
-_abi_test_clobber_d25:
- AARCH64_VALID_CALL_TARGET
- fmov d25, xzr
- ret
-
-
-.globl _abi_test_clobber_d26
-.private_extern _abi_test_clobber_d26
-.align 4
-_abi_test_clobber_d26:
- AARCH64_VALID_CALL_TARGET
- fmov d26, xzr
- ret
-
-
-.globl _abi_test_clobber_d27
-.private_extern _abi_test_clobber_d27
-.align 4
-_abi_test_clobber_d27:
- AARCH64_VALID_CALL_TARGET
- fmov d27, xzr
- ret
-
-
-.globl _abi_test_clobber_d28
-.private_extern _abi_test_clobber_d28
-.align 4
-_abi_test_clobber_d28:
- AARCH64_VALID_CALL_TARGET
- fmov d28, xzr
- ret
-
-
-.globl _abi_test_clobber_d29
-.private_extern _abi_test_clobber_d29
-.align 4
-_abi_test_clobber_d29:
- AARCH64_VALID_CALL_TARGET
- fmov d29, xzr
- ret
-
-
-.globl _abi_test_clobber_d30
-.private_extern _abi_test_clobber_d30
-.align 4
-_abi_test_clobber_d30:
- AARCH64_VALID_CALL_TARGET
- fmov d30, xzr
- ret
-
-
-.globl _abi_test_clobber_d31
-.private_extern _abi_test_clobber_d31
-.align 4
-_abi_test_clobber_d31:
- AARCH64_VALID_CALL_TARGET
- fmov d31, xzr
- ret
-
-
-.globl _abi_test_clobber_v8_upper
-.private_extern _abi_test_clobber_v8_upper
-.align 4
-_abi_test_clobber_v8_upper:
- AARCH64_VALID_CALL_TARGET
- fmov v8.d[1], xzr
- ret
-
-
-.globl _abi_test_clobber_v9_upper
-.private_extern _abi_test_clobber_v9_upper
-.align 4
-_abi_test_clobber_v9_upper:
- AARCH64_VALID_CALL_TARGET
- fmov v9.d[1], xzr
- ret
-
-
-.globl _abi_test_clobber_v10_upper
-.private_extern _abi_test_clobber_v10_upper
-.align 4
-_abi_test_clobber_v10_upper:
- AARCH64_VALID_CALL_TARGET
- fmov v10.d[1], xzr
- ret
-
-
-.globl _abi_test_clobber_v11_upper
-.private_extern _abi_test_clobber_v11_upper
-.align 4
-_abi_test_clobber_v11_upper:
- AARCH64_VALID_CALL_TARGET
- fmov v11.d[1], xzr
- ret
-
-
-.globl _abi_test_clobber_v12_upper
-.private_extern _abi_test_clobber_v12_upper
-.align 4
-_abi_test_clobber_v12_upper:
- AARCH64_VALID_CALL_TARGET
- fmov v12.d[1], xzr
- ret
-
-
-.globl _abi_test_clobber_v13_upper
-.private_extern _abi_test_clobber_v13_upper
-.align 4
-_abi_test_clobber_v13_upper:
- AARCH64_VALID_CALL_TARGET
- fmov v13.d[1], xzr
- ret
-
-
-.globl _abi_test_clobber_v14_upper
-.private_extern _abi_test_clobber_v14_upper
-.align 4
-_abi_test_clobber_v14_upper:
- AARCH64_VALID_CALL_TARGET
- fmov v14.d[1], xzr
- ret
-
-
-.globl _abi_test_clobber_v15_upper
-.private_extern _abi_test_clobber_v15_upper
-.align 4
-_abi_test_clobber_v15_upper:
- AARCH64_VALID_CALL_TARGET
- fmov v15.d[1], xzr
- ret
-
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-arm/crypto/chacha/chacha-armv4-apple.S b/apple-arm/crypto/chacha/chacha-armv4-apple.S
deleted file mode 100644
index cf2644e..0000000
--- a/apple-arm/crypto/chacha/chacha-armv4-apple.S
+++ /dev/null
@@ -1,1490 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
-
-
-.text
-#if defined(__thumb2__) || defined(__clang__)
-.syntax unified
-#endif
-#if defined(__thumb2__)
-.thumb
-#else
-.code 32
-#endif
-
-#if defined(__thumb2__) || defined(__clang__)
-#define ldrhsb ldrbhs
-#endif
-
-.align 5
-Lsigma:
-.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
-Lone:
-.long 1,0,0,0
-#if __ARM_MAX_ARCH__>=7
-LOPENSSL_armcap:
-.word OPENSSL_armcap_P-LChaCha20_ctr32
-#else
-.word -1
-#endif
-
-.globl _ChaCha20_ctr32
-.private_extern _ChaCha20_ctr32
-#ifdef __thumb2__
-.thumb_func _ChaCha20_ctr32
-#endif
-.align 5
-_ChaCha20_ctr32:
-LChaCha20_ctr32:
- ldr r12,[sp,#0] @ pull pointer to counter and nonce
- stmdb sp!,{r0,r1,r2,r4-r11,lr}
-#if __ARM_ARCH__<7 && !defined(__thumb2__)
- sub r14,pc,#16 @ _ChaCha20_ctr32
-#else
- adr r14,LChaCha20_ctr32
-#endif
- cmp r2,#0 @ len==0?
-#ifdef __thumb2__
- itt eq
-#endif
- addeq sp,sp,#4*3
- beq Lno_data
-#if __ARM_MAX_ARCH__>=7
- cmp r2,#192 @ test len
- bls Lshort
- ldr r4,[r14,#-32]
- ldr r4,[r14,r4]
-# ifdef __APPLE__
- ldr r4,[r4]
-# endif
- tst r4,#ARMV7_NEON
- bne LChaCha20_neon
-Lshort:
-#endif
- ldmia r12,{r4,r5,r6,r7} @ load counter and nonce
- sub sp,sp,#4*(16) @ off-load area
- sub r14,r14,#64 @ Lsigma
- stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce
- ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key
- ldmia r14,{r0,r1,r2,r3} @ load sigma
- stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key
- stmdb sp!,{r0,r1,r2,r3} @ copy sigma
- str r10,[sp,#4*(16+10)] @ off-load "rx"
- str r11,[sp,#4*(16+11)] @ off-load "rx"
- b Loop_outer_enter
-
-.align 4
-Loop_outer:
- ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material
- str r11,[sp,#4*(32+2)] @ save len
- str r12, [sp,#4*(32+1)] @ save inp
- str r14, [sp,#4*(32+0)] @ save out
-Loop_outer_enter:
- ldr r11, [sp,#4*(15)]
- ldr r12,[sp,#4*(12)] @ modulo-scheduled load
- ldr r10, [sp,#4*(13)]
- ldr r14,[sp,#4*(14)]
- str r11, [sp,#4*(16+15)]
- mov r11,#10
- b Loop
-
-.align 4
-Loop:
- subs r11,r11,#1
- add r0,r0,r4
- mov r12,r12,ror#16
- add r1,r1,r5
- mov r10,r10,ror#16
- eor r12,r12,r0,ror#16
- eor r10,r10,r1,ror#16
- add r8,r8,r12
- mov r4,r4,ror#20
- add r9,r9,r10
- mov r5,r5,ror#20
- eor r4,r4,r8,ror#20
- eor r5,r5,r9,ror#20
- add r0,r0,r4
- mov r12,r12,ror#24
- add r1,r1,r5
- mov r10,r10,ror#24
- eor r12,r12,r0,ror#24
- eor r10,r10,r1,ror#24
- add r8,r8,r12
- mov r4,r4,ror#25
- add r9,r9,r10
- mov r5,r5,ror#25
- str r10,[sp,#4*(16+13)]
- ldr r10,[sp,#4*(16+15)]
- eor r4,r4,r8,ror#25
- eor r5,r5,r9,ror#25
- str r8,[sp,#4*(16+8)]
- ldr r8,[sp,#4*(16+10)]
- add r2,r2,r6
- mov r14,r14,ror#16
- str r9,[sp,#4*(16+9)]
- ldr r9,[sp,#4*(16+11)]
- add r3,r3,r7
- mov r10,r10,ror#16
- eor r14,r14,r2,ror#16
- eor r10,r10,r3,ror#16
- add r8,r8,r14
- mov r6,r6,ror#20
- add r9,r9,r10
- mov r7,r7,ror#20
- eor r6,r6,r8,ror#20
- eor r7,r7,r9,ror#20
- add r2,r2,r6
- mov r14,r14,ror#24
- add r3,r3,r7
- mov r10,r10,ror#24
- eor r14,r14,r2,ror#24
- eor r10,r10,r3,ror#24
- add r8,r8,r14
- mov r6,r6,ror#25
- add r9,r9,r10
- mov r7,r7,ror#25
- eor r6,r6,r8,ror#25
- eor r7,r7,r9,ror#25
- add r0,r0,r5
- mov r10,r10,ror#16
- add r1,r1,r6
- mov r12,r12,ror#16
- eor r10,r10,r0,ror#16
- eor r12,r12,r1,ror#16
- add r8,r8,r10
- mov r5,r5,ror#20
- add r9,r9,r12
- mov r6,r6,ror#20
- eor r5,r5,r8,ror#20
- eor r6,r6,r9,ror#20
- add r0,r0,r5
- mov r10,r10,ror#24
- add r1,r1,r6
- mov r12,r12,ror#24
- eor r10,r10,r0,ror#24
- eor r12,r12,r1,ror#24
- add r8,r8,r10
- mov r5,r5,ror#25
- str r10,[sp,#4*(16+15)]
- ldr r10,[sp,#4*(16+13)]
- add r9,r9,r12
- mov r6,r6,ror#25
- eor r5,r5,r8,ror#25
- eor r6,r6,r9,ror#25
- str r8,[sp,#4*(16+10)]
- ldr r8,[sp,#4*(16+8)]
- add r2,r2,r7
- mov r10,r10,ror#16
- str r9,[sp,#4*(16+11)]
- ldr r9,[sp,#4*(16+9)]
- add r3,r3,r4
- mov r14,r14,ror#16
- eor r10,r10,r2,ror#16
- eor r14,r14,r3,ror#16
- add r8,r8,r10
- mov r7,r7,ror#20
- add r9,r9,r14
- mov r4,r4,ror#20
- eor r7,r7,r8,ror#20
- eor r4,r4,r9,ror#20
- add r2,r2,r7
- mov r10,r10,ror#24
- add r3,r3,r4
- mov r14,r14,ror#24
- eor r10,r10,r2,ror#24
- eor r14,r14,r3,ror#24
- add r8,r8,r10
- mov r7,r7,ror#25
- add r9,r9,r14
- mov r4,r4,ror#25
- eor r7,r7,r8,ror#25
- eor r4,r4,r9,ror#25
- bne Loop
-
- ldr r11,[sp,#4*(32+2)] @ load len
-
- str r8, [sp,#4*(16+8)] @ modulo-scheduled store
- str r9, [sp,#4*(16+9)]
- str r12,[sp,#4*(16+12)]
- str r10, [sp,#4*(16+13)]
- str r14,[sp,#4*(16+14)]
-
- @ at this point we have first half of 512-bit result in
- @ rx and second half at sp+4*(16+8)
-
- cmp r11,#64 @ done yet?
-#ifdef __thumb2__
- itete lo
-#endif
- addlo r12,sp,#4*(0) @ shortcut or ...
- ldrhs r12,[sp,#4*(32+1)] @ ... load inp
- addlo r14,sp,#4*(0) @ shortcut or ...
- ldrhs r14,[sp,#4*(32+0)] @ ... load out
-
- ldr r8,[sp,#4*(0)] @ load key material
- ldr r9,[sp,#4*(1)]
-
-#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
-# if __ARM_ARCH__<7
- orr r10,r12,r14
- tst r10,#3 @ are input and output aligned?
- ldr r10,[sp,#4*(2)]
- bne Lunaligned
- cmp r11,#64 @ restore flags
-# else
- ldr r10,[sp,#4*(2)]
-# endif
- ldr r11,[sp,#4*(3)]
-
- add r0,r0,r8 @ accumulate key material
- add r1,r1,r9
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs r8,[r12],#16 @ load input
- ldrhs r9,[r12,#-12]
-
- add r2,r2,r10
- add r3,r3,r11
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs r10,[r12,#-8]
- ldrhs r11,[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs r0,r0,r8 @ xor with input
- eorhs r1,r1,r9
- add r8,sp,#4*(4)
- str r0,[r14],#16 @ store output
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs r2,r2,r10
- eorhs r3,r3,r11
- ldmia r8,{r8,r9,r10,r11} @ load key material
- str r1,[r14,#-12]
- str r2,[r14,#-8]
- str r3,[r14,#-4]
-
- add r4,r4,r8 @ accumulate key material
- add r5,r5,r9
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs r8,[r12],#16 @ load input
- ldrhs r9,[r12,#-12]
- add r6,r6,r10
- add r7,r7,r11
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs r10,[r12,#-8]
- ldrhs r11,[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev r4,r4
- rev r5,r5
- rev r6,r6
- rev r7,r7
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs r4,r4,r8
- eorhs r5,r5,r9
- add r8,sp,#4*(8)
- str r4,[r14],#16 @ store output
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs r6,r6,r10
- eorhs r7,r7,r11
- str r5,[r14,#-12]
- ldmia r8,{r8,r9,r10,r11} @ load key material
- str r6,[r14,#-8]
- add r0,sp,#4*(16+8)
- str r7,[r14,#-4]
-
- ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
-
- add r0,r0,r8 @ accumulate key material
- add r1,r1,r9
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs r8,[r12],#16 @ load input
- ldrhs r9,[r12,#-12]
-# ifdef __thumb2__
- itt hi
-# endif
- strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
- strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
- add r2,r2,r10
- add r3,r3,r11
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs r10,[r12,#-8]
- ldrhs r11,[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs r0,r0,r8
- eorhs r1,r1,r9
- add r8,sp,#4*(12)
- str r0,[r14],#16 @ store output
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs r2,r2,r10
- eorhs r3,r3,r11
- str r1,[r14,#-12]
- ldmia r8,{r8,r9,r10,r11} @ load key material
- str r2,[r14,#-8]
- str r3,[r14,#-4]
-
- add r4,r4,r8 @ accumulate key material
- add r5,r5,r9
-# ifdef __thumb2__
- itt hi
-# endif
- addhi r8,r8,#1 @ next counter value
- strhi r8,[sp,#4*(12)] @ save next counter value
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs r8,[r12],#16 @ load input
- ldrhs r9,[r12,#-12]
- add r6,r6,r10
- add r7,r7,r11
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs r10,[r12,#-8]
- ldrhs r11,[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev r4,r4
- rev r5,r5
- rev r6,r6
- rev r7,r7
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs r4,r4,r8
- eorhs r5,r5,r9
-# ifdef __thumb2__
- it ne
-# endif
- ldrne r8,[sp,#4*(32+2)] @ re-load len
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs r6,r6,r10
- eorhs r7,r7,r11
- str r4,[r14],#16 @ store output
- str r5,[r14,#-12]
-# ifdef __thumb2__
- it hs
-# endif
- subhs r11,r8,#64 @ len-=64
- str r6,[r14,#-8]
- str r7,[r14,#-4]
- bhi Loop_outer
-
- beq Ldone
-# if __ARM_ARCH__<7
- b Ltail
-
-.align 4
-Lunaligned:@ unaligned endian-neutral path
- cmp r11,#64 @ restore flags
-# endif
-#endif
-#if __ARM_ARCH__<7
- ldr r11,[sp,#4*(3)]
- add r0,r0,r8 @ accumulate key material
- add r1,r1,r9
- add r2,r2,r10
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo r8,r8,r8 @ zero or ...
- ldrhsb r8,[r12],#16 @ ... load input
- eorlo r9,r9,r9
- ldrhsb r9,[r12,#-12]
-
- add r3,r3,r11
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo r10,r10,r10
- ldrhsb r10,[r12,#-8]
- eorlo r11,r11,r11
- ldrhsb r11,[r12,#-4]
-
- eor r0,r8,r0 @ xor with input (or zero)
- eor r1,r9,r1
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r8,[r12,#-15] @ load more input
- ldrhsb r9,[r12,#-11]
- eor r2,r10,r2
- strb r0,[r14],#16 @ store output
- eor r3,r11,r3
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r10,[r12,#-7]
- ldrhsb r11,[r12,#-3]
- strb r1,[r14,#-12]
- eor r0,r8,r0,lsr#8
- strb r2,[r14,#-8]
- eor r1,r9,r1,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r8,[r12,#-14] @ load more input
- ldrhsb r9,[r12,#-10]
- strb r3,[r14,#-4]
- eor r2,r10,r2,lsr#8
- strb r0,[r14,#-15]
- eor r3,r11,r3,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r10,[r12,#-6]
- ldrhsb r11,[r12,#-2]
- strb r1,[r14,#-11]
- eor r0,r8,r0,lsr#8
- strb r2,[r14,#-7]
- eor r1,r9,r1,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r8,[r12,#-13] @ load more input
- ldrhsb r9,[r12,#-9]
- strb r3,[r14,#-3]
- eor r2,r10,r2,lsr#8
- strb r0,[r14,#-14]
- eor r3,r11,r3,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r10,[r12,#-5]
- ldrhsb r11,[r12,#-1]
- strb r1,[r14,#-10]
- strb r2,[r14,#-6]
- eor r0,r8,r0,lsr#8
- strb r3,[r14,#-2]
- eor r1,r9,r1,lsr#8
- strb r0,[r14,#-13]
- eor r2,r10,r2,lsr#8
- strb r1,[r14,#-9]
- eor r3,r11,r3,lsr#8
- strb r2,[r14,#-5]
- strb r3,[r14,#-1]
- add r8,sp,#4*(4+0)
- ldmia r8,{r8,r9,r10,r11} @ load key material
- add r0,sp,#4*(16+8)
- add r4,r4,r8 @ accumulate key material
- add r5,r5,r9
- add r6,r6,r10
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo r8,r8,r8 @ zero or ...
- ldrhsb r8,[r12],#16 @ ... load input
- eorlo r9,r9,r9
- ldrhsb r9,[r12,#-12]
-
- add r7,r7,r11
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo r10,r10,r10
- ldrhsb r10,[r12,#-8]
- eorlo r11,r11,r11
- ldrhsb r11,[r12,#-4]
-
- eor r4,r8,r4 @ xor with input (or zero)
- eor r5,r9,r5
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r8,[r12,#-15] @ load more input
- ldrhsb r9,[r12,#-11]
- eor r6,r10,r6
- strb r4,[r14],#16 @ store output
- eor r7,r11,r7
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r10,[r12,#-7]
- ldrhsb r11,[r12,#-3]
- strb r5,[r14,#-12]
- eor r4,r8,r4,lsr#8
- strb r6,[r14,#-8]
- eor r5,r9,r5,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r8,[r12,#-14] @ load more input
- ldrhsb r9,[r12,#-10]
- strb r7,[r14,#-4]
- eor r6,r10,r6,lsr#8
- strb r4,[r14,#-15]
- eor r7,r11,r7,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r10,[r12,#-6]
- ldrhsb r11,[r12,#-2]
- strb r5,[r14,#-11]
- eor r4,r8,r4,lsr#8
- strb r6,[r14,#-7]
- eor r5,r9,r5,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r8,[r12,#-13] @ load more input
- ldrhsb r9,[r12,#-9]
- strb r7,[r14,#-3]
- eor r6,r10,r6,lsr#8
- strb r4,[r14,#-14]
- eor r7,r11,r7,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r10,[r12,#-5]
- ldrhsb r11,[r12,#-1]
- strb r5,[r14,#-10]
- strb r6,[r14,#-6]
- eor r4,r8,r4,lsr#8
- strb r7,[r14,#-2]
- eor r5,r9,r5,lsr#8
- strb r4,[r14,#-13]
- eor r6,r10,r6,lsr#8
- strb r5,[r14,#-9]
- eor r7,r11,r7,lsr#8
- strb r6,[r14,#-5]
- strb r7,[r14,#-1]
- add r8,sp,#4*(4+4)
- ldmia r8,{r8,r9,r10,r11} @ load key material
- ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
-# ifdef __thumb2__
- itt hi
-# endif
- strhi r10,[sp,#4*(16+10)] @ copy "rx"
- strhi r11,[sp,#4*(16+11)] @ copy "rx"
- add r0,r0,r8 @ accumulate key material
- add r1,r1,r9
- add r2,r2,r10
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo r8,r8,r8 @ zero or ...
- ldrhsb r8,[r12],#16 @ ... load input
- eorlo r9,r9,r9
- ldrhsb r9,[r12,#-12]
-
- add r3,r3,r11
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo r10,r10,r10
- ldrhsb r10,[r12,#-8]
- eorlo r11,r11,r11
- ldrhsb r11,[r12,#-4]
-
- eor r0,r8,r0 @ xor with input (or zero)
- eor r1,r9,r1
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r8,[r12,#-15] @ load more input
- ldrhsb r9,[r12,#-11]
- eor r2,r10,r2
- strb r0,[r14],#16 @ store output
- eor r3,r11,r3
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r10,[r12,#-7]
- ldrhsb r11,[r12,#-3]
- strb r1,[r14,#-12]
- eor r0,r8,r0,lsr#8
- strb r2,[r14,#-8]
- eor r1,r9,r1,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r8,[r12,#-14] @ load more input
- ldrhsb r9,[r12,#-10]
- strb r3,[r14,#-4]
- eor r2,r10,r2,lsr#8
- strb r0,[r14,#-15]
- eor r3,r11,r3,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r10,[r12,#-6]
- ldrhsb r11,[r12,#-2]
- strb r1,[r14,#-11]
- eor r0,r8,r0,lsr#8
- strb r2,[r14,#-7]
- eor r1,r9,r1,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r8,[r12,#-13] @ load more input
- ldrhsb r9,[r12,#-9]
- strb r3,[r14,#-3]
- eor r2,r10,r2,lsr#8
- strb r0,[r14,#-14]
- eor r3,r11,r3,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r10,[r12,#-5]
- ldrhsb r11,[r12,#-1]
- strb r1,[r14,#-10]
- strb r2,[r14,#-6]
- eor r0,r8,r0,lsr#8
- strb r3,[r14,#-2]
- eor r1,r9,r1,lsr#8
- strb r0,[r14,#-13]
- eor r2,r10,r2,lsr#8
- strb r1,[r14,#-9]
- eor r3,r11,r3,lsr#8
- strb r2,[r14,#-5]
- strb r3,[r14,#-1]
- add r8,sp,#4*(4+8)
- ldmia r8,{r8,r9,r10,r11} @ load key material
- add r4,r4,r8 @ accumulate key material
-# ifdef __thumb2__
- itt hi
-# endif
- addhi r8,r8,#1 @ next counter value
- strhi r8,[sp,#4*(12)] @ save next counter value
- add r5,r5,r9
- add r6,r6,r10
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo r8,r8,r8 @ zero or ...
- ldrhsb r8,[r12],#16 @ ... load input
- eorlo r9,r9,r9
- ldrhsb r9,[r12,#-12]
-
- add r7,r7,r11
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo r10,r10,r10
- ldrhsb r10,[r12,#-8]
- eorlo r11,r11,r11
- ldrhsb r11,[r12,#-4]
-
- eor r4,r8,r4 @ xor with input (or zero)
- eor r5,r9,r5
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r8,[r12,#-15] @ load more input
- ldrhsb r9,[r12,#-11]
- eor r6,r10,r6
- strb r4,[r14],#16 @ store output
- eor r7,r11,r7
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r10,[r12,#-7]
- ldrhsb r11,[r12,#-3]
- strb r5,[r14,#-12]
- eor r4,r8,r4,lsr#8
- strb r6,[r14,#-8]
- eor r5,r9,r5,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r8,[r12,#-14] @ load more input
- ldrhsb r9,[r12,#-10]
- strb r7,[r14,#-4]
- eor r6,r10,r6,lsr#8
- strb r4,[r14,#-15]
- eor r7,r11,r7,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r10,[r12,#-6]
- ldrhsb r11,[r12,#-2]
- strb r5,[r14,#-11]
- eor r4,r8,r4,lsr#8
- strb r6,[r14,#-7]
- eor r5,r9,r5,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r8,[r12,#-13] @ load more input
- ldrhsb r9,[r12,#-9]
- strb r7,[r14,#-3]
- eor r6,r10,r6,lsr#8
- strb r4,[r14,#-14]
- eor r7,r11,r7,lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb r10,[r12,#-5]
- ldrhsb r11,[r12,#-1]
- strb r5,[r14,#-10]
- strb r6,[r14,#-6]
- eor r4,r8,r4,lsr#8
- strb r7,[r14,#-2]
- eor r5,r9,r5,lsr#8
- strb r4,[r14,#-13]
- eor r6,r10,r6,lsr#8
- strb r5,[r14,#-9]
- eor r7,r11,r7,lsr#8
- strb r6,[r14,#-5]
- strb r7,[r14,#-1]
-# ifdef __thumb2__
- it ne
-# endif
- ldrne r8,[sp,#4*(32+2)] @ re-load len
-# ifdef __thumb2__
- it hs
-# endif
- subhs r11,r8,#64 @ len-=64
- bhi Loop_outer
-
- beq Ldone
-#endif
-
-Ltail:
- ldr r12,[sp,#4*(32+1)] @ load inp
- add r9,sp,#4*(0)
- ldr r14,[sp,#4*(32+0)] @ load out
-
-Loop_tail:
- ldrb r10,[r9],#1 @ read buffer on stack
- ldrb r11,[r12],#1 @ read input
- subs r8,r8,#1
- eor r11,r11,r10
- strb r11,[r14],#1 @ store output
- bne Loop_tail
-
-Ldone:
- add sp,sp,#4*(32+3)
-Lno_data:
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-#ifdef __thumb2__
-.thumb_func ChaCha20_neon
-#endif
-.align 5
-ChaCha20_neon:
- ldr r12,[sp,#0] @ pull pointer to counter and nonce
- stmdb sp!,{r0,r1,r2,r4-r11,lr}
-LChaCha20_neon:
- adr r14,Lsigma
- vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so
- stmdb sp!,{r0,r1,r2,r3}
-
- vld1.32 {q1,q2},[r3] @ load key
- ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key
-
- sub sp,sp,#4*(16+16)
- vld1.32 {q3},[r12] @ load counter and nonce
- add r12,sp,#4*8
- ldmia r14,{r0,r1,r2,r3} @ load sigma
- vld1.32 {q0},[r14]! @ load sigma
- vld1.32 {q12},[r14] @ one
- vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce
- vst1.32 {q0,q1},[sp] @ copy sigma|1/2key
-
- str r10,[sp,#4*(16+10)] @ off-load "rx"
- str r11,[sp,#4*(16+11)] @ off-load "rx"
- vshl.i32 d26,d24,#1 @ two
- vstr d24,[sp,#4*(16+0)]
- vshl.i32 d28,d24,#2 @ four
- vstr d26,[sp,#4*(16+2)]
- vmov q4,q0
- vstr d28,[sp,#4*(16+4)]
- vmov q8,q0
- vmov q5,q1
- vmov q9,q1
- b Loop_neon_enter
-
-.align 4
-Loop_neon_outer:
- ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material
- cmp r11,#64*2 @ if len<=64*2
- bls Lbreak_neon @ switch to integer-only
- vmov q4,q0
- str r11,[sp,#4*(32+2)] @ save len
- vmov q8,q0
- str r12, [sp,#4*(32+1)] @ save inp
- vmov q5,q1
- str r14, [sp,#4*(32+0)] @ save out
- vmov q9,q1
-Loop_neon_enter:
- ldr r11, [sp,#4*(15)]
- vadd.i32 q7,q3,q12 @ counter+1
- ldr r12,[sp,#4*(12)] @ modulo-scheduled load
- vmov q6,q2
- ldr r10, [sp,#4*(13)]
- vmov q10,q2
- ldr r14,[sp,#4*(14)]
- vadd.i32 q11,q7,q12 @ counter+2
- str r11, [sp,#4*(16+15)]
- mov r11,#10
- add r12,r12,#3 @ counter+3
- b Loop_neon
-
-.align 4
-Loop_neon:
- subs r11,r11,#1
- vadd.i32 q0,q0,q1
- add r0,r0,r4
- vadd.i32 q4,q4,q5
- mov r12,r12,ror#16
- vadd.i32 q8,q8,q9
- add r1,r1,r5
- veor q3,q3,q0
- mov r10,r10,ror#16
- veor q7,q7,q4
- eor r12,r12,r0,ror#16
- veor q11,q11,q8
- eor r10,r10,r1,ror#16
- vrev32.16 q3,q3
- add r8,r8,r12
- vrev32.16 q7,q7
- mov r4,r4,ror#20
- vrev32.16 q11,q11
- add r9,r9,r10
- vadd.i32 q2,q2,q3
- mov r5,r5,ror#20
- vadd.i32 q6,q6,q7
- eor r4,r4,r8,ror#20
- vadd.i32 q10,q10,q11
- eor r5,r5,r9,ror#20
- veor q12,q1,q2
- add r0,r0,r4
- veor q13,q5,q6
- mov r12,r12,ror#24
- veor q14,q9,q10
- add r1,r1,r5
- vshr.u32 q1,q12,#20
- mov r10,r10,ror#24
- vshr.u32 q5,q13,#20
- eor r12,r12,r0,ror#24
- vshr.u32 q9,q14,#20
- eor r10,r10,r1,ror#24
- vsli.32 q1,q12,#12
- add r8,r8,r12
- vsli.32 q5,q13,#12
- mov r4,r4,ror#25
- vsli.32 q9,q14,#12
- add r9,r9,r10
- vadd.i32 q0,q0,q1
- mov r5,r5,ror#25
- vadd.i32 q4,q4,q5
- str r10,[sp,#4*(16+13)]
- vadd.i32 q8,q8,q9
- ldr r10,[sp,#4*(16+15)]
- veor q12,q3,q0
- eor r4,r4,r8,ror#25
- veor q13,q7,q4
- eor r5,r5,r9,ror#25
- veor q14,q11,q8
- str r8,[sp,#4*(16+8)]
- vshr.u32 q3,q12,#24
- ldr r8,[sp,#4*(16+10)]
- vshr.u32 q7,q13,#24
- add r2,r2,r6
- vshr.u32 q11,q14,#24
- mov r14,r14,ror#16
- vsli.32 q3,q12,#8
- str r9,[sp,#4*(16+9)]
- vsli.32 q7,q13,#8
- ldr r9,[sp,#4*(16+11)]
- vsli.32 q11,q14,#8
- add r3,r3,r7
- vadd.i32 q2,q2,q3
- mov r10,r10,ror#16
- vadd.i32 q6,q6,q7
- eor r14,r14,r2,ror#16
- vadd.i32 q10,q10,q11
- eor r10,r10,r3,ror#16
- veor q12,q1,q2
- add r8,r8,r14
- veor q13,q5,q6
- mov r6,r6,ror#20
- veor q14,q9,q10
- add r9,r9,r10
- vshr.u32 q1,q12,#25
- mov r7,r7,ror#20
- vshr.u32 q5,q13,#25
- eor r6,r6,r8,ror#20
- vshr.u32 q9,q14,#25
- eor r7,r7,r9,ror#20
- vsli.32 q1,q12,#7
- add r2,r2,r6
- vsli.32 q5,q13,#7
- mov r14,r14,ror#24
- vsli.32 q9,q14,#7
- add r3,r3,r7
- vext.8 q2,q2,q2,#8
- mov r10,r10,ror#24
- vext.8 q6,q6,q6,#8
- eor r14,r14,r2,ror#24
- vext.8 q10,q10,q10,#8
- eor r10,r10,r3,ror#24
- vext.8 q1,q1,q1,#4
- add r8,r8,r14
- vext.8 q5,q5,q5,#4
- mov r6,r6,ror#25
- vext.8 q9,q9,q9,#4
- add r9,r9,r10
- vext.8 q3,q3,q3,#12
- mov r7,r7,ror#25
- vext.8 q7,q7,q7,#12
- eor r6,r6,r8,ror#25
- vext.8 q11,q11,q11,#12
- eor r7,r7,r9,ror#25
- vadd.i32 q0,q0,q1
- add r0,r0,r5
- vadd.i32 q4,q4,q5
- mov r10,r10,ror#16
- vadd.i32 q8,q8,q9
- add r1,r1,r6
- veor q3,q3,q0
- mov r12,r12,ror#16
- veor q7,q7,q4
- eor r10,r10,r0,ror#16
- veor q11,q11,q8
- eor r12,r12,r1,ror#16
- vrev32.16 q3,q3
- add r8,r8,r10
- vrev32.16 q7,q7
- mov r5,r5,ror#20
- vrev32.16 q11,q11
- add r9,r9,r12
- vadd.i32 q2,q2,q3
- mov r6,r6,ror#20
- vadd.i32 q6,q6,q7
- eor r5,r5,r8,ror#20
- vadd.i32 q10,q10,q11
- eor r6,r6,r9,ror#20
- veor q12,q1,q2
- add r0,r0,r5
- veor q13,q5,q6
- mov r10,r10,ror#24
- veor q14,q9,q10
- add r1,r1,r6
- vshr.u32 q1,q12,#20
- mov r12,r12,ror#24
- vshr.u32 q5,q13,#20
- eor r10,r10,r0,ror#24
- vshr.u32 q9,q14,#20
- eor r12,r12,r1,ror#24
- vsli.32 q1,q12,#12
- add r8,r8,r10
- vsli.32 q5,q13,#12
- mov r5,r5,ror#25
- vsli.32 q9,q14,#12
- str r10,[sp,#4*(16+15)]
- vadd.i32 q0,q0,q1
- ldr r10,[sp,#4*(16+13)]
- vadd.i32 q4,q4,q5
- add r9,r9,r12
- vadd.i32 q8,q8,q9
- mov r6,r6,ror#25
- veor q12,q3,q0
- eor r5,r5,r8,ror#25
- veor q13,q7,q4
- eor r6,r6,r9,ror#25
- veor q14,q11,q8
- str r8,[sp,#4*(16+10)]
- vshr.u32 q3,q12,#24
- ldr r8,[sp,#4*(16+8)]
- vshr.u32 q7,q13,#24
- add r2,r2,r7
- vshr.u32 q11,q14,#24
- mov r10,r10,ror#16
- vsli.32 q3,q12,#8
- str r9,[sp,#4*(16+11)]
- vsli.32 q7,q13,#8
- ldr r9,[sp,#4*(16+9)]
- vsli.32 q11,q14,#8
- add r3,r3,r4
- vadd.i32 q2,q2,q3
- mov r14,r14,ror#16
- vadd.i32 q6,q6,q7
- eor r10,r10,r2,ror#16
- vadd.i32 q10,q10,q11
- eor r14,r14,r3,ror#16
- veor q12,q1,q2
- add r8,r8,r10
- veor q13,q5,q6
- mov r7,r7,ror#20
- veor q14,q9,q10
- add r9,r9,r14
- vshr.u32 q1,q12,#25
- mov r4,r4,ror#20
- vshr.u32 q5,q13,#25
- eor r7,r7,r8,ror#20
- vshr.u32 q9,q14,#25
- eor r4,r4,r9,ror#20
- vsli.32 q1,q12,#7
- add r2,r2,r7
- vsli.32 q5,q13,#7
- mov r10,r10,ror#24
- vsli.32 q9,q14,#7
- add r3,r3,r4
- vext.8 q2,q2,q2,#8
- mov r14,r14,ror#24
- vext.8 q6,q6,q6,#8
- eor r10,r10,r2,ror#24
- vext.8 q10,q10,q10,#8
- eor r14,r14,r3,ror#24
- vext.8 q1,q1,q1,#12
- add r8,r8,r10
- vext.8 q5,q5,q5,#12
- mov r7,r7,ror#25
- vext.8 q9,q9,q9,#12
- add r9,r9,r14
- vext.8 q3,q3,q3,#4
- mov r4,r4,ror#25
- vext.8 q7,q7,q7,#4
- eor r7,r7,r8,ror#25
- vext.8 q11,q11,q11,#4
- eor r4,r4,r9,ror#25
- bne Loop_neon
-
- add r11,sp,#32
- vld1.32 {q12,q13},[sp] @ load key material
- vld1.32 {q14,q15},[r11]
-
- ldr r11,[sp,#4*(32+2)] @ load len
-
- str r8, [sp,#4*(16+8)] @ modulo-scheduled store
- str r9, [sp,#4*(16+9)]
- str r12,[sp,#4*(16+12)]
- str r10, [sp,#4*(16+13)]
- str r14,[sp,#4*(16+14)]
-
- @ at this point we have first half of 512-bit result in
- @ rx and second half at sp+4*(16+8)
-
- ldr r12,[sp,#4*(32+1)] @ load inp
- ldr r14,[sp,#4*(32+0)] @ load out
-
- vadd.i32 q0,q0,q12 @ accumulate key material
- vadd.i32 q4,q4,q12
- vadd.i32 q8,q8,q12
- vldr d24,[sp,#4*(16+0)] @ one
-
- vadd.i32 q1,q1,q13
- vadd.i32 q5,q5,q13
- vadd.i32 q9,q9,q13
- vldr d26,[sp,#4*(16+2)] @ two
-
- vadd.i32 q2,q2,q14
- vadd.i32 q6,q6,q14
- vadd.i32 q10,q10,q14
- vadd.i32 d14,d14,d24 @ counter+1
- vadd.i32 d22,d22,d26 @ counter+2
-
- vadd.i32 q3,q3,q15
- vadd.i32 q7,q7,q15
- vadd.i32 q11,q11,q15
-
- cmp r11,#64*4
- blo Ltail_neon
-
- vld1.8 {q12,q13},[r12]! @ load input
- mov r11,sp
- vld1.8 {q14,q15},[r12]!
- veor q0,q0,q12 @ xor with input
- veor q1,q1,q13
- vld1.8 {q12,q13},[r12]!
- veor q2,q2,q14
- veor q3,q3,q15
- vld1.8 {q14,q15},[r12]!
-
- veor q4,q4,q12
- vst1.8 {q0,q1},[r14]! @ store output
- veor q5,q5,q13
- vld1.8 {q12,q13},[r12]!
- veor q6,q6,q14
- vst1.8 {q2,q3},[r14]!
- veor q7,q7,q15
- vld1.8 {q14,q15},[r12]!
-
- veor q8,q8,q12
- vld1.32 {q0,q1},[r11]! @ load for next iteration
- veor d25,d25,d25
- vldr d24,[sp,#4*(16+4)] @ four
- veor q9,q9,q13
- vld1.32 {q2,q3},[r11]
- veor q10,q10,q14
- vst1.8 {q4,q5},[r14]!
- veor q11,q11,q15
- vst1.8 {q6,q7},[r14]!
-
- vadd.i32 d6,d6,d24 @ next counter value
- vldr d24,[sp,#4*(16+0)] @ one
-
- ldmia sp,{r8,r9,r10,r11} @ load key material
- add r0,r0,r8 @ accumulate key material
- ldr r8,[r12],#16 @ load input
- vst1.8 {q8,q9},[r14]!
- add r1,r1,r9
- ldr r9,[r12,#-12]
- vst1.8 {q10,q11},[r14]!
- add r2,r2,r10
- ldr r10,[r12,#-8]
- add r3,r3,r11
- ldr r11,[r12,#-4]
-# ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-# endif
- eor r0,r0,r8 @ xor with input
- add r8,sp,#4*(4)
- eor r1,r1,r9
- str r0,[r14],#16 @ store output
- eor r2,r2,r10
- str r1,[r14,#-12]
- eor r3,r3,r11
- ldmia r8,{r8,r9,r10,r11} @ load key material
- str r2,[r14,#-8]
- str r3,[r14,#-4]
-
- add r4,r4,r8 @ accumulate key material
- ldr r8,[r12],#16 @ load input
- add r5,r5,r9
- ldr r9,[r12,#-12]
- add r6,r6,r10
- ldr r10,[r12,#-8]
- add r7,r7,r11
- ldr r11,[r12,#-4]
-# ifdef __ARMEB__
- rev r4,r4
- rev r5,r5
- rev r6,r6
- rev r7,r7
-# endif
- eor r4,r4,r8
- add r8,sp,#4*(8)
- eor r5,r5,r9
- str r4,[r14],#16 @ store output
- eor r6,r6,r10
- str r5,[r14,#-12]
- eor r7,r7,r11
- ldmia r8,{r8,r9,r10,r11} @ load key material
- str r6,[r14,#-8]
- add r0,sp,#4*(16+8)
- str r7,[r14,#-4]
-
- ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
-
- add r0,r0,r8 @ accumulate key material
- ldr r8,[r12],#16 @ load input
- add r1,r1,r9
- ldr r9,[r12,#-12]
-# ifdef __thumb2__
- it hi
-# endif
- strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
- add r2,r2,r10
- ldr r10,[r12,#-8]
-# ifdef __thumb2__
- it hi
-# endif
- strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
- add r3,r3,r11
- ldr r11,[r12,#-4]
-# ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-# endif
- eor r0,r0,r8
- add r8,sp,#4*(12)
- eor r1,r1,r9
- str r0,[r14],#16 @ store output
- eor r2,r2,r10
- str r1,[r14,#-12]
- eor r3,r3,r11
- ldmia r8,{r8,r9,r10,r11} @ load key material
- str r2,[r14,#-8]
- str r3,[r14,#-4]
-
- add r4,r4,r8 @ accumulate key material
- add r8,r8,#4 @ next counter value
- add r5,r5,r9
- str r8,[sp,#4*(12)] @ save next counter value
- ldr r8,[r12],#16 @ load input
- add r6,r6,r10
- add r4,r4,#3 @ counter+3
- ldr r9,[r12,#-12]
- add r7,r7,r11
- ldr r10,[r12,#-8]
- ldr r11,[r12,#-4]
-# ifdef __ARMEB__
- rev r4,r4
- rev r5,r5
- rev r6,r6
- rev r7,r7
-# endif
- eor r4,r4,r8
-# ifdef __thumb2__
- it hi
-# endif
- ldrhi r8,[sp,#4*(32+2)] @ re-load len
- eor r5,r5,r9
- eor r6,r6,r10
- str r4,[r14],#16 @ store output
- eor r7,r7,r11
- str r5,[r14,#-12]
- sub r11,r8,#64*4 @ len-=64*4
- str r6,[r14,#-8]
- str r7,[r14,#-4]
- bhi Loop_neon_outer
-
- b Ldone_neon
-
-.align 4
-Lbreak_neon:
- @ harmonize NEON and integer-only stack frames: load data
- @ from NEON frame, but save to integer-only one; distance
- @ between the two is 4*(32+4+16-32)=4*(20).
-
- str r11, [sp,#4*(20+32+2)] @ save len
- add r11,sp,#4*(32+4)
- str r12, [sp,#4*(20+32+1)] @ save inp
- str r14, [sp,#4*(20+32+0)] @ save out
-
- ldr r12,[sp,#4*(16+10)]
- ldr r14,[sp,#4*(16+11)]
- vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement
- str r12,[sp,#4*(20+16+10)] @ copy "rx"
- str r14,[sp,#4*(20+16+11)] @ copy "rx"
-
- ldr r11, [sp,#4*(15)]
- ldr r12,[sp,#4*(12)] @ modulo-scheduled load
- ldr r10, [sp,#4*(13)]
- ldr r14,[sp,#4*(14)]
- str r11, [sp,#4*(20+16+15)]
- add r11,sp,#4*(20)
- vst1.32 {q0,q1},[r11]! @ copy key
- add sp,sp,#4*(20) @ switch frame
- vst1.32 {q2,q3},[r11]
- mov r11,#10
- b Loop @ go integer-only
-
-.align 4
-Ltail_neon:
- cmp r11,#64*3
- bhs L192_or_more_neon
- cmp r11,#64*2
- bhs L128_or_more_neon
- cmp r11,#64*1
- bhs L64_or_more_neon
-
- add r8,sp,#4*(8)
- vst1.8 {q0,q1},[sp]
- add r10,sp,#4*(0)
- vst1.8 {q2,q3},[r8]
- b Loop_tail_neon
-
-.align 4
-L64_or_more_neon:
- vld1.8 {q12,q13},[r12]!
- vld1.8 {q14,q15},[r12]!
- veor q0,q0,q12
- veor q1,q1,q13
- veor q2,q2,q14
- veor q3,q3,q15
- vst1.8 {q0,q1},[r14]!
- vst1.8 {q2,q3},[r14]!
-
- beq Ldone_neon
-
- add r8,sp,#4*(8)
- vst1.8 {q4,q5},[sp]
- add r10,sp,#4*(0)
- vst1.8 {q6,q7},[r8]
- sub r11,r11,#64*1 @ len-=64*1
- b Loop_tail_neon
-
-.align 4
-L128_or_more_neon:
- vld1.8 {q12,q13},[r12]!
- vld1.8 {q14,q15},[r12]!
- veor q0,q0,q12
- veor q1,q1,q13
- vld1.8 {q12,q13},[r12]!
- veor q2,q2,q14
- veor q3,q3,q15
- vld1.8 {q14,q15},[r12]!
-
- veor q4,q4,q12
- veor q5,q5,q13
- vst1.8 {q0,q1},[r14]!
- veor q6,q6,q14
- vst1.8 {q2,q3},[r14]!
- veor q7,q7,q15
- vst1.8 {q4,q5},[r14]!
- vst1.8 {q6,q7},[r14]!
-
- beq Ldone_neon
-
- add r8,sp,#4*(8)
- vst1.8 {q8,q9},[sp]
- add r10,sp,#4*(0)
- vst1.8 {q10,q11},[r8]
- sub r11,r11,#64*2 @ len-=64*2
- b Loop_tail_neon
-
-.align 4
-L192_or_more_neon:
- vld1.8 {q12,q13},[r12]!
- vld1.8 {q14,q15},[r12]!
- veor q0,q0,q12
- veor q1,q1,q13
- vld1.8 {q12,q13},[r12]!
- veor q2,q2,q14
- veor q3,q3,q15
- vld1.8 {q14,q15},[r12]!
-
- veor q4,q4,q12
- veor q5,q5,q13
- vld1.8 {q12,q13},[r12]!
- veor q6,q6,q14
- vst1.8 {q0,q1},[r14]!
- veor q7,q7,q15
- vld1.8 {q14,q15},[r12]!
-
- veor q8,q8,q12
- vst1.8 {q2,q3},[r14]!
- veor q9,q9,q13
- vst1.8 {q4,q5},[r14]!
- veor q10,q10,q14
- vst1.8 {q6,q7},[r14]!
- veor q11,q11,q15
- vst1.8 {q8,q9},[r14]!
- vst1.8 {q10,q11},[r14]!
-
- beq Ldone_neon
-
- ldmia sp,{r8,r9,r10,r11} @ load key material
- add r0,r0,r8 @ accumulate key material
- add r8,sp,#4*(4)
- add r1,r1,r9
- add r2,r2,r10
- add r3,r3,r11
- ldmia r8,{r8,r9,r10,r11} @ load key material
-
- add r4,r4,r8 @ accumulate key material
- add r8,sp,#4*(8)
- add r5,r5,r9
- add r6,r6,r10
- add r7,r7,r11
- ldmia r8,{r8,r9,r10,r11} @ load key material
-# ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
- rev r4,r4
- rev r5,r5
- rev r6,r6
- rev r7,r7
-# endif
- stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7}
- add r0,sp,#4*(16+8)
-
- ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
-
- add r0,r0,r8 @ accumulate key material
- add r8,sp,#4*(12)
- add r1,r1,r9
- add r2,r2,r10
- add r3,r3,r11
- ldmia r8,{r8,r9,r10,r11} @ load key material
-
- add r4,r4,r8 @ accumulate key material
- add r8,sp,#4*(8)
- add r5,r5,r9
- add r4,r4,#3 @ counter+3
- add r6,r6,r10
- add r7,r7,r11
- ldr r11,[sp,#4*(32+2)] @ re-load len
-# ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
- rev r4,r4
- rev r5,r5
- rev r6,r6
- rev r7,r7
-# endif
- stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7}
- add r10,sp,#4*(0)
- sub r11,r11,#64*3 @ len-=64*3
-
-Loop_tail_neon:
- ldrb r8,[r10],#1 @ read buffer on stack
- ldrb r9,[r12],#1 @ read input
- subs r11,r11,#1
- eor r8,r8,r9
- strb r8,[r14],#1 @ store output
- bne Loop_tail_neon
-
-Ldone_neon:
- add sp,sp,#4*(32+4)
- vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15}
- add sp,sp,#4*(16+3)
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
-
-.comm _OPENSSL_armcap_P,4
-.non_lazy_symbol_pointer
-OPENSSL_armcap_P:
-.indirect_symbol _OPENSSL_armcap_P
-.long 0
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/aesv8-armv7-apple.S b/apple-arm/crypto/fipsmodule/aesv8-armv7-apple.S
deleted file mode 100644
index 638062a..0000000
--- a/apple-arm/crypto/fipsmodule/aesv8-armv7-apple.S
+++ /dev/null
@@ -1,801 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-#if __ARM_MAX_ARCH__>=7
-.text
-
-
-.code 32
-#undef __thumb2__
-.align 5
-Lrcon:
-.long 0x01,0x01,0x01,0x01
-.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat
-.long 0x1b,0x1b,0x1b,0x1b
-
-.text
-
-.globl _aes_hw_set_encrypt_key
-.private_extern _aes_hw_set_encrypt_key
-#ifdef __thumb2__
-.thumb_func _aes_hw_set_encrypt_key
-#endif
-.align 5
-_aes_hw_set_encrypt_key:
-Lenc_key:
- mov r3,#-1
- cmp r0,#0
- beq Lenc_key_abort
- cmp r2,#0
- beq Lenc_key_abort
- mov r3,#-2
- cmp r1,#128
- blt Lenc_key_abort
- cmp r1,#256
- bgt Lenc_key_abort
- tst r1,#0x3f
- bne Lenc_key_abort
-
- adr r3,Lrcon
- cmp r1,#192
-
- veor q0,q0,q0
- vld1.8 {q3},[r0]!
- mov r1,#8 @ reuse r1
- vld1.32 {q1,q2},[r3]!
-
- blt Loop128
- beq L192
- b L256
-
-.align 4
-Loop128:
- vtbl.8 d20,{q3},d4
- vtbl.8 d21,{q3},d5
- vext.8 q9,q0,q3,#12
- vst1.32 {q3},[r2]!
-.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
- subs r1,r1,#1
-
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q10,q10,q1
- veor q3,q3,q9
- vshl.u8 q1,q1,#1
- veor q3,q3,q10
- bne Loop128
-
- vld1.32 {q1},[r3]
-
- vtbl.8 d20,{q3},d4
- vtbl.8 d21,{q3},d5
- vext.8 q9,q0,q3,#12
- vst1.32 {q3},[r2]!
-.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
-
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q10,q10,q1
- veor q3,q3,q9
- vshl.u8 q1,q1,#1
- veor q3,q3,q10
-
- vtbl.8 d20,{q3},d4
- vtbl.8 d21,{q3},d5
- vext.8 q9,q0,q3,#12
- vst1.32 {q3},[r2]!
-.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
-
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q10,q10,q1
- veor q3,q3,q9
- veor q3,q3,q10
- vst1.32 {q3},[r2]
- add r2,r2,#0x50
-
- mov r12,#10
- b Ldone
-
-.align 4
-L192:
- vld1.8 {d16},[r0]!
- vmov.i8 q10,#8 @ borrow q10
- vst1.32 {q3},[r2]!
- vsub.i8 q2,q2,q10 @ adjust the mask
-
-Loop192:
- vtbl.8 d20,{q8},d4
- vtbl.8 d21,{q8},d5
- vext.8 q9,q0,q3,#12
- vst1.32 {d16},[r2]!
-.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
- subs r1,r1,#1
-
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q3,q3,q9
-
- vdup.32 q9,d7[1]
- veor q9,q9,q8
- veor q10,q10,q1
- vext.8 q8,q0,q8,#12
- vshl.u8 q1,q1,#1
- veor q8,q8,q9
- veor q3,q3,q10
- veor q8,q8,q10
- vst1.32 {q3},[r2]!
- bne Loop192
-
- mov r12,#12
- add r2,r2,#0x20
- b Ldone
-
-.align 4
-L256:
- vld1.8 {q8},[r0]
- mov r1,#7
- mov r12,#14
- vst1.32 {q3},[r2]!
-
-Loop256:
- vtbl.8 d20,{q8},d4
- vtbl.8 d21,{q8},d5
- vext.8 q9,q0,q3,#12
- vst1.32 {q8},[r2]!
-.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
- subs r1,r1,#1
-
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q3,q3,q9
- vext.8 q9,q0,q9,#12
- veor q10,q10,q1
- veor q3,q3,q9
- vshl.u8 q1,q1,#1
- veor q3,q3,q10
- vst1.32 {q3},[r2]!
- beq Ldone
-
- vdup.32 q10,d7[1]
- vext.8 q9,q0,q8,#12
-.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
-
- veor q8,q8,q9
- vext.8 q9,q0,q9,#12
- veor q8,q8,q9
- vext.8 q9,q0,q9,#12
- veor q8,q8,q9
-
- veor q8,q8,q10
- b Loop256
-
-Ldone:
- str r12,[r2]
- mov r3,#0
-
-Lenc_key_abort:
- mov r0,r3 @ return value
-
- bx lr
-
-
-.globl _aes_hw_set_decrypt_key
-.private_extern _aes_hw_set_decrypt_key
-#ifdef __thumb2__
-.thumb_func _aes_hw_set_decrypt_key
-#endif
-.align 5
-_aes_hw_set_decrypt_key:
- stmdb sp!,{r4,lr}
- bl Lenc_key
-
- cmp r0,#0
- bne Ldec_key_abort
-
- sub r2,r2,#240 @ restore original r2
- mov r4,#-16
- add r0,r2,r12,lsl#4 @ end of key schedule
-
- vld1.32 {q0},[r2]
- vld1.32 {q1},[r0]
- vst1.32 {q0},[r0],r4
- vst1.32 {q1},[r2]!
-
-Loop_imc:
- vld1.32 {q0},[r2]
- vld1.32 {q1},[r0]
-.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
- vst1.32 {q0},[r0],r4
- vst1.32 {q1},[r2]!
- cmp r0,r2
- bhi Loop_imc
-
- vld1.32 {q0},[r2]
-.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
- vst1.32 {q0},[r0]
-
- eor r0,r0,r0 @ return value
-Ldec_key_abort:
- ldmia sp!,{r4,pc}
-
-.globl _aes_hw_encrypt
-.private_extern _aes_hw_encrypt
-#ifdef __thumb2__
-.thumb_func _aes_hw_encrypt
-#endif
-.align 5
-_aes_hw_encrypt:
- AARCH64_VALID_CALL_TARGET
- ldr r3,[r2,#240]
- vld1.32 {q0},[r2]!
- vld1.8 {q2},[r0]
- sub r3,r3,#2
- vld1.32 {q1},[r2]!
-
-Loop_enc:
-.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
-.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
- vld1.32 {q0},[r2]!
- subs r3,r3,#2
-.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
-.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
- vld1.32 {q1},[r2]!
- bgt Loop_enc
-
-.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
-.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
- vld1.32 {q0},[r2]
-.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
- veor q2,q2,q0
-
- vst1.8 {q2},[r1]
- bx lr
-
-.globl _aes_hw_decrypt
-.private_extern _aes_hw_decrypt
-#ifdef __thumb2__
-.thumb_func _aes_hw_decrypt
-#endif
-.align 5
-_aes_hw_decrypt:
- AARCH64_VALID_CALL_TARGET
- ldr r3,[r2,#240]
- vld1.32 {q0},[r2]!
- vld1.8 {q2},[r0]
- sub r3,r3,#2
- vld1.32 {q1},[r2]!
-
-Loop_dec:
-.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
-.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
- vld1.32 {q0},[r2]!
- subs r3,r3,#2
-.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
-.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
- vld1.32 {q1},[r2]!
- bgt Loop_dec
-
-.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
-.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
- vld1.32 {q0},[r2]
-.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
- veor q2,q2,q0
-
- vst1.8 {q2},[r1]
- bx lr
-
-.globl _aes_hw_cbc_encrypt
-.private_extern _aes_hw_cbc_encrypt
-#ifdef __thumb2__
-.thumb_func _aes_hw_cbc_encrypt
-#endif
-.align 5
-_aes_hw_cbc_encrypt:
- mov ip,sp
- stmdb sp!,{r4,r5,r6,r7,r8,lr}
- vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
- ldmia ip,{r4,r5} @ load remaining args
- subs r2,r2,#16
- mov r8,#16
- blo Lcbc_abort
- moveq r8,#0
-
- cmp r5,#0 @ en- or decrypting?
- ldr r5,[r3,#240]
- and r2,r2,#-16
- vld1.8 {q6},[r4]
- vld1.8 {q0},[r0],r8
-
- vld1.32 {q8,q9},[r3] @ load key schedule...
- sub r5,r5,#6
- add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
- sub r5,r5,#2
- vld1.32 {q10,q11},[r7]!
- vld1.32 {q12,q13},[r7]!
- vld1.32 {q14,q15},[r7]!
- vld1.32 {q7},[r7]
-
- add r7,r3,#32
- mov r6,r5
- beq Lcbc_dec
-
- cmp r5,#2
- veor q0,q0,q6
- veor q5,q8,q7
- beq Lcbc_enc128
-
- vld1.32 {q2,q3},[r7]
- add r7,r3,#16
- add r6,r3,#16*4
- add r12,r3,#16*5
-.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- add r14,r3,#16*6
- add r3,r3,#16*7
- b Lenter_cbc_enc
-
-.align 4
-Loop_cbc_enc:
-.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vst1.8 {q6},[r1]!
-Lenter_cbc_enc:
-.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.32 {q8},[r6]
- cmp r5,#4
-.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.32 {q9},[r12]
- beq Lcbc_enc192
-
-.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.32 {q8},[r14]
-.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.32 {q9},[r3]
- nop
-
-Lcbc_enc192:
-.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- subs r2,r2,#16
-.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- moveq r8,#0
-.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.8 {q8},[r0],r8
-.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- veor q8,q8,q5
-.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.32 {q9},[r7] @ re-pre-load rndkey[1]
-.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
- veor q6,q0,q7
- bhs Loop_cbc_enc
-
- vst1.8 {q6},[r1]!
- b Lcbc_done
-
-.align 5
-Lcbc_enc128:
- vld1.32 {q2,q3},[r7]
-.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- b Lenter_cbc_enc128
-Loop_cbc_enc128:
-.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vst1.8 {q6},[r1]!
-Lenter_cbc_enc128:
-.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- subs r2,r2,#16
-.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- moveq r8,#0
-.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- vld1.8 {q8},[r0],r8
-.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
- veor q8,q8,q5
-.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
- veor q6,q0,q7
- bhs Loop_cbc_enc128
-
- vst1.8 {q6},[r1]!
- b Lcbc_done
-.align 5
-Lcbc_dec:
- vld1.8 {q10},[r0]!
- subs r2,r2,#32 @ bias
- add r6,r5,#2
- vorr q3,q0,q0
- vorr q1,q0,q0
- vorr q11,q10,q10
- blo Lcbc_dec_tail
-
- vorr q1,q10,q10
- vld1.8 {q10},[r0]!
- vorr q2,q0,q0
- vorr q3,q1,q1
- vorr q11,q10,q10
-
-Loop3x_cbc_dec:
-.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
-.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
-.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- vld1.32 {q8},[r7]!
- subs r6,r6,#2
-.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
-.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
-.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- vld1.32 {q9},[r7]!
- bgt Loop3x_cbc_dec
-
-.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
-.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
-.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- veor q4,q6,q7
- subs r2,r2,#0x30
- veor q5,q2,q7
- movlo r6,r2 @ r6, r6, is zero at this point
-.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
-.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
-.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- veor q9,q3,q7
- add r0,r0,r6 @ r0 is adjusted in such way that
- @ at exit from the loop q1-q10
- @ are loaded with last "words"
- vorr q6,q11,q11
- mov r7,r3
-.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
-.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
-.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- vld1.8 {q2},[r0]!
-.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
-.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
-.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- vld1.8 {q3},[r0]!
-.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
-.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
-.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- vld1.8 {q11},[r0]!
-.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
-.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
-.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
- vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
- add r6,r5,#2
- veor q4,q4,q0
- veor q5,q5,q1
- veor q10,q10,q9
- vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
- vst1.8 {q4},[r1]!
- vorr q0,q2,q2
- vst1.8 {q5},[r1]!
- vorr q1,q3,q3
- vst1.8 {q10},[r1]!
- vorr q10,q11,q11
- bhs Loop3x_cbc_dec
-
- cmn r2,#0x30
- beq Lcbc_done
- nop
-
-Lcbc_dec_tail:
-.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- vld1.32 {q8},[r7]!
- subs r6,r6,#2
-.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- vld1.32 {q9},[r7]!
- bgt Lcbc_dec_tail
-
-.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
-.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
-.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- cmn r2,#0x20
-.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- veor q5,q6,q7
-.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
-.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
-.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
-.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
- veor q9,q3,q7
-.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
-.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
- beq Lcbc_dec_one
- veor q5,q5,q1
- veor q9,q9,q10
- vorr q6,q11,q11
- vst1.8 {q5},[r1]!
- vst1.8 {q9},[r1]!
- b Lcbc_done
-
-Lcbc_dec_one:
- veor q5,q5,q10
- vorr q6,q11,q11
- vst1.8 {q5},[r1]!
-
-Lcbc_done:
- vst1.8 {q6},[r4]
-Lcbc_abort:
- vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
- ldmia sp!,{r4,r5,r6,r7,r8,pc}
-
-.globl _aes_hw_ctr32_encrypt_blocks
-.private_extern _aes_hw_ctr32_encrypt_blocks
-#ifdef __thumb2__
-.thumb_func _aes_hw_ctr32_encrypt_blocks
-#endif
-.align 5
-_aes_hw_ctr32_encrypt_blocks:
- mov ip,sp
- stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
- vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
- ldr r4, [ip] @ load remaining arg
- ldr r5,[r3,#240]
-
- ldr r8, [r4, #12]
- vld1.32 {q0},[r4]
-
- vld1.32 {q8,q9},[r3] @ load key schedule...
- sub r5,r5,#4
- mov r12,#16
- cmp r2,#2
- add r7,r3,r5,lsl#4 @ pointer to last 5 round keys
- sub r5,r5,#2
- vld1.32 {q12,q13},[r7]!
- vld1.32 {q14,q15},[r7]!
- vld1.32 {q7},[r7]
- add r7,r3,#32
- mov r6,r5
- movlo r12,#0
-
- @ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
- @ affected by silicon errata #1742098 [0] and #1655431 [1],
- @ respectively, where the second instruction of an aese/aesmc
- @ instruction pair may execute twice if an interrupt is taken right
- @ after the first instruction consumes an input register of which a
- @ single 32-bit lane has been updated the last time it was modified.
- @
- @ This function uses a counter in one 32-bit lane. The
- @ could write to q1 and q10 directly, but that trips this bugs.
- @ We write to q6 and copy to the final register as a workaround.
- @
- @ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
- @ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
-#ifndef __ARMEB__
- rev r8, r8
-#endif
- add r10, r8, #1
- vorr q6,q0,q0
- rev r10, r10
- vmov.32 d13[1],r10
- add r8, r8, #2
- vorr q1,q6,q6
- bls Lctr32_tail
- rev r12, r8
- vmov.32 d13[1],r12
- sub r2,r2,#3 @ bias
- vorr q10,q6,q6
- b Loop3x_ctr32
-
-.align 4
-Loop3x_ctr32:
-.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
-.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
-.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
-.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
- vld1.32 {q8},[r7]!
- subs r6,r6,#2
-.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
-.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
-.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
-.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
- vld1.32 {q9},[r7]!
- bgt Loop3x_ctr32
-
-.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
-.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
-.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
-.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
- vld1.8 {q2},[r0]!
- add r9,r8,#1
-.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
-.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
- vld1.8 {q3},[r0]!
- rev r9,r9
-.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
-.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
-.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
-.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
- vld1.8 {q11},[r0]!
- mov r7,r3
-.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
-.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10
-.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
-.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
-.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
-.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
- veor q2,q2,q7
- add r10,r8,#2
-.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
-.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
- veor q3,q3,q7
- add r8,r8,#3
-.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
-.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
-.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
-.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
- @ Note the logic to update q0, q1, and q1 is written to work
- @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
- @ 32-bit mode. See the comment above.
- veor q11,q11,q7
- vmov.32 d13[1], r9
-.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
-.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
- vorr q0,q6,q6
- rev r10,r10
-.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
-.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
- vmov.32 d13[1], r10
- rev r12,r8
-.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
-.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
- vorr q1,q6,q6
- vmov.32 d13[1], r12
-.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
-.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
- vorr q10,q6,q6
- subs r2,r2,#3
-.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15
-.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
-.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15
-
- veor q2,q2,q4
- vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
- vst1.8 {q2},[r1]!
- veor q3,q3,q5
- mov r6,r5
- vst1.8 {q3},[r1]!
- veor q11,q11,q9
- vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
- vst1.8 {q11},[r1]!
- bhs Loop3x_ctr32
-
- adds r2,r2,#3
- beq Lctr32_done
- cmp r2,#1
- mov r12,#16
- moveq r12,#0
-
-Lctr32_tail:
-.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
-.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- vld1.32 {q8},[r7]!
- subs r6,r6,#2
-.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
-.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- vld1.32 {q9},[r7]!
- bgt Lctr32_tail
-
-.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
-.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
-.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
-.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- vld1.8 {q2},[r0],r12
-.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
-.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- vld1.8 {q3},[r0]
-.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
-.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- veor q2,q2,q7
-.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
-.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
-.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
-.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
- veor q3,q3,q7
-.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
-.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15
-
- cmp r2,#1
- veor q2,q2,q0
- veor q3,q3,q1
- vst1.8 {q2},[r1]!
- beq Lctr32_done
- vst1.8 {q3},[r1]
-
-Lctr32_done:
- vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
-
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/armv4-mont-apple.S b/apple-arm/crypto/fipsmodule/armv4-mont-apple.S
deleted file mode 100644
index 54bd13f..0000000
--- a/apple-arm/crypto/fipsmodule/armv4-mont-apple.S
+++ /dev/null
@@ -1,974 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
-
-
-.text
-#if defined(__thumb2__)
-.syntax unified
-.thumb
-#else
-.code 32
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-.align 5
-LOPENSSL_armcap:
-.word OPENSSL_armcap_P-Lbn_mul_mont
-#endif
-
-.globl _bn_mul_mont
-.private_extern _bn_mul_mont
-#ifdef __thumb2__
-.thumb_func _bn_mul_mont
-#endif
-
-.align 5
-_bn_mul_mont:
-Lbn_mul_mont:
- ldr ip,[sp,#4] @ load num
- stmdb sp!,{r0,r2} @ sp points at argument block
-#if __ARM_MAX_ARCH__>=7
- tst ip,#7
- bne Lialu
- adr r0,Lbn_mul_mont
- ldr r2,LOPENSSL_armcap
- ldr r0,[r0,r2]
-#ifdef __APPLE__
- ldr r0,[r0]
-#endif
- tst r0,#ARMV7_NEON @ NEON available?
- ldmia sp, {r0,r2}
- beq Lialu
- add sp,sp,#8
- b bn_mul8x_mont_neon
-.align 4
-Lialu:
-#endif
- cmp ip,#2
- mov r0,ip @ load num
-#ifdef __thumb2__
- ittt lt
-#endif
- movlt r0,#0
- addlt sp,sp,#2*4
- blt Labrt
-
- stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers
-
- mov r0,r0,lsl#2 @ rescale r0 for byte count
- sub sp,sp,r0 @ alloca(4*num)
- sub sp,sp,#4 @ +extra dword
- sub r0,r0,#4 @ "num=num-1"
- add r4,r2,r0 @ &bp[num-1]
-
- add r0,sp,r0 @ r0 to point at &tp[num-1]
- ldr r8,[r0,#14*4] @ &n0
- ldr r2,[r2] @ bp[0]
- ldr r5,[r1],#4 @ ap[0],ap++
- ldr r6,[r3],#4 @ np[0],np++
- ldr r8,[r8] @ *n0
- str r4,[r0,#15*4] @ save &bp[num]
-
- umull r10,r11,r5,r2 @ ap[0]*bp[0]
- str r8,[r0,#14*4] @ save n0 value
- mul r8,r10,r8 @ "tp[0]"*n0
- mov r12,#0
- umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]"
- mov r4,sp
-
-L1st:
- ldr r5,[r1],#4 @ ap[j],ap++
- mov r10,r11
- ldr r6,[r3],#4 @ np[j],np++
- mov r11,#0
- umlal r10,r11,r5,r2 @ ap[j]*bp[0]
- mov r14,#0
- umlal r12,r14,r6,r8 @ np[j]*n0
- adds r12,r12,r10
- str r12,[r4],#4 @ tp[j-1]=,tp++
- adc r12,r14,#0
- cmp r4,r0
- bne L1st
-
- adds r12,r12,r11
- ldr r4,[r0,#13*4] @ restore bp
- mov r14,#0
- ldr r8,[r0,#14*4] @ restore n0
- adc r14,r14,#0
- str r12,[r0] @ tp[num-1]=
- mov r7,sp
- str r14,[r0,#4] @ tp[num]=
-
-Louter:
- sub r7,r0,r7 @ "original" r0-1 value
- sub r1,r1,r7 @ "rewind" ap to &ap[1]
- ldr r2,[r4,#4]! @ *(++bp)
- sub r3,r3,r7 @ "rewind" np to &np[1]
- ldr r5,[r1,#-4] @ ap[0]
- ldr r10,[sp] @ tp[0]
- ldr r6,[r3,#-4] @ np[0]
- ldr r7,[sp,#4] @ tp[1]
-
- mov r11,#0
- umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0]
- str r4,[r0,#13*4] @ save bp
- mul r8,r10,r8
- mov r12,#0
- umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]"
- mov r4,sp
-
-Linner:
- ldr r5,[r1],#4 @ ap[j],ap++
- adds r10,r11,r7 @ +=tp[j]
- ldr r6,[r3],#4 @ np[j],np++
- mov r11,#0
- umlal r10,r11,r5,r2 @ ap[j]*bp[i]
- mov r14,#0
- umlal r12,r14,r6,r8 @ np[j]*n0
- adc r11,r11,#0
- ldr r7,[r4,#8] @ tp[j+1]
- adds r12,r12,r10
- str r12,[r4],#4 @ tp[j-1]=,tp++
- adc r12,r14,#0
- cmp r4,r0
- bne Linner
-
- adds r12,r12,r11
- mov r14,#0
- ldr r4,[r0,#13*4] @ restore bp
- adc r14,r14,#0
- ldr r8,[r0,#14*4] @ restore n0
- adds r12,r12,r7
- ldr r7,[r0,#15*4] @ restore &bp[num]
- adc r14,r14,#0
- str r12,[r0] @ tp[num-1]=
- str r14,[r0,#4] @ tp[num]=
-
- cmp r4,r7
-#ifdef __thumb2__
- itt ne
-#endif
- movne r7,sp
- bne Louter
-
- ldr r2,[r0,#12*4] @ pull rp
- mov r5,sp
- add r0,r0,#4 @ r0 to point at &tp[num]
- sub r5,r0,r5 @ "original" num value
- mov r4,sp @ "rewind" r4
- mov r1,r4 @ "borrow" r1
- sub r3,r3,r5 @ "rewind" r3 to &np[0]
-
- subs r7,r7,r7 @ "clear" carry flag
-Lsub: ldr r7,[r4],#4
- ldr r6,[r3],#4
- sbcs r7,r7,r6 @ tp[j]-np[j]
- str r7,[r2],#4 @ rp[j]=
- teq r4,r0 @ preserve carry
- bne Lsub
- sbcs r14,r14,#0 @ upmost carry
- mov r4,sp @ "rewind" r4
- sub r2,r2,r5 @ "rewind" r2
-
-Lcopy: ldr r7,[r4] @ conditional copy
- ldr r5,[r2]
- str sp,[r4],#4 @ zap tp
-#ifdef __thumb2__
- it cc
-#endif
- movcc r5,r7
- str r5,[r2],#4
- teq r4,r0 @ preserve carry
- bne Lcopy
-
- mov sp,r0
- add sp,sp,#4 @ skip over tp[num+1]
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers
- add sp,sp,#2*4 @ skip over {r0,r2}
- mov r0,#1
-Labrt:
-#if __ARM_ARCH__>=5
- bx lr @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
-.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-#ifdef __thumb2__
-.thumb_func bn_mul8x_mont_neon
-#endif
-.align 5
-bn_mul8x_mont_neon:
- mov ip,sp
- stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
- vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
- ldmia ip,{r4,r5} @ load rest of parameter block
- mov ip,sp
-
- cmp r5,#8
- bhi LNEON_8n
-
- @ special case for r5==8, everything is in register bank...
-
- vld1.32 {d28[0]}, [r2,:32]!
- veor d8,d8,d8
- sub r7,sp,r5,lsl#4
- vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-(
- and r7,r7,#-64
- vld1.32 {d30[0]}, [r4,:32]
- mov sp,r7 @ alloca
- vzip.16 d28,d8
-
- vmull.u32 q6,d28,d0[0]
- vmull.u32 q7,d28,d0[1]
- vmull.u32 q8,d28,d1[0]
- vshl.i64 d29,d13,#16
- vmull.u32 q9,d28,d1[1]
-
- vadd.u64 d29,d29,d12
- veor d8,d8,d8
- vmul.u32 d29,d29,d30
-
- vmull.u32 q10,d28,d2[0]
- vld1.32 {d4,d5,d6,d7}, [r3]!
- vmull.u32 q11,d28,d2[1]
- vmull.u32 q12,d28,d3[0]
- vzip.16 d29,d8
- vmull.u32 q13,d28,d3[1]
-
- vmlal.u32 q6,d29,d4[0]
- sub r9,r5,#1
- vmlal.u32 q7,d29,d4[1]
- vmlal.u32 q8,d29,d5[0]
- vmlal.u32 q9,d29,d5[1]
-
- vmlal.u32 q10,d29,d6[0]
- vmov q5,q6
- vmlal.u32 q11,d29,d6[1]
- vmov q6,q7
- vmlal.u32 q12,d29,d7[0]
- vmov q7,q8
- vmlal.u32 q13,d29,d7[1]
- vmov q8,q9
- vmov q9,q10
- vshr.u64 d10,d10,#16
- vmov q10,q11
- vmov q11,q12
- vadd.u64 d10,d10,d11
- vmov q12,q13
- veor q13,q13
- vshr.u64 d10,d10,#16
-
- b LNEON_outer8
-
-.align 4
-LNEON_outer8:
- vld1.32 {d28[0]}, [r2,:32]!
- veor d8,d8,d8
- vzip.16 d28,d8
- vadd.u64 d12,d12,d10
-
- vmlal.u32 q6,d28,d0[0]
- vmlal.u32 q7,d28,d0[1]
- vmlal.u32 q8,d28,d1[0]
- vshl.i64 d29,d13,#16
- vmlal.u32 q9,d28,d1[1]
-
- vadd.u64 d29,d29,d12
- veor d8,d8,d8
- subs r9,r9,#1
- vmul.u32 d29,d29,d30
-
- vmlal.u32 q10,d28,d2[0]
- vmlal.u32 q11,d28,d2[1]
- vmlal.u32 q12,d28,d3[0]
- vzip.16 d29,d8
- vmlal.u32 q13,d28,d3[1]
-
- vmlal.u32 q6,d29,d4[0]
- vmlal.u32 q7,d29,d4[1]
- vmlal.u32 q8,d29,d5[0]
- vmlal.u32 q9,d29,d5[1]
-
- vmlal.u32 q10,d29,d6[0]
- vmov q5,q6
- vmlal.u32 q11,d29,d6[1]
- vmov q6,q7
- vmlal.u32 q12,d29,d7[0]
- vmov q7,q8
- vmlal.u32 q13,d29,d7[1]
- vmov q8,q9
- vmov q9,q10
- vshr.u64 d10,d10,#16
- vmov q10,q11
- vmov q11,q12
- vadd.u64 d10,d10,d11
- vmov q12,q13
- veor q13,q13
- vshr.u64 d10,d10,#16
-
- bne LNEON_outer8
-
- vadd.u64 d12,d12,d10
- mov r7,sp
- vshr.u64 d10,d12,#16
- mov r8,r5
- vadd.u64 d13,d13,d10
- add r6,sp,#96
- vshr.u64 d10,d13,#16
- vzip.16 d12,d13
-
- b LNEON_tail_entry
-
-.align 4
-LNEON_8n:
- veor q6,q6,q6
- sub r7,sp,#128
- veor q7,q7,q7
- sub r7,r7,r5,lsl#4
- veor q8,q8,q8
- and r7,r7,#-64
- veor q9,q9,q9
- mov sp,r7 @ alloca
- veor q10,q10,q10
- add r7,r7,#256
- veor q11,q11,q11
- sub r8,r5,#8
- veor q12,q12,q12
- veor q13,q13,q13
-
-LNEON_8n_init:
- vst1.64 {q6,q7},[r7,:256]!
- subs r8,r8,#8
- vst1.64 {q8,q9},[r7,:256]!
- vst1.64 {q10,q11},[r7,:256]!
- vst1.64 {q12,q13},[r7,:256]!
- bne LNEON_8n_init
-
- add r6,sp,#256
- vld1.32 {d0,d1,d2,d3},[r1]!
- add r10,sp,#8
- vld1.32 {d30[0]},[r4,:32]
- mov r9,r5
- b LNEON_8n_outer
-
-.align 4
-LNEON_8n_outer:
- vld1.32 {d28[0]},[r2,:32]! @ *b++
- veor d8,d8,d8
- vzip.16 d28,d8
- add r7,sp,#128
- vld1.32 {d4,d5,d6,d7},[r3]!
-
- vmlal.u32 q6,d28,d0[0]
- vmlal.u32 q7,d28,d0[1]
- veor d8,d8,d8
- vmlal.u32 q8,d28,d1[0]
- vshl.i64 d29,d13,#16
- vmlal.u32 q9,d28,d1[1]
- vadd.u64 d29,d29,d12
- vmlal.u32 q10,d28,d2[0]
- vmul.u32 d29,d29,d30
- vmlal.u32 q11,d28,d2[1]
- vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0]
- vmlal.u32 q12,d28,d3[0]
- vzip.16 d29,d8
- vmlal.u32 q13,d28,d3[1]
- vld1.32 {d28[0]},[r2,:32]! @ *b++
- vmlal.u32 q6,d29,d4[0]
- veor d10,d10,d10
- vmlal.u32 q7,d29,d4[1]
- vzip.16 d28,d10
- vmlal.u32 q8,d29,d5[0]
- vshr.u64 d12,d12,#16
- vmlal.u32 q9,d29,d5[1]
- vmlal.u32 q10,d29,d6[0]
- vadd.u64 d12,d12,d13
- vmlal.u32 q11,d29,d6[1]
- vshr.u64 d12,d12,#16
- vmlal.u32 q12,d29,d7[0]
- vmlal.u32 q13,d29,d7[1]
- vadd.u64 d14,d14,d12
- vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0]
- vmlal.u32 q7,d28,d0[0]
- vld1.64 {q6},[r6,:128]!
- vmlal.u32 q8,d28,d0[1]
- veor d8,d8,d8
- vmlal.u32 q9,d28,d1[0]
- vshl.i64 d29,d15,#16
- vmlal.u32 q10,d28,d1[1]
- vadd.u64 d29,d29,d14
- vmlal.u32 q11,d28,d2[0]
- vmul.u32 d29,d29,d30
- vmlal.u32 q12,d28,d2[1]
- vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1]
- vmlal.u32 q13,d28,d3[0]
- vzip.16 d29,d8
- vmlal.u32 q6,d28,d3[1]
- vld1.32 {d28[0]},[r2,:32]! @ *b++
- vmlal.u32 q7,d29,d4[0]
- veor d10,d10,d10
- vmlal.u32 q8,d29,d4[1]
- vzip.16 d28,d10
- vmlal.u32 q9,d29,d5[0]
- vshr.u64 d14,d14,#16
- vmlal.u32 q10,d29,d5[1]
- vmlal.u32 q11,d29,d6[0]
- vadd.u64 d14,d14,d15
- vmlal.u32 q12,d29,d6[1]
- vshr.u64 d14,d14,#16
- vmlal.u32 q13,d29,d7[0]
- vmlal.u32 q6,d29,d7[1]
- vadd.u64 d16,d16,d14
- vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1]
- vmlal.u32 q8,d28,d0[0]
- vld1.64 {q7},[r6,:128]!
- vmlal.u32 q9,d28,d0[1]
- veor d8,d8,d8
- vmlal.u32 q10,d28,d1[0]
- vshl.i64 d29,d17,#16
- vmlal.u32 q11,d28,d1[1]
- vadd.u64 d29,d29,d16
- vmlal.u32 q12,d28,d2[0]
- vmul.u32 d29,d29,d30
- vmlal.u32 q13,d28,d2[1]
- vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2]
- vmlal.u32 q6,d28,d3[0]
- vzip.16 d29,d8
- vmlal.u32 q7,d28,d3[1]
- vld1.32 {d28[0]},[r2,:32]! @ *b++
- vmlal.u32 q8,d29,d4[0]
- veor d10,d10,d10
- vmlal.u32 q9,d29,d4[1]
- vzip.16 d28,d10
- vmlal.u32 q10,d29,d5[0]
- vshr.u64 d16,d16,#16
- vmlal.u32 q11,d29,d5[1]
- vmlal.u32 q12,d29,d6[0]
- vadd.u64 d16,d16,d17
- vmlal.u32 q13,d29,d6[1]
- vshr.u64 d16,d16,#16
- vmlal.u32 q6,d29,d7[0]
- vmlal.u32 q7,d29,d7[1]
- vadd.u64 d18,d18,d16
- vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2]
- vmlal.u32 q9,d28,d0[0]
- vld1.64 {q8},[r6,:128]!
- vmlal.u32 q10,d28,d0[1]
- veor d8,d8,d8
- vmlal.u32 q11,d28,d1[0]
- vshl.i64 d29,d19,#16
- vmlal.u32 q12,d28,d1[1]
- vadd.u64 d29,d29,d18
- vmlal.u32 q13,d28,d2[0]
- vmul.u32 d29,d29,d30
- vmlal.u32 q6,d28,d2[1]
- vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3]
- vmlal.u32 q7,d28,d3[0]
- vzip.16 d29,d8
- vmlal.u32 q8,d28,d3[1]
- vld1.32 {d28[0]},[r2,:32]! @ *b++
- vmlal.u32 q9,d29,d4[0]
- veor d10,d10,d10
- vmlal.u32 q10,d29,d4[1]
- vzip.16 d28,d10
- vmlal.u32 q11,d29,d5[0]
- vshr.u64 d18,d18,#16
- vmlal.u32 q12,d29,d5[1]
- vmlal.u32 q13,d29,d6[0]
- vadd.u64 d18,d18,d19
- vmlal.u32 q6,d29,d6[1]
- vshr.u64 d18,d18,#16
- vmlal.u32 q7,d29,d7[0]
- vmlal.u32 q8,d29,d7[1]
- vadd.u64 d20,d20,d18
- vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3]
- vmlal.u32 q10,d28,d0[0]
- vld1.64 {q9},[r6,:128]!
- vmlal.u32 q11,d28,d0[1]
- veor d8,d8,d8
- vmlal.u32 q12,d28,d1[0]
- vshl.i64 d29,d21,#16
- vmlal.u32 q13,d28,d1[1]
- vadd.u64 d29,d29,d20
- vmlal.u32 q6,d28,d2[0]
- vmul.u32 d29,d29,d30
- vmlal.u32 q7,d28,d2[1]
- vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4]
- vmlal.u32 q8,d28,d3[0]
- vzip.16 d29,d8
- vmlal.u32 q9,d28,d3[1]
- vld1.32 {d28[0]},[r2,:32]! @ *b++
- vmlal.u32 q10,d29,d4[0]
- veor d10,d10,d10
- vmlal.u32 q11,d29,d4[1]
- vzip.16 d28,d10
- vmlal.u32 q12,d29,d5[0]
- vshr.u64 d20,d20,#16
- vmlal.u32 q13,d29,d5[1]
- vmlal.u32 q6,d29,d6[0]
- vadd.u64 d20,d20,d21
- vmlal.u32 q7,d29,d6[1]
- vshr.u64 d20,d20,#16
- vmlal.u32 q8,d29,d7[0]
- vmlal.u32 q9,d29,d7[1]
- vadd.u64 d22,d22,d20
- vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4]
- vmlal.u32 q11,d28,d0[0]
- vld1.64 {q10},[r6,:128]!
- vmlal.u32 q12,d28,d0[1]
- veor d8,d8,d8
- vmlal.u32 q13,d28,d1[0]
- vshl.i64 d29,d23,#16
- vmlal.u32 q6,d28,d1[1]
- vadd.u64 d29,d29,d22
- vmlal.u32 q7,d28,d2[0]
- vmul.u32 d29,d29,d30
- vmlal.u32 q8,d28,d2[1]
- vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5]
- vmlal.u32 q9,d28,d3[0]
- vzip.16 d29,d8
- vmlal.u32 q10,d28,d3[1]
- vld1.32 {d28[0]},[r2,:32]! @ *b++
- vmlal.u32 q11,d29,d4[0]
- veor d10,d10,d10
- vmlal.u32 q12,d29,d4[1]
- vzip.16 d28,d10
- vmlal.u32 q13,d29,d5[0]
- vshr.u64 d22,d22,#16
- vmlal.u32 q6,d29,d5[1]
- vmlal.u32 q7,d29,d6[0]
- vadd.u64 d22,d22,d23
- vmlal.u32 q8,d29,d6[1]
- vshr.u64 d22,d22,#16
- vmlal.u32 q9,d29,d7[0]
- vmlal.u32 q10,d29,d7[1]
- vadd.u64 d24,d24,d22
- vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5]
- vmlal.u32 q12,d28,d0[0]
- vld1.64 {q11},[r6,:128]!
- vmlal.u32 q13,d28,d0[1]
- veor d8,d8,d8
- vmlal.u32 q6,d28,d1[0]
- vshl.i64 d29,d25,#16
- vmlal.u32 q7,d28,d1[1]
- vadd.u64 d29,d29,d24
- vmlal.u32 q8,d28,d2[0]
- vmul.u32 d29,d29,d30
- vmlal.u32 q9,d28,d2[1]
- vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6]
- vmlal.u32 q10,d28,d3[0]
- vzip.16 d29,d8
- vmlal.u32 q11,d28,d3[1]
- vld1.32 {d28[0]},[r2,:32]! @ *b++
- vmlal.u32 q12,d29,d4[0]
- veor d10,d10,d10
- vmlal.u32 q13,d29,d4[1]
- vzip.16 d28,d10
- vmlal.u32 q6,d29,d5[0]
- vshr.u64 d24,d24,#16
- vmlal.u32 q7,d29,d5[1]
- vmlal.u32 q8,d29,d6[0]
- vadd.u64 d24,d24,d25
- vmlal.u32 q9,d29,d6[1]
- vshr.u64 d24,d24,#16
- vmlal.u32 q10,d29,d7[0]
- vmlal.u32 q11,d29,d7[1]
- vadd.u64 d26,d26,d24
- vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6]
- vmlal.u32 q13,d28,d0[0]
- vld1.64 {q12},[r6,:128]!
- vmlal.u32 q6,d28,d0[1]
- veor d8,d8,d8
- vmlal.u32 q7,d28,d1[0]
- vshl.i64 d29,d27,#16
- vmlal.u32 q8,d28,d1[1]
- vadd.u64 d29,d29,d26
- vmlal.u32 q9,d28,d2[0]
- vmul.u32 d29,d29,d30
- vmlal.u32 q10,d28,d2[1]
- vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7]
- vmlal.u32 q11,d28,d3[0]
- vzip.16 d29,d8
- vmlal.u32 q12,d28,d3[1]
- vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0]
- vmlal.u32 q13,d29,d4[0]
- vld1.32 {d0,d1,d2,d3},[r1]!
- vmlal.u32 q6,d29,d4[1]
- vmlal.u32 q7,d29,d5[0]
- vshr.u64 d26,d26,#16
- vmlal.u32 q8,d29,d5[1]
- vmlal.u32 q9,d29,d6[0]
- vadd.u64 d26,d26,d27
- vmlal.u32 q10,d29,d6[1]
- vshr.u64 d26,d26,#16
- vmlal.u32 q11,d29,d7[0]
- vmlal.u32 q12,d29,d7[1]
- vadd.u64 d12,d12,d26
- vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7]
- add r10,sp,#8 @ rewind
- sub r8,r5,#8
- b LNEON_8n_inner
-
-.align 4
-LNEON_8n_inner:
- subs r8,r8,#8
- vmlal.u32 q6,d28,d0[0]
- vld1.64 {q13},[r6,:128]
- vmlal.u32 q7,d28,d0[1]
- vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0]
- vmlal.u32 q8,d28,d1[0]
- vld1.32 {d4,d5,d6,d7},[r3]!
- vmlal.u32 q9,d28,d1[1]
- it ne
- addne r6,r6,#16 @ don't advance in last iteration
- vmlal.u32 q10,d28,d2[0]
- vmlal.u32 q11,d28,d2[1]
- vmlal.u32 q12,d28,d3[0]
- vmlal.u32 q13,d28,d3[1]
- vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1]
- vmlal.u32 q6,d29,d4[0]
- vmlal.u32 q7,d29,d4[1]
- vmlal.u32 q8,d29,d5[0]
- vmlal.u32 q9,d29,d5[1]
- vmlal.u32 q10,d29,d6[0]
- vmlal.u32 q11,d29,d6[1]
- vmlal.u32 q12,d29,d7[0]
- vmlal.u32 q13,d29,d7[1]
- vst1.64 {q6},[r7,:128]!
- vmlal.u32 q7,d28,d0[0]
- vld1.64 {q6},[r6,:128]
- vmlal.u32 q8,d28,d0[1]
- vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1]
- vmlal.u32 q9,d28,d1[0]
- it ne
- addne r6,r6,#16 @ don't advance in last iteration
- vmlal.u32 q10,d28,d1[1]
- vmlal.u32 q11,d28,d2[0]
- vmlal.u32 q12,d28,d2[1]
- vmlal.u32 q13,d28,d3[0]
- vmlal.u32 q6,d28,d3[1]
- vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2]
- vmlal.u32 q7,d29,d4[0]
- vmlal.u32 q8,d29,d4[1]
- vmlal.u32 q9,d29,d5[0]
- vmlal.u32 q10,d29,d5[1]
- vmlal.u32 q11,d29,d6[0]
- vmlal.u32 q12,d29,d6[1]
- vmlal.u32 q13,d29,d7[0]
- vmlal.u32 q6,d29,d7[1]
- vst1.64 {q7},[r7,:128]!
- vmlal.u32 q8,d28,d0[0]
- vld1.64 {q7},[r6,:128]
- vmlal.u32 q9,d28,d0[1]
- vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2]
- vmlal.u32 q10,d28,d1[0]
- it ne
- addne r6,r6,#16 @ don't advance in last iteration
- vmlal.u32 q11,d28,d1[1]
- vmlal.u32 q12,d28,d2[0]
- vmlal.u32 q13,d28,d2[1]
- vmlal.u32 q6,d28,d3[0]
- vmlal.u32 q7,d28,d3[1]
- vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3]
- vmlal.u32 q8,d29,d4[0]
- vmlal.u32 q9,d29,d4[1]
- vmlal.u32 q10,d29,d5[0]
- vmlal.u32 q11,d29,d5[1]
- vmlal.u32 q12,d29,d6[0]
- vmlal.u32 q13,d29,d6[1]
- vmlal.u32 q6,d29,d7[0]
- vmlal.u32 q7,d29,d7[1]
- vst1.64 {q8},[r7,:128]!
- vmlal.u32 q9,d28,d0[0]
- vld1.64 {q8},[r6,:128]
- vmlal.u32 q10,d28,d0[1]
- vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3]
- vmlal.u32 q11,d28,d1[0]
- it ne
- addne r6,r6,#16 @ don't advance in last iteration
- vmlal.u32 q12,d28,d1[1]
- vmlal.u32 q13,d28,d2[0]
- vmlal.u32 q6,d28,d2[1]
- vmlal.u32 q7,d28,d3[0]
- vmlal.u32 q8,d28,d3[1]
- vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4]
- vmlal.u32 q9,d29,d4[0]
- vmlal.u32 q10,d29,d4[1]
- vmlal.u32 q11,d29,d5[0]
- vmlal.u32 q12,d29,d5[1]
- vmlal.u32 q13,d29,d6[0]
- vmlal.u32 q6,d29,d6[1]
- vmlal.u32 q7,d29,d7[0]
- vmlal.u32 q8,d29,d7[1]
- vst1.64 {q9},[r7,:128]!
- vmlal.u32 q10,d28,d0[0]
- vld1.64 {q9},[r6,:128]
- vmlal.u32 q11,d28,d0[1]
- vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4]
- vmlal.u32 q12,d28,d1[0]
- it ne
- addne r6,r6,#16 @ don't advance in last iteration
- vmlal.u32 q13,d28,d1[1]
- vmlal.u32 q6,d28,d2[0]
- vmlal.u32 q7,d28,d2[1]
- vmlal.u32 q8,d28,d3[0]
- vmlal.u32 q9,d28,d3[1]
- vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5]
- vmlal.u32 q10,d29,d4[0]
- vmlal.u32 q11,d29,d4[1]
- vmlal.u32 q12,d29,d5[0]
- vmlal.u32 q13,d29,d5[1]
- vmlal.u32 q6,d29,d6[0]
- vmlal.u32 q7,d29,d6[1]
- vmlal.u32 q8,d29,d7[0]
- vmlal.u32 q9,d29,d7[1]
- vst1.64 {q10},[r7,:128]!
- vmlal.u32 q11,d28,d0[0]
- vld1.64 {q10},[r6,:128]
- vmlal.u32 q12,d28,d0[1]
- vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5]
- vmlal.u32 q13,d28,d1[0]
- it ne
- addne r6,r6,#16 @ don't advance in last iteration
- vmlal.u32 q6,d28,d1[1]
- vmlal.u32 q7,d28,d2[0]
- vmlal.u32 q8,d28,d2[1]
- vmlal.u32 q9,d28,d3[0]
- vmlal.u32 q10,d28,d3[1]
- vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6]
- vmlal.u32 q11,d29,d4[0]
- vmlal.u32 q12,d29,d4[1]
- vmlal.u32 q13,d29,d5[0]
- vmlal.u32 q6,d29,d5[1]
- vmlal.u32 q7,d29,d6[0]
- vmlal.u32 q8,d29,d6[1]
- vmlal.u32 q9,d29,d7[0]
- vmlal.u32 q10,d29,d7[1]
- vst1.64 {q11},[r7,:128]!
- vmlal.u32 q12,d28,d0[0]
- vld1.64 {q11},[r6,:128]
- vmlal.u32 q13,d28,d0[1]
- vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6]
- vmlal.u32 q6,d28,d1[0]
- it ne
- addne r6,r6,#16 @ don't advance in last iteration
- vmlal.u32 q7,d28,d1[1]
- vmlal.u32 q8,d28,d2[0]
- vmlal.u32 q9,d28,d2[1]
- vmlal.u32 q10,d28,d3[0]
- vmlal.u32 q11,d28,d3[1]
- vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7]
- vmlal.u32 q12,d29,d4[0]
- vmlal.u32 q13,d29,d4[1]
- vmlal.u32 q6,d29,d5[0]
- vmlal.u32 q7,d29,d5[1]
- vmlal.u32 q8,d29,d6[0]
- vmlal.u32 q9,d29,d6[1]
- vmlal.u32 q10,d29,d7[0]
- vmlal.u32 q11,d29,d7[1]
- vst1.64 {q12},[r7,:128]!
- vmlal.u32 q13,d28,d0[0]
- vld1.64 {q12},[r6,:128]
- vmlal.u32 q6,d28,d0[1]
- vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7]
- vmlal.u32 q7,d28,d1[0]
- it ne
- addne r6,r6,#16 @ don't advance in last iteration
- vmlal.u32 q8,d28,d1[1]
- vmlal.u32 q9,d28,d2[0]
- vmlal.u32 q10,d28,d2[1]
- vmlal.u32 q11,d28,d3[0]
- vmlal.u32 q12,d28,d3[1]
- it eq
- subeq r1,r1,r5,lsl#2 @ rewind
- vmlal.u32 q13,d29,d4[0]
- vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0]
- vmlal.u32 q6,d29,d4[1]
- vld1.32 {d0,d1,d2,d3},[r1]!
- vmlal.u32 q7,d29,d5[0]
- add r10,sp,#8 @ rewind
- vmlal.u32 q8,d29,d5[1]
- vmlal.u32 q9,d29,d6[0]
- vmlal.u32 q10,d29,d6[1]
- vmlal.u32 q11,d29,d7[0]
- vst1.64 {q13},[r7,:128]!
- vmlal.u32 q12,d29,d7[1]
-
- bne LNEON_8n_inner
- add r6,sp,#128
- vst1.64 {q6,q7},[r7,:256]!
- veor q2,q2,q2 @ d4-d5
- vst1.64 {q8,q9},[r7,:256]!
- veor q3,q3,q3 @ d6-d7
- vst1.64 {q10,q11},[r7,:256]!
- vst1.64 {q12},[r7,:128]
-
- subs r9,r9,#8
- vld1.64 {q6,q7},[r6,:256]!
- vld1.64 {q8,q9},[r6,:256]!
- vld1.64 {q10,q11},[r6,:256]!
- vld1.64 {q12,q13},[r6,:256]!
-
- itt ne
- subne r3,r3,r5,lsl#2 @ rewind
- bne LNEON_8n_outer
-
- add r7,sp,#128
- vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame
- vshr.u64 d10,d12,#16
- vst1.64 {q2,q3},[sp,:256]!
- vadd.u64 d13,d13,d10
- vst1.64 {q2,q3}, [sp,:256]!
- vshr.u64 d10,d13,#16
- vst1.64 {q2,q3}, [sp,:256]!
- vzip.16 d12,d13
-
- mov r8,r5
- b LNEON_tail_entry
-
-.align 4
-LNEON_tail:
- vadd.u64 d12,d12,d10
- vshr.u64 d10,d12,#16
- vld1.64 {q8,q9}, [r6, :256]!
- vadd.u64 d13,d13,d10
- vld1.64 {q10,q11}, [r6, :256]!
- vshr.u64 d10,d13,#16
- vld1.64 {q12,q13}, [r6, :256]!
- vzip.16 d12,d13
-
-LNEON_tail_entry:
- vadd.u64 d14,d14,d10
- vst1.32 {d12[0]}, [r7, :32]!
- vshr.u64 d10,d14,#16
- vadd.u64 d15,d15,d10
- vshr.u64 d10,d15,#16
- vzip.16 d14,d15
- vadd.u64 d16,d16,d10
- vst1.32 {d14[0]}, [r7, :32]!
- vshr.u64 d10,d16,#16
- vadd.u64 d17,d17,d10
- vshr.u64 d10,d17,#16
- vzip.16 d16,d17
- vadd.u64 d18,d18,d10
- vst1.32 {d16[0]}, [r7, :32]!
- vshr.u64 d10,d18,#16
- vadd.u64 d19,d19,d10
- vshr.u64 d10,d19,#16
- vzip.16 d18,d19
- vadd.u64 d20,d20,d10
- vst1.32 {d18[0]}, [r7, :32]!
- vshr.u64 d10,d20,#16
- vadd.u64 d21,d21,d10
- vshr.u64 d10,d21,#16
- vzip.16 d20,d21
- vadd.u64 d22,d22,d10
- vst1.32 {d20[0]}, [r7, :32]!
- vshr.u64 d10,d22,#16
- vadd.u64 d23,d23,d10
- vshr.u64 d10,d23,#16
- vzip.16 d22,d23
- vadd.u64 d24,d24,d10
- vst1.32 {d22[0]}, [r7, :32]!
- vshr.u64 d10,d24,#16
- vadd.u64 d25,d25,d10
- vshr.u64 d10,d25,#16
- vzip.16 d24,d25
- vadd.u64 d26,d26,d10
- vst1.32 {d24[0]}, [r7, :32]!
- vshr.u64 d10,d26,#16
- vadd.u64 d27,d27,d10
- vshr.u64 d10,d27,#16
- vzip.16 d26,d27
- vld1.64 {q6,q7}, [r6, :256]!
- subs r8,r8,#8
- vst1.32 {d26[0]}, [r7, :32]!
- bne LNEON_tail
-
- vst1.32 {d10[0]}, [r7, :32] @ top-most bit
- sub r3,r3,r5,lsl#2 @ rewind r3
- subs r1,sp,#0 @ clear carry flag
- add r2,sp,r5,lsl#2
-
-LNEON_sub:
- ldmia r1!, {r4,r5,r6,r7}
- ldmia r3!, {r8,r9,r10,r11}
- sbcs r8, r4,r8
- sbcs r9, r5,r9
- sbcs r10,r6,r10
- sbcs r11,r7,r11
- teq r1,r2 @ preserves carry
- stmia r0!, {r8,r9,r10,r11}
- bne LNEON_sub
-
- ldr r10, [r1] @ load top-most bit
- mov r11,sp
- veor q0,q0,q0
- sub r11,r2,r11 @ this is num*4
- veor q1,q1,q1
- mov r1,sp
- sub r0,r0,r11 @ rewind r0
- mov r3,r2 @ second 3/4th of frame
- sbcs r10,r10,#0 @ result is carry flag
-
-LNEON_copy_n_zap:
- ldmia r1!, {r4,r5,r6,r7}
- ldmia r0, {r8,r9,r10,r11}
- it cc
- movcc r8, r4
- vst1.64 {q0,q1}, [r3,:256]! @ wipe
- itt cc
- movcc r9, r5
- movcc r10,r6
- vst1.64 {q0,q1}, [r3,:256]! @ wipe
- it cc
- movcc r11,r7
- ldmia r1, {r4,r5,r6,r7}
- stmia r0!, {r8,r9,r10,r11}
- sub r1,r1,#16
- ldmia r0, {r8,r9,r10,r11}
- it cc
- movcc r8, r4
- vst1.64 {q0,q1}, [r1,:256]! @ wipe
- itt cc
- movcc r9, r5
- movcc r10,r6
- vst1.64 {q0,q1}, [r3,:256]! @ wipe
- it cc
- movcc r11,r7
- teq r1,r2 @ preserves carry
- stmia r0!, {r8,r9,r10,r11}
- bne LNEON_copy_n_zap
-
- mov sp,ip
- vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
- bx lr @ bx lr
-
-#endif
-.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
-#if __ARM_MAX_ARCH__>=7
-.comm _OPENSSL_armcap_P,4
-.non_lazy_symbol_pointer
-OPENSSL_armcap_P:
-.indirect_symbol _OPENSSL_armcap_P
-.long 0
-.private_extern _OPENSSL_armcap_P
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/bsaes-armv7-apple.S b/apple-arm/crypto/fipsmodule/bsaes-armv7-apple.S
deleted file mode 100644
index 28cc6b3..0000000
--- a/apple-arm/crypto/fipsmodule/bsaes-armv7-apple.S
+++ /dev/null
@@ -1,1528 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
-@
-@ Licensed under the OpenSSL license (the "License"). You may not use
-@ this file except in compliance with the License. You can obtain a copy
-@ in the file LICENSE in the source distribution or at
-@ https://www.openssl.org/source/license.html
-
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@
-@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
-@ of Linaro. Permission to use under GPL terms is granted.
-@ ====================================================================
-
-@ Bit-sliced AES for ARM NEON
-@
-@ February 2012.
-@
-@ This implementation is direct adaptation of bsaes-x86_64 module for
-@ ARM NEON. Except that this module is endian-neutral [in sense that
-@ it can be compiled for either endianness] by courtesy of vld1.8's
-@ neutrality. Initial version doesn't implement interface to OpenSSL,
-@ only low-level primitives and unsupported entry points, just enough
-@ to collect performance results, which for Cortex-A8 core are:
-@
-@ encrypt 19.5 cycles per byte processed with 128-bit key
-@ decrypt 22.1 cycles per byte processed with 128-bit key
-@ key conv. 440 cycles per 128-bit key/0.18 of 8x block
-@
-@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
-@ which is [much] worse than anticipated (for further details see
-@ http://www.openssl.org/~appro/Snapdragon-S4.html).
-@
-@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
-@ manages in 20.0 cycles].
-@
-@ When comparing to x86_64 results keep in mind that NEON unit is
-@ [mostly] single-issue and thus can't [fully] benefit from
-@ instruction-level parallelism. And when comparing to aes-armv4
-@ results keep in mind key schedule conversion overhead (see
-@ bsaes-x86_64.pl for further details)...
-@
-@ <appro@openssl.org>
-
-@ April-August 2013
-@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
-
-#ifndef __KERNEL__
-# include <openssl/arm_arch.h>
-
-# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
-# define VFP_ABI_POP vldmia sp!,{d8-d15}
-# define VFP_ABI_FRAME 0x40
-#else
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-# define VFP_ABI_FRAME 0
-# define BSAES_ASM_EXTENDED_KEY
-# define XTS_CHAIN_TWEAK
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-#ifdef __thumb__
-# define adrl adr
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.text
-.syntax unified @ ARMv7-capable assembler is expected to handle this
-#if defined(__thumb2__) && !defined(__APPLE__)
-.thumb
-#else
-.code 32
-# undef __thumb2__
-#endif
-
-#ifdef __thumb2__
-.thumb_func _bsaes_decrypt8
-#endif
-.align 4
-_bsaes_decrypt8:
- adr r6,.
- vldmia r4!, {q9} @ round 0 key
-#if defined(__thumb2__) || defined(__APPLE__)
- adr r6,LM0ISR
-#else
- add r6,r6,#LM0ISR-_bsaes_decrypt8
-#endif
-
- vldmia r6!, {q8} @ LM0ISR
- veor q10, q0, q9 @ xor with round0 key
- veor q11, q1, q9
- vtbl.8 d0, {q10}, d16
- vtbl.8 d1, {q10}, d17
- veor q12, q2, q9
- vtbl.8 d2, {q11}, d16
- vtbl.8 d3, {q11}, d17
- veor q13, q3, q9
- vtbl.8 d4, {q12}, d16
- vtbl.8 d5, {q12}, d17
- veor q14, q4, q9
- vtbl.8 d6, {q13}, d16
- vtbl.8 d7, {q13}, d17
- veor q15, q5, q9
- vtbl.8 d8, {q14}, d16
- vtbl.8 d9, {q14}, d17
- veor q10, q6, q9
- vtbl.8 d10, {q15}, d16
- vtbl.8 d11, {q15}, d17
- veor q11, q7, q9
- vtbl.8 d12, {q10}, d16
- vtbl.8 d13, {q10}, d17
- vtbl.8 d14, {q11}, d16
- vtbl.8 d15, {q11}, d17
- vmov.i8 q8,#0x55 @ compose LBS0
- vmov.i8 q9,#0x33 @ compose LBS1
- vshr.u64 q10, q6, #1
- vshr.u64 q11, q4, #1
- veor q10, q10, q7
- veor q11, q11, q5
- vand q10, q10, q8
- vand q11, q11, q8
- veor q7, q7, q10
- vshl.u64 q10, q10, #1
- veor q5, q5, q11
- vshl.u64 q11, q11, #1
- veor q6, q6, q10
- veor q4, q4, q11
- vshr.u64 q10, q2, #1
- vshr.u64 q11, q0, #1
- veor q10, q10, q3
- veor q11, q11, q1
- vand q10, q10, q8
- vand q11, q11, q8
- veor q3, q3, q10
- vshl.u64 q10, q10, #1
- veor q1, q1, q11
- vshl.u64 q11, q11, #1
- veor q2, q2, q10
- veor q0, q0, q11
- vmov.i8 q8,#0x0f @ compose LBS2
- vshr.u64 q10, q5, #2
- vshr.u64 q11, q4, #2
- veor q10, q10, q7
- veor q11, q11, q6
- vand q10, q10, q9
- vand q11, q11, q9
- veor q7, q7, q10
- vshl.u64 q10, q10, #2
- veor q6, q6, q11
- vshl.u64 q11, q11, #2
- veor q5, q5, q10
- veor q4, q4, q11
- vshr.u64 q10, q1, #2
- vshr.u64 q11, q0, #2
- veor q10, q10, q3
- veor q11, q11, q2
- vand q10, q10, q9
- vand q11, q11, q9
- veor q3, q3, q10
- vshl.u64 q10, q10, #2
- veor q2, q2, q11
- vshl.u64 q11, q11, #2
- veor q1, q1, q10
- veor q0, q0, q11
- vshr.u64 q10, q3, #4
- vshr.u64 q11, q2, #4
- veor q10, q10, q7
- veor q11, q11, q6
- vand q10, q10, q8
- vand q11, q11, q8
- veor q7, q7, q10
- vshl.u64 q10, q10, #4
- veor q6, q6, q11
- vshl.u64 q11, q11, #4
- veor q3, q3, q10
- veor q2, q2, q11
- vshr.u64 q10, q1, #4
- vshr.u64 q11, q0, #4
- veor q10, q10, q5
- veor q11, q11, q4
- vand q10, q10, q8
- vand q11, q11, q8
- veor q5, q5, q10
- vshl.u64 q10, q10, #4
- veor q4, q4, q11
- vshl.u64 q11, q11, #4
- veor q1, q1, q10
- veor q0, q0, q11
- sub r5,r5,#1
- b Ldec_sbox
-.align 4
-Ldec_loop:
- vldmia r4!, {q8,q9,q10,q11}
- veor q8, q8, q0
- veor q9, q9, q1
- vtbl.8 d0, {q8}, d24
- vtbl.8 d1, {q8}, d25
- vldmia r4!, {q8}
- veor q10, q10, q2
- vtbl.8 d2, {q9}, d24
- vtbl.8 d3, {q9}, d25
- vldmia r4!, {q9}
- veor q11, q11, q3
- vtbl.8 d4, {q10}, d24
- vtbl.8 d5, {q10}, d25
- vldmia r4!, {q10}
- vtbl.8 d6, {q11}, d24
- vtbl.8 d7, {q11}, d25
- vldmia r4!, {q11}
- veor q8, q8, q4
- veor q9, q9, q5
- vtbl.8 d8, {q8}, d24
- vtbl.8 d9, {q8}, d25
- veor q10, q10, q6
- vtbl.8 d10, {q9}, d24
- vtbl.8 d11, {q9}, d25
- veor q11, q11, q7
- vtbl.8 d12, {q10}, d24
- vtbl.8 d13, {q10}, d25
- vtbl.8 d14, {q11}, d24
- vtbl.8 d15, {q11}, d25
-Ldec_sbox:
- veor q1, q1, q4
- veor q3, q3, q4
-
- veor q4, q4, q7
- veor q1, q1, q6
- veor q2, q2, q7
- veor q6, q6, q4
-
- veor q0, q0, q1
- veor q2, q2, q5
- veor q7, q7, q6
- veor q3, q3, q0
- veor q5, q5, q0
- veor q1, q1, q3
- veor q11, q3, q0
- veor q10, q7, q4
- veor q9, q1, q6
- veor q13, q4, q0
- vmov q8, q10
- veor q12, q5, q2
-
- vorr q10, q10, q9
- veor q15, q11, q8
- vand q14, q11, q12
- vorr q11, q11, q12
- veor q12, q12, q9
- vand q8, q8, q9
- veor q9, q6, q2
- vand q15, q15, q12
- vand q13, q13, q9
- veor q9, q3, q7
- veor q12, q1, q5
- veor q11, q11, q13
- veor q10, q10, q13
- vand q13, q9, q12
- vorr q9, q9, q12
- veor q11, q11, q15
- veor q8, q8, q13
- veor q10, q10, q14
- veor q9, q9, q15
- veor q8, q8, q14
- vand q12, q4, q6
- veor q9, q9, q14
- vand q13, q0, q2
- vand q14, q7, q1
- vorr q15, q3, q5
- veor q11, q11, q12
- veor q9, q9, q14
- veor q8, q8, q15
- veor q10, q10, q13
-
- @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
-
- @ new smaller inversion
-
- vand q14, q11, q9
- vmov q12, q8
-
- veor q13, q10, q14
- veor q15, q8, q14
- veor q14, q8, q14 @ q14=q15
-
- vbsl q13, q9, q8
- vbsl q15, q11, q10
- veor q11, q11, q10
-
- vbsl q12, q13, q14
- vbsl q8, q14, q13
-
- vand q14, q12, q15
- veor q9, q9, q8
-
- veor q14, q14, q11
- veor q12, q5, q2
- veor q8, q1, q6
- veor q10, q15, q14
- vand q10, q10, q5
- veor q5, q5, q1
- vand q11, q1, q15
- vand q5, q5, q14
- veor q1, q11, q10
- veor q5, q5, q11
- veor q15, q15, q13
- veor q14, q14, q9
- veor q11, q15, q14
- veor q10, q13, q9
- vand q11, q11, q12
- vand q10, q10, q2
- veor q12, q12, q8
- veor q2, q2, q6
- vand q8, q8, q15
- vand q6, q6, q13
- vand q12, q12, q14
- vand q2, q2, q9
- veor q8, q8, q12
- veor q2, q2, q6
- veor q12, q12, q11
- veor q6, q6, q10
- veor q5, q5, q12
- veor q2, q2, q12
- veor q1, q1, q8
- veor q6, q6, q8
-
- veor q12, q3, q0
- veor q8, q7, q4
- veor q11, q15, q14
- veor q10, q13, q9
- vand q11, q11, q12
- vand q10, q10, q0
- veor q12, q12, q8
- veor q0, q0, q4
- vand q8, q8, q15
- vand q4, q4, q13
- vand q12, q12, q14
- vand q0, q0, q9
- veor q8, q8, q12
- veor q0, q0, q4
- veor q12, q12, q11
- veor q4, q4, q10
- veor q15, q15, q13
- veor q14, q14, q9
- veor q10, q15, q14
- vand q10, q10, q3
- veor q3, q3, q7
- vand q11, q7, q15
- vand q3, q3, q14
- veor q7, q11, q10
- veor q3, q3, q11
- veor q3, q3, q12
- veor q0, q0, q12
- veor q7, q7, q8
- veor q4, q4, q8
- veor q1, q1, q7
- veor q6, q6, q5
-
- veor q4, q4, q1
- veor q2, q2, q7
- veor q5, q5, q7
- veor q4, q4, q2
- veor q7, q7, q0
- veor q4, q4, q5
- veor q3, q3, q6
- veor q6, q6, q1
- veor q3, q3, q4
-
- veor q4, q4, q0
- veor q7, q7, q3
- subs r5,r5,#1
- bcc Ldec_done
- @ multiplication by 0x05-0x00-0x04-0x00
- vext.8 q8, q0, q0, #8
- vext.8 q14, q3, q3, #8
- vext.8 q15, q5, q5, #8
- veor q8, q8, q0
- vext.8 q9, q1, q1, #8
- veor q14, q14, q3
- vext.8 q10, q6, q6, #8
- veor q15, q15, q5
- vext.8 q11, q4, q4, #8
- veor q9, q9, q1
- vext.8 q12, q2, q2, #8
- veor q10, q10, q6
- vext.8 q13, q7, q7, #8
- veor q11, q11, q4
- veor q12, q12, q2
- veor q13, q13, q7
-
- veor q0, q0, q14
- veor q1, q1, q14
- veor q6, q6, q8
- veor q2, q2, q10
- veor q4, q4, q9
- veor q1, q1, q15
- veor q6, q6, q15
- veor q2, q2, q14
- veor q7, q7, q11
- veor q4, q4, q14
- veor q3, q3, q12
- veor q2, q2, q15
- veor q7, q7, q15
- veor q5, q5, q13
- vext.8 q8, q0, q0, #12 @ x0 <<< 32
- vext.8 q9, q1, q1, #12
- veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
- vext.8 q10, q6, q6, #12
- veor q1, q1, q9
- vext.8 q11, q4, q4, #12
- veor q6, q6, q10
- vext.8 q12, q2, q2, #12
- veor q4, q4, q11
- vext.8 q13, q7, q7, #12
- veor q2, q2, q12
- vext.8 q14, q3, q3, #12
- veor q7, q7, q13
- vext.8 q15, q5, q5, #12
- veor q3, q3, q14
-
- veor q9, q9, q0
- veor q5, q5, q15
- vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
- veor q10, q10, q1
- veor q8, q8, q5
- veor q9, q9, q5
- vext.8 q1, q1, q1, #8
- veor q13, q13, q2
- veor q0, q0, q8
- veor q14, q14, q7
- veor q1, q1, q9
- vext.8 q8, q2, q2, #8
- veor q12, q12, q4
- vext.8 q9, q7, q7, #8
- veor q15, q15, q3
- vext.8 q2, q4, q4, #8
- veor q11, q11, q6
- vext.8 q7, q5, q5, #8
- veor q12, q12, q5
- vext.8 q4, q3, q3, #8
- veor q11, q11, q5
- vext.8 q3, q6, q6, #8
- veor q5, q9, q13
- veor q11, q11, q2
- veor q7, q7, q15
- veor q6, q4, q14
- veor q4, q8, q12
- veor q2, q3, q10
- vmov q3, q11
- @ vmov q5, q9
- vldmia r6, {q12} @ LISR
- ite eq @ Thumb2 thing, sanity check in ARM
- addeq r6,r6,#0x10
- bne Ldec_loop
- vldmia r6, {q12} @ LISRM0
- b Ldec_loop
-.align 4
-Ldec_done:
- vmov.i8 q8,#0x55 @ compose LBS0
- vmov.i8 q9,#0x33 @ compose LBS1
- vshr.u64 q10, q3, #1
- vshr.u64 q11, q2, #1
- veor q10, q10, q5
- veor q11, q11, q7
- vand q10, q10, q8
- vand q11, q11, q8
- veor q5, q5, q10
- vshl.u64 q10, q10, #1
- veor q7, q7, q11
- vshl.u64 q11, q11, #1
- veor q3, q3, q10
- veor q2, q2, q11
- vshr.u64 q10, q6, #1
- vshr.u64 q11, q0, #1
- veor q10, q10, q4
- veor q11, q11, q1
- vand q10, q10, q8
- vand q11, q11, q8
- veor q4, q4, q10
- vshl.u64 q10, q10, #1
- veor q1, q1, q11
- vshl.u64 q11, q11, #1
- veor q6, q6, q10
- veor q0, q0, q11
- vmov.i8 q8,#0x0f @ compose LBS2
- vshr.u64 q10, q7, #2
- vshr.u64 q11, q2, #2
- veor q10, q10, q5
- veor q11, q11, q3
- vand q10, q10, q9
- vand q11, q11, q9
- veor q5, q5, q10
- vshl.u64 q10, q10, #2
- veor q3, q3, q11
- vshl.u64 q11, q11, #2
- veor q7, q7, q10
- veor q2, q2, q11
- vshr.u64 q10, q1, #2
- vshr.u64 q11, q0, #2
- veor q10, q10, q4
- veor q11, q11, q6
- vand q10, q10, q9
- vand q11, q11, q9
- veor q4, q4, q10
- vshl.u64 q10, q10, #2
- veor q6, q6, q11
- vshl.u64 q11, q11, #2
- veor q1, q1, q10
- veor q0, q0, q11
- vshr.u64 q10, q4, #4
- vshr.u64 q11, q6, #4
- veor q10, q10, q5
- veor q11, q11, q3
- vand q10, q10, q8
- vand q11, q11, q8
- veor q5, q5, q10
- vshl.u64 q10, q10, #4
- veor q3, q3, q11
- vshl.u64 q11, q11, #4
- veor q4, q4, q10
- veor q6, q6, q11
- vshr.u64 q10, q1, #4
- vshr.u64 q11, q0, #4
- veor q10, q10, q7
- veor q11, q11, q2
- vand q10, q10, q8
- vand q11, q11, q8
- veor q7, q7, q10
- vshl.u64 q10, q10, #4
- veor q2, q2, q11
- vshl.u64 q11, q11, #4
- veor q1, q1, q10
- veor q0, q0, q11
- vldmia r4, {q8} @ last round key
- veor q6, q6, q8
- veor q4, q4, q8
- veor q2, q2, q8
- veor q7, q7, q8
- veor q3, q3, q8
- veor q5, q5, q8
- veor q0, q0, q8
- veor q1, q1, q8
- bx lr
-
-
-
-.align 6
-_bsaes_const:
-LM0ISR:@ InvShiftRows constants
-.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
-LISR:
-.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
-LISRM0:
-.quad 0x01040b0e0205080f, 0x0306090c00070a0d
-LM0SR:@ ShiftRows constants
-.quad 0x0a0e02060f03070b, 0x0004080c05090d01
-LSR:
-.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
-LSRM0:
-.quad 0x0304090e00050a0f, 0x01060b0c0207080d
-LM0:
-.quad 0x02060a0e03070b0f, 0x0004080c0105090d
-LREVM0SR:
-.quad 0x090d01050c000408, 0x03070b0f060a0e02
-.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 6
-
-
-#ifdef __thumb2__
-.thumb_func _bsaes_encrypt8
-#endif
-.align 4
-_bsaes_encrypt8:
- adr r6,.
- vldmia r4!, {q9} @ round 0 key
-#if defined(__thumb2__) || defined(__APPLE__)
- adr r6,LM0SR
-#else
- sub r6,r6,#_bsaes_encrypt8-LM0SR
-#endif
-
- vldmia r6!, {q8} @ LM0SR
-_bsaes_encrypt8_alt:
- veor q10, q0, q9 @ xor with round0 key
- veor q11, q1, q9
- vtbl.8 d0, {q10}, d16
- vtbl.8 d1, {q10}, d17
- veor q12, q2, q9
- vtbl.8 d2, {q11}, d16
- vtbl.8 d3, {q11}, d17
- veor q13, q3, q9
- vtbl.8 d4, {q12}, d16
- vtbl.8 d5, {q12}, d17
- veor q14, q4, q9
- vtbl.8 d6, {q13}, d16
- vtbl.8 d7, {q13}, d17
- veor q15, q5, q9
- vtbl.8 d8, {q14}, d16
- vtbl.8 d9, {q14}, d17
- veor q10, q6, q9
- vtbl.8 d10, {q15}, d16
- vtbl.8 d11, {q15}, d17
- veor q11, q7, q9
- vtbl.8 d12, {q10}, d16
- vtbl.8 d13, {q10}, d17
- vtbl.8 d14, {q11}, d16
- vtbl.8 d15, {q11}, d17
-_bsaes_encrypt8_bitslice:
- vmov.i8 q8,#0x55 @ compose LBS0
- vmov.i8 q9,#0x33 @ compose LBS1
- vshr.u64 q10, q6, #1
- vshr.u64 q11, q4, #1
- veor q10, q10, q7
- veor q11, q11, q5
- vand q10, q10, q8
- vand q11, q11, q8
- veor q7, q7, q10
- vshl.u64 q10, q10, #1
- veor q5, q5, q11
- vshl.u64 q11, q11, #1
- veor q6, q6, q10
- veor q4, q4, q11
- vshr.u64 q10, q2, #1
- vshr.u64 q11, q0, #1
- veor q10, q10, q3
- veor q11, q11, q1
- vand q10, q10, q8
- vand q11, q11, q8
- veor q3, q3, q10
- vshl.u64 q10, q10, #1
- veor q1, q1, q11
- vshl.u64 q11, q11, #1
- veor q2, q2, q10
- veor q0, q0, q11
- vmov.i8 q8,#0x0f @ compose LBS2
- vshr.u64 q10, q5, #2
- vshr.u64 q11, q4, #2
- veor q10, q10, q7
- veor q11, q11, q6
- vand q10, q10, q9
- vand q11, q11, q9
- veor q7, q7, q10
- vshl.u64 q10, q10, #2
- veor q6, q6, q11
- vshl.u64 q11, q11, #2
- veor q5, q5, q10
- veor q4, q4, q11
- vshr.u64 q10, q1, #2
- vshr.u64 q11, q0, #2
- veor q10, q10, q3
- veor q11, q11, q2
- vand q10, q10, q9
- vand q11, q11, q9
- veor q3, q3, q10
- vshl.u64 q10, q10, #2
- veor q2, q2, q11
- vshl.u64 q11, q11, #2
- veor q1, q1, q10
- veor q0, q0, q11
- vshr.u64 q10, q3, #4
- vshr.u64 q11, q2, #4
- veor q10, q10, q7
- veor q11, q11, q6
- vand q10, q10, q8
- vand q11, q11, q8
- veor q7, q7, q10
- vshl.u64 q10, q10, #4
- veor q6, q6, q11
- vshl.u64 q11, q11, #4
- veor q3, q3, q10
- veor q2, q2, q11
- vshr.u64 q10, q1, #4
- vshr.u64 q11, q0, #4
- veor q10, q10, q5
- veor q11, q11, q4
- vand q10, q10, q8
- vand q11, q11, q8
- veor q5, q5, q10
- vshl.u64 q10, q10, #4
- veor q4, q4, q11
- vshl.u64 q11, q11, #4
- veor q1, q1, q10
- veor q0, q0, q11
- sub r5,r5,#1
- b Lenc_sbox
-.align 4
-Lenc_loop:
- vldmia r4!, {q8,q9,q10,q11}
- veor q8, q8, q0
- veor q9, q9, q1
- vtbl.8 d0, {q8}, d24
- vtbl.8 d1, {q8}, d25
- vldmia r4!, {q8}
- veor q10, q10, q2
- vtbl.8 d2, {q9}, d24
- vtbl.8 d3, {q9}, d25
- vldmia r4!, {q9}
- veor q11, q11, q3
- vtbl.8 d4, {q10}, d24
- vtbl.8 d5, {q10}, d25
- vldmia r4!, {q10}
- vtbl.8 d6, {q11}, d24
- vtbl.8 d7, {q11}, d25
- vldmia r4!, {q11}
- veor q8, q8, q4
- veor q9, q9, q5
- vtbl.8 d8, {q8}, d24
- vtbl.8 d9, {q8}, d25
- veor q10, q10, q6
- vtbl.8 d10, {q9}, d24
- vtbl.8 d11, {q9}, d25
- veor q11, q11, q7
- vtbl.8 d12, {q10}, d24
- vtbl.8 d13, {q10}, d25
- vtbl.8 d14, {q11}, d24
- vtbl.8 d15, {q11}, d25
-Lenc_sbox:
- veor q2, q2, q1
- veor q5, q5, q6
- veor q3, q3, q0
- veor q6, q6, q2
- veor q5, q5, q0
-
- veor q6, q6, q3
- veor q3, q3, q7
- veor q7, q7, q5
- veor q3, q3, q4
- veor q4, q4, q5
-
- veor q2, q2, q7
- veor q3, q3, q1
- veor q1, q1, q5
- veor q11, q7, q4
- veor q10, q1, q2
- veor q9, q5, q3
- veor q13, q2, q4
- vmov q8, q10
- veor q12, q6, q0
-
- vorr q10, q10, q9
- veor q15, q11, q8
- vand q14, q11, q12
- vorr q11, q11, q12
- veor q12, q12, q9
- vand q8, q8, q9
- veor q9, q3, q0
- vand q15, q15, q12
- vand q13, q13, q9
- veor q9, q7, q1
- veor q12, q5, q6
- veor q11, q11, q13
- veor q10, q10, q13
- vand q13, q9, q12
- vorr q9, q9, q12
- veor q11, q11, q15
- veor q8, q8, q13
- veor q10, q10, q14
- veor q9, q9, q15
- veor q8, q8, q14
- vand q12, q2, q3
- veor q9, q9, q14
- vand q13, q4, q0
- vand q14, q1, q5
- vorr q15, q7, q6
- veor q11, q11, q12
- veor q9, q9, q14
- veor q8, q8, q15
- veor q10, q10, q13
-
- @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
-
- @ new smaller inversion
-
- vand q14, q11, q9
- vmov q12, q8
-
- veor q13, q10, q14
- veor q15, q8, q14
- veor q14, q8, q14 @ q14=q15
-
- vbsl q13, q9, q8
- vbsl q15, q11, q10
- veor q11, q11, q10
-
- vbsl q12, q13, q14
- vbsl q8, q14, q13
-
- vand q14, q12, q15
- veor q9, q9, q8
-
- veor q14, q14, q11
- veor q12, q6, q0
- veor q8, q5, q3
- veor q10, q15, q14
- vand q10, q10, q6
- veor q6, q6, q5
- vand q11, q5, q15
- vand q6, q6, q14
- veor q5, q11, q10
- veor q6, q6, q11
- veor q15, q15, q13
- veor q14, q14, q9
- veor q11, q15, q14
- veor q10, q13, q9
- vand q11, q11, q12
- vand q10, q10, q0
- veor q12, q12, q8
- veor q0, q0, q3
- vand q8, q8, q15
- vand q3, q3, q13
- vand q12, q12, q14
- vand q0, q0, q9
- veor q8, q8, q12
- veor q0, q0, q3
- veor q12, q12, q11
- veor q3, q3, q10
- veor q6, q6, q12
- veor q0, q0, q12
- veor q5, q5, q8
- veor q3, q3, q8
-
- veor q12, q7, q4
- veor q8, q1, q2
- veor q11, q15, q14
- veor q10, q13, q9
- vand q11, q11, q12
- vand q10, q10, q4
- veor q12, q12, q8
- veor q4, q4, q2
- vand q8, q8, q15
- vand q2, q2, q13
- vand q12, q12, q14
- vand q4, q4, q9
- veor q8, q8, q12
- veor q4, q4, q2
- veor q12, q12, q11
- veor q2, q2, q10
- veor q15, q15, q13
- veor q14, q14, q9
- veor q10, q15, q14
- vand q10, q10, q7
- veor q7, q7, q1
- vand q11, q1, q15
- vand q7, q7, q14
- veor q1, q11, q10
- veor q7, q7, q11
- veor q7, q7, q12
- veor q4, q4, q12
- veor q1, q1, q8
- veor q2, q2, q8
- veor q7, q7, q0
- veor q1, q1, q6
- veor q6, q6, q0
- veor q4, q4, q7
- veor q0, q0, q1
-
- veor q1, q1, q5
- veor q5, q5, q2
- veor q2, q2, q3
- veor q3, q3, q5
- veor q4, q4, q5
-
- veor q6, q6, q3
- subs r5,r5,#1
- bcc Lenc_done
- vext.8 q8, q0, q0, #12 @ x0 <<< 32
- vext.8 q9, q1, q1, #12
- veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
- vext.8 q10, q4, q4, #12
- veor q1, q1, q9
- vext.8 q11, q6, q6, #12
- veor q4, q4, q10
- vext.8 q12, q3, q3, #12
- veor q6, q6, q11
- vext.8 q13, q7, q7, #12
- veor q3, q3, q12
- vext.8 q14, q2, q2, #12
- veor q7, q7, q13
- vext.8 q15, q5, q5, #12
- veor q2, q2, q14
-
- veor q9, q9, q0
- veor q5, q5, q15
- vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
- veor q10, q10, q1
- veor q8, q8, q5
- veor q9, q9, q5
- vext.8 q1, q1, q1, #8
- veor q13, q13, q3
- veor q0, q0, q8
- veor q14, q14, q7
- veor q1, q1, q9
- vext.8 q8, q3, q3, #8
- veor q12, q12, q6
- vext.8 q9, q7, q7, #8
- veor q15, q15, q2
- vext.8 q3, q6, q6, #8
- veor q11, q11, q4
- vext.8 q7, q5, q5, #8
- veor q12, q12, q5
- vext.8 q6, q2, q2, #8
- veor q11, q11, q5
- vext.8 q2, q4, q4, #8
- veor q5, q9, q13
- veor q4, q8, q12
- veor q3, q3, q11
- veor q7, q7, q15
- veor q6, q6, q14
- @ vmov q4, q8
- veor q2, q2, q10
- @ vmov q5, q9
- vldmia r6, {q12} @ LSR
- ite eq @ Thumb2 thing, samity check in ARM
- addeq r6,r6,#0x10
- bne Lenc_loop
- vldmia r6, {q12} @ LSRM0
- b Lenc_loop
-.align 4
-Lenc_done:
- vmov.i8 q8,#0x55 @ compose LBS0
- vmov.i8 q9,#0x33 @ compose LBS1
- vshr.u64 q10, q2, #1
- vshr.u64 q11, q3, #1
- veor q10, q10, q5
- veor q11, q11, q7
- vand q10, q10, q8
- vand q11, q11, q8
- veor q5, q5, q10
- vshl.u64 q10, q10, #1
- veor q7, q7, q11
- vshl.u64 q11, q11, #1
- veor q2, q2, q10
- veor q3, q3, q11
- vshr.u64 q10, q4, #1
- vshr.u64 q11, q0, #1
- veor q10, q10, q6
- veor q11, q11, q1
- vand q10, q10, q8
- vand q11, q11, q8
- veor q6, q6, q10
- vshl.u64 q10, q10, #1
- veor q1, q1, q11
- vshl.u64 q11, q11, #1
- veor q4, q4, q10
- veor q0, q0, q11
- vmov.i8 q8,#0x0f @ compose LBS2
- vshr.u64 q10, q7, #2
- vshr.u64 q11, q3, #2
- veor q10, q10, q5
- veor q11, q11, q2
- vand q10, q10, q9
- vand q11, q11, q9
- veor q5, q5, q10
- vshl.u64 q10, q10, #2
- veor q2, q2, q11
- vshl.u64 q11, q11, #2
- veor q7, q7, q10
- veor q3, q3, q11
- vshr.u64 q10, q1, #2
- vshr.u64 q11, q0, #2
- veor q10, q10, q6
- veor q11, q11, q4
- vand q10, q10, q9
- vand q11, q11, q9
- veor q6, q6, q10
- vshl.u64 q10, q10, #2
- veor q4, q4, q11
- vshl.u64 q11, q11, #2
- veor q1, q1, q10
- veor q0, q0, q11
- vshr.u64 q10, q6, #4
- vshr.u64 q11, q4, #4
- veor q10, q10, q5
- veor q11, q11, q2
- vand q10, q10, q8
- vand q11, q11, q8
- veor q5, q5, q10
- vshl.u64 q10, q10, #4
- veor q2, q2, q11
- vshl.u64 q11, q11, #4
- veor q6, q6, q10
- veor q4, q4, q11
- vshr.u64 q10, q1, #4
- vshr.u64 q11, q0, #4
- veor q10, q10, q7
- veor q11, q11, q3
- vand q10, q10, q8
- vand q11, q11, q8
- veor q7, q7, q10
- vshl.u64 q10, q10, #4
- veor q3, q3, q11
- vshl.u64 q11, q11, #4
- veor q1, q1, q10
- veor q0, q0, q11
- vldmia r4, {q8} @ last round key
- veor q4, q4, q8
- veor q6, q6, q8
- veor q3, q3, q8
- veor q7, q7, q8
- veor q2, q2, q8
- veor q5, q5, q8
- veor q0, q0, q8
- veor q1, q1, q8
- bx lr
-
-#ifdef __thumb2__
-.thumb_func _bsaes_key_convert
-#endif
-.align 4
-_bsaes_key_convert:
- adr r6,.
- vld1.8 {q7}, [r4]! @ load round 0 key
-#if defined(__thumb2__) || defined(__APPLE__)
- adr r6,LM0
-#else
- sub r6,r6,#_bsaes_key_convert-LM0
-#endif
- vld1.8 {q15}, [r4]! @ load round 1 key
-
- vmov.i8 q8, #0x01 @ bit masks
- vmov.i8 q9, #0x02
- vmov.i8 q10, #0x04
- vmov.i8 q11, #0x08
- vmov.i8 q12, #0x10
- vmov.i8 q13, #0x20
- vldmia r6, {q14} @ LM0
-
-#ifdef __ARMEL__
- vrev32.8 q7, q7
- vrev32.8 q15, q15
-#endif
- sub r5,r5,#1
- vstmia r12!, {q7} @ save round 0 key
- b Lkey_loop
-
-.align 4
-Lkey_loop:
- vtbl.8 d14,{q15},d28
- vtbl.8 d15,{q15},d29
- vmov.i8 q6, #0x40
- vmov.i8 q15, #0x80
-
- vtst.8 q0, q7, q8
- vtst.8 q1, q7, q9
- vtst.8 q2, q7, q10
- vtst.8 q3, q7, q11
- vtst.8 q4, q7, q12
- vtst.8 q5, q7, q13
- vtst.8 q6, q7, q6
- vtst.8 q7, q7, q15
- vld1.8 {q15}, [r4]! @ load next round key
- vmvn q0, q0 @ "pnot"
- vmvn q1, q1
- vmvn q5, q5
- vmvn q6, q6
-#ifdef __ARMEL__
- vrev32.8 q15, q15
-#endif
- subs r5,r5,#1
- vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key
- bne Lkey_loop
-
- vmov.i8 q7,#0x63 @ compose L63
- @ don't save last round key
- bx lr
-
-.globl _bsaes_cbc_encrypt
-.private_extern _bsaes_cbc_encrypt
-#ifdef __thumb2__
-.thumb_func _bsaes_cbc_encrypt
-#endif
-.align 5
-_bsaes_cbc_encrypt:
- @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
- @ short inputs. We patch this out, using bsaes for all input sizes.
-
- @ it is up to the caller to make sure we are called with enc == 0
-
- mov ip, sp
- stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
- VFP_ABI_PUSH
- ldr r8, [ip] @ IV is 1st arg on the stack
- mov r2, r2, lsr#4 @ len in 16 byte blocks
- sub sp, #0x10 @ scratch space to carry over the IV
- mov r9, sp @ save sp
-
- ldr r10, [r3, #240] @ get # of rounds
-#ifndef BSAES_ASM_EXTENDED_KEY
- @ allocate the key schedule on the stack
- sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
- add r12, #96 @ sifze of bit-slices key schedule
-
- @ populate the key schedule
- mov r4, r3 @ pass key
- mov r5, r10 @ pass # of rounds
- mov sp, r12 @ sp is sp
- bl _bsaes_key_convert
- vldmia sp, {q6}
- vstmia r12, {q15} @ save last round key
- veor q7, q7, q6 @ fix up round 0 key
- vstmia sp, {q7}
-#else
- ldr r12, [r3, #244]
- eors r12, #1
- beq 0f
-
- @ populate the key schedule
- str r12, [r3, #244]
- mov r4, r3 @ pass key
- mov r5, r10 @ pass # of rounds
- add r12, r3, #248 @ pass key schedule
- bl _bsaes_key_convert
- add r4, r3, #248
- vldmia r4, {q6}
- vstmia r12, {q15} @ save last round key
- veor q7, q7, q6 @ fix up round 0 key
- vstmia r4, {q7}
-
-.align 2
-
-#endif
-
- vld1.8 {q15}, [r8] @ load IV
- b Lcbc_dec_loop
-
-.align 4
-Lcbc_dec_loop:
- subs r2, r2, #0x8
- bmi Lcbc_dec_loop_finish
-
- vld1.8 {q0,q1}, [r0]! @ load input
- vld1.8 {q2,q3}, [r0]!
-#ifndef BSAES_ASM_EXTENDED_KEY
- mov r4, sp @ pass the key
-#else
- add r4, r3, #248
-#endif
- vld1.8 {q4,q5}, [r0]!
- mov r5, r10
- vld1.8 {q6,q7}, [r0]
- sub r0, r0, #0x60
- vstmia r9, {q15} @ put aside IV
-
- bl _bsaes_decrypt8
-
- vldmia r9, {q14} @ reload IV
- vld1.8 {q8,q9}, [r0]! @ reload input
- veor q0, q0, q14 @ ^= IV
- vld1.8 {q10,q11}, [r0]!
- veor q1, q1, q8
- veor q6, q6, q9
- vld1.8 {q12,q13}, [r0]!
- veor q4, q4, q10
- veor q2, q2, q11
- vld1.8 {q14,q15}, [r0]!
- veor q7, q7, q12
- vst1.8 {q0,q1}, [r1]! @ write output
- veor q3, q3, q13
- vst1.8 {q6}, [r1]!
- veor q5, q5, q14
- vst1.8 {q4}, [r1]!
- vst1.8 {q2}, [r1]!
- vst1.8 {q7}, [r1]!
- vst1.8 {q3}, [r1]!
- vst1.8 {q5}, [r1]!
-
- b Lcbc_dec_loop
-
-Lcbc_dec_loop_finish:
- adds r2, r2, #8
- beq Lcbc_dec_done
-
- @ Set up most parameters for the _bsaes_decrypt8 call.
-#ifndef BSAES_ASM_EXTENDED_KEY
- mov r4, sp @ pass the key
-#else
- add r4, r3, #248
-#endif
- mov r5, r10
- vstmia r9, {q15} @ put aside IV
-
- vld1.8 {q0}, [r0]! @ load input
- cmp r2, #2
- blo Lcbc_dec_one
- vld1.8 {q1}, [r0]!
- beq Lcbc_dec_two
- vld1.8 {q2}, [r0]!
- cmp r2, #4
- blo Lcbc_dec_three
- vld1.8 {q3}, [r0]!
- beq Lcbc_dec_four
- vld1.8 {q4}, [r0]!
- cmp r2, #6
- blo Lcbc_dec_five
- vld1.8 {q5}, [r0]!
- beq Lcbc_dec_six
- vld1.8 {q6}, [r0]!
- sub r0, r0, #0x70
-
- bl _bsaes_decrypt8
-
- vldmia r9, {q14} @ reload IV
- vld1.8 {q8,q9}, [r0]! @ reload input
- veor q0, q0, q14 @ ^= IV
- vld1.8 {q10,q11}, [r0]!
- veor q1, q1, q8
- veor q6, q6, q9
- vld1.8 {q12,q13}, [r0]!
- veor q4, q4, q10
- veor q2, q2, q11
- vld1.8 {q15}, [r0]!
- veor q7, q7, q12
- vst1.8 {q0,q1}, [r1]! @ write output
- veor q3, q3, q13
- vst1.8 {q6}, [r1]!
- vst1.8 {q4}, [r1]!
- vst1.8 {q2}, [r1]!
- vst1.8 {q7}, [r1]!
- vst1.8 {q3}, [r1]!
- b Lcbc_dec_done
-.align 4
-Lcbc_dec_six:
- sub r0, r0, #0x60
- bl _bsaes_decrypt8
- vldmia r9,{q14} @ reload IV
- vld1.8 {q8,q9}, [r0]! @ reload input
- veor q0, q0, q14 @ ^= IV
- vld1.8 {q10,q11}, [r0]!
- veor q1, q1, q8
- veor q6, q6, q9
- vld1.8 {q12}, [r0]!
- veor q4, q4, q10
- veor q2, q2, q11
- vld1.8 {q15}, [r0]!
- veor q7, q7, q12
- vst1.8 {q0,q1}, [r1]! @ write output
- vst1.8 {q6}, [r1]!
- vst1.8 {q4}, [r1]!
- vst1.8 {q2}, [r1]!
- vst1.8 {q7}, [r1]!
- b Lcbc_dec_done
-.align 4
-Lcbc_dec_five:
- sub r0, r0, #0x50
- bl _bsaes_decrypt8
- vldmia r9, {q14} @ reload IV
- vld1.8 {q8,q9}, [r0]! @ reload input
- veor q0, q0, q14 @ ^= IV
- vld1.8 {q10,q11}, [r0]!
- veor q1, q1, q8
- veor q6, q6, q9
- vld1.8 {q15}, [r0]!
- veor q4, q4, q10
- vst1.8 {q0,q1}, [r1]! @ write output
- veor q2, q2, q11
- vst1.8 {q6}, [r1]!
- vst1.8 {q4}, [r1]!
- vst1.8 {q2}, [r1]!
- b Lcbc_dec_done
-.align 4
-Lcbc_dec_four:
- sub r0, r0, #0x40
- bl _bsaes_decrypt8
- vldmia r9, {q14} @ reload IV
- vld1.8 {q8,q9}, [r0]! @ reload input
- veor q0, q0, q14 @ ^= IV
- vld1.8 {q10}, [r0]!
- veor q1, q1, q8
- veor q6, q6, q9
- vld1.8 {q15}, [r0]!
- veor q4, q4, q10
- vst1.8 {q0,q1}, [r1]! @ write output
- vst1.8 {q6}, [r1]!
- vst1.8 {q4}, [r1]!
- b Lcbc_dec_done
-.align 4
-Lcbc_dec_three:
- sub r0, r0, #0x30
- bl _bsaes_decrypt8
- vldmia r9, {q14} @ reload IV
- vld1.8 {q8,q9}, [r0]! @ reload input
- veor q0, q0, q14 @ ^= IV
- vld1.8 {q15}, [r0]!
- veor q1, q1, q8
- veor q6, q6, q9
- vst1.8 {q0,q1}, [r1]! @ write output
- vst1.8 {q6}, [r1]!
- b Lcbc_dec_done
-.align 4
-Lcbc_dec_two:
- sub r0, r0, #0x20
- bl _bsaes_decrypt8
- vldmia r9, {q14} @ reload IV
- vld1.8 {q8}, [r0]! @ reload input
- veor q0, q0, q14 @ ^= IV
- vld1.8 {q15}, [r0]! @ reload input
- veor q1, q1, q8
- vst1.8 {q0,q1}, [r1]! @ write output
- b Lcbc_dec_done
-.align 4
-Lcbc_dec_one:
- sub r0, r0, #0x10
- bl _bsaes_decrypt8
- vldmia r9, {q14} @ reload IV
- vld1.8 {q15}, [r0]! @ reload input
- veor q0, q0, q14 @ ^= IV
- vst1.8 {q0}, [r1]! @ write output
-
-Lcbc_dec_done:
-#ifndef BSAES_ASM_EXTENDED_KEY
- vmov.i32 q0, #0
- vmov.i32 q1, #0
-Lcbc_dec_bzero:@ wipe key schedule [if any]
- vstmia sp!, {q0,q1}
- cmp sp, r9
- bne Lcbc_dec_bzero
-#endif
-
- mov sp, r9
- add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
- vst1.8 {q15}, [r8] @ return IV
- VFP_ABI_POP
- ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
-
-.globl _bsaes_ctr32_encrypt_blocks
-.private_extern _bsaes_ctr32_encrypt_blocks
-#ifdef __thumb2__
-.thumb_func _bsaes_ctr32_encrypt_blocks
-#endif
-.align 5
-_bsaes_ctr32_encrypt_blocks:
- @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
- @ out to retain a constant-time implementation.
- mov ip, sp
- stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
- VFP_ABI_PUSH
- ldr r8, [ip] @ ctr is 1st arg on the stack
- sub sp, sp, #0x10 @ scratch space to carry over the ctr
- mov r9, sp @ save sp
-
- ldr r10, [r3, #240] @ get # of rounds
-#ifndef BSAES_ASM_EXTENDED_KEY
- @ allocate the key schedule on the stack
- sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
- add r12, #96 @ size of bit-sliced key schedule
-
- @ populate the key schedule
- mov r4, r3 @ pass key
- mov r5, r10 @ pass # of rounds
- mov sp, r12 @ sp is sp
- bl _bsaes_key_convert
- veor q7,q7,q15 @ fix up last round key
- vstmia r12, {q7} @ save last round key
-
- vld1.8 {q0}, [r8] @ load counter
-#ifdef __APPLE__
- mov r8, #:lower16:(LREVM0SR-LM0)
- add r8, r6, r8
-#else
- add r8, r6, #LREVM0SR-LM0 @ borrow r8
-#endif
- vldmia sp, {q4} @ load round0 key
-#else
- ldr r12, [r3, #244]
- eors r12, #1
- beq 0f
-
- @ populate the key schedule
- str r12, [r3, #244]
- mov r4, r3 @ pass key
- mov r5, r10 @ pass # of rounds
- add r12, r3, #248 @ pass key schedule
- bl _bsaes_key_convert
- veor q7,q7,q15 @ fix up last round key
- vstmia r12, {q7} @ save last round key
-
-.align 2
- add r12, r3, #248
- vld1.8 {q0}, [r8] @ load counter
- adrl r8, LREVM0SR @ borrow r8
- vldmia r12, {q4} @ load round0 key
- sub sp, #0x10 @ place for adjusted round0 key
-#endif
-
- vmov.i32 q8,#1 @ compose 1<<96
- veor q9,q9,q9
- vrev32.8 q0,q0
- vext.8 q8,q9,q8,#4
- vrev32.8 q4,q4
- vadd.u32 q9,q8,q8 @ compose 2<<96
- vstmia sp, {q4} @ save adjusted round0 key
- b Lctr_enc_loop
-
-.align 4
-Lctr_enc_loop:
- vadd.u32 q10, q8, q9 @ compose 3<<96
- vadd.u32 q1, q0, q8 @ +1
- vadd.u32 q2, q0, q9 @ +2
- vadd.u32 q3, q0, q10 @ +3
- vadd.u32 q4, q1, q10
- vadd.u32 q5, q2, q10
- vadd.u32 q6, q3, q10
- vadd.u32 q7, q4, q10
- vadd.u32 q10, q5, q10 @ next counter
-
- @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
- @ to flip byte order in 32-bit counter
-
- vldmia sp, {q9} @ load round0 key
-#ifndef BSAES_ASM_EXTENDED_KEY
- add r4, sp, #0x10 @ pass next round key
-#else
- add r4, r3, #264
-#endif
- vldmia r8, {q8} @ LREVM0SR
- mov r5, r10 @ pass rounds
- vstmia r9, {q10} @ save next counter
-#ifdef __APPLE__
- mov r6, #:lower16:(LREVM0SR-LSR)
- sub r6, r8, r6
-#else
- sub r6, r8, #LREVM0SR-LSR @ pass constants
-#endif
-
- bl _bsaes_encrypt8_alt
-
- subs r2, r2, #8
- blo Lctr_enc_loop_done
-
- vld1.8 {q8,q9}, [r0]! @ load input
- vld1.8 {q10,q11}, [r0]!
- veor q0, q8
- veor q1, q9
- vld1.8 {q12,q13}, [r0]!
- veor q4, q10
- veor q6, q11
- vld1.8 {q14,q15}, [r0]!
- veor q3, q12
- vst1.8 {q0,q1}, [r1]! @ write output
- veor q7, q13
- veor q2, q14
- vst1.8 {q4}, [r1]!
- veor q5, q15
- vst1.8 {q6}, [r1]!
- vmov.i32 q8, #1 @ compose 1<<96
- vst1.8 {q3}, [r1]!
- veor q9, q9, q9
- vst1.8 {q7}, [r1]!
- vext.8 q8, q9, q8, #4
- vst1.8 {q2}, [r1]!
- vadd.u32 q9,q8,q8 @ compose 2<<96
- vst1.8 {q5}, [r1]!
- vldmia r9, {q0} @ load counter
-
- bne Lctr_enc_loop
- b Lctr_enc_done
-
-.align 4
-Lctr_enc_loop_done:
- add r2, r2, #8
- vld1.8 {q8}, [r0]! @ load input
- veor q0, q8
- vst1.8 {q0}, [r1]! @ write output
- cmp r2, #2
- blo Lctr_enc_done
- vld1.8 {q9}, [r0]!
- veor q1, q9
- vst1.8 {q1}, [r1]!
- beq Lctr_enc_done
- vld1.8 {q10}, [r0]!
- veor q4, q10
- vst1.8 {q4}, [r1]!
- cmp r2, #4
- blo Lctr_enc_done
- vld1.8 {q11}, [r0]!
- veor q6, q11
- vst1.8 {q6}, [r1]!
- beq Lctr_enc_done
- vld1.8 {q12}, [r0]!
- veor q3, q12
- vst1.8 {q3}, [r1]!
- cmp r2, #6
- blo Lctr_enc_done
- vld1.8 {q13}, [r0]!
- veor q7, q13
- vst1.8 {q7}, [r1]!
- beq Lctr_enc_done
- vld1.8 {q14}, [r0]
- veor q2, q14
- vst1.8 {q2}, [r1]!
-
-Lctr_enc_done:
- vmov.i32 q0, #0
- vmov.i32 q1, #0
-#ifndef BSAES_ASM_EXTENDED_KEY
-Lctr_enc_bzero:@ wipe key schedule [if any]
- vstmia sp!, {q0,q1}
- cmp sp, r9
- bne Lctr_enc_bzero
-#else
- vstmia sp, {q0,q1}
-#endif
-
- mov sp, r9
- add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
- VFP_ABI_POP
- ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return
-
- @ OpenSSL contains aes_nohw_* fallback code here. We patch this
- @ out to retain a constant-time implementation.
-
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/ghash-armv4-apple.S b/apple-arm/crypto/fipsmodule/ghash-armv4-apple.S
deleted file mode 100644
index 0f47586..0000000
--- a/apple-arm/crypto/fipsmodule/ghash-armv4-apple.S
+++ /dev/null
@@ -1,250 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
-@ instructions are in aesv8-armx.pl.)
-
-
-.text
-#if defined(__thumb2__) || defined(__clang__)
-.syntax unified
-#define ldrplb ldrbpl
-#define ldrneb ldrbne
-#endif
-#if defined(__thumb2__)
-.thumb
-#else
-.code 32
-#endif
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.globl _gcm_init_neon
-.private_extern _gcm_init_neon
-#ifdef __thumb2__
-.thumb_func _gcm_init_neon
-#endif
-.align 4
-_gcm_init_neon:
- vld1.64 d7,[r1]! @ load H
- vmov.i8 q8,#0xe1
- vld1.64 d6,[r1]
- vshl.i64 d17,#57
- vshr.u64 d16,#63 @ t0=0xc2....01
- vdup.8 q9,d7[7]
- vshr.u64 d26,d6,#63
- vshr.s8 q9,#7 @ broadcast carry bit
- vshl.i64 q3,q3,#1
- vand q8,q8,q9
- vorr d7,d26 @ H<<<=1
- veor q3,q3,q8 @ twisted H
- vstmia r0,{q3}
-
- bx lr @ bx lr
-
-
-.globl _gcm_gmult_neon
-.private_extern _gcm_gmult_neon
-#ifdef __thumb2__
-.thumb_func _gcm_gmult_neon
-#endif
-.align 4
-_gcm_gmult_neon:
- vld1.64 d7,[r0]! @ load Xi
- vld1.64 d6,[r0]!
- vmov.i64 d29,#0x0000ffffffffffff
- vldmia r1,{d26,d27} @ load twisted H
- vmov.i64 d30,#0x00000000ffffffff
-#ifdef __ARMEL__
- vrev64.8 q3,q3
-#endif
- vmov.i64 d31,#0x000000000000ffff
- veor d28,d26,d27 @ Karatsuba pre-processing
- mov r3,#16
- b Lgmult_neon
-
-
-.globl _gcm_ghash_neon
-.private_extern _gcm_ghash_neon
-#ifdef __thumb2__
-.thumb_func _gcm_ghash_neon
-#endif
-.align 4
-_gcm_ghash_neon:
- vld1.64 d1,[r0]! @ load Xi
- vld1.64 d0,[r0]!
- vmov.i64 d29,#0x0000ffffffffffff
- vldmia r1,{d26,d27} @ load twisted H
- vmov.i64 d30,#0x00000000ffffffff
-#ifdef __ARMEL__
- vrev64.8 q0,q0
-#endif
- vmov.i64 d31,#0x000000000000ffff
- veor d28,d26,d27 @ Karatsuba pre-processing
-
-Loop_neon:
- vld1.64 d7,[r2]! @ load inp
- vld1.64 d6,[r2]!
-#ifdef __ARMEL__
- vrev64.8 q3,q3
-#endif
- veor q3,q0 @ inp^=Xi
-Lgmult_neon:
- vext.8 d16, d26, d26, #1 @ A1
- vmull.p8 q8, d16, d6 @ F = A1*B
- vext.8 d0, d6, d6, #1 @ B1
- vmull.p8 q0, d26, d0 @ E = A*B1
- vext.8 d18, d26, d26, #2 @ A2
- vmull.p8 q9, d18, d6 @ H = A2*B
- vext.8 d22, d6, d6, #2 @ B2
- vmull.p8 q11, d26, d22 @ G = A*B2
- vext.8 d20, d26, d26, #3 @ A3
- veor q8, q8, q0 @ L = E + F
- vmull.p8 q10, d20, d6 @ J = A3*B
- vext.8 d0, d6, d6, #3 @ B3
- veor q9, q9, q11 @ M = G + H
- vmull.p8 q0, d26, d0 @ I = A*B3
- veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
- vand d17, d17, d29
- vext.8 d22, d6, d6, #4 @ B4
- veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
- vand d19, d19, d30
- vmull.p8 q11, d26, d22 @ K = A*B4
- veor q10, q10, q0 @ N = I + J
- veor d16, d16, d17
- veor d18, d18, d19
- veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
- vand d21, d21, d31
- vext.8 q8, q8, q8, #15
- veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
- vmov.i64 d23, #0
- vext.8 q9, q9, q9, #14
- veor d20, d20, d21
- vmull.p8 q0, d26, d6 @ D = A*B
- vext.8 q11, q11, q11, #12
- vext.8 q10, q10, q10, #13
- veor q8, q8, q9
- veor q10, q10, q11
- veor q0, q0, q8
- veor q0, q0, q10
- veor d6,d6,d7 @ Karatsuba pre-processing
- vext.8 d16, d28, d28, #1 @ A1
- vmull.p8 q8, d16, d6 @ F = A1*B
- vext.8 d2, d6, d6, #1 @ B1
- vmull.p8 q1, d28, d2 @ E = A*B1
- vext.8 d18, d28, d28, #2 @ A2
- vmull.p8 q9, d18, d6 @ H = A2*B
- vext.8 d22, d6, d6, #2 @ B2
- vmull.p8 q11, d28, d22 @ G = A*B2
- vext.8 d20, d28, d28, #3 @ A3
- veor q8, q8, q1 @ L = E + F
- vmull.p8 q10, d20, d6 @ J = A3*B
- vext.8 d2, d6, d6, #3 @ B3
- veor q9, q9, q11 @ M = G + H
- vmull.p8 q1, d28, d2 @ I = A*B3
- veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
- vand d17, d17, d29
- vext.8 d22, d6, d6, #4 @ B4
- veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
- vand d19, d19, d30
- vmull.p8 q11, d28, d22 @ K = A*B4
- veor q10, q10, q1 @ N = I + J
- veor d16, d16, d17
- veor d18, d18, d19
- veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
- vand d21, d21, d31
- vext.8 q8, q8, q8, #15
- veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
- vmov.i64 d23, #0
- vext.8 q9, q9, q9, #14
- veor d20, d20, d21
- vmull.p8 q1, d28, d6 @ D = A*B
- vext.8 q11, q11, q11, #12
- vext.8 q10, q10, q10, #13
- veor q8, q8, q9
- veor q10, q10, q11
- veor q1, q1, q8
- veor q1, q1, q10
- vext.8 d16, d27, d27, #1 @ A1
- vmull.p8 q8, d16, d7 @ F = A1*B
- vext.8 d4, d7, d7, #1 @ B1
- vmull.p8 q2, d27, d4 @ E = A*B1
- vext.8 d18, d27, d27, #2 @ A2
- vmull.p8 q9, d18, d7 @ H = A2*B
- vext.8 d22, d7, d7, #2 @ B2
- vmull.p8 q11, d27, d22 @ G = A*B2
- vext.8 d20, d27, d27, #3 @ A3
- veor q8, q8, q2 @ L = E + F
- vmull.p8 q10, d20, d7 @ J = A3*B
- vext.8 d4, d7, d7, #3 @ B3
- veor q9, q9, q11 @ M = G + H
- vmull.p8 q2, d27, d4 @ I = A*B3
- veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
- vand d17, d17, d29
- vext.8 d22, d7, d7, #4 @ B4
- veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
- vand d19, d19, d30
- vmull.p8 q11, d27, d22 @ K = A*B4
- veor q10, q10, q2 @ N = I + J
- veor d16, d16, d17
- veor d18, d18, d19
- veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
- vand d21, d21, d31
- vext.8 q8, q8, q8, #15
- veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
- vmov.i64 d23, #0
- vext.8 q9, q9, q9, #14
- veor d20, d20, d21
- vmull.p8 q2, d27, d7 @ D = A*B
- vext.8 q11, q11, q11, #12
- vext.8 q10, q10, q10, #13
- veor q8, q8, q9
- veor q10, q10, q11
- veor q2, q2, q8
- veor q2, q2, q10
- veor q1,q1,q0 @ Karatsuba post-processing
- veor q1,q1,q2
- veor d1,d1,d2
- veor d4,d4,d3 @ Xh|Xl - 256-bit result
-
- @ equivalent of reduction_avx from ghash-x86_64.pl
- vshl.i64 q9,q0,#57 @ 1st phase
- vshl.i64 q10,q0,#62
- veor q10,q10,q9 @
- vshl.i64 q9,q0,#63
- veor q10, q10, q9 @
- veor d1,d1,d20 @
- veor d4,d4,d21
-
- vshr.u64 q10,q0,#1 @ 2nd phase
- veor q2,q2,q0
- veor q0,q0,q10 @
- vshr.u64 q10,q10,#6
- vshr.u64 q0,q0,#1 @
- veor q0,q0,q2 @
- veor q0,q0,q10 @
-
- subs r3,#16
- bne Loop_neon
-
-#ifdef __ARMEL__
- vrev64.8 q0,q0
-#endif
- sub r0,#16
- vst1.64 d1,[r0]! @ write out Xi
- vst1.64 d0,[r0]
-
- bx lr @ bx lr
-
-#endif
-.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/ghashv8-armv7-apple.S b/apple-arm/crypto/fipsmodule/ghashv8-armv7-apple.S
deleted file mode 100644
index 37ee28e..0000000
--- a/apple-arm/crypto/fipsmodule/ghashv8-armv7-apple.S
+++ /dev/null
@@ -1,252 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-#if __ARM_MAX_ARCH__>=7
-.text
-
-.code 32
-#undef __thumb2__
-.globl _gcm_init_v8
-.private_extern _gcm_init_v8
-#ifdef __thumb2__
-.thumb_func _gcm_init_v8
-#endif
-.align 4
-_gcm_init_v8:
- AARCH64_VALID_CALL_TARGET
- vld1.64 {q9},[r1] @ load input H
- vmov.i8 q11,#0xe1
- vshl.i64 q11,q11,#57 @ 0xc2.0
- vext.8 q3,q9,q9,#8
- vshr.u64 q10,q11,#63
- vdup.32 q9,d18[1]
- vext.8 q8,q10,q11,#8 @ t0=0xc2....01
- vshr.u64 q10,q3,#63
- vshr.s32 q9,q9,#31 @ broadcast carry bit
- vand q10,q10,q8
- vshl.i64 q3,q3,#1
- vext.8 q10,q10,q10,#8
- vand q8,q8,q9
- vorr q3,q3,q10 @ H<<<=1
- veor q12,q3,q8 @ twisted H
- vst1.64 {q12},[r0]! @ store Htable[0]
-
- @ calculate H^2
- vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
-.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
- veor q8,q8,q12
-.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
-.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
-
- vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
- veor q10,q0,q2
- veor q1,q1,q9
- veor q1,q1,q10
-.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
-
- vmov d4,d3 @ Xh|Xm - 256-bit result
- vmov d3,d0 @ Xm is rotated Xl
- veor q0,q1,q10
-
- vext.8 q10,q0,q0,#8 @ 2nd phase
-.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
- veor q10,q10,q2
- veor q14,q0,q10
-
- vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
- veor q9,q9,q14
- vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
- vst1.64 {q13,q14},[r0]! @ store Htable[1..2]
- bx lr
-
-.globl _gcm_gmult_v8
-.private_extern _gcm_gmult_v8
-#ifdef __thumb2__
-.thumb_func _gcm_gmult_v8
-#endif
-.align 4
-_gcm_gmult_v8:
- AARCH64_VALID_CALL_TARGET
- vld1.64 {q9},[r0] @ load Xi
- vmov.i8 q11,#0xe1
- vld1.64 {q12,q13},[r1] @ load twisted H, ...
- vshl.u64 q11,q11,#57
-#ifndef __ARMEB__
- vrev64.8 q9,q9
-#endif
- vext.8 q3,q9,q9,#8
-
-.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
- veor q9,q9,q3 @ Karatsuba pre-processing
-.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
-.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
-
- vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
- veor q10,q0,q2
- veor q1,q1,q9
- veor q1,q1,q10
-.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
-
- vmov d4,d3 @ Xh|Xm - 256-bit result
- vmov d3,d0 @ Xm is rotated Xl
- veor q0,q1,q10
-
- vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
-.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
- veor q10,q10,q2
- veor q0,q0,q10
-
-#ifndef __ARMEB__
- vrev64.8 q0,q0
-#endif
- vext.8 q0,q0,q0,#8
- vst1.64 {q0},[r0] @ write out Xi
-
- bx lr
-
-.globl _gcm_ghash_v8
-.private_extern _gcm_ghash_v8
-#ifdef __thumb2__
-.thumb_func _gcm_ghash_v8
-#endif
-.align 4
-_gcm_ghash_v8:
- AARCH64_VALID_CALL_TARGET
- vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
- vld1.64 {q0},[r0] @ load [rotated] Xi
- @ "[rotated]" means that
- @ loaded value would have
- @ to be rotated in order to
- @ make it appear as in
- @ algorithm specification
- subs r3,r3,#32 @ see if r3 is 32 or larger
- mov r12,#16 @ r12 is used as post-
- @ increment for input pointer;
- @ as loop is modulo-scheduled
- @ r12 is zeroed just in time
- @ to preclude overstepping
- @ inp[len], which means that
- @ last block[s] are actually
- @ loaded twice, but last
- @ copy is not processed
- vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2
- vmov.i8 q11,#0xe1
- vld1.64 {q14},[r1]
- moveq r12,#0 @ is it time to zero r12?
- vext.8 q0,q0,q0,#8 @ rotate Xi
- vld1.64 {q8},[r2]! @ load [rotated] I[0]
- vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
-#ifndef __ARMEB__
- vrev64.8 q8,q8
- vrev64.8 q0,q0
-#endif
- vext.8 q3,q8,q8,#8 @ rotate I[0]
- blo Lodd_tail_v8 @ r3 was less than 32
- vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
-#ifndef __ARMEB__
- vrev64.8 q9,q9
-#endif
- vext.8 q7,q9,q9,#8
- veor q3,q3,q0 @ I[i]^=Xi
-.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
- veor q9,q9,q7 @ Karatsuba pre-processing
-.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
- b Loop_mod2x_v8
-
-.align 4
-Loop_mod2x_v8:
- vext.8 q10,q3,q3,#8
- subs r3,r3,#32 @ is there more data?
-.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
- movlo r12,#0 @ is it time to zero r12?
-
-.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
- veor q10,q10,q3 @ Karatsuba pre-processing
-.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
- veor q0,q0,q4 @ accumulate
-.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
- vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
-
- veor q2,q2,q6
- moveq r12,#0 @ is it time to zero r12?
- veor q1,q1,q5
-
- vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
- veor q10,q0,q2
- veor q1,q1,q9
- vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
-#ifndef __ARMEB__
- vrev64.8 q8,q8
-#endif
- veor q1,q1,q10
-.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
-
-#ifndef __ARMEB__
- vrev64.8 q9,q9
-#endif
- vmov d4,d3 @ Xh|Xm - 256-bit result
- vmov d3,d0 @ Xm is rotated Xl
- vext.8 q7,q9,q9,#8
- vext.8 q3,q8,q8,#8
- veor q0,q1,q10
-.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
- veor q3,q3,q2 @ accumulate q3 early
-
- vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
-.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
- veor q3,q3,q10
- veor q9,q9,q7 @ Karatsuba pre-processing
- veor q3,q3,q0
-.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
- bhs Loop_mod2x_v8 @ there was at least 32 more bytes
-
- veor q2,q2,q10
- vext.8 q3,q8,q8,#8 @ re-construct q3
- adds r3,r3,#32 @ re-construct r3
- veor q0,q0,q2 @ re-construct q0
- beq Ldone_v8 @ is r3 zero?
-Lodd_tail_v8:
- vext.8 q10,q0,q0,#8
- veor q3,q3,q0 @ inp^=Xi
- veor q9,q8,q10 @ q9 is rotated inp^Xi
-
-.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
- veor q9,q9,q3 @ Karatsuba pre-processing
-.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
-.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
-
- vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
- veor q10,q0,q2
- veor q1,q1,q9
- veor q1,q1,q10
-.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
-
- vmov d4,d3 @ Xh|Xm - 256-bit result
- vmov d3,d0 @ Xm is rotated Xl
- veor q0,q1,q10
-
- vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
-.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
- veor q10,q10,q2
- veor q0,q0,q10
-
-Ldone_v8:
-#ifndef __ARMEB__
- vrev64.8 q0,q0
-#endif
- vext.8 q0,q0,q0,#8
- vst1.64 {q0},[r0] @ write out Xi
-
- vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
- bx lr
-
-.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/sha1-armv4-large-apple.S b/apple-arm/crypto/fipsmodule/sha1-armv4-large-apple.S
deleted file mode 100644
index d653f2d..0000000
--- a/apple-arm/crypto/fipsmodule/sha1-armv4-large-apple.S
+++ /dev/null
@@ -1,1510 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.text
-#if defined(__thumb2__)
-.syntax unified
-.thumb
-#else
-.code 32
-#endif
-
-.globl _sha1_block_data_order
-.private_extern _sha1_block_data_order
-#ifdef __thumb2__
-.thumb_func _sha1_block_data_order
-#endif
-
-.align 5
-_sha1_block_data_order:
-#if __ARM_MAX_ARCH__>=7
-Lsha1_block:
- adr r3,Lsha1_block
- ldr r12,LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
- ldr r12,[r12]
-#endif
- tst r12,#ARMV8_SHA1
- bne LARMv8
- tst r12,#ARMV7_NEON
- bne LNEON
-#endif
- stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
- add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
- ldmia r0,{r3,r4,r5,r6,r7}
-Lloop:
- ldr r8,LK_00_19
- mov r14,sp
- sub sp,sp,#15*4
- mov r5,r5,ror#30
- mov r6,r6,ror#30
- mov r7,r7,ror#30 @ [6]
-L_00_15:
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r7,r8,r7,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r5,r6 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r7,r8,r7,ror#2 @ E+=K_00_19
- eor r10,r5,r6 @ F_xx_xx
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r4,r10,ror#2
- add r7,r7,r9 @ E+=X[i]
- eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r7,r7,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r6,r8,r6,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r4,r5 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r6,r8,r6,ror#2 @ E+=K_00_19
- eor r10,r4,r5 @ F_xx_xx
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r3,r10,ror#2
- add r6,r6,r9 @ E+=X[i]
- eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r6,r6,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r5,r8,r5,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r3,r4 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r5,r8,r5,ror#2 @ E+=K_00_19
- eor r10,r3,r4 @ F_xx_xx
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r7,r10,ror#2
- add r5,r5,r9 @ E+=X[i]
- eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r5,r5,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r4,r8,r4,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r7,r3 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r4,r8,r4,ror#2 @ E+=K_00_19
- eor r10,r7,r3 @ F_xx_xx
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r6,r10,ror#2
- add r4,r4,r9 @ E+=X[i]
- eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r4,r4,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r3,r8,r3,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r6,r7 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r3,r8,r3,ror#2 @ E+=K_00_19
- eor r10,r6,r7 @ F_xx_xx
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r5,r10,ror#2
- add r3,r3,r9 @ E+=X[i]
- eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r3,r3,r10 @ E+=F_00_19(B,C,D)
-#if defined(__thumb2__)
- mov r12,sp
- teq r14,r12
-#else
- teq r14,sp
-#endif
- bne L_00_15 @ [((11+4)*5+2)*3]
- sub sp,sp,#25*4
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r7,r8,r7,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r5,r6 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r7,r8,r7,ror#2 @ E+=K_00_19
- eor r10,r5,r6 @ F_xx_xx
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r4,r10,ror#2
- add r7,r7,r9 @ E+=X[i]
- eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r7,r7,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r6,r8,r6,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r4,r5 @ F_xx_xx
- mov r9,r9,ror#31
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r3,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r6,r6,r9 @ E+=X[i]
- eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
- add r6,r6,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r5,r8,r5,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r3,r4 @ F_xx_xx
- mov r9,r9,ror#31
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r7,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r5,r5,r9 @ E+=X[i]
- eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
- add r5,r5,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r4,r8,r4,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r7,r3 @ F_xx_xx
- mov r9,r9,ror#31
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r6,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r4,r4,r9 @ E+=X[i]
- eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
- add r4,r4,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r3,r8,r3,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r6,r7 @ F_xx_xx
- mov r9,r9,ror#31
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r5,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r3,r3,r9 @ E+=X[i]
- eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
- add r3,r3,r10 @ E+=F_00_19(B,C,D)
-
- ldr r8,LK_20_39 @ [+15+16*4]
- cmn sp,#0 @ [+3], clear carry to denote 20_39
-L_20_39_or_60_79:
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r7,r8,r7,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r5,r6 @ F_xx_xx
- mov r9,r9,ror#31
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r4,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r7,r7,r9 @ E+=X[i]
- add r7,r7,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r6,r8,r6,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r4,r5 @ F_xx_xx
- mov r9,r9,ror#31
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r3,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r6,r6,r9 @ E+=X[i]
- add r6,r6,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r5,r8,r5,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r3,r4 @ F_xx_xx
- mov r9,r9,ror#31
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r7,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r5,r5,r9 @ E+=X[i]
- add r5,r5,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r4,r8,r4,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r7,r3 @ F_xx_xx
- mov r9,r9,ror#31
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r6,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r4,r4,r9 @ E+=X[i]
- add r4,r4,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r3,r8,r3,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r6,r7 @ F_xx_xx
- mov r9,r9,ror#31
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r5,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r3,r3,r9 @ E+=X[i]
- add r3,r3,r10 @ E+=F_20_39(B,C,D)
-#if defined(__thumb2__)
- mov r12,sp
- teq r14,r12
-#else
- teq r14,sp @ preserve carry
-#endif
- bne L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
- bcs L_done @ [+((12+3)*5+2)*4], spare 300 bytes
-
- ldr r8,LK_40_59
- sub sp,sp,#20*4 @ [+2]
-L_40_59:
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r7,r8,r7,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r5,r6 @ F_xx_xx
- mov r9,r9,ror#31
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r4,r10,ror#2 @ F_xx_xx
- and r11,r5,r6 @ F_xx_xx
- add r7,r7,r9 @ E+=X[i]
- add r7,r7,r10 @ E+=F_40_59(B,C,D)
- add r7,r7,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r6,r8,r6,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r4,r5 @ F_xx_xx
- mov r9,r9,ror#31
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r3,r10,ror#2 @ F_xx_xx
- and r11,r4,r5 @ F_xx_xx
- add r6,r6,r9 @ E+=X[i]
- add r6,r6,r10 @ E+=F_40_59(B,C,D)
- add r6,r6,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r5,r8,r5,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r3,r4 @ F_xx_xx
- mov r9,r9,ror#31
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r7,r10,ror#2 @ F_xx_xx
- and r11,r3,r4 @ F_xx_xx
- add r5,r5,r9 @ E+=X[i]
- add r5,r5,r10 @ E+=F_40_59(B,C,D)
- add r5,r5,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r4,r8,r4,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r7,r3 @ F_xx_xx
- mov r9,r9,ror#31
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r6,r10,ror#2 @ F_xx_xx
- and r11,r7,r3 @ F_xx_xx
- add r4,r4,r9 @ E+=X[i]
- add r4,r4,r10 @ E+=F_40_59(B,C,D)
- add r4,r4,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r3,r8,r3,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r6,r7 @ F_xx_xx
- mov r9,r9,ror#31
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r5,r10,ror#2 @ F_xx_xx
- and r11,r6,r7 @ F_xx_xx
- add r3,r3,r9 @ E+=X[i]
- add r3,r3,r10 @ E+=F_40_59(B,C,D)
- add r3,r3,r11,ror#2
-#if defined(__thumb2__)
- mov r12,sp
- teq r14,r12
-#else
- teq r14,sp
-#endif
- bne L_40_59 @ [+((12+5)*5+2)*4]
-
- ldr r8,LK_60_79
- sub sp,sp,#20*4
- cmp sp,#0 @ set carry to denote 60_79
- b L_20_39_or_60_79 @ [+4], spare 300 bytes
-L_done:
- add sp,sp,#80*4 @ "deallocate" stack frame
- ldmia r0,{r8,r9,r10,r11,r12}
- add r3,r8,r3
- add r4,r9,r4
- add r5,r10,r5,ror#2
- add r6,r11,r6,ror#2
- add r7,r12,r7,ror#2
- stmia r0,{r3,r4,r5,r6,r7}
- teq r1,r2
- bne Lloop @ [+18], total 1307
-
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-#else
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
-.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-
-
-.align 5
-LK_00_19:.word 0x5a827999
-LK_20_39:.word 0x6ed9eba1
-LK_40_59:.word 0x8f1bbcdc
-LK_60_79:.word 0xca62c1d6
-#if __ARM_MAX_ARCH__>=7
-LOPENSSL_armcap:
-.word OPENSSL_armcap_P-Lsha1_block
-#endif
-.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 5
-#if __ARM_MAX_ARCH__>=7
-
-
-
-#ifdef __thumb2__
-.thumb_func sha1_block_data_order_neon
-#endif
-.align 4
-sha1_block_data_order_neon:
-LNEON:
- stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
- add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
- @ dmb @ errata #451034 on early Cortex A8
- @ vstmdb sp!,{d8-d15} @ ABI specification says so
- mov r14,sp
- sub r12,sp,#64
- adr r8,LK_00_19
- bic r12,r12,#15 @ align for 128-bit stores
-
- ldmia r0,{r3,r4,r5,r6,r7} @ load context
- mov sp,r12 @ alloca
-
- vld1.8 {q0,q1},[r1]! @ handles unaligned
- veor q15,q15,q15
- vld1.8 {q2,q3},[r1]!
- vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19
- vrev32.8 q0,q0 @ yes, even on
- vrev32.8 q1,q1 @ big-endian...
- vrev32.8 q2,q2
- vadd.i32 q8,q0,q14
- vrev32.8 q3,q3
- vadd.i32 q9,q1,q14
- vst1.32 {q8},[r12,:128]!
- vadd.i32 q10,q2,q14
- vst1.32 {q9},[r12,:128]!
- vst1.32 {q10},[r12,:128]!
- ldr r9,[sp] @ big RAW stall
-
-Loop_neon:
- vext.8 q8,q0,q1,#8
- bic r10,r6,r4
- add r7,r7,r9
- and r11,r5,r4
- vadd.i32 q13,q3,q14
- ldr r9,[sp,#4]
- add r7,r7,r3,ror#27
- vext.8 q12,q3,q15,#4
- eor r11,r11,r10
- mov r4,r4,ror#2
- add r7,r7,r11
- veor q8,q8,q0
- bic r10,r5,r3
- add r6,r6,r9
- veor q12,q12,q2
- and r11,r4,r3
- ldr r9,[sp,#8]
- veor q12,q12,q8
- add r6,r6,r7,ror#27
- eor r11,r11,r10
- vst1.32 {q13},[r12,:128]!
- sub r12,r12,#64
- mov r3,r3,ror#2
- add r6,r6,r11
- vext.8 q13,q15,q12,#4
- bic r10,r4,r7
- add r5,r5,r9
- vadd.i32 q8,q12,q12
- and r11,r3,r7
- ldr r9,[sp,#12]
- vsri.32 q8,q12,#31
- add r5,r5,r6,ror#27
- eor r11,r11,r10
- mov r7,r7,ror#2
- vshr.u32 q12,q13,#30
- add r5,r5,r11
- bic r10,r3,r6
- vshl.u32 q13,q13,#2
- add r4,r4,r9
- and r11,r7,r6
- veor q8,q8,q12
- ldr r9,[sp,#16]
- add r4,r4,r5,ror#27
- veor q8,q8,q13
- eor r11,r11,r10
- mov r6,r6,ror#2
- add r4,r4,r11
- vext.8 q9,q1,q2,#8
- bic r10,r7,r5
- add r3,r3,r9
- and r11,r6,r5
- vadd.i32 q13,q8,q14
- ldr r9,[sp,#20]
- vld1.32 {d28[],d29[]},[r8,:32]!
- add r3,r3,r4,ror#27
- vext.8 q12,q8,q15,#4
- eor r11,r11,r10
- mov r5,r5,ror#2
- add r3,r3,r11
- veor q9,q9,q1
- bic r10,r6,r4
- add r7,r7,r9
- veor q12,q12,q3
- and r11,r5,r4
- ldr r9,[sp,#24]
- veor q12,q12,q9
- add r7,r7,r3,ror#27
- eor r11,r11,r10
- vst1.32 {q13},[r12,:128]!
- mov r4,r4,ror#2
- add r7,r7,r11
- vext.8 q13,q15,q12,#4
- bic r10,r5,r3
- add r6,r6,r9
- vadd.i32 q9,q12,q12
- and r11,r4,r3
- ldr r9,[sp,#28]
- vsri.32 q9,q12,#31
- add r6,r6,r7,ror#27
- eor r11,r11,r10
- mov r3,r3,ror#2
- vshr.u32 q12,q13,#30
- add r6,r6,r11
- bic r10,r4,r7
- vshl.u32 q13,q13,#2
- add r5,r5,r9
- and r11,r3,r7
- veor q9,q9,q12
- ldr r9,[sp,#32]
- add r5,r5,r6,ror#27
- veor q9,q9,q13
- eor r11,r11,r10
- mov r7,r7,ror#2
- add r5,r5,r11
- vext.8 q10,q2,q3,#8
- bic r10,r3,r6
- add r4,r4,r9
- and r11,r7,r6
- vadd.i32 q13,q9,q14
- ldr r9,[sp,#36]
- add r4,r4,r5,ror#27
- vext.8 q12,q9,q15,#4
- eor r11,r11,r10
- mov r6,r6,ror#2
- add r4,r4,r11
- veor q10,q10,q2
- bic r10,r7,r5
- add r3,r3,r9
- veor q12,q12,q8
- and r11,r6,r5
- ldr r9,[sp,#40]
- veor q12,q12,q10
- add r3,r3,r4,ror#27
- eor r11,r11,r10
- vst1.32 {q13},[r12,:128]!
- mov r5,r5,ror#2
- add r3,r3,r11
- vext.8 q13,q15,q12,#4
- bic r10,r6,r4
- add r7,r7,r9
- vadd.i32 q10,q12,q12
- and r11,r5,r4
- ldr r9,[sp,#44]
- vsri.32 q10,q12,#31
- add r7,r7,r3,ror#27
- eor r11,r11,r10
- mov r4,r4,ror#2
- vshr.u32 q12,q13,#30
- add r7,r7,r11
- bic r10,r5,r3
- vshl.u32 q13,q13,#2
- add r6,r6,r9
- and r11,r4,r3
- veor q10,q10,q12
- ldr r9,[sp,#48]
- add r6,r6,r7,ror#27
- veor q10,q10,q13
- eor r11,r11,r10
- mov r3,r3,ror#2
- add r6,r6,r11
- vext.8 q11,q3,q8,#8
- bic r10,r4,r7
- add r5,r5,r9
- and r11,r3,r7
- vadd.i32 q13,q10,q14
- ldr r9,[sp,#52]
- add r5,r5,r6,ror#27
- vext.8 q12,q10,q15,#4
- eor r11,r11,r10
- mov r7,r7,ror#2
- add r5,r5,r11
- veor q11,q11,q3
- bic r10,r3,r6
- add r4,r4,r9
- veor q12,q12,q9
- and r11,r7,r6
- ldr r9,[sp,#56]
- veor q12,q12,q11
- add r4,r4,r5,ror#27
- eor r11,r11,r10
- vst1.32 {q13},[r12,:128]!
- mov r6,r6,ror#2
- add r4,r4,r11
- vext.8 q13,q15,q12,#4
- bic r10,r7,r5
- add r3,r3,r9
- vadd.i32 q11,q12,q12
- and r11,r6,r5
- ldr r9,[sp,#60]
- vsri.32 q11,q12,#31
- add r3,r3,r4,ror#27
- eor r11,r11,r10
- mov r5,r5,ror#2
- vshr.u32 q12,q13,#30
- add r3,r3,r11
- bic r10,r6,r4
- vshl.u32 q13,q13,#2
- add r7,r7,r9
- and r11,r5,r4
- veor q11,q11,q12
- ldr r9,[sp,#0]
- add r7,r7,r3,ror#27
- veor q11,q11,q13
- eor r11,r11,r10
- mov r4,r4,ror#2
- add r7,r7,r11
- vext.8 q12,q10,q11,#8
- bic r10,r5,r3
- add r6,r6,r9
- and r11,r4,r3
- veor q0,q0,q8
- ldr r9,[sp,#4]
- add r6,r6,r7,ror#27
- veor q0,q0,q1
- eor r11,r11,r10
- mov r3,r3,ror#2
- vadd.i32 q13,q11,q14
- add r6,r6,r11
- bic r10,r4,r7
- veor q12,q12,q0
- add r5,r5,r9
- and r11,r3,r7
- vshr.u32 q0,q12,#30
- ldr r9,[sp,#8]
- add r5,r5,r6,ror#27
- vst1.32 {q13},[r12,:128]!
- sub r12,r12,#64
- eor r11,r11,r10
- mov r7,r7,ror#2
- vsli.32 q0,q12,#2
- add r5,r5,r11
- bic r10,r3,r6
- add r4,r4,r9
- and r11,r7,r6
- ldr r9,[sp,#12]
- add r4,r4,r5,ror#27
- eor r11,r11,r10
- mov r6,r6,ror#2
- add r4,r4,r11
- bic r10,r7,r5
- add r3,r3,r9
- and r11,r6,r5
- ldr r9,[sp,#16]
- add r3,r3,r4,ror#27
- eor r11,r11,r10
- mov r5,r5,ror#2
- add r3,r3,r11
- vext.8 q12,q11,q0,#8
- eor r10,r4,r6
- add r7,r7,r9
- ldr r9,[sp,#20]
- veor q1,q1,q9
- eor r11,r10,r5
- add r7,r7,r3,ror#27
- veor q1,q1,q2
- mov r4,r4,ror#2
- add r7,r7,r11
- vadd.i32 q13,q0,q14
- eor r10,r3,r5
- add r6,r6,r9
- veor q12,q12,q1
- ldr r9,[sp,#24]
- eor r11,r10,r4
- vshr.u32 q1,q12,#30
- add r6,r6,r7,ror#27
- mov r3,r3,ror#2
- vst1.32 {q13},[r12,:128]!
- add r6,r6,r11
- eor r10,r7,r4
- vsli.32 q1,q12,#2
- add r5,r5,r9
- ldr r9,[sp,#28]
- eor r11,r10,r3
- add r5,r5,r6,ror#27
- mov r7,r7,ror#2
- add r5,r5,r11
- eor r10,r6,r3
- add r4,r4,r9
- ldr r9,[sp,#32]
- eor r11,r10,r7
- add r4,r4,r5,ror#27
- mov r6,r6,ror#2
- add r4,r4,r11
- vext.8 q12,q0,q1,#8
- eor r10,r5,r7
- add r3,r3,r9
- ldr r9,[sp,#36]
- veor q2,q2,q10
- eor r11,r10,r6
- add r3,r3,r4,ror#27
- veor q2,q2,q3
- mov r5,r5,ror#2
- add r3,r3,r11
- vadd.i32 q13,q1,q14
- eor r10,r4,r6
- vld1.32 {d28[],d29[]},[r8,:32]!
- add r7,r7,r9
- veor q12,q12,q2
- ldr r9,[sp,#40]
- eor r11,r10,r5
- vshr.u32 q2,q12,#30
- add r7,r7,r3,ror#27
- mov r4,r4,ror#2
- vst1.32 {q13},[r12,:128]!
- add r7,r7,r11
- eor r10,r3,r5
- vsli.32 q2,q12,#2
- add r6,r6,r9
- ldr r9,[sp,#44]
- eor r11,r10,r4
- add r6,r6,r7,ror#27
- mov r3,r3,ror#2
- add r6,r6,r11
- eor r10,r7,r4
- add r5,r5,r9
- ldr r9,[sp,#48]
- eor r11,r10,r3
- add r5,r5,r6,ror#27
- mov r7,r7,ror#2
- add r5,r5,r11
- vext.8 q12,q1,q2,#8
- eor r10,r6,r3
- add r4,r4,r9
- ldr r9,[sp,#52]
- veor q3,q3,q11
- eor r11,r10,r7
- add r4,r4,r5,ror#27
- veor q3,q3,q8
- mov r6,r6,ror#2
- add r4,r4,r11
- vadd.i32 q13,q2,q14
- eor r10,r5,r7
- add r3,r3,r9
- veor q12,q12,q3
- ldr r9,[sp,#56]
- eor r11,r10,r6
- vshr.u32 q3,q12,#30
- add r3,r3,r4,ror#27
- mov r5,r5,ror#2
- vst1.32 {q13},[r12,:128]!
- add r3,r3,r11
- eor r10,r4,r6
- vsli.32 q3,q12,#2
- add r7,r7,r9
- ldr r9,[sp,#60]
- eor r11,r10,r5
- add r7,r7,r3,ror#27
- mov r4,r4,ror#2
- add r7,r7,r11
- eor r10,r3,r5
- add r6,r6,r9
- ldr r9,[sp,#0]
- eor r11,r10,r4
- add r6,r6,r7,ror#27
- mov r3,r3,ror#2
- add r6,r6,r11
- vext.8 q12,q2,q3,#8
- eor r10,r7,r4
- add r5,r5,r9
- ldr r9,[sp,#4]
- veor q8,q8,q0
- eor r11,r10,r3
- add r5,r5,r6,ror#27
- veor q8,q8,q9
- mov r7,r7,ror#2
- add r5,r5,r11
- vadd.i32 q13,q3,q14
- eor r10,r6,r3
- add r4,r4,r9
- veor q12,q12,q8
- ldr r9,[sp,#8]
- eor r11,r10,r7
- vshr.u32 q8,q12,#30
- add r4,r4,r5,ror#27
- mov r6,r6,ror#2
- vst1.32 {q13},[r12,:128]!
- sub r12,r12,#64
- add r4,r4,r11
- eor r10,r5,r7
- vsli.32 q8,q12,#2
- add r3,r3,r9
- ldr r9,[sp,#12]
- eor r11,r10,r6
- add r3,r3,r4,ror#27
- mov r5,r5,ror#2
- add r3,r3,r11
- eor r10,r4,r6
- add r7,r7,r9
- ldr r9,[sp,#16]
- eor r11,r10,r5
- add r7,r7,r3,ror#27
- mov r4,r4,ror#2
- add r7,r7,r11
- vext.8 q12,q3,q8,#8
- eor r10,r3,r5
- add r6,r6,r9
- ldr r9,[sp,#20]
- veor q9,q9,q1
- eor r11,r10,r4
- add r6,r6,r7,ror#27
- veor q9,q9,q10
- mov r3,r3,ror#2
- add r6,r6,r11
- vadd.i32 q13,q8,q14
- eor r10,r7,r4
- add r5,r5,r9
- veor q12,q12,q9
- ldr r9,[sp,#24]
- eor r11,r10,r3
- vshr.u32 q9,q12,#30
- add r5,r5,r6,ror#27
- mov r7,r7,ror#2
- vst1.32 {q13},[r12,:128]!
- add r5,r5,r11
- eor r10,r6,r3
- vsli.32 q9,q12,#2
- add r4,r4,r9
- ldr r9,[sp,#28]
- eor r11,r10,r7
- add r4,r4,r5,ror#27
- mov r6,r6,ror#2
- add r4,r4,r11
- eor r10,r5,r7
- add r3,r3,r9
- ldr r9,[sp,#32]
- eor r11,r10,r6
- add r3,r3,r4,ror#27
- mov r5,r5,ror#2
- add r3,r3,r11
- vext.8 q12,q8,q9,#8
- add r7,r7,r9
- and r10,r5,r6
- ldr r9,[sp,#36]
- veor q10,q10,q2
- add r7,r7,r3,ror#27
- eor r11,r5,r6
- veor q10,q10,q11
- add r7,r7,r10
- and r11,r11,r4
- vadd.i32 q13,q9,q14
- mov r4,r4,ror#2
- add r7,r7,r11
- veor q12,q12,q10
- add r6,r6,r9
- and r10,r4,r5
- vshr.u32 q10,q12,#30
- ldr r9,[sp,#40]
- add r6,r6,r7,ror#27
- vst1.32 {q13},[r12,:128]!
- eor r11,r4,r5
- add r6,r6,r10
- vsli.32 q10,q12,#2
- and r11,r11,r3
- mov r3,r3,ror#2
- add r6,r6,r11
- add r5,r5,r9
- and r10,r3,r4
- ldr r9,[sp,#44]
- add r5,r5,r6,ror#27
- eor r11,r3,r4
- add r5,r5,r10
- and r11,r11,r7
- mov r7,r7,ror#2
- add r5,r5,r11
- add r4,r4,r9
- and r10,r7,r3
- ldr r9,[sp,#48]
- add r4,r4,r5,ror#27
- eor r11,r7,r3
- add r4,r4,r10
- and r11,r11,r6
- mov r6,r6,ror#2
- add r4,r4,r11
- vext.8 q12,q9,q10,#8
- add r3,r3,r9
- and r10,r6,r7
- ldr r9,[sp,#52]
- veor q11,q11,q3
- add r3,r3,r4,ror#27
- eor r11,r6,r7
- veor q11,q11,q0
- add r3,r3,r10
- and r11,r11,r5
- vadd.i32 q13,q10,q14
- mov r5,r5,ror#2
- vld1.32 {d28[],d29[]},[r8,:32]!
- add r3,r3,r11
- veor q12,q12,q11
- add r7,r7,r9
- and r10,r5,r6
- vshr.u32 q11,q12,#30
- ldr r9,[sp,#56]
- add r7,r7,r3,ror#27
- vst1.32 {q13},[r12,:128]!
- eor r11,r5,r6
- add r7,r7,r10
- vsli.32 q11,q12,#2
- and r11,r11,r4
- mov r4,r4,ror#2
- add r7,r7,r11
- add r6,r6,r9
- and r10,r4,r5
- ldr r9,[sp,#60]
- add r6,r6,r7,ror#27
- eor r11,r4,r5
- add r6,r6,r10
- and r11,r11,r3
- mov r3,r3,ror#2
- add r6,r6,r11
- add r5,r5,r9
- and r10,r3,r4
- ldr r9,[sp,#0]
- add r5,r5,r6,ror#27
- eor r11,r3,r4
- add r5,r5,r10
- and r11,r11,r7
- mov r7,r7,ror#2
- add r5,r5,r11
- vext.8 q12,q10,q11,#8
- add r4,r4,r9
- and r10,r7,r3
- ldr r9,[sp,#4]
- veor q0,q0,q8
- add r4,r4,r5,ror#27
- eor r11,r7,r3
- veor q0,q0,q1
- add r4,r4,r10
- and r11,r11,r6
- vadd.i32 q13,q11,q14
- mov r6,r6,ror#2
- add r4,r4,r11
- veor q12,q12,q0
- add r3,r3,r9
- and r10,r6,r7
- vshr.u32 q0,q12,#30
- ldr r9,[sp,#8]
- add r3,r3,r4,ror#27
- vst1.32 {q13},[r12,:128]!
- sub r12,r12,#64
- eor r11,r6,r7
- add r3,r3,r10
- vsli.32 q0,q12,#2
- and r11,r11,r5
- mov r5,r5,ror#2
- add r3,r3,r11
- add r7,r7,r9
- and r10,r5,r6
- ldr r9,[sp,#12]
- add r7,r7,r3,ror#27
- eor r11,r5,r6
- add r7,r7,r10
- and r11,r11,r4
- mov r4,r4,ror#2
- add r7,r7,r11
- add r6,r6,r9
- and r10,r4,r5
- ldr r9,[sp,#16]
- add r6,r6,r7,ror#27
- eor r11,r4,r5
- add r6,r6,r10
- and r11,r11,r3
- mov r3,r3,ror#2
- add r6,r6,r11
- vext.8 q12,q11,q0,#8
- add r5,r5,r9
- and r10,r3,r4
- ldr r9,[sp,#20]
- veor q1,q1,q9
- add r5,r5,r6,ror#27
- eor r11,r3,r4
- veor q1,q1,q2
- add r5,r5,r10
- and r11,r11,r7
- vadd.i32 q13,q0,q14
- mov r7,r7,ror#2
- add r5,r5,r11
- veor q12,q12,q1
- add r4,r4,r9
- and r10,r7,r3
- vshr.u32 q1,q12,#30
- ldr r9,[sp,#24]
- add r4,r4,r5,ror#27
- vst1.32 {q13},[r12,:128]!
- eor r11,r7,r3
- add r4,r4,r10
- vsli.32 q1,q12,#2
- and r11,r11,r6
- mov r6,r6,ror#2
- add r4,r4,r11
- add r3,r3,r9
- and r10,r6,r7
- ldr r9,[sp,#28]
- add r3,r3,r4,ror#27
- eor r11,r6,r7
- add r3,r3,r10
- and r11,r11,r5
- mov r5,r5,ror#2
- add r3,r3,r11
- add r7,r7,r9
- and r10,r5,r6
- ldr r9,[sp,#32]
- add r7,r7,r3,ror#27
- eor r11,r5,r6
- add r7,r7,r10
- and r11,r11,r4
- mov r4,r4,ror#2
- add r7,r7,r11
- vext.8 q12,q0,q1,#8
- add r6,r6,r9
- and r10,r4,r5
- ldr r9,[sp,#36]
- veor q2,q2,q10
- add r6,r6,r7,ror#27
- eor r11,r4,r5
- veor q2,q2,q3
- add r6,r6,r10
- and r11,r11,r3
- vadd.i32 q13,q1,q14
- mov r3,r3,ror#2
- add r6,r6,r11
- veor q12,q12,q2
- add r5,r5,r9
- and r10,r3,r4
- vshr.u32 q2,q12,#30
- ldr r9,[sp,#40]
- add r5,r5,r6,ror#27
- vst1.32 {q13},[r12,:128]!
- eor r11,r3,r4
- add r5,r5,r10
- vsli.32 q2,q12,#2
- and r11,r11,r7
- mov r7,r7,ror#2
- add r5,r5,r11
- add r4,r4,r9
- and r10,r7,r3
- ldr r9,[sp,#44]
- add r4,r4,r5,ror#27
- eor r11,r7,r3
- add r4,r4,r10
- and r11,r11,r6
- mov r6,r6,ror#2
- add r4,r4,r11
- add r3,r3,r9
- and r10,r6,r7
- ldr r9,[sp,#48]
- add r3,r3,r4,ror#27
- eor r11,r6,r7
- add r3,r3,r10
- and r11,r11,r5
- mov r5,r5,ror#2
- add r3,r3,r11
- vext.8 q12,q1,q2,#8
- eor r10,r4,r6
- add r7,r7,r9
- ldr r9,[sp,#52]
- veor q3,q3,q11
- eor r11,r10,r5
- add r7,r7,r3,ror#27
- veor q3,q3,q8
- mov r4,r4,ror#2
- add r7,r7,r11
- vadd.i32 q13,q2,q14
- eor r10,r3,r5
- add r6,r6,r9
- veor q12,q12,q3
- ldr r9,[sp,#56]
- eor r11,r10,r4
- vshr.u32 q3,q12,#30
- add r6,r6,r7,ror#27
- mov r3,r3,ror#2
- vst1.32 {q13},[r12,:128]!
- add r6,r6,r11
- eor r10,r7,r4
- vsli.32 q3,q12,#2
- add r5,r5,r9
- ldr r9,[sp,#60]
- eor r11,r10,r3
- add r5,r5,r6,ror#27
- mov r7,r7,ror#2
- add r5,r5,r11
- eor r10,r6,r3
- add r4,r4,r9
- ldr r9,[sp,#0]
- eor r11,r10,r7
- add r4,r4,r5,ror#27
- mov r6,r6,ror#2
- add r4,r4,r11
- vadd.i32 q13,q3,q14
- eor r10,r5,r7
- add r3,r3,r9
- vst1.32 {q13},[r12,:128]!
- sub r12,r12,#64
- teq r1,r2
- sub r8,r8,#16
- it eq
- subeq r1,r1,#64
- vld1.8 {q0,q1},[r1]!
- ldr r9,[sp,#4]
- eor r11,r10,r6
- vld1.8 {q2,q3},[r1]!
- add r3,r3,r4,ror#27
- mov r5,r5,ror#2
- vld1.32 {d28[],d29[]},[r8,:32]!
- add r3,r3,r11
- eor r10,r4,r6
- vrev32.8 q0,q0
- add r7,r7,r9
- ldr r9,[sp,#8]
- eor r11,r10,r5
- add r7,r7,r3,ror#27
- mov r4,r4,ror#2
- add r7,r7,r11
- eor r10,r3,r5
- add r6,r6,r9
- ldr r9,[sp,#12]
- eor r11,r10,r4
- add r6,r6,r7,ror#27
- mov r3,r3,ror#2
- add r6,r6,r11
- eor r10,r7,r4
- add r5,r5,r9
- ldr r9,[sp,#16]
- eor r11,r10,r3
- add r5,r5,r6,ror#27
- mov r7,r7,ror#2
- add r5,r5,r11
- vrev32.8 q1,q1
- eor r10,r6,r3
- add r4,r4,r9
- vadd.i32 q8,q0,q14
- ldr r9,[sp,#20]
- eor r11,r10,r7
- vst1.32 {q8},[r12,:128]!
- add r4,r4,r5,ror#27
- mov r6,r6,ror#2
- add r4,r4,r11
- eor r10,r5,r7
- add r3,r3,r9
- ldr r9,[sp,#24]
- eor r11,r10,r6
- add r3,r3,r4,ror#27
- mov r5,r5,ror#2
- add r3,r3,r11
- eor r10,r4,r6
- add r7,r7,r9
- ldr r9,[sp,#28]
- eor r11,r10,r5
- add r7,r7,r3,ror#27
- mov r4,r4,ror#2
- add r7,r7,r11
- eor r10,r3,r5
- add r6,r6,r9
- ldr r9,[sp,#32]
- eor r11,r10,r4
- add r6,r6,r7,ror#27
- mov r3,r3,ror#2
- add r6,r6,r11
- vrev32.8 q2,q2
- eor r10,r7,r4
- add r5,r5,r9
- vadd.i32 q9,q1,q14
- ldr r9,[sp,#36]
- eor r11,r10,r3
- vst1.32 {q9},[r12,:128]!
- add r5,r5,r6,ror#27
- mov r7,r7,ror#2
- add r5,r5,r11
- eor r10,r6,r3
- add r4,r4,r9
- ldr r9,[sp,#40]
- eor r11,r10,r7
- add r4,r4,r5,ror#27
- mov r6,r6,ror#2
- add r4,r4,r11
- eor r10,r5,r7
- add r3,r3,r9
- ldr r9,[sp,#44]
- eor r11,r10,r6
- add r3,r3,r4,ror#27
- mov r5,r5,ror#2
- add r3,r3,r11
- eor r10,r4,r6
- add r7,r7,r9
- ldr r9,[sp,#48]
- eor r11,r10,r5
- add r7,r7,r3,ror#27
- mov r4,r4,ror#2
- add r7,r7,r11
- vrev32.8 q3,q3
- eor r10,r3,r5
- add r6,r6,r9
- vadd.i32 q10,q2,q14
- ldr r9,[sp,#52]
- eor r11,r10,r4
- vst1.32 {q10},[r12,:128]!
- add r6,r6,r7,ror#27
- mov r3,r3,ror#2
- add r6,r6,r11
- eor r10,r7,r4
- add r5,r5,r9
- ldr r9,[sp,#56]
- eor r11,r10,r3
- add r5,r5,r6,ror#27
- mov r7,r7,ror#2
- add r5,r5,r11
- eor r10,r6,r3
- add r4,r4,r9
- ldr r9,[sp,#60]
- eor r11,r10,r7
- add r4,r4,r5,ror#27
- mov r6,r6,ror#2
- add r4,r4,r11
- eor r10,r5,r7
- add r3,r3,r9
- eor r11,r10,r6
- add r3,r3,r4,ror#27
- mov r5,r5,ror#2
- add r3,r3,r11
- ldmia r0,{r9,r10,r11,r12} @ accumulate context
- add r3,r3,r9
- ldr r9,[r0,#16]
- add r4,r4,r10
- add r5,r5,r11
- add r6,r6,r12
- it eq
- moveq sp,r14
- add r7,r7,r9
- it ne
- ldrne r9,[sp]
- stmia r0,{r3,r4,r5,r6,r7}
- itt ne
- addne r12,sp,#3*16
- bne Loop_neon
-
- @ vldmia sp!,{d8-d15}
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-
-#endif
-#if __ARM_MAX_ARCH__>=7
-
-# if defined(__thumb2__)
-# define INST(a,b,c,d) .byte c,d|0xf,a,b
-# else
-# define INST(a,b,c,d) .byte a,b,c,d|0x10
-# endif
-
-#ifdef __thumb2__
-.thumb_func sha1_block_data_order_armv8
-#endif
-.align 5
-sha1_block_data_order_armv8:
-LARMv8:
- vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
-
- veor q1,q1,q1
- adr r3,LK_00_19
- vld1.32 {q0},[r0]!
- vld1.32 {d2[0]},[r0]
- sub r0,r0,#16
- vld1.32 {d16[],d17[]},[r3,:32]!
- vld1.32 {d18[],d19[]},[r3,:32]!
- vld1.32 {d20[],d21[]},[r3,:32]!
- vld1.32 {d22[],d23[]},[r3,:32]
-
-Loop_v8:
- vld1.8 {q4,q5},[r1]!
- vld1.8 {q6,q7},[r1]!
- vrev32.8 q4,q4
- vrev32.8 q5,q5
-
- vadd.i32 q12,q8,q4
- vrev32.8 q6,q6
- vmov q14,q0 @ offload
- subs r2,r2,#1
-
- vadd.i32 q13,q8,q5
- vrev32.8 q7,q7
- INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0
- INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12
- vadd.i32 q12,q8,q6
- INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
- INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1
- INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13
- vadd.i32 q13,q8,q7
- INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
- INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
- INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2
- INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12
- vadd.i32 q12,q8,q4
- INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
- INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
- INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3
- INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13
- vadd.i32 q13,q9,q5
- INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
- INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
- INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4
- INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12
- vadd.i32 q12,q9,q6
- INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
- INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
- INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5
- INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
- vadd.i32 q13,q9,q7
- INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
- INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
- INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6
- INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
- vadd.i32 q12,q9,q4
- INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
- INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
- INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7
- INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
- vadd.i32 q13,q9,q5
- INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
- INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
- INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8
- INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
- vadd.i32 q12,q10,q6
- INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
- INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
- INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9
- INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
- vadd.i32 q13,q10,q7
- INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
- INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
- INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10
- INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
- vadd.i32 q12,q10,q4
- INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
- INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
- INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11
- INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13
- vadd.i32 q13,q10,q5
- INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
- INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
- INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12
- INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
- vadd.i32 q12,q10,q6
- INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
- INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
- INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13
- INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13
- vadd.i32 q13,q11,q7
- INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
- INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
- INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14
- INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
- vadd.i32 q12,q11,q4
- INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
- INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
- INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15
- INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
- vadd.i32 q13,q11,q5
- INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
- INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
- INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16
- INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
- vadd.i32 q12,q11,q6
- INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
- INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17
- INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
- vadd.i32 q13,q11,q7
-
- INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18
- INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
-
- INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19
- INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
-
- vadd.i32 q1,q1,q2
- vadd.i32 q0,q0,q14
- bne Loop_v8
-
- vst1.32 {q0},[r0]!
- vst1.32 {d2[0]},[r0]
-
- vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
- bx lr @ bx lr
-
-#endif
-#if __ARM_MAX_ARCH__>=7
-.comm _OPENSSL_armcap_P,4
-.non_lazy_symbol_pointer
-OPENSSL_armcap_P:
-.indirect_symbol _OPENSSL_armcap_P
-.long 0
-.private_extern _OPENSSL_armcap_P
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/sha256-armv4-apple.S b/apple-arm/crypto/fipsmodule/sha256-armv4-apple.S
deleted file mode 100644
index 8379765..0000000
--- a/apple-arm/crypto/fipsmodule/sha256-armv4-apple.S
+++ /dev/null
@@ -1,2838 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
-@
-@ Licensed under the OpenSSL license (the "License"). You may not use
-@ this file except in compliance with the License. You can obtain a copy
-@ in the file LICENSE in the source distribution or at
-@ https://www.openssl.org/source/license.html
-
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@
-@ Permission to use under GPL terms is granted.
-@ ====================================================================
-
-@ SHA256 block procedure for ARMv4. May 2007.
-
-@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
-@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-@ byte [on single-issue Xscale PXA250 core].
-
-@ July 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
-@ Cortex A8 core and ~20 cycles per processed byte.
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 16%
-@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
-
-@ September 2013.
-@
-@ Add NEON implementation. On Cortex A8 it was measured to process one
-@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
-@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
-@ code (meaning that latter performs sub-optimally, nothing was done
-@ about it).
-
-@ May 2014.
-@
-@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-
-#ifndef __KERNEL__
-# include <openssl/arm_arch.h>
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
-@ instructions are manually-encoded. (See unsha256.)
-
-
-.text
-#if defined(__thumb2__)
-.syntax unified
-.thumb
-#else
-.code 32
-#endif
-
-
-.align 5
-K256:
-.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.word 0 @ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-LOPENSSL_armcap:
-.word OPENSSL_armcap_P-Lsha256_block_data_order
-#endif
-.align 5
-
-.globl _sha256_block_data_order
-.private_extern _sha256_block_data_order
-#ifdef __thumb2__
-.thumb_func _sha256_block_data_order
-#endif
-_sha256_block_data_order:
-Lsha256_block_data_order:
-#if __ARM_ARCH__<7 && !defined(__thumb2__)
- sub r3,pc,#8 @ _sha256_block_data_order
-#else
- adr r3,Lsha256_block_data_order
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
- ldr r12,[r12]
-#endif
- tst r12,#ARMV8_SHA256
- bne LARMv8
- tst r12,#ARMV7_NEON
- bne LNEON
-#endif
- add r2,r1,r2,lsl#6 @ len to point at the end of inp
- stmdb sp!,{r0,r1,r2,r4-r11,lr}
- ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
- sub r14,r3,#256+32 @ K256
- sub sp,sp,#16*4 @ alloca(X[16])
-Loop:
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r5,r6 @ magic
- eor r12,r12,r12
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 0
-# if 0==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r8,r8,ror#5
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r8,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 0
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 0==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r8,r8,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r8,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r11,r11,r2 @ h+=X[i]
- str r2,[sp,#0*4]
- eor r2,r9,r10
- add r11,r11,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r8
- add r11,r11,r12 @ h+=K256[i]
- eor r2,r2,r10 @ Ch(e,f,g)
- eor r0,r4,r4,ror#11
- add r11,r11,r2 @ h+=Ch(e,f,g)
-#if 0==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 0<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r4,r5 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#2*4] @ from future BODY_16_xx
- eor r12,r4,r5 @ a^b, b^c in next round
- ldr r1,[sp,#15*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r4,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r7,r7,r11 @ d+=h
- eor r3,r3,r5 @ Maj(a,b,c)
- add r11,r11,r0,ror#2 @ h+=Sigma0(a)
- @ add r11,r11,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 1
-# if 1==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r7,r7,ror#5
- add r11,r11,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r7,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 1
- add r11,r11,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 1==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r7,r7,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r7,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r10,r10,r2 @ h+=X[i]
- str r2,[sp,#1*4]
- eor r2,r8,r9
- add r10,r10,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r7
- add r10,r10,r3 @ h+=K256[i]
- eor r2,r2,r9 @ Ch(e,f,g)
- eor r0,r11,r11,ror#11
- add r10,r10,r2 @ h+=Ch(e,f,g)
-#if 1==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 1<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r11,r4 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#3*4] @ from future BODY_16_xx
- eor r3,r11,r4 @ a^b, b^c in next round
- ldr r1,[sp,#0*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r11,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r6,r6,r10 @ d+=h
- eor r12,r12,r4 @ Maj(a,b,c)
- add r10,r10,r0,ror#2 @ h+=Sigma0(a)
- @ add r10,r10,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 2
-# if 2==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r6,r6,ror#5
- add r10,r10,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r6,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 2
- add r10,r10,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 2==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r6,r6,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r6,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r9,r9,r2 @ h+=X[i]
- str r2,[sp,#2*4]
- eor r2,r7,r8
- add r9,r9,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r6
- add r9,r9,r12 @ h+=K256[i]
- eor r2,r2,r8 @ Ch(e,f,g)
- eor r0,r10,r10,ror#11
- add r9,r9,r2 @ h+=Ch(e,f,g)
-#if 2==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 2<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r10,r11 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#4*4] @ from future BODY_16_xx
- eor r12,r10,r11 @ a^b, b^c in next round
- ldr r1,[sp,#1*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r10,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r5,r5,r9 @ d+=h
- eor r3,r3,r11 @ Maj(a,b,c)
- add r9,r9,r0,ror#2 @ h+=Sigma0(a)
- @ add r9,r9,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 3
-# if 3==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r5,r5,ror#5
- add r9,r9,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r5,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 3
- add r9,r9,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 3==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r5,r5,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r5,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r8,r8,r2 @ h+=X[i]
- str r2,[sp,#3*4]
- eor r2,r6,r7
- add r8,r8,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r5
- add r8,r8,r3 @ h+=K256[i]
- eor r2,r2,r7 @ Ch(e,f,g)
- eor r0,r9,r9,ror#11
- add r8,r8,r2 @ h+=Ch(e,f,g)
-#if 3==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 3<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r9,r10 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#5*4] @ from future BODY_16_xx
- eor r3,r9,r10 @ a^b, b^c in next round
- ldr r1,[sp,#2*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r9,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r4,r4,r8 @ d+=h
- eor r12,r12,r10 @ Maj(a,b,c)
- add r8,r8,r0,ror#2 @ h+=Sigma0(a)
- @ add r8,r8,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 4
-# if 4==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r4,r4,ror#5
- add r8,r8,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r4,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 4
- add r8,r8,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 4==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r4,r4,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r4,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r7,r7,r2 @ h+=X[i]
- str r2,[sp,#4*4]
- eor r2,r5,r6
- add r7,r7,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r4
- add r7,r7,r12 @ h+=K256[i]
- eor r2,r2,r6 @ Ch(e,f,g)
- eor r0,r8,r8,ror#11
- add r7,r7,r2 @ h+=Ch(e,f,g)
-#if 4==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 4<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r8,r9 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#6*4] @ from future BODY_16_xx
- eor r12,r8,r9 @ a^b, b^c in next round
- ldr r1,[sp,#3*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r8,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r11,r11,r7 @ d+=h
- eor r3,r3,r9 @ Maj(a,b,c)
- add r7,r7,r0,ror#2 @ h+=Sigma0(a)
- @ add r7,r7,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 5
-# if 5==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r11,r11,ror#5
- add r7,r7,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r11,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 5
- add r7,r7,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 5==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r11,r11,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r11,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r6,r6,r2 @ h+=X[i]
- str r2,[sp,#5*4]
- eor r2,r4,r5
- add r6,r6,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r11
- add r6,r6,r3 @ h+=K256[i]
- eor r2,r2,r5 @ Ch(e,f,g)
- eor r0,r7,r7,ror#11
- add r6,r6,r2 @ h+=Ch(e,f,g)
-#if 5==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 5<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r7,r8 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#7*4] @ from future BODY_16_xx
- eor r3,r7,r8 @ a^b, b^c in next round
- ldr r1,[sp,#4*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r7,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r10,r10,r6 @ d+=h
- eor r12,r12,r8 @ Maj(a,b,c)
- add r6,r6,r0,ror#2 @ h+=Sigma0(a)
- @ add r6,r6,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 6
-# if 6==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r10,r10,ror#5
- add r6,r6,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r10,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 6
- add r6,r6,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 6==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r10,r10,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r10,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r5,r5,r2 @ h+=X[i]
- str r2,[sp,#6*4]
- eor r2,r11,r4
- add r5,r5,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r10
- add r5,r5,r12 @ h+=K256[i]
- eor r2,r2,r4 @ Ch(e,f,g)
- eor r0,r6,r6,ror#11
- add r5,r5,r2 @ h+=Ch(e,f,g)
-#if 6==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 6<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r6,r7 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#8*4] @ from future BODY_16_xx
- eor r12,r6,r7 @ a^b, b^c in next round
- ldr r1,[sp,#5*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r6,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r9,r9,r5 @ d+=h
- eor r3,r3,r7 @ Maj(a,b,c)
- add r5,r5,r0,ror#2 @ h+=Sigma0(a)
- @ add r5,r5,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 7
-# if 7==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r9,r9,ror#5
- add r5,r5,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r9,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 7
- add r5,r5,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 7==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r9,r9,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r9,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r4,r4,r2 @ h+=X[i]
- str r2,[sp,#7*4]
- eor r2,r10,r11
- add r4,r4,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r9
- add r4,r4,r3 @ h+=K256[i]
- eor r2,r2,r11 @ Ch(e,f,g)
- eor r0,r5,r5,ror#11
- add r4,r4,r2 @ h+=Ch(e,f,g)
-#if 7==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 7<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r5,r6 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#9*4] @ from future BODY_16_xx
- eor r3,r5,r6 @ a^b, b^c in next round
- ldr r1,[sp,#6*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r5,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r8,r8,r4 @ d+=h
- eor r12,r12,r6 @ Maj(a,b,c)
- add r4,r4,r0,ror#2 @ h+=Sigma0(a)
- @ add r4,r4,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 8
-# if 8==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r8,r8,ror#5
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r8,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 8
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 8==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r8,r8,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r8,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r11,r11,r2 @ h+=X[i]
- str r2,[sp,#8*4]
- eor r2,r9,r10
- add r11,r11,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r8
- add r11,r11,r12 @ h+=K256[i]
- eor r2,r2,r10 @ Ch(e,f,g)
- eor r0,r4,r4,ror#11
- add r11,r11,r2 @ h+=Ch(e,f,g)
-#if 8==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 8<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r4,r5 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#10*4] @ from future BODY_16_xx
- eor r12,r4,r5 @ a^b, b^c in next round
- ldr r1,[sp,#7*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r4,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r7,r7,r11 @ d+=h
- eor r3,r3,r5 @ Maj(a,b,c)
- add r11,r11,r0,ror#2 @ h+=Sigma0(a)
- @ add r11,r11,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 9
-# if 9==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r7,r7,ror#5
- add r11,r11,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r7,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 9
- add r11,r11,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 9==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r7,r7,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r7,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r10,r10,r2 @ h+=X[i]
- str r2,[sp,#9*4]
- eor r2,r8,r9
- add r10,r10,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r7
- add r10,r10,r3 @ h+=K256[i]
- eor r2,r2,r9 @ Ch(e,f,g)
- eor r0,r11,r11,ror#11
- add r10,r10,r2 @ h+=Ch(e,f,g)
-#if 9==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 9<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r11,r4 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#11*4] @ from future BODY_16_xx
- eor r3,r11,r4 @ a^b, b^c in next round
- ldr r1,[sp,#8*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r11,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r6,r6,r10 @ d+=h
- eor r12,r12,r4 @ Maj(a,b,c)
- add r10,r10,r0,ror#2 @ h+=Sigma0(a)
- @ add r10,r10,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 10
-# if 10==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r6,r6,ror#5
- add r10,r10,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r6,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 10
- add r10,r10,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 10==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r6,r6,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r6,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r9,r9,r2 @ h+=X[i]
- str r2,[sp,#10*4]
- eor r2,r7,r8
- add r9,r9,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r6
- add r9,r9,r12 @ h+=K256[i]
- eor r2,r2,r8 @ Ch(e,f,g)
- eor r0,r10,r10,ror#11
- add r9,r9,r2 @ h+=Ch(e,f,g)
-#if 10==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 10<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r10,r11 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#12*4] @ from future BODY_16_xx
- eor r12,r10,r11 @ a^b, b^c in next round
- ldr r1,[sp,#9*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r10,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r5,r5,r9 @ d+=h
- eor r3,r3,r11 @ Maj(a,b,c)
- add r9,r9,r0,ror#2 @ h+=Sigma0(a)
- @ add r9,r9,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 11
-# if 11==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r5,r5,ror#5
- add r9,r9,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r5,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 11
- add r9,r9,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 11==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r5,r5,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r5,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r8,r8,r2 @ h+=X[i]
- str r2,[sp,#11*4]
- eor r2,r6,r7
- add r8,r8,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r5
- add r8,r8,r3 @ h+=K256[i]
- eor r2,r2,r7 @ Ch(e,f,g)
- eor r0,r9,r9,ror#11
- add r8,r8,r2 @ h+=Ch(e,f,g)
-#if 11==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 11<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r9,r10 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#13*4] @ from future BODY_16_xx
- eor r3,r9,r10 @ a^b, b^c in next round
- ldr r1,[sp,#10*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r9,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r4,r4,r8 @ d+=h
- eor r12,r12,r10 @ Maj(a,b,c)
- add r8,r8,r0,ror#2 @ h+=Sigma0(a)
- @ add r8,r8,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 12
-# if 12==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r4,r4,ror#5
- add r8,r8,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r4,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 12
- add r8,r8,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 12==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r4,r4,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r4,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r7,r7,r2 @ h+=X[i]
- str r2,[sp,#12*4]
- eor r2,r5,r6
- add r7,r7,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r4
- add r7,r7,r12 @ h+=K256[i]
- eor r2,r2,r6 @ Ch(e,f,g)
- eor r0,r8,r8,ror#11
- add r7,r7,r2 @ h+=Ch(e,f,g)
-#if 12==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 12<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r8,r9 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#14*4] @ from future BODY_16_xx
- eor r12,r8,r9 @ a^b, b^c in next round
- ldr r1,[sp,#11*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r8,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r11,r11,r7 @ d+=h
- eor r3,r3,r9 @ Maj(a,b,c)
- add r7,r7,r0,ror#2 @ h+=Sigma0(a)
- @ add r7,r7,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 13
-# if 13==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r11,r11,ror#5
- add r7,r7,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r11,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 13
- add r7,r7,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 13==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r11,r11,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r11,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r6,r6,r2 @ h+=X[i]
- str r2,[sp,#13*4]
- eor r2,r4,r5
- add r6,r6,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r11
- add r6,r6,r3 @ h+=K256[i]
- eor r2,r2,r5 @ Ch(e,f,g)
- eor r0,r7,r7,ror#11
- add r6,r6,r2 @ h+=Ch(e,f,g)
-#if 13==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 13<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r7,r8 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#15*4] @ from future BODY_16_xx
- eor r3,r7,r8 @ a^b, b^c in next round
- ldr r1,[sp,#12*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r7,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r10,r10,r6 @ d+=h
- eor r12,r12,r8 @ Maj(a,b,c)
- add r6,r6,r0,ror#2 @ h+=Sigma0(a)
- @ add r6,r6,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 14
-# if 14==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r10,r10,ror#5
- add r6,r6,r12 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r10,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 14
- add r6,r6,r12 @ h+=Maj(a,b,c) from the past
- ldrb r12,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r12,lsl#8
- ldrb r12,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 14==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r10,r10,ror#5
- orr r2,r2,r12,lsl#24
- eor r0,r0,r10,ror#19 @ Sigma1(e)
-#endif
- ldr r12,[r14],#4 @ *K256++
- add r5,r5,r2 @ h+=X[i]
- str r2,[sp,#14*4]
- eor r2,r11,r4
- add r5,r5,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r10
- add r5,r5,r12 @ h+=K256[i]
- eor r2,r2,r4 @ Ch(e,f,g)
- eor r0,r6,r6,ror#11
- add r5,r5,r2 @ h+=Ch(e,f,g)
-#if 14==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 14<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r6,r7 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#0*4] @ from future BODY_16_xx
- eor r12,r6,r7 @ a^b, b^c in next round
- ldr r1,[sp,#13*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r6,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r9,r9,r5 @ d+=h
- eor r3,r3,r7 @ Maj(a,b,c)
- add r5,r5,r0,ror#2 @ h+=Sigma0(a)
- @ add r5,r5,r3 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- @ ldr r2,[r1],#4 @ 15
-# if 15==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r9,r9,ror#5
- add r5,r5,r3 @ h+=Maj(a,b,c) from the past
- eor r0,r0,r9,ror#19 @ Sigma1(e)
-# ifndef __ARMEB__
- rev r2,r2
-# endif
-#else
- @ ldrb r2,[r1,#3] @ 15
- add r5,r5,r3 @ h+=Maj(a,b,c) from the past
- ldrb r3,[r1,#2]
- ldrb r0,[r1,#1]
- orr r2,r2,r3,lsl#8
- ldrb r3,[r1],#4
- orr r2,r2,r0,lsl#16
-# if 15==15
- str r1,[sp,#17*4] @ make room for r1
-# endif
- eor r0,r9,r9,ror#5
- orr r2,r2,r3,lsl#24
- eor r0,r0,r9,ror#19 @ Sigma1(e)
-#endif
- ldr r3,[r14],#4 @ *K256++
- add r4,r4,r2 @ h+=X[i]
- str r2,[sp,#15*4]
- eor r2,r10,r11
- add r4,r4,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r9
- add r4,r4,r3 @ h+=K256[i]
- eor r2,r2,r11 @ Ch(e,f,g)
- eor r0,r5,r5,ror#11
- add r4,r4,r2 @ h+=Ch(e,f,g)
-#if 15==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 15<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r5,r6 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#1*4] @ from future BODY_16_xx
- eor r3,r5,r6 @ a^b, b^c in next round
- ldr r1,[sp,#14*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r5,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r8,r8,r4 @ d+=h
- eor r12,r12,r6 @ Maj(a,b,c)
- add r4,r4,r0,ror#2 @ h+=Sigma0(a)
- @ add r4,r4,r12 @ h+=Maj(a,b,c)
-Lrounds_16_xx:
- @ ldr r2,[sp,#1*4] @ 16
- @ ldr r1,[sp,#14*4]
- mov r0,r2,ror#7
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#0*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#9*4]
-
- add r12,r12,r0
- eor r0,r8,r8,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r8,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r11,r11,r2 @ h+=X[i]
- str r2,[sp,#0*4]
- eor r2,r9,r10
- add r11,r11,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r8
- add r11,r11,r12 @ h+=K256[i]
- eor r2,r2,r10 @ Ch(e,f,g)
- eor r0,r4,r4,ror#11
- add r11,r11,r2 @ h+=Ch(e,f,g)
-#if 16==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 16<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r4,r5 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#2*4] @ from future BODY_16_xx
- eor r12,r4,r5 @ a^b, b^c in next round
- ldr r1,[sp,#15*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r4,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r7,r7,r11 @ d+=h
- eor r3,r3,r5 @ Maj(a,b,c)
- add r11,r11,r0,ror#2 @ h+=Sigma0(a)
- @ add r11,r11,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#2*4] @ 17
- @ ldr r1,[sp,#15*4]
- mov r0,r2,ror#7
- add r11,r11,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#1*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#10*4]
-
- add r3,r3,r0
- eor r0,r7,r7,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r7,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r10,r10,r2 @ h+=X[i]
- str r2,[sp,#1*4]
- eor r2,r8,r9
- add r10,r10,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r7
- add r10,r10,r3 @ h+=K256[i]
- eor r2,r2,r9 @ Ch(e,f,g)
- eor r0,r11,r11,ror#11
- add r10,r10,r2 @ h+=Ch(e,f,g)
-#if 17==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 17<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r11,r4 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#3*4] @ from future BODY_16_xx
- eor r3,r11,r4 @ a^b, b^c in next round
- ldr r1,[sp,#0*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r11,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r6,r6,r10 @ d+=h
- eor r12,r12,r4 @ Maj(a,b,c)
- add r10,r10,r0,ror#2 @ h+=Sigma0(a)
- @ add r10,r10,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#3*4] @ 18
- @ ldr r1,[sp,#0*4]
- mov r0,r2,ror#7
- add r10,r10,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#2*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#11*4]
-
- add r12,r12,r0
- eor r0,r6,r6,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r6,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r9,r9,r2 @ h+=X[i]
- str r2,[sp,#2*4]
- eor r2,r7,r8
- add r9,r9,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r6
- add r9,r9,r12 @ h+=K256[i]
- eor r2,r2,r8 @ Ch(e,f,g)
- eor r0,r10,r10,ror#11
- add r9,r9,r2 @ h+=Ch(e,f,g)
-#if 18==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 18<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r10,r11 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#4*4] @ from future BODY_16_xx
- eor r12,r10,r11 @ a^b, b^c in next round
- ldr r1,[sp,#1*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r10,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r5,r5,r9 @ d+=h
- eor r3,r3,r11 @ Maj(a,b,c)
- add r9,r9,r0,ror#2 @ h+=Sigma0(a)
- @ add r9,r9,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#4*4] @ 19
- @ ldr r1,[sp,#1*4]
- mov r0,r2,ror#7
- add r9,r9,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#3*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#12*4]
-
- add r3,r3,r0
- eor r0,r5,r5,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r5,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r8,r8,r2 @ h+=X[i]
- str r2,[sp,#3*4]
- eor r2,r6,r7
- add r8,r8,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r5
- add r8,r8,r3 @ h+=K256[i]
- eor r2,r2,r7 @ Ch(e,f,g)
- eor r0,r9,r9,ror#11
- add r8,r8,r2 @ h+=Ch(e,f,g)
-#if 19==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 19<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r9,r10 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#5*4] @ from future BODY_16_xx
- eor r3,r9,r10 @ a^b, b^c in next round
- ldr r1,[sp,#2*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r9,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r4,r4,r8 @ d+=h
- eor r12,r12,r10 @ Maj(a,b,c)
- add r8,r8,r0,ror#2 @ h+=Sigma0(a)
- @ add r8,r8,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#5*4] @ 20
- @ ldr r1,[sp,#2*4]
- mov r0,r2,ror#7
- add r8,r8,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#4*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#13*4]
-
- add r12,r12,r0
- eor r0,r4,r4,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r4,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r7,r7,r2 @ h+=X[i]
- str r2,[sp,#4*4]
- eor r2,r5,r6
- add r7,r7,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r4
- add r7,r7,r12 @ h+=K256[i]
- eor r2,r2,r6 @ Ch(e,f,g)
- eor r0,r8,r8,ror#11
- add r7,r7,r2 @ h+=Ch(e,f,g)
-#if 20==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 20<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r8,r9 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#6*4] @ from future BODY_16_xx
- eor r12,r8,r9 @ a^b, b^c in next round
- ldr r1,[sp,#3*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r8,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r11,r11,r7 @ d+=h
- eor r3,r3,r9 @ Maj(a,b,c)
- add r7,r7,r0,ror#2 @ h+=Sigma0(a)
- @ add r7,r7,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#6*4] @ 21
- @ ldr r1,[sp,#3*4]
- mov r0,r2,ror#7
- add r7,r7,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#5*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#14*4]
-
- add r3,r3,r0
- eor r0,r11,r11,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r11,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r6,r6,r2 @ h+=X[i]
- str r2,[sp,#5*4]
- eor r2,r4,r5
- add r6,r6,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r11
- add r6,r6,r3 @ h+=K256[i]
- eor r2,r2,r5 @ Ch(e,f,g)
- eor r0,r7,r7,ror#11
- add r6,r6,r2 @ h+=Ch(e,f,g)
-#if 21==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 21<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r7,r8 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#7*4] @ from future BODY_16_xx
- eor r3,r7,r8 @ a^b, b^c in next round
- ldr r1,[sp,#4*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r7,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r10,r10,r6 @ d+=h
- eor r12,r12,r8 @ Maj(a,b,c)
- add r6,r6,r0,ror#2 @ h+=Sigma0(a)
- @ add r6,r6,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#7*4] @ 22
- @ ldr r1,[sp,#4*4]
- mov r0,r2,ror#7
- add r6,r6,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#6*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#15*4]
-
- add r12,r12,r0
- eor r0,r10,r10,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r10,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r5,r5,r2 @ h+=X[i]
- str r2,[sp,#6*4]
- eor r2,r11,r4
- add r5,r5,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r10
- add r5,r5,r12 @ h+=K256[i]
- eor r2,r2,r4 @ Ch(e,f,g)
- eor r0,r6,r6,ror#11
- add r5,r5,r2 @ h+=Ch(e,f,g)
-#if 22==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 22<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r6,r7 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#8*4] @ from future BODY_16_xx
- eor r12,r6,r7 @ a^b, b^c in next round
- ldr r1,[sp,#5*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r6,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r9,r9,r5 @ d+=h
- eor r3,r3,r7 @ Maj(a,b,c)
- add r5,r5,r0,ror#2 @ h+=Sigma0(a)
- @ add r5,r5,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#8*4] @ 23
- @ ldr r1,[sp,#5*4]
- mov r0,r2,ror#7
- add r5,r5,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#7*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#0*4]
-
- add r3,r3,r0
- eor r0,r9,r9,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r9,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r4,r4,r2 @ h+=X[i]
- str r2,[sp,#7*4]
- eor r2,r10,r11
- add r4,r4,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r9
- add r4,r4,r3 @ h+=K256[i]
- eor r2,r2,r11 @ Ch(e,f,g)
- eor r0,r5,r5,ror#11
- add r4,r4,r2 @ h+=Ch(e,f,g)
-#if 23==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 23<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r5,r6 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#9*4] @ from future BODY_16_xx
- eor r3,r5,r6 @ a^b, b^c in next round
- ldr r1,[sp,#6*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r5,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r8,r8,r4 @ d+=h
- eor r12,r12,r6 @ Maj(a,b,c)
- add r4,r4,r0,ror#2 @ h+=Sigma0(a)
- @ add r4,r4,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#9*4] @ 24
- @ ldr r1,[sp,#6*4]
- mov r0,r2,ror#7
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#8*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#1*4]
-
- add r12,r12,r0
- eor r0,r8,r8,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r8,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r11,r11,r2 @ h+=X[i]
- str r2,[sp,#8*4]
- eor r2,r9,r10
- add r11,r11,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r8
- add r11,r11,r12 @ h+=K256[i]
- eor r2,r2,r10 @ Ch(e,f,g)
- eor r0,r4,r4,ror#11
- add r11,r11,r2 @ h+=Ch(e,f,g)
-#if 24==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 24<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r4,r5 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#10*4] @ from future BODY_16_xx
- eor r12,r4,r5 @ a^b, b^c in next round
- ldr r1,[sp,#7*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r4,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r7,r7,r11 @ d+=h
- eor r3,r3,r5 @ Maj(a,b,c)
- add r11,r11,r0,ror#2 @ h+=Sigma0(a)
- @ add r11,r11,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#10*4] @ 25
- @ ldr r1,[sp,#7*4]
- mov r0,r2,ror#7
- add r11,r11,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#9*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#2*4]
-
- add r3,r3,r0
- eor r0,r7,r7,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r7,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r10,r10,r2 @ h+=X[i]
- str r2,[sp,#9*4]
- eor r2,r8,r9
- add r10,r10,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r7
- add r10,r10,r3 @ h+=K256[i]
- eor r2,r2,r9 @ Ch(e,f,g)
- eor r0,r11,r11,ror#11
- add r10,r10,r2 @ h+=Ch(e,f,g)
-#if 25==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 25<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r11,r4 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#11*4] @ from future BODY_16_xx
- eor r3,r11,r4 @ a^b, b^c in next round
- ldr r1,[sp,#8*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r11,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r6,r6,r10 @ d+=h
- eor r12,r12,r4 @ Maj(a,b,c)
- add r10,r10,r0,ror#2 @ h+=Sigma0(a)
- @ add r10,r10,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#11*4] @ 26
- @ ldr r1,[sp,#8*4]
- mov r0,r2,ror#7
- add r10,r10,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#10*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#3*4]
-
- add r12,r12,r0
- eor r0,r6,r6,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r6,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r9,r9,r2 @ h+=X[i]
- str r2,[sp,#10*4]
- eor r2,r7,r8
- add r9,r9,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r6
- add r9,r9,r12 @ h+=K256[i]
- eor r2,r2,r8 @ Ch(e,f,g)
- eor r0,r10,r10,ror#11
- add r9,r9,r2 @ h+=Ch(e,f,g)
-#if 26==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 26<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r10,r11 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#12*4] @ from future BODY_16_xx
- eor r12,r10,r11 @ a^b, b^c in next round
- ldr r1,[sp,#9*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r10,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r5,r5,r9 @ d+=h
- eor r3,r3,r11 @ Maj(a,b,c)
- add r9,r9,r0,ror#2 @ h+=Sigma0(a)
- @ add r9,r9,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#12*4] @ 27
- @ ldr r1,[sp,#9*4]
- mov r0,r2,ror#7
- add r9,r9,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#11*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#4*4]
-
- add r3,r3,r0
- eor r0,r5,r5,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r5,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r8,r8,r2 @ h+=X[i]
- str r2,[sp,#11*4]
- eor r2,r6,r7
- add r8,r8,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r5
- add r8,r8,r3 @ h+=K256[i]
- eor r2,r2,r7 @ Ch(e,f,g)
- eor r0,r9,r9,ror#11
- add r8,r8,r2 @ h+=Ch(e,f,g)
-#if 27==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 27<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r9,r10 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#13*4] @ from future BODY_16_xx
- eor r3,r9,r10 @ a^b, b^c in next round
- ldr r1,[sp,#10*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r9,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r4,r4,r8 @ d+=h
- eor r12,r12,r10 @ Maj(a,b,c)
- add r8,r8,r0,ror#2 @ h+=Sigma0(a)
- @ add r8,r8,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#13*4] @ 28
- @ ldr r1,[sp,#10*4]
- mov r0,r2,ror#7
- add r8,r8,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#12*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#5*4]
-
- add r12,r12,r0
- eor r0,r4,r4,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r4,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r7,r7,r2 @ h+=X[i]
- str r2,[sp,#12*4]
- eor r2,r5,r6
- add r7,r7,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r4
- add r7,r7,r12 @ h+=K256[i]
- eor r2,r2,r6 @ Ch(e,f,g)
- eor r0,r8,r8,ror#11
- add r7,r7,r2 @ h+=Ch(e,f,g)
-#if 28==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 28<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r8,r9 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#14*4] @ from future BODY_16_xx
- eor r12,r8,r9 @ a^b, b^c in next round
- ldr r1,[sp,#11*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r8,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r11,r11,r7 @ d+=h
- eor r3,r3,r9 @ Maj(a,b,c)
- add r7,r7,r0,ror#2 @ h+=Sigma0(a)
- @ add r7,r7,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#14*4] @ 29
- @ ldr r1,[sp,#11*4]
- mov r0,r2,ror#7
- add r7,r7,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#13*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#6*4]
-
- add r3,r3,r0
- eor r0,r11,r11,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r11,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r6,r6,r2 @ h+=X[i]
- str r2,[sp,#13*4]
- eor r2,r4,r5
- add r6,r6,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r11
- add r6,r6,r3 @ h+=K256[i]
- eor r2,r2,r5 @ Ch(e,f,g)
- eor r0,r7,r7,ror#11
- add r6,r6,r2 @ h+=Ch(e,f,g)
-#if 29==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 29<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r7,r8 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#15*4] @ from future BODY_16_xx
- eor r3,r7,r8 @ a^b, b^c in next round
- ldr r1,[sp,#12*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r7,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r10,r10,r6 @ d+=h
- eor r12,r12,r8 @ Maj(a,b,c)
- add r6,r6,r0,ror#2 @ h+=Sigma0(a)
- @ add r6,r6,r12 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#15*4] @ 30
- @ ldr r1,[sp,#12*4]
- mov r0,r2,ror#7
- add r6,r6,r12 @ h+=Maj(a,b,c) from the past
- mov r12,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r12,r12,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#14*4]
- eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#7*4]
-
- add r12,r12,r0
- eor r0,r10,r10,ror#5 @ from BODY_00_15
- add r2,r2,r12
- eor r0,r0,r10,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r12,[r14],#4 @ *K256++
- add r5,r5,r2 @ h+=X[i]
- str r2,[sp,#14*4]
- eor r2,r11,r4
- add r5,r5,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r10
- add r5,r5,r12 @ h+=K256[i]
- eor r2,r2,r4 @ Ch(e,f,g)
- eor r0,r6,r6,ror#11
- add r5,r5,r2 @ h+=Ch(e,f,g)
-#if 30==31
- and r12,r12,#0xff
- cmp r12,#0xf2 @ done?
-#endif
-#if 30<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r12,r6,r7 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#0*4] @ from future BODY_16_xx
- eor r12,r6,r7 @ a^b, b^c in next round
- ldr r1,[sp,#13*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r6,ror#20 @ Sigma0(a)
- and r3,r3,r12 @ (b^c)&=(a^b)
- add r9,r9,r5 @ d+=h
- eor r3,r3,r7 @ Maj(a,b,c)
- add r5,r5,r0,ror#2 @ h+=Sigma0(a)
- @ add r5,r5,r3 @ h+=Maj(a,b,c)
- @ ldr r2,[sp,#0*4] @ 31
- @ ldr r1,[sp,#13*4]
- mov r0,r2,ror#7
- add r5,r5,r3 @ h+=Maj(a,b,c) from the past
- mov r3,r1,ror#17
- eor r0,r0,r2,ror#18
- eor r3,r3,r1,ror#19
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- ldr r2,[sp,#15*4]
- eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
- ldr r1,[sp,#8*4]
-
- add r3,r3,r0
- eor r0,r9,r9,ror#5 @ from BODY_00_15
- add r2,r2,r3
- eor r0,r0,r9,ror#19 @ Sigma1(e)
- add r2,r2,r1 @ X[i]
- ldr r3,[r14],#4 @ *K256++
- add r4,r4,r2 @ h+=X[i]
- str r2,[sp,#15*4]
- eor r2,r10,r11
- add r4,r4,r0,ror#6 @ h+=Sigma1(e)
- and r2,r2,r9
- add r4,r4,r3 @ h+=K256[i]
- eor r2,r2,r11 @ Ch(e,f,g)
- eor r0,r5,r5,ror#11
- add r4,r4,r2 @ h+=Ch(e,f,g)
-#if 31==31
- and r3,r3,#0xff
- cmp r3,#0xf2 @ done?
-#endif
-#if 31<15
-# if __ARM_ARCH__>=7
- ldr r2,[r1],#4 @ prefetch
-# else
- ldrb r2,[r1,#3]
-# endif
- eor r3,r5,r6 @ a^b, b^c in next round
-#else
- ldr r2,[sp,#1*4] @ from future BODY_16_xx
- eor r3,r5,r6 @ a^b, b^c in next round
- ldr r1,[sp,#14*4] @ from future BODY_16_xx
-#endif
- eor r0,r0,r5,ror#20 @ Sigma0(a)
- and r12,r12,r3 @ (b^c)&=(a^b)
- add r8,r8,r4 @ d+=h
- eor r12,r12,r6 @ Maj(a,b,c)
- add r4,r4,r0,ror#2 @ h+=Sigma0(a)
- @ add r4,r4,r12 @ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
- ite eq @ Thumb2 thing, sanity check in ARM
-#endif
- ldreq r3,[sp,#16*4] @ pull ctx
- bne Lrounds_16_xx
-
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- ldr r0,[r3,#0]
- ldr r2,[r3,#4]
- ldr r12,[r3,#8]
- add r4,r4,r0
- ldr r0,[r3,#12]
- add r5,r5,r2
- ldr r2,[r3,#16]
- add r6,r6,r12
- ldr r12,[r3,#20]
- add r7,r7,r0
- ldr r0,[r3,#24]
- add r8,r8,r2
- ldr r2,[r3,#28]
- add r9,r9,r12
- ldr r1,[sp,#17*4] @ pull inp
- ldr r12,[sp,#18*4] @ pull inp+len
- add r10,r10,r0
- add r11,r11,r2
- stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
- cmp r1,r12
- sub r14,r14,#256 @ rewind Ktbl
- bne Loop
-
- add sp,sp,#19*4 @ destroy frame
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
-#else
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
-.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.globl _sha256_block_data_order_neon
-.private_extern _sha256_block_data_order_neon
-#ifdef __thumb2__
-.thumb_func _sha256_block_data_order_neon
-#endif
-.align 5
-.skip 16
-_sha256_block_data_order_neon:
-LNEON:
- stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-
- sub r11,sp,#16*4+16
- adr r14,K256
- bic r11,r11,#15 @ align for 128-bit stores
- mov r12,sp
- mov sp,r11 @ alloca
- add r2,r1,r2,lsl#6 @ len to point at the end of inp
-
- vld1.8 {q0},[r1]!
- vld1.8 {q1},[r1]!
- vld1.8 {q2},[r1]!
- vld1.8 {q3},[r1]!
- vld1.32 {q8},[r14,:128]!
- vld1.32 {q9},[r14,:128]!
- vld1.32 {q10},[r14,:128]!
- vld1.32 {q11},[r14,:128]!
- vrev32.8 q0,q0 @ yes, even on
- str r0,[sp,#64]
- vrev32.8 q1,q1 @ big-endian
- str r1,[sp,#68]
- mov r1,sp
- vrev32.8 q2,q2
- str r2,[sp,#72]
- vrev32.8 q3,q3
- str r12,[sp,#76] @ save original sp
- vadd.i32 q8,q8,q0
- vadd.i32 q9,q9,q1
- vst1.32 {q8},[r1,:128]!
- vadd.i32 q10,q10,q2
- vst1.32 {q9},[r1,:128]!
- vadd.i32 q11,q11,q3
- vst1.32 {q10},[r1,:128]!
- vst1.32 {q11},[r1,:128]!
-
- ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
- sub r1,r1,#64
- ldr r2,[sp,#0]
- eor r12,r12,r12
- eor r3,r5,r6
- b L_00_48
-
-.align 4
-L_00_48:
- vext.8 q8,q0,q1,#4
- add r11,r11,r2
- eor r2,r9,r10
- eor r0,r8,r8,ror#5
- vext.8 q9,q2,q3,#4
- add r4,r4,r12
- and r2,r2,r8
- eor r12,r0,r8,ror#19
- vshr.u32 q10,q8,#7
- eor r0,r4,r4,ror#11
- eor r2,r2,r10
- vadd.i32 q0,q0,q9
- add r11,r11,r12,ror#6
- eor r12,r4,r5
- vshr.u32 q9,q8,#3
- eor r0,r0,r4,ror#20
- add r11,r11,r2
- vsli.32 q10,q8,#25
- ldr r2,[sp,#4]
- and r3,r3,r12
- vshr.u32 q11,q8,#18
- add r7,r7,r11
- add r11,r11,r0,ror#2
- eor r3,r3,r5
- veor q9,q9,q10
- add r10,r10,r2
- vsli.32 q11,q8,#14
- eor r2,r8,r9
- eor r0,r7,r7,ror#5
- vshr.u32 d24,d7,#17
- add r11,r11,r3
- and r2,r2,r7
- veor q9,q9,q11
- eor r3,r0,r7,ror#19
- eor r0,r11,r11,ror#11
- vsli.32 d24,d7,#15
- eor r2,r2,r9
- add r10,r10,r3,ror#6
- vshr.u32 d25,d7,#10
- eor r3,r11,r4
- eor r0,r0,r11,ror#20
- vadd.i32 q0,q0,q9
- add r10,r10,r2
- ldr r2,[sp,#8]
- veor d25,d25,d24
- and r12,r12,r3
- add r6,r6,r10
- vshr.u32 d24,d7,#19
- add r10,r10,r0,ror#2
- eor r12,r12,r4
- vsli.32 d24,d7,#13
- add r9,r9,r2
- eor r2,r7,r8
- veor d25,d25,d24
- eor r0,r6,r6,ror#5
- add r10,r10,r12
- vadd.i32 d0,d0,d25
- and r2,r2,r6
- eor r12,r0,r6,ror#19
- vshr.u32 d24,d0,#17
- eor r0,r10,r10,ror#11
- eor r2,r2,r8
- vsli.32 d24,d0,#15
- add r9,r9,r12,ror#6
- eor r12,r10,r11
- vshr.u32 d25,d0,#10
- eor r0,r0,r10,ror#20
- add r9,r9,r2
- veor d25,d25,d24
- ldr r2,[sp,#12]
- and r3,r3,r12
- vshr.u32 d24,d0,#19
- add r5,r5,r9
- add r9,r9,r0,ror#2
- eor r3,r3,r11
- vld1.32 {q8},[r14,:128]!
- add r8,r8,r2
- vsli.32 d24,d0,#13
- eor r2,r6,r7
- eor r0,r5,r5,ror#5
- veor d25,d25,d24
- add r9,r9,r3
- and r2,r2,r5
- vadd.i32 d1,d1,d25
- eor r3,r0,r5,ror#19
- eor r0,r9,r9,ror#11
- vadd.i32 q8,q8,q0
- eor r2,r2,r7
- add r8,r8,r3,ror#6
- eor r3,r9,r10
- eor r0,r0,r9,ror#20
- add r8,r8,r2
- ldr r2,[sp,#16]
- and r12,r12,r3
- add r4,r4,r8
- vst1.32 {q8},[r1,:128]!
- add r8,r8,r0,ror#2
- eor r12,r12,r10
- vext.8 q8,q1,q2,#4
- add r7,r7,r2
- eor r2,r5,r6
- eor r0,r4,r4,ror#5
- vext.8 q9,q3,q0,#4
- add r8,r8,r12
- and r2,r2,r4
- eor r12,r0,r4,ror#19
- vshr.u32 q10,q8,#7
- eor r0,r8,r8,ror#11
- eor r2,r2,r6
- vadd.i32 q1,q1,q9
- add r7,r7,r12,ror#6
- eor r12,r8,r9
- vshr.u32 q9,q8,#3
- eor r0,r0,r8,ror#20
- add r7,r7,r2
- vsli.32 q10,q8,#25
- ldr r2,[sp,#20]
- and r3,r3,r12
- vshr.u32 q11,q8,#18
- add r11,r11,r7
- add r7,r7,r0,ror#2
- eor r3,r3,r9
- veor q9,q9,q10
- add r6,r6,r2
- vsli.32 q11,q8,#14
- eor r2,r4,r5
- eor r0,r11,r11,ror#5
- vshr.u32 d24,d1,#17
- add r7,r7,r3
- and r2,r2,r11
- veor q9,q9,q11
- eor r3,r0,r11,ror#19
- eor r0,r7,r7,ror#11
- vsli.32 d24,d1,#15
- eor r2,r2,r5
- add r6,r6,r3,ror#6
- vshr.u32 d25,d1,#10
- eor r3,r7,r8
- eor r0,r0,r7,ror#20
- vadd.i32 q1,q1,q9
- add r6,r6,r2
- ldr r2,[sp,#24]
- veor d25,d25,d24
- and r12,r12,r3
- add r10,r10,r6
- vshr.u32 d24,d1,#19
- add r6,r6,r0,ror#2
- eor r12,r12,r8
- vsli.32 d24,d1,#13
- add r5,r5,r2
- eor r2,r11,r4
- veor d25,d25,d24
- eor r0,r10,r10,ror#5
- add r6,r6,r12
- vadd.i32 d2,d2,d25
- and r2,r2,r10
- eor r12,r0,r10,ror#19
- vshr.u32 d24,d2,#17
- eor r0,r6,r6,ror#11
- eor r2,r2,r4
- vsli.32 d24,d2,#15
- add r5,r5,r12,ror#6
- eor r12,r6,r7
- vshr.u32 d25,d2,#10
- eor r0,r0,r6,ror#20
- add r5,r5,r2
- veor d25,d25,d24
- ldr r2,[sp,#28]
- and r3,r3,r12
- vshr.u32 d24,d2,#19
- add r9,r9,r5
- add r5,r5,r0,ror#2
- eor r3,r3,r7
- vld1.32 {q8},[r14,:128]!
- add r4,r4,r2
- vsli.32 d24,d2,#13
- eor r2,r10,r11
- eor r0,r9,r9,ror#5
- veor d25,d25,d24
- add r5,r5,r3
- and r2,r2,r9
- vadd.i32 d3,d3,d25
- eor r3,r0,r9,ror#19
- eor r0,r5,r5,ror#11
- vadd.i32 q8,q8,q1
- eor r2,r2,r11
- add r4,r4,r3,ror#6
- eor r3,r5,r6
- eor r0,r0,r5,ror#20
- add r4,r4,r2
- ldr r2,[sp,#32]
- and r12,r12,r3
- add r8,r8,r4
- vst1.32 {q8},[r1,:128]!
- add r4,r4,r0,ror#2
- eor r12,r12,r6
- vext.8 q8,q2,q3,#4
- add r11,r11,r2
- eor r2,r9,r10
- eor r0,r8,r8,ror#5
- vext.8 q9,q0,q1,#4
- add r4,r4,r12
- and r2,r2,r8
- eor r12,r0,r8,ror#19
- vshr.u32 q10,q8,#7
- eor r0,r4,r4,ror#11
- eor r2,r2,r10
- vadd.i32 q2,q2,q9
- add r11,r11,r12,ror#6
- eor r12,r4,r5
- vshr.u32 q9,q8,#3
- eor r0,r0,r4,ror#20
- add r11,r11,r2
- vsli.32 q10,q8,#25
- ldr r2,[sp,#36]
- and r3,r3,r12
- vshr.u32 q11,q8,#18
- add r7,r7,r11
- add r11,r11,r0,ror#2
- eor r3,r3,r5
- veor q9,q9,q10
- add r10,r10,r2
- vsli.32 q11,q8,#14
- eor r2,r8,r9
- eor r0,r7,r7,ror#5
- vshr.u32 d24,d3,#17
- add r11,r11,r3
- and r2,r2,r7
- veor q9,q9,q11
- eor r3,r0,r7,ror#19
- eor r0,r11,r11,ror#11
- vsli.32 d24,d3,#15
- eor r2,r2,r9
- add r10,r10,r3,ror#6
- vshr.u32 d25,d3,#10
- eor r3,r11,r4
- eor r0,r0,r11,ror#20
- vadd.i32 q2,q2,q9
- add r10,r10,r2
- ldr r2,[sp,#40]
- veor d25,d25,d24
- and r12,r12,r3
- add r6,r6,r10
- vshr.u32 d24,d3,#19
- add r10,r10,r0,ror#2
- eor r12,r12,r4
- vsli.32 d24,d3,#13
- add r9,r9,r2
- eor r2,r7,r8
- veor d25,d25,d24
- eor r0,r6,r6,ror#5
- add r10,r10,r12
- vadd.i32 d4,d4,d25
- and r2,r2,r6
- eor r12,r0,r6,ror#19
- vshr.u32 d24,d4,#17
- eor r0,r10,r10,ror#11
- eor r2,r2,r8
- vsli.32 d24,d4,#15
- add r9,r9,r12,ror#6
- eor r12,r10,r11
- vshr.u32 d25,d4,#10
- eor r0,r0,r10,ror#20
- add r9,r9,r2
- veor d25,d25,d24
- ldr r2,[sp,#44]
- and r3,r3,r12
- vshr.u32 d24,d4,#19
- add r5,r5,r9
- add r9,r9,r0,ror#2
- eor r3,r3,r11
- vld1.32 {q8},[r14,:128]!
- add r8,r8,r2
- vsli.32 d24,d4,#13
- eor r2,r6,r7
- eor r0,r5,r5,ror#5
- veor d25,d25,d24
- add r9,r9,r3
- and r2,r2,r5
- vadd.i32 d5,d5,d25
- eor r3,r0,r5,ror#19
- eor r0,r9,r9,ror#11
- vadd.i32 q8,q8,q2
- eor r2,r2,r7
- add r8,r8,r3,ror#6
- eor r3,r9,r10
- eor r0,r0,r9,ror#20
- add r8,r8,r2
- ldr r2,[sp,#48]
- and r12,r12,r3
- add r4,r4,r8
- vst1.32 {q8},[r1,:128]!
- add r8,r8,r0,ror#2
- eor r12,r12,r10
- vext.8 q8,q3,q0,#4
- add r7,r7,r2
- eor r2,r5,r6
- eor r0,r4,r4,ror#5
- vext.8 q9,q1,q2,#4
- add r8,r8,r12
- and r2,r2,r4
- eor r12,r0,r4,ror#19
- vshr.u32 q10,q8,#7
- eor r0,r8,r8,ror#11
- eor r2,r2,r6
- vadd.i32 q3,q3,q9
- add r7,r7,r12,ror#6
- eor r12,r8,r9
- vshr.u32 q9,q8,#3
- eor r0,r0,r8,ror#20
- add r7,r7,r2
- vsli.32 q10,q8,#25
- ldr r2,[sp,#52]
- and r3,r3,r12
- vshr.u32 q11,q8,#18
- add r11,r11,r7
- add r7,r7,r0,ror#2
- eor r3,r3,r9
- veor q9,q9,q10
- add r6,r6,r2
- vsli.32 q11,q8,#14
- eor r2,r4,r5
- eor r0,r11,r11,ror#5
- vshr.u32 d24,d5,#17
- add r7,r7,r3
- and r2,r2,r11
- veor q9,q9,q11
- eor r3,r0,r11,ror#19
- eor r0,r7,r7,ror#11
- vsli.32 d24,d5,#15
- eor r2,r2,r5
- add r6,r6,r3,ror#6
- vshr.u32 d25,d5,#10
- eor r3,r7,r8
- eor r0,r0,r7,ror#20
- vadd.i32 q3,q3,q9
- add r6,r6,r2
- ldr r2,[sp,#56]
- veor d25,d25,d24
- and r12,r12,r3
- add r10,r10,r6
- vshr.u32 d24,d5,#19
- add r6,r6,r0,ror#2
- eor r12,r12,r8
- vsli.32 d24,d5,#13
- add r5,r5,r2
- eor r2,r11,r4
- veor d25,d25,d24
- eor r0,r10,r10,ror#5
- add r6,r6,r12
- vadd.i32 d6,d6,d25
- and r2,r2,r10
- eor r12,r0,r10,ror#19
- vshr.u32 d24,d6,#17
- eor r0,r6,r6,ror#11
- eor r2,r2,r4
- vsli.32 d24,d6,#15
- add r5,r5,r12,ror#6
- eor r12,r6,r7
- vshr.u32 d25,d6,#10
- eor r0,r0,r6,ror#20
- add r5,r5,r2
- veor d25,d25,d24
- ldr r2,[sp,#60]
- and r3,r3,r12
- vshr.u32 d24,d6,#19
- add r9,r9,r5
- add r5,r5,r0,ror#2
- eor r3,r3,r7
- vld1.32 {q8},[r14,:128]!
- add r4,r4,r2
- vsli.32 d24,d6,#13
- eor r2,r10,r11
- eor r0,r9,r9,ror#5
- veor d25,d25,d24
- add r5,r5,r3
- and r2,r2,r9
- vadd.i32 d7,d7,d25
- eor r3,r0,r9,ror#19
- eor r0,r5,r5,ror#11
- vadd.i32 q8,q8,q3
- eor r2,r2,r11
- add r4,r4,r3,ror#6
- eor r3,r5,r6
- eor r0,r0,r5,ror#20
- add r4,r4,r2
- ldr r2,[r14]
- and r12,r12,r3
- add r8,r8,r4
- vst1.32 {q8},[r1,:128]!
- add r4,r4,r0,ror#2
- eor r12,r12,r6
- teq r2,#0 @ check for K256 terminator
- ldr r2,[sp,#0]
- sub r1,r1,#64
- bne L_00_48
-
- ldr r1,[sp,#68]
- ldr r0,[sp,#72]
- sub r14,r14,#256 @ rewind r14
- teq r1,r0
- it eq
- subeq r1,r1,#64 @ avoid SEGV
- vld1.8 {q0},[r1]! @ load next input block
- vld1.8 {q1},[r1]!
- vld1.8 {q2},[r1]!
- vld1.8 {q3},[r1]!
- it ne
- strne r1,[sp,#68]
- mov r1,sp
- add r11,r11,r2
- eor r2,r9,r10
- eor r0,r8,r8,ror#5
- add r4,r4,r12
- vld1.32 {q8},[r14,:128]!
- and r2,r2,r8
- eor r12,r0,r8,ror#19
- eor r0,r4,r4,ror#11
- eor r2,r2,r10
- vrev32.8 q0,q0
- add r11,r11,r12,ror#6
- eor r12,r4,r5
- eor r0,r0,r4,ror#20
- add r11,r11,r2
- vadd.i32 q8,q8,q0
- ldr r2,[sp,#4]
- and r3,r3,r12
- add r7,r7,r11
- add r11,r11,r0,ror#2
- eor r3,r3,r5
- add r10,r10,r2
- eor r2,r8,r9
- eor r0,r7,r7,ror#5
- add r11,r11,r3
- and r2,r2,r7
- eor r3,r0,r7,ror#19
- eor r0,r11,r11,ror#11
- eor r2,r2,r9
- add r10,r10,r3,ror#6
- eor r3,r11,r4
- eor r0,r0,r11,ror#20
- add r10,r10,r2
- ldr r2,[sp,#8]
- and r12,r12,r3
- add r6,r6,r10
- add r10,r10,r0,ror#2
- eor r12,r12,r4
- add r9,r9,r2
- eor r2,r7,r8
- eor r0,r6,r6,ror#5
- add r10,r10,r12
- and r2,r2,r6
- eor r12,r0,r6,ror#19
- eor r0,r10,r10,ror#11
- eor r2,r2,r8
- add r9,r9,r12,ror#6
- eor r12,r10,r11
- eor r0,r0,r10,ror#20
- add r9,r9,r2
- ldr r2,[sp,#12]
- and r3,r3,r12
- add r5,r5,r9
- add r9,r9,r0,ror#2
- eor r3,r3,r11
- add r8,r8,r2
- eor r2,r6,r7
- eor r0,r5,r5,ror#5
- add r9,r9,r3
- and r2,r2,r5
- eor r3,r0,r5,ror#19
- eor r0,r9,r9,ror#11
- eor r2,r2,r7
- add r8,r8,r3,ror#6
- eor r3,r9,r10
- eor r0,r0,r9,ror#20
- add r8,r8,r2
- ldr r2,[sp,#16]
- and r12,r12,r3
- add r4,r4,r8
- add r8,r8,r0,ror#2
- eor r12,r12,r10
- vst1.32 {q8},[r1,:128]!
- add r7,r7,r2
- eor r2,r5,r6
- eor r0,r4,r4,ror#5
- add r8,r8,r12
- vld1.32 {q8},[r14,:128]!
- and r2,r2,r4
- eor r12,r0,r4,ror#19
- eor r0,r8,r8,ror#11
- eor r2,r2,r6
- vrev32.8 q1,q1
- add r7,r7,r12,ror#6
- eor r12,r8,r9
- eor r0,r0,r8,ror#20
- add r7,r7,r2
- vadd.i32 q8,q8,q1
- ldr r2,[sp,#20]
- and r3,r3,r12
- add r11,r11,r7
- add r7,r7,r0,ror#2
- eor r3,r3,r9
- add r6,r6,r2
- eor r2,r4,r5
- eor r0,r11,r11,ror#5
- add r7,r7,r3
- and r2,r2,r11
- eor r3,r0,r11,ror#19
- eor r0,r7,r7,ror#11
- eor r2,r2,r5
- add r6,r6,r3,ror#6
- eor r3,r7,r8
- eor r0,r0,r7,ror#20
- add r6,r6,r2
- ldr r2,[sp,#24]
- and r12,r12,r3
- add r10,r10,r6
- add r6,r6,r0,ror#2
- eor r12,r12,r8
- add r5,r5,r2
- eor r2,r11,r4
- eor r0,r10,r10,ror#5
- add r6,r6,r12
- and r2,r2,r10
- eor r12,r0,r10,ror#19
- eor r0,r6,r6,ror#11
- eor r2,r2,r4
- add r5,r5,r12,ror#6
- eor r12,r6,r7
- eor r0,r0,r6,ror#20
- add r5,r5,r2
- ldr r2,[sp,#28]
- and r3,r3,r12
- add r9,r9,r5
- add r5,r5,r0,ror#2
- eor r3,r3,r7
- add r4,r4,r2
- eor r2,r10,r11
- eor r0,r9,r9,ror#5
- add r5,r5,r3
- and r2,r2,r9
- eor r3,r0,r9,ror#19
- eor r0,r5,r5,ror#11
- eor r2,r2,r11
- add r4,r4,r3,ror#6
- eor r3,r5,r6
- eor r0,r0,r5,ror#20
- add r4,r4,r2
- ldr r2,[sp,#32]
- and r12,r12,r3
- add r8,r8,r4
- add r4,r4,r0,ror#2
- eor r12,r12,r6
- vst1.32 {q8},[r1,:128]!
- add r11,r11,r2
- eor r2,r9,r10
- eor r0,r8,r8,ror#5
- add r4,r4,r12
- vld1.32 {q8},[r14,:128]!
- and r2,r2,r8
- eor r12,r0,r8,ror#19
- eor r0,r4,r4,ror#11
- eor r2,r2,r10
- vrev32.8 q2,q2
- add r11,r11,r12,ror#6
- eor r12,r4,r5
- eor r0,r0,r4,ror#20
- add r11,r11,r2
- vadd.i32 q8,q8,q2
- ldr r2,[sp,#36]
- and r3,r3,r12
- add r7,r7,r11
- add r11,r11,r0,ror#2
- eor r3,r3,r5
- add r10,r10,r2
- eor r2,r8,r9
- eor r0,r7,r7,ror#5
- add r11,r11,r3
- and r2,r2,r7
- eor r3,r0,r7,ror#19
- eor r0,r11,r11,ror#11
- eor r2,r2,r9
- add r10,r10,r3,ror#6
- eor r3,r11,r4
- eor r0,r0,r11,ror#20
- add r10,r10,r2
- ldr r2,[sp,#40]
- and r12,r12,r3
- add r6,r6,r10
- add r10,r10,r0,ror#2
- eor r12,r12,r4
- add r9,r9,r2
- eor r2,r7,r8
- eor r0,r6,r6,ror#5
- add r10,r10,r12
- and r2,r2,r6
- eor r12,r0,r6,ror#19
- eor r0,r10,r10,ror#11
- eor r2,r2,r8
- add r9,r9,r12,ror#6
- eor r12,r10,r11
- eor r0,r0,r10,ror#20
- add r9,r9,r2
- ldr r2,[sp,#44]
- and r3,r3,r12
- add r5,r5,r9
- add r9,r9,r0,ror#2
- eor r3,r3,r11
- add r8,r8,r2
- eor r2,r6,r7
- eor r0,r5,r5,ror#5
- add r9,r9,r3
- and r2,r2,r5
- eor r3,r0,r5,ror#19
- eor r0,r9,r9,ror#11
- eor r2,r2,r7
- add r8,r8,r3,ror#6
- eor r3,r9,r10
- eor r0,r0,r9,ror#20
- add r8,r8,r2
- ldr r2,[sp,#48]
- and r12,r12,r3
- add r4,r4,r8
- add r8,r8,r0,ror#2
- eor r12,r12,r10
- vst1.32 {q8},[r1,:128]!
- add r7,r7,r2
- eor r2,r5,r6
- eor r0,r4,r4,ror#5
- add r8,r8,r12
- vld1.32 {q8},[r14,:128]!
- and r2,r2,r4
- eor r12,r0,r4,ror#19
- eor r0,r8,r8,ror#11
- eor r2,r2,r6
- vrev32.8 q3,q3
- add r7,r7,r12,ror#6
- eor r12,r8,r9
- eor r0,r0,r8,ror#20
- add r7,r7,r2
- vadd.i32 q8,q8,q3
- ldr r2,[sp,#52]
- and r3,r3,r12
- add r11,r11,r7
- add r7,r7,r0,ror#2
- eor r3,r3,r9
- add r6,r6,r2
- eor r2,r4,r5
- eor r0,r11,r11,ror#5
- add r7,r7,r3
- and r2,r2,r11
- eor r3,r0,r11,ror#19
- eor r0,r7,r7,ror#11
- eor r2,r2,r5
- add r6,r6,r3,ror#6
- eor r3,r7,r8
- eor r0,r0,r7,ror#20
- add r6,r6,r2
- ldr r2,[sp,#56]
- and r12,r12,r3
- add r10,r10,r6
- add r6,r6,r0,ror#2
- eor r12,r12,r8
- add r5,r5,r2
- eor r2,r11,r4
- eor r0,r10,r10,ror#5
- add r6,r6,r12
- and r2,r2,r10
- eor r12,r0,r10,ror#19
- eor r0,r6,r6,ror#11
- eor r2,r2,r4
- add r5,r5,r12,ror#6
- eor r12,r6,r7
- eor r0,r0,r6,ror#20
- add r5,r5,r2
- ldr r2,[sp,#60]
- and r3,r3,r12
- add r9,r9,r5
- add r5,r5,r0,ror#2
- eor r3,r3,r7
- add r4,r4,r2
- eor r2,r10,r11
- eor r0,r9,r9,ror#5
- add r5,r5,r3
- and r2,r2,r9
- eor r3,r0,r9,ror#19
- eor r0,r5,r5,ror#11
- eor r2,r2,r11
- add r4,r4,r3,ror#6
- eor r3,r5,r6
- eor r0,r0,r5,ror#20
- add r4,r4,r2
- ldr r2,[sp,#64]
- and r12,r12,r3
- add r8,r8,r4
- add r4,r4,r0,ror#2
- eor r12,r12,r6
- vst1.32 {q8},[r1,:128]!
- ldr r0,[r2,#0]
- add r4,r4,r12 @ h+=Maj(a,b,c) from the past
- ldr r12,[r2,#4]
- ldr r3,[r2,#8]
- ldr r1,[r2,#12]
- add r4,r4,r0 @ accumulate
- ldr r0,[r2,#16]
- add r5,r5,r12
- ldr r12,[r2,#20]
- add r6,r6,r3
- ldr r3,[r2,#24]
- add r7,r7,r1
- ldr r1,[r2,#28]
- add r8,r8,r0
- str r4,[r2],#4
- add r9,r9,r12
- str r5,[r2],#4
- add r10,r10,r3
- str r6,[r2],#4
- add r11,r11,r1
- str r7,[r2],#4
- stmia r2,{r8,r9,r10,r11}
-
- ittte ne
- movne r1,sp
- ldrne r2,[sp,#0]
- eorne r12,r12,r12
- ldreq sp,[sp,#76] @ restore original sp
- itt ne
- eorne r3,r5,r6
- bne L_00_48
-
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-
-# if defined(__thumb2__)
-# define INST(a,b,c,d) .byte c,d|0xc,a,b
-# else
-# define INST(a,b,c,d) .byte a,b,c,d
-# endif
-
-#ifdef __thumb2__
-.thumb_func sha256_block_data_order_armv8
-#endif
-.align 5
-sha256_block_data_order_armv8:
-LARMv8:
- vld1.32 {q0,q1},[r0]
- sub r3,r3,#256+32
- add r2,r1,r2,lsl#6 @ len to point at the end of inp
- b Loop_v8
-
-.align 4
-Loop_v8:
- vld1.8 {q8,q9},[r1]!
- vld1.8 {q10,q11},[r1]!
- vld1.32 {q12},[r3]!
- vrev32.8 q8,q8
- vrev32.8 q9,q9
- vrev32.8 q10,q10
- vrev32.8 q11,q11
- vmov q14,q0 @ offload
- vmov q15,q1
- teq r1,r2
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q8
- INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
- INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q9
- INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
- INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q10
- INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
- INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q11
- INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
- INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q8
- INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
- INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q9
- INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
- INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q10
- INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
- INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q11
- INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
- INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q8
- INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
- INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q9
- INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
- INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q10
- INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
- INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q11
- INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
- INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
- vld1.32 {q13},[r3]!
- vadd.i32 q12,q12,q8
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
-
- vld1.32 {q12},[r3]!
- vadd.i32 q13,q13,q9
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
-
- vld1.32 {q13},[r3]
- vadd.i32 q12,q12,q10
- sub r3,r3,#256-16 @ rewind
- vmov q2,q0
- INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
- INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
-
- vadd.i32 q13,q13,q11
- vmov q2,q0
- INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
- INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
-
- vadd.i32 q0,q0,q14
- vadd.i32 q1,q1,q15
- it ne
- bne Loop_v8
-
- vst1.32 {q0,q1},[r0]
-
- bx lr @ bx lr
-
-#endif
-.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm _OPENSSL_armcap_P,4
-.non_lazy_symbol_pointer
-OPENSSL_armcap_P:
-.indirect_symbol _OPENSSL_armcap_P
-.long 0
-.private_extern _OPENSSL_armcap_P
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/sha512-armv4-apple.S b/apple-arm/crypto/fipsmodule/sha512-armv4-apple.S
deleted file mode 100644
index 12884b5..0000000
--- a/apple-arm/crypto/fipsmodule/sha512-armv4-apple.S
+++ /dev/null
@@ -1,1891 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
-@
-@ Licensed under the OpenSSL license (the "License"). You may not use
-@ this file except in compliance with the License. You can obtain a copy
-@ in the file LICENSE in the source distribution or at
-@ https://www.openssl.org/source/license.html
-
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@
-@ Permission to use under GPL terms is granted.
-@ ====================================================================
-
-@ SHA512 block procedure for ARMv4. September 2007.
-
-@ This code is ~4.5 (four and a half) times faster than code generated
-@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
-@ Xscale PXA250 core].
-@
-@ July 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 6% improvement on
-@ Cortex A8 core and ~40 cycles per processed byte.
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 7%
-@ improvement on Coxtex A8 core and ~38 cycles per byte.
-
-@ March 2011.
-@
-@ Add NEON implementation. On Cortex A8 it was measured to process
-@ one byte in 23.3 cycles or ~60% faster than integer-only code.
-
-@ August 2012.
-@
-@ Improve NEON performance by 12% on Snapdragon S4. In absolute
-@ terms it's 22.6 cycles per byte, which is disappointing result.
-@ Technical writers asserted that 3-way S4 pipeline can sustain
-@ multiple NEON instructions per cycle, but dual NEON issue could
-@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
-@ for further details. On side note Cortex-A15 processes one byte in
-@ 16 cycles.
-
-@ Byte order [in]dependence. =========================================
-@
-@ Originally caller was expected to maintain specific *dword* order in
-@ h[0-7], namely with most significant dword at *lower* address, which
-@ was reflected in below two parameters as 0 and 4. Now caller is
-@ expected to maintain native byte order for whole 64-bit values.
-#ifndef __KERNEL__
-# include <openssl/arm_arch.h>
-# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
-# define VFP_ABI_POP vldmia sp!,{d8-d15}
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-#endif
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
-
-
-#ifdef __ARMEL__
-# define LO 0
-# define HI 4
-# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
-#else
-# define HI 0
-# define LO 4
-# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
-#endif
-
-.text
-#if defined(__thumb2__)
-.syntax unified
-.thumb
-# define adrl adr
-#else
-.code 32
-#endif
-
-
-.align 5
-K512:
- WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
- WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
- WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
- WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
- WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
- WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
- WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
- WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
- WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
- WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
- WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
- WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
- WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
- WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
- WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
- WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
- WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
- WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
- WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
- WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
- WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
- WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
- WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
- WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
- WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
- WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
- WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
- WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
- WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
- WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
- WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
- WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
- WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
- WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
- WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
- WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
- WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
- WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
- WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
- WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-LOPENSSL_armcap:
-.word OPENSSL_armcap_P-Lsha512_block_data_order
-.skip 32-4
-#else
-.skip 32
-#endif
-
-.globl _sha512_block_data_order
-.private_extern _sha512_block_data_order
-#ifdef __thumb2__
-.thumb_func _sha512_block_data_order
-#endif
-_sha512_block_data_order:
-Lsha512_block_data_order:
-#if __ARM_ARCH__<7 && !defined(__thumb2__)
- sub r3,pc,#8 @ _sha512_block_data_order
-#else
- adr r3,Lsha512_block_data_order
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
- ldr r12,[r12]
-#endif
- tst r12,#ARMV7_NEON
- bne LNEON
-#endif
- add r2,r1,r2,lsl#7 @ len to point at the end of inp
- stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
- sub r14,r3,#672 @ K512
- sub sp,sp,#9*8
-
- ldr r7,[r0,#32+LO]
- ldr r8,[r0,#32+HI]
- ldr r9, [r0,#48+LO]
- ldr r10, [r0,#48+HI]
- ldr r11, [r0,#56+LO]
- ldr r12, [r0,#56+HI]
-Loop:
- str r9, [sp,#48+0]
- str r10, [sp,#48+4]
- str r11, [sp,#56+0]
- str r12, [sp,#56+4]
- ldr r5,[r0,#0+LO]
- ldr r6,[r0,#0+HI]
- ldr r3,[r0,#8+LO]
- ldr r4,[r0,#8+HI]
- ldr r9, [r0,#16+LO]
- ldr r10, [r0,#16+HI]
- ldr r11, [r0,#24+LO]
- ldr r12, [r0,#24+HI]
- str r3,[sp,#8+0]
- str r4,[sp,#8+4]
- str r9, [sp,#16+0]
- str r10, [sp,#16+4]
- str r11, [sp,#24+0]
- str r12, [sp,#24+4]
- ldr r3,[r0,#40+LO]
- ldr r4,[r0,#40+HI]
- str r3,[sp,#40+0]
- str r4,[sp,#40+4]
-
-L00_15:
-#if __ARM_ARCH__<7
- ldrb r3,[r1,#7]
- ldrb r9, [r1,#6]
- ldrb r10, [r1,#5]
- ldrb r11, [r1,#4]
- ldrb r4,[r1,#3]
- ldrb r12, [r1,#2]
- orr r3,r3,r9,lsl#8
- ldrb r9, [r1,#1]
- orr r3,r3,r10,lsl#16
- ldrb r10, [r1],#8
- orr r3,r3,r11,lsl#24
- orr r4,r4,r12,lsl#8
- orr r4,r4,r9,lsl#16
- orr r4,r4,r10,lsl#24
-#else
- ldr r3,[r1,#4]
- ldr r4,[r1],#8
-#ifdef __ARMEL__
- rev r3,r3
- rev r4,r4
-#endif
-#endif
- @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
- @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
- @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
- mov r9,r7,lsr#14
- str r3,[sp,#64+0]
- mov r10,r8,lsr#14
- str r4,[sp,#64+4]
- eor r9,r9,r8,lsl#18
- ldr r11,[sp,#56+0] @ h.lo
- eor r10,r10,r7,lsl#18
- ldr r12,[sp,#56+4] @ h.hi
- eor r9,r9,r7,lsr#18
- eor r10,r10,r8,lsr#18
- eor r9,r9,r8,lsl#14
- eor r10,r10,r7,lsl#14
- eor r9,r9,r8,lsr#9
- eor r10,r10,r7,lsr#9
- eor r9,r9,r7,lsl#23
- eor r10,r10,r8,lsl#23 @ Sigma1(e)
- adds r3,r3,r9
- ldr r9,[sp,#40+0] @ f.lo
- adc r4,r4,r10 @ T += Sigma1(e)
- ldr r10,[sp,#40+4] @ f.hi
- adds r3,r3,r11
- ldr r11,[sp,#48+0] @ g.lo
- adc r4,r4,r12 @ T += h
- ldr r12,[sp,#48+4] @ g.hi
-
- eor r9,r9,r11
- str r7,[sp,#32+0]
- eor r10,r10,r12
- str r8,[sp,#32+4]
- and r9,r9,r7
- str r5,[sp,#0+0]
- and r10,r10,r8
- str r6,[sp,#0+4]
- eor r9,r9,r11
- ldr r11,[r14,#LO] @ K[i].lo
- eor r10,r10,r12 @ Ch(e,f,g)
- ldr r12,[r14,#HI] @ K[i].hi
-
- adds r3,r3,r9
- ldr r7,[sp,#24+0] @ d.lo
- adc r4,r4,r10 @ T += Ch(e,f,g)
- ldr r8,[sp,#24+4] @ d.hi
- adds r3,r3,r11
- and r9,r11,#0xff
- adc r4,r4,r12 @ T += K[i]
- adds r7,r7,r3
- ldr r11,[sp,#8+0] @ b.lo
- adc r8,r8,r4 @ d += T
- teq r9,#148
-
- ldr r12,[sp,#16+0] @ c.lo
-#if __ARM_ARCH__>=7
- it eq @ Thumb2 thing, sanity check in ARM
-#endif
- orreq r14,r14,#1
- @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
- @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
- @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
- mov r9,r5,lsr#28
- mov r10,r6,lsr#28
- eor r9,r9,r6,lsl#4
- eor r10,r10,r5,lsl#4
- eor r9,r9,r6,lsr#2
- eor r10,r10,r5,lsr#2
- eor r9,r9,r5,lsl#30
- eor r10,r10,r6,lsl#30
- eor r9,r9,r6,lsr#7
- eor r10,r10,r5,lsr#7
- eor r9,r9,r5,lsl#25
- eor r10,r10,r6,lsl#25 @ Sigma0(a)
- adds r3,r3,r9
- and r9,r5,r11
- adc r4,r4,r10 @ T += Sigma0(a)
-
- ldr r10,[sp,#8+4] @ b.hi
- orr r5,r5,r11
- ldr r11,[sp,#16+4] @ c.hi
- and r5,r5,r12
- and r12,r6,r10
- orr r6,r6,r10
- orr r5,r5,r9 @ Maj(a,b,c).lo
- and r6,r6,r11
- adds r5,r5,r3
- orr r6,r6,r12 @ Maj(a,b,c).hi
- sub sp,sp,#8
- adc r6,r6,r4 @ h += T
- tst r14,#1
- add r14,r14,#8
- tst r14,#1
- beq L00_15
- ldr r9,[sp,#184+0]
- ldr r10,[sp,#184+4]
- bic r14,r14,#1
-L16_79:
- @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
- @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
- @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
- mov r3,r9,lsr#1
- ldr r11,[sp,#80+0]
- mov r4,r10,lsr#1
- ldr r12,[sp,#80+4]
- eor r3,r3,r10,lsl#31
- eor r4,r4,r9,lsl#31
- eor r3,r3,r9,lsr#8
- eor r4,r4,r10,lsr#8
- eor r3,r3,r10,lsl#24
- eor r4,r4,r9,lsl#24
- eor r3,r3,r9,lsr#7
- eor r4,r4,r10,lsr#7
- eor r3,r3,r10,lsl#25
-
- @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
- @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
- @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
- mov r9,r11,lsr#19
- mov r10,r12,lsr#19
- eor r9,r9,r12,lsl#13
- eor r10,r10,r11,lsl#13
- eor r9,r9,r12,lsr#29
- eor r10,r10,r11,lsr#29
- eor r9,r9,r11,lsl#3
- eor r10,r10,r12,lsl#3
- eor r9,r9,r11,lsr#6
- eor r10,r10,r12,lsr#6
- ldr r11,[sp,#120+0]
- eor r9,r9,r12,lsl#26
-
- ldr r12,[sp,#120+4]
- adds r3,r3,r9
- ldr r9,[sp,#192+0]
- adc r4,r4,r10
-
- ldr r10,[sp,#192+4]
- adds r3,r3,r11
- adc r4,r4,r12
- adds r3,r3,r9
- adc r4,r4,r10
- @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
- @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
- @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
- mov r9,r7,lsr#14
- str r3,[sp,#64+0]
- mov r10,r8,lsr#14
- str r4,[sp,#64+4]
- eor r9,r9,r8,lsl#18
- ldr r11,[sp,#56+0] @ h.lo
- eor r10,r10,r7,lsl#18
- ldr r12,[sp,#56+4] @ h.hi
- eor r9,r9,r7,lsr#18
- eor r10,r10,r8,lsr#18
- eor r9,r9,r8,lsl#14
- eor r10,r10,r7,lsl#14
- eor r9,r9,r8,lsr#9
- eor r10,r10,r7,lsr#9
- eor r9,r9,r7,lsl#23
- eor r10,r10,r8,lsl#23 @ Sigma1(e)
- adds r3,r3,r9
- ldr r9,[sp,#40+0] @ f.lo
- adc r4,r4,r10 @ T += Sigma1(e)
- ldr r10,[sp,#40+4] @ f.hi
- adds r3,r3,r11
- ldr r11,[sp,#48+0] @ g.lo
- adc r4,r4,r12 @ T += h
- ldr r12,[sp,#48+4] @ g.hi
-
- eor r9,r9,r11
- str r7,[sp,#32+0]
- eor r10,r10,r12
- str r8,[sp,#32+4]
- and r9,r9,r7
- str r5,[sp,#0+0]
- and r10,r10,r8
- str r6,[sp,#0+4]
- eor r9,r9,r11
- ldr r11,[r14,#LO] @ K[i].lo
- eor r10,r10,r12 @ Ch(e,f,g)
- ldr r12,[r14,#HI] @ K[i].hi
-
- adds r3,r3,r9
- ldr r7,[sp,#24+0] @ d.lo
- adc r4,r4,r10 @ T += Ch(e,f,g)
- ldr r8,[sp,#24+4] @ d.hi
- adds r3,r3,r11
- and r9,r11,#0xff
- adc r4,r4,r12 @ T += K[i]
- adds r7,r7,r3
- ldr r11,[sp,#8+0] @ b.lo
- adc r8,r8,r4 @ d += T
- teq r9,#23
-
- ldr r12,[sp,#16+0] @ c.lo
-#if __ARM_ARCH__>=7
- it eq @ Thumb2 thing, sanity check in ARM
-#endif
- orreq r14,r14,#1
- @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
- @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
- @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
- mov r9,r5,lsr#28
- mov r10,r6,lsr#28
- eor r9,r9,r6,lsl#4
- eor r10,r10,r5,lsl#4
- eor r9,r9,r6,lsr#2
- eor r10,r10,r5,lsr#2
- eor r9,r9,r5,lsl#30
- eor r10,r10,r6,lsl#30
- eor r9,r9,r6,lsr#7
- eor r10,r10,r5,lsr#7
- eor r9,r9,r5,lsl#25
- eor r10,r10,r6,lsl#25 @ Sigma0(a)
- adds r3,r3,r9
- and r9,r5,r11
- adc r4,r4,r10 @ T += Sigma0(a)
-
- ldr r10,[sp,#8+4] @ b.hi
- orr r5,r5,r11
- ldr r11,[sp,#16+4] @ c.hi
- and r5,r5,r12
- and r12,r6,r10
- orr r6,r6,r10
- orr r5,r5,r9 @ Maj(a,b,c).lo
- and r6,r6,r11
- adds r5,r5,r3
- orr r6,r6,r12 @ Maj(a,b,c).hi
- sub sp,sp,#8
- adc r6,r6,r4 @ h += T
- tst r14,#1
- add r14,r14,#8
-#if __ARM_ARCH__>=7
- ittt eq @ Thumb2 thing, sanity check in ARM
-#endif
- ldreq r9,[sp,#184+0]
- ldreq r10,[sp,#184+4]
- beq L16_79
- bic r14,r14,#1
-
- ldr r3,[sp,#8+0]
- ldr r4,[sp,#8+4]
- ldr r9, [r0,#0+LO]
- ldr r10, [r0,#0+HI]
- ldr r11, [r0,#8+LO]
- ldr r12, [r0,#8+HI]
- adds r9,r5,r9
- str r9, [r0,#0+LO]
- adc r10,r6,r10
- str r10, [r0,#0+HI]
- adds r11,r3,r11
- str r11, [r0,#8+LO]
- adc r12,r4,r12
- str r12, [r0,#8+HI]
-
- ldr r5,[sp,#16+0]
- ldr r6,[sp,#16+4]
- ldr r3,[sp,#24+0]
- ldr r4,[sp,#24+4]
- ldr r9, [r0,#16+LO]
- ldr r10, [r0,#16+HI]
- ldr r11, [r0,#24+LO]
- ldr r12, [r0,#24+HI]
- adds r9,r5,r9
- str r9, [r0,#16+LO]
- adc r10,r6,r10
- str r10, [r0,#16+HI]
- adds r11,r3,r11
- str r11, [r0,#24+LO]
- adc r12,r4,r12
- str r12, [r0,#24+HI]
-
- ldr r3,[sp,#40+0]
- ldr r4,[sp,#40+4]
- ldr r9, [r0,#32+LO]
- ldr r10, [r0,#32+HI]
- ldr r11, [r0,#40+LO]
- ldr r12, [r0,#40+HI]
- adds r7,r7,r9
- str r7,[r0,#32+LO]
- adc r8,r8,r10
- str r8,[r0,#32+HI]
- adds r11,r3,r11
- str r11, [r0,#40+LO]
- adc r12,r4,r12
- str r12, [r0,#40+HI]
-
- ldr r5,[sp,#48+0]
- ldr r6,[sp,#48+4]
- ldr r3,[sp,#56+0]
- ldr r4,[sp,#56+4]
- ldr r9, [r0,#48+LO]
- ldr r10, [r0,#48+HI]
- ldr r11, [r0,#56+LO]
- ldr r12, [r0,#56+HI]
- adds r9,r5,r9
- str r9, [r0,#48+LO]
- adc r10,r6,r10
- str r10, [r0,#48+HI]
- adds r11,r3,r11
- str r11, [r0,#56+LO]
- adc r12,r4,r12
- str r12, [r0,#56+HI]
-
- add sp,sp,#640
- sub r14,r14,#640
-
- teq r1,r2
- bne Loop
-
- add sp,sp,#8*9 @ destroy frame
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-#else
- ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
-.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.globl _sha512_block_data_order_neon
-.private_extern _sha512_block_data_order_neon
-#ifdef __thumb2__
-.thumb_func _sha512_block_data_order_neon
-#endif
-.align 4
-_sha512_block_data_order_neon:
-LNEON:
- dmb @ errata #451034 on early Cortex A8
- add r2,r1,r2,lsl#7 @ len to point at the end of inp
- adr r3,K512
- VFP_ABI_PUSH
- vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context
-Loop_neon:
- vshr.u64 d24,d20,#14 @ 0
-#if 0<16
- vld1.64 {d0},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d20,#18
-#if 0>0
- vadd.i64 d16,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d20,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d20,#50
- vsli.64 d25,d20,#46
- vmov d29,d20
- vsli.64 d26,d20,#23
-#if 0<16 && defined(__ARMEL__)
- vrev64.8 d0,d0
-#endif
- veor d25,d24
- vbsl d29,d21,d22 @ Ch(e,f,g)
- vshr.u64 d24,d16,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d23
- vshr.u64 d25,d16,#34
- vsli.64 d24,d16,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d16,#39
- vadd.i64 d28,d0
- vsli.64 d25,d16,#30
- veor d30,d16,d17
- vsli.64 d26,d16,#25
- veor d23,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d18,d17 @ Maj(a,b,c)
- veor d23,d26 @ Sigma0(a)
- vadd.i64 d19,d27
- vadd.i64 d30,d27
- @ vadd.i64 d23,d30
- vshr.u64 d24,d19,#14 @ 1
-#if 1<16
- vld1.64 {d1},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d19,#18
-#if 1>0
- vadd.i64 d23,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d19,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d19,#50
- vsli.64 d25,d19,#46
- vmov d29,d19
- vsli.64 d26,d19,#23
-#if 1<16 && defined(__ARMEL__)
- vrev64.8 d1,d1
-#endif
- veor d25,d24
- vbsl d29,d20,d21 @ Ch(e,f,g)
- vshr.u64 d24,d23,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d22
- vshr.u64 d25,d23,#34
- vsli.64 d24,d23,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d23,#39
- vadd.i64 d28,d1
- vsli.64 d25,d23,#30
- veor d30,d23,d16
- vsli.64 d26,d23,#25
- veor d22,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d17,d16 @ Maj(a,b,c)
- veor d22,d26 @ Sigma0(a)
- vadd.i64 d18,d27
- vadd.i64 d30,d27
- @ vadd.i64 d22,d30
- vshr.u64 d24,d18,#14 @ 2
-#if 2<16
- vld1.64 {d2},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d18,#18
-#if 2>0
- vadd.i64 d22,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d18,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d18,#50
- vsli.64 d25,d18,#46
- vmov d29,d18
- vsli.64 d26,d18,#23
-#if 2<16 && defined(__ARMEL__)
- vrev64.8 d2,d2
-#endif
- veor d25,d24
- vbsl d29,d19,d20 @ Ch(e,f,g)
- vshr.u64 d24,d22,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d21
- vshr.u64 d25,d22,#34
- vsli.64 d24,d22,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d22,#39
- vadd.i64 d28,d2
- vsli.64 d25,d22,#30
- veor d30,d22,d23
- vsli.64 d26,d22,#25
- veor d21,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d16,d23 @ Maj(a,b,c)
- veor d21,d26 @ Sigma0(a)
- vadd.i64 d17,d27
- vadd.i64 d30,d27
- @ vadd.i64 d21,d30
- vshr.u64 d24,d17,#14 @ 3
-#if 3<16
- vld1.64 {d3},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d17,#18
-#if 3>0
- vadd.i64 d21,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d17,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d17,#50
- vsli.64 d25,d17,#46
- vmov d29,d17
- vsli.64 d26,d17,#23
-#if 3<16 && defined(__ARMEL__)
- vrev64.8 d3,d3
-#endif
- veor d25,d24
- vbsl d29,d18,d19 @ Ch(e,f,g)
- vshr.u64 d24,d21,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d20
- vshr.u64 d25,d21,#34
- vsli.64 d24,d21,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d21,#39
- vadd.i64 d28,d3
- vsli.64 d25,d21,#30
- veor d30,d21,d22
- vsli.64 d26,d21,#25
- veor d20,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d23,d22 @ Maj(a,b,c)
- veor d20,d26 @ Sigma0(a)
- vadd.i64 d16,d27
- vadd.i64 d30,d27
- @ vadd.i64 d20,d30
- vshr.u64 d24,d16,#14 @ 4
-#if 4<16
- vld1.64 {d4},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d16,#18
-#if 4>0
- vadd.i64 d20,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d16,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d16,#50
- vsli.64 d25,d16,#46
- vmov d29,d16
- vsli.64 d26,d16,#23
-#if 4<16 && defined(__ARMEL__)
- vrev64.8 d4,d4
-#endif
- veor d25,d24
- vbsl d29,d17,d18 @ Ch(e,f,g)
- vshr.u64 d24,d20,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d19
- vshr.u64 d25,d20,#34
- vsli.64 d24,d20,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d20,#39
- vadd.i64 d28,d4
- vsli.64 d25,d20,#30
- veor d30,d20,d21
- vsli.64 d26,d20,#25
- veor d19,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d22,d21 @ Maj(a,b,c)
- veor d19,d26 @ Sigma0(a)
- vadd.i64 d23,d27
- vadd.i64 d30,d27
- @ vadd.i64 d19,d30
- vshr.u64 d24,d23,#14 @ 5
-#if 5<16
- vld1.64 {d5},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d23,#18
-#if 5>0
- vadd.i64 d19,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d23,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d23,#50
- vsli.64 d25,d23,#46
- vmov d29,d23
- vsli.64 d26,d23,#23
-#if 5<16 && defined(__ARMEL__)
- vrev64.8 d5,d5
-#endif
- veor d25,d24
- vbsl d29,d16,d17 @ Ch(e,f,g)
- vshr.u64 d24,d19,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d18
- vshr.u64 d25,d19,#34
- vsli.64 d24,d19,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d19,#39
- vadd.i64 d28,d5
- vsli.64 d25,d19,#30
- veor d30,d19,d20
- vsli.64 d26,d19,#25
- veor d18,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d21,d20 @ Maj(a,b,c)
- veor d18,d26 @ Sigma0(a)
- vadd.i64 d22,d27
- vadd.i64 d30,d27
- @ vadd.i64 d18,d30
- vshr.u64 d24,d22,#14 @ 6
-#if 6<16
- vld1.64 {d6},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d22,#18
-#if 6>0
- vadd.i64 d18,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d22,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d22,#50
- vsli.64 d25,d22,#46
- vmov d29,d22
- vsli.64 d26,d22,#23
-#if 6<16 && defined(__ARMEL__)
- vrev64.8 d6,d6
-#endif
- veor d25,d24
- vbsl d29,d23,d16 @ Ch(e,f,g)
- vshr.u64 d24,d18,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d17
- vshr.u64 d25,d18,#34
- vsli.64 d24,d18,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d18,#39
- vadd.i64 d28,d6
- vsli.64 d25,d18,#30
- veor d30,d18,d19
- vsli.64 d26,d18,#25
- veor d17,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d20,d19 @ Maj(a,b,c)
- veor d17,d26 @ Sigma0(a)
- vadd.i64 d21,d27
- vadd.i64 d30,d27
- @ vadd.i64 d17,d30
- vshr.u64 d24,d21,#14 @ 7
-#if 7<16
- vld1.64 {d7},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d21,#18
-#if 7>0
- vadd.i64 d17,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d21,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d21,#50
- vsli.64 d25,d21,#46
- vmov d29,d21
- vsli.64 d26,d21,#23
-#if 7<16 && defined(__ARMEL__)
- vrev64.8 d7,d7
-#endif
- veor d25,d24
- vbsl d29,d22,d23 @ Ch(e,f,g)
- vshr.u64 d24,d17,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d16
- vshr.u64 d25,d17,#34
- vsli.64 d24,d17,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d17,#39
- vadd.i64 d28,d7
- vsli.64 d25,d17,#30
- veor d30,d17,d18
- vsli.64 d26,d17,#25
- veor d16,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d19,d18 @ Maj(a,b,c)
- veor d16,d26 @ Sigma0(a)
- vadd.i64 d20,d27
- vadd.i64 d30,d27
- @ vadd.i64 d16,d30
- vshr.u64 d24,d20,#14 @ 8
-#if 8<16
- vld1.64 {d8},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d20,#18
-#if 8>0
- vadd.i64 d16,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d20,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d20,#50
- vsli.64 d25,d20,#46
- vmov d29,d20
- vsli.64 d26,d20,#23
-#if 8<16 && defined(__ARMEL__)
- vrev64.8 d8,d8
-#endif
- veor d25,d24
- vbsl d29,d21,d22 @ Ch(e,f,g)
- vshr.u64 d24,d16,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d23
- vshr.u64 d25,d16,#34
- vsli.64 d24,d16,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d16,#39
- vadd.i64 d28,d8
- vsli.64 d25,d16,#30
- veor d30,d16,d17
- vsli.64 d26,d16,#25
- veor d23,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d18,d17 @ Maj(a,b,c)
- veor d23,d26 @ Sigma0(a)
- vadd.i64 d19,d27
- vadd.i64 d30,d27
- @ vadd.i64 d23,d30
- vshr.u64 d24,d19,#14 @ 9
-#if 9<16
- vld1.64 {d9},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d19,#18
-#if 9>0
- vadd.i64 d23,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d19,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d19,#50
- vsli.64 d25,d19,#46
- vmov d29,d19
- vsli.64 d26,d19,#23
-#if 9<16 && defined(__ARMEL__)
- vrev64.8 d9,d9
-#endif
- veor d25,d24
- vbsl d29,d20,d21 @ Ch(e,f,g)
- vshr.u64 d24,d23,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d22
- vshr.u64 d25,d23,#34
- vsli.64 d24,d23,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d23,#39
- vadd.i64 d28,d9
- vsli.64 d25,d23,#30
- veor d30,d23,d16
- vsli.64 d26,d23,#25
- veor d22,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d17,d16 @ Maj(a,b,c)
- veor d22,d26 @ Sigma0(a)
- vadd.i64 d18,d27
- vadd.i64 d30,d27
- @ vadd.i64 d22,d30
- vshr.u64 d24,d18,#14 @ 10
-#if 10<16
- vld1.64 {d10},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d18,#18
-#if 10>0
- vadd.i64 d22,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d18,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d18,#50
- vsli.64 d25,d18,#46
- vmov d29,d18
- vsli.64 d26,d18,#23
-#if 10<16 && defined(__ARMEL__)
- vrev64.8 d10,d10
-#endif
- veor d25,d24
- vbsl d29,d19,d20 @ Ch(e,f,g)
- vshr.u64 d24,d22,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d21
- vshr.u64 d25,d22,#34
- vsli.64 d24,d22,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d22,#39
- vadd.i64 d28,d10
- vsli.64 d25,d22,#30
- veor d30,d22,d23
- vsli.64 d26,d22,#25
- veor d21,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d16,d23 @ Maj(a,b,c)
- veor d21,d26 @ Sigma0(a)
- vadd.i64 d17,d27
- vadd.i64 d30,d27
- @ vadd.i64 d21,d30
- vshr.u64 d24,d17,#14 @ 11
-#if 11<16
- vld1.64 {d11},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d17,#18
-#if 11>0
- vadd.i64 d21,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d17,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d17,#50
- vsli.64 d25,d17,#46
- vmov d29,d17
- vsli.64 d26,d17,#23
-#if 11<16 && defined(__ARMEL__)
- vrev64.8 d11,d11
-#endif
- veor d25,d24
- vbsl d29,d18,d19 @ Ch(e,f,g)
- vshr.u64 d24,d21,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d20
- vshr.u64 d25,d21,#34
- vsli.64 d24,d21,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d21,#39
- vadd.i64 d28,d11
- vsli.64 d25,d21,#30
- veor d30,d21,d22
- vsli.64 d26,d21,#25
- veor d20,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d23,d22 @ Maj(a,b,c)
- veor d20,d26 @ Sigma0(a)
- vadd.i64 d16,d27
- vadd.i64 d30,d27
- @ vadd.i64 d20,d30
- vshr.u64 d24,d16,#14 @ 12
-#if 12<16
- vld1.64 {d12},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d16,#18
-#if 12>0
- vadd.i64 d20,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d16,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d16,#50
- vsli.64 d25,d16,#46
- vmov d29,d16
- vsli.64 d26,d16,#23
-#if 12<16 && defined(__ARMEL__)
- vrev64.8 d12,d12
-#endif
- veor d25,d24
- vbsl d29,d17,d18 @ Ch(e,f,g)
- vshr.u64 d24,d20,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d19
- vshr.u64 d25,d20,#34
- vsli.64 d24,d20,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d20,#39
- vadd.i64 d28,d12
- vsli.64 d25,d20,#30
- veor d30,d20,d21
- vsli.64 d26,d20,#25
- veor d19,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d22,d21 @ Maj(a,b,c)
- veor d19,d26 @ Sigma0(a)
- vadd.i64 d23,d27
- vadd.i64 d30,d27
- @ vadd.i64 d19,d30
- vshr.u64 d24,d23,#14 @ 13
-#if 13<16
- vld1.64 {d13},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d23,#18
-#if 13>0
- vadd.i64 d19,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d23,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d23,#50
- vsli.64 d25,d23,#46
- vmov d29,d23
- vsli.64 d26,d23,#23
-#if 13<16 && defined(__ARMEL__)
- vrev64.8 d13,d13
-#endif
- veor d25,d24
- vbsl d29,d16,d17 @ Ch(e,f,g)
- vshr.u64 d24,d19,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d18
- vshr.u64 d25,d19,#34
- vsli.64 d24,d19,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d19,#39
- vadd.i64 d28,d13
- vsli.64 d25,d19,#30
- veor d30,d19,d20
- vsli.64 d26,d19,#25
- veor d18,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d21,d20 @ Maj(a,b,c)
- veor d18,d26 @ Sigma0(a)
- vadd.i64 d22,d27
- vadd.i64 d30,d27
- @ vadd.i64 d18,d30
- vshr.u64 d24,d22,#14 @ 14
-#if 14<16
- vld1.64 {d14},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d22,#18
-#if 14>0
- vadd.i64 d18,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d22,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d22,#50
- vsli.64 d25,d22,#46
- vmov d29,d22
- vsli.64 d26,d22,#23
-#if 14<16 && defined(__ARMEL__)
- vrev64.8 d14,d14
-#endif
- veor d25,d24
- vbsl d29,d23,d16 @ Ch(e,f,g)
- vshr.u64 d24,d18,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d17
- vshr.u64 d25,d18,#34
- vsli.64 d24,d18,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d18,#39
- vadd.i64 d28,d14
- vsli.64 d25,d18,#30
- veor d30,d18,d19
- vsli.64 d26,d18,#25
- veor d17,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d20,d19 @ Maj(a,b,c)
- veor d17,d26 @ Sigma0(a)
- vadd.i64 d21,d27
- vadd.i64 d30,d27
- @ vadd.i64 d17,d30
- vshr.u64 d24,d21,#14 @ 15
-#if 15<16
- vld1.64 {d15},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d21,#18
-#if 15>0
- vadd.i64 d17,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d21,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d21,#50
- vsli.64 d25,d21,#46
- vmov d29,d21
- vsli.64 d26,d21,#23
-#if 15<16 && defined(__ARMEL__)
- vrev64.8 d15,d15
-#endif
- veor d25,d24
- vbsl d29,d22,d23 @ Ch(e,f,g)
- vshr.u64 d24,d17,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d16
- vshr.u64 d25,d17,#34
- vsli.64 d24,d17,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d17,#39
- vadd.i64 d28,d15
- vsli.64 d25,d17,#30
- veor d30,d17,d18
- vsli.64 d26,d17,#25
- veor d16,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d19,d18 @ Maj(a,b,c)
- veor d16,d26 @ Sigma0(a)
- vadd.i64 d20,d27
- vadd.i64 d30,d27
- @ vadd.i64 d16,d30
- mov r12,#4
-L16_79_neon:
- subs r12,#1
- vshr.u64 q12,q7,#19
- vshr.u64 q13,q7,#61
- vadd.i64 d16,d30 @ h+=Maj from the past
- vshr.u64 q15,q7,#6
- vsli.64 q12,q7,#45
- vext.8 q14,q0,q1,#8 @ X[i+1]
- vsli.64 q13,q7,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q0,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q4,q5,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d20,#14 @ from NEON_00_15
- vadd.i64 q0,q14
- vshr.u64 d25,d20,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d20,#41 @ from NEON_00_15
- vadd.i64 q0,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d20,#50
- vsli.64 d25,d20,#46
- vmov d29,d20
- vsli.64 d26,d20,#23
-#if 16<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d21,d22 @ Ch(e,f,g)
- vshr.u64 d24,d16,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d23
- vshr.u64 d25,d16,#34
- vsli.64 d24,d16,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d16,#39
- vadd.i64 d28,d0
- vsli.64 d25,d16,#30
- veor d30,d16,d17
- vsli.64 d26,d16,#25
- veor d23,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d18,d17 @ Maj(a,b,c)
- veor d23,d26 @ Sigma0(a)
- vadd.i64 d19,d27
- vadd.i64 d30,d27
- @ vadd.i64 d23,d30
- vshr.u64 d24,d19,#14 @ 17
-#if 17<16
- vld1.64 {d1},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d19,#18
-#if 17>0
- vadd.i64 d23,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d19,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d19,#50
- vsli.64 d25,d19,#46
- vmov d29,d19
- vsli.64 d26,d19,#23
-#if 17<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d20,d21 @ Ch(e,f,g)
- vshr.u64 d24,d23,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d22
- vshr.u64 d25,d23,#34
- vsli.64 d24,d23,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d23,#39
- vadd.i64 d28,d1
- vsli.64 d25,d23,#30
- veor d30,d23,d16
- vsli.64 d26,d23,#25
- veor d22,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d17,d16 @ Maj(a,b,c)
- veor d22,d26 @ Sigma0(a)
- vadd.i64 d18,d27
- vadd.i64 d30,d27
- @ vadd.i64 d22,d30
- vshr.u64 q12,q0,#19
- vshr.u64 q13,q0,#61
- vadd.i64 d22,d30 @ h+=Maj from the past
- vshr.u64 q15,q0,#6
- vsli.64 q12,q0,#45
- vext.8 q14,q1,q2,#8 @ X[i+1]
- vsli.64 q13,q0,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q1,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q5,q6,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d18,#14 @ from NEON_00_15
- vadd.i64 q1,q14
- vshr.u64 d25,d18,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d18,#41 @ from NEON_00_15
- vadd.i64 q1,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d18,#50
- vsli.64 d25,d18,#46
- vmov d29,d18
- vsli.64 d26,d18,#23
-#if 18<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d19,d20 @ Ch(e,f,g)
- vshr.u64 d24,d22,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d21
- vshr.u64 d25,d22,#34
- vsli.64 d24,d22,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d22,#39
- vadd.i64 d28,d2
- vsli.64 d25,d22,#30
- veor d30,d22,d23
- vsli.64 d26,d22,#25
- veor d21,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d16,d23 @ Maj(a,b,c)
- veor d21,d26 @ Sigma0(a)
- vadd.i64 d17,d27
- vadd.i64 d30,d27
- @ vadd.i64 d21,d30
- vshr.u64 d24,d17,#14 @ 19
-#if 19<16
- vld1.64 {d3},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d17,#18
-#if 19>0
- vadd.i64 d21,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d17,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d17,#50
- vsli.64 d25,d17,#46
- vmov d29,d17
- vsli.64 d26,d17,#23
-#if 19<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d18,d19 @ Ch(e,f,g)
- vshr.u64 d24,d21,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d20
- vshr.u64 d25,d21,#34
- vsli.64 d24,d21,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d21,#39
- vadd.i64 d28,d3
- vsli.64 d25,d21,#30
- veor d30,d21,d22
- vsli.64 d26,d21,#25
- veor d20,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d23,d22 @ Maj(a,b,c)
- veor d20,d26 @ Sigma0(a)
- vadd.i64 d16,d27
- vadd.i64 d30,d27
- @ vadd.i64 d20,d30
- vshr.u64 q12,q1,#19
- vshr.u64 q13,q1,#61
- vadd.i64 d20,d30 @ h+=Maj from the past
- vshr.u64 q15,q1,#6
- vsli.64 q12,q1,#45
- vext.8 q14,q2,q3,#8 @ X[i+1]
- vsli.64 q13,q1,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q2,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q6,q7,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d16,#14 @ from NEON_00_15
- vadd.i64 q2,q14
- vshr.u64 d25,d16,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d16,#41 @ from NEON_00_15
- vadd.i64 q2,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d16,#50
- vsli.64 d25,d16,#46
- vmov d29,d16
- vsli.64 d26,d16,#23
-#if 20<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d17,d18 @ Ch(e,f,g)
- vshr.u64 d24,d20,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d19
- vshr.u64 d25,d20,#34
- vsli.64 d24,d20,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d20,#39
- vadd.i64 d28,d4
- vsli.64 d25,d20,#30
- veor d30,d20,d21
- vsli.64 d26,d20,#25
- veor d19,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d22,d21 @ Maj(a,b,c)
- veor d19,d26 @ Sigma0(a)
- vadd.i64 d23,d27
- vadd.i64 d30,d27
- @ vadd.i64 d19,d30
- vshr.u64 d24,d23,#14 @ 21
-#if 21<16
- vld1.64 {d5},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d23,#18
-#if 21>0
- vadd.i64 d19,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d23,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d23,#50
- vsli.64 d25,d23,#46
- vmov d29,d23
- vsli.64 d26,d23,#23
-#if 21<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d16,d17 @ Ch(e,f,g)
- vshr.u64 d24,d19,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d18
- vshr.u64 d25,d19,#34
- vsli.64 d24,d19,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d19,#39
- vadd.i64 d28,d5
- vsli.64 d25,d19,#30
- veor d30,d19,d20
- vsli.64 d26,d19,#25
- veor d18,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d21,d20 @ Maj(a,b,c)
- veor d18,d26 @ Sigma0(a)
- vadd.i64 d22,d27
- vadd.i64 d30,d27
- @ vadd.i64 d18,d30
- vshr.u64 q12,q2,#19
- vshr.u64 q13,q2,#61
- vadd.i64 d18,d30 @ h+=Maj from the past
- vshr.u64 q15,q2,#6
- vsli.64 q12,q2,#45
- vext.8 q14,q3,q4,#8 @ X[i+1]
- vsli.64 q13,q2,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q3,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q7,q0,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d22,#14 @ from NEON_00_15
- vadd.i64 q3,q14
- vshr.u64 d25,d22,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d22,#41 @ from NEON_00_15
- vadd.i64 q3,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d22,#50
- vsli.64 d25,d22,#46
- vmov d29,d22
- vsli.64 d26,d22,#23
-#if 22<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d23,d16 @ Ch(e,f,g)
- vshr.u64 d24,d18,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d17
- vshr.u64 d25,d18,#34
- vsli.64 d24,d18,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d18,#39
- vadd.i64 d28,d6
- vsli.64 d25,d18,#30
- veor d30,d18,d19
- vsli.64 d26,d18,#25
- veor d17,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d20,d19 @ Maj(a,b,c)
- veor d17,d26 @ Sigma0(a)
- vadd.i64 d21,d27
- vadd.i64 d30,d27
- @ vadd.i64 d17,d30
- vshr.u64 d24,d21,#14 @ 23
-#if 23<16
- vld1.64 {d7},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d21,#18
-#if 23>0
- vadd.i64 d17,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d21,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d21,#50
- vsli.64 d25,d21,#46
- vmov d29,d21
- vsli.64 d26,d21,#23
-#if 23<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d22,d23 @ Ch(e,f,g)
- vshr.u64 d24,d17,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d16
- vshr.u64 d25,d17,#34
- vsli.64 d24,d17,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d17,#39
- vadd.i64 d28,d7
- vsli.64 d25,d17,#30
- veor d30,d17,d18
- vsli.64 d26,d17,#25
- veor d16,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d19,d18 @ Maj(a,b,c)
- veor d16,d26 @ Sigma0(a)
- vadd.i64 d20,d27
- vadd.i64 d30,d27
- @ vadd.i64 d16,d30
- vshr.u64 q12,q3,#19
- vshr.u64 q13,q3,#61
- vadd.i64 d16,d30 @ h+=Maj from the past
- vshr.u64 q15,q3,#6
- vsli.64 q12,q3,#45
- vext.8 q14,q4,q5,#8 @ X[i+1]
- vsli.64 q13,q3,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q4,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q0,q1,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d20,#14 @ from NEON_00_15
- vadd.i64 q4,q14
- vshr.u64 d25,d20,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d20,#41 @ from NEON_00_15
- vadd.i64 q4,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d20,#50
- vsli.64 d25,d20,#46
- vmov d29,d20
- vsli.64 d26,d20,#23
-#if 24<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d21,d22 @ Ch(e,f,g)
- vshr.u64 d24,d16,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d23
- vshr.u64 d25,d16,#34
- vsli.64 d24,d16,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d16,#39
- vadd.i64 d28,d8
- vsli.64 d25,d16,#30
- veor d30,d16,d17
- vsli.64 d26,d16,#25
- veor d23,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d18,d17 @ Maj(a,b,c)
- veor d23,d26 @ Sigma0(a)
- vadd.i64 d19,d27
- vadd.i64 d30,d27
- @ vadd.i64 d23,d30
- vshr.u64 d24,d19,#14 @ 25
-#if 25<16
- vld1.64 {d9},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d19,#18
-#if 25>0
- vadd.i64 d23,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d19,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d19,#50
- vsli.64 d25,d19,#46
- vmov d29,d19
- vsli.64 d26,d19,#23
-#if 25<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d20,d21 @ Ch(e,f,g)
- vshr.u64 d24,d23,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d22
- vshr.u64 d25,d23,#34
- vsli.64 d24,d23,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d23,#39
- vadd.i64 d28,d9
- vsli.64 d25,d23,#30
- veor d30,d23,d16
- vsli.64 d26,d23,#25
- veor d22,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d17,d16 @ Maj(a,b,c)
- veor d22,d26 @ Sigma0(a)
- vadd.i64 d18,d27
- vadd.i64 d30,d27
- @ vadd.i64 d22,d30
- vshr.u64 q12,q4,#19
- vshr.u64 q13,q4,#61
- vadd.i64 d22,d30 @ h+=Maj from the past
- vshr.u64 q15,q4,#6
- vsli.64 q12,q4,#45
- vext.8 q14,q5,q6,#8 @ X[i+1]
- vsli.64 q13,q4,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q5,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q1,q2,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d18,#14 @ from NEON_00_15
- vadd.i64 q5,q14
- vshr.u64 d25,d18,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d18,#41 @ from NEON_00_15
- vadd.i64 q5,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d18,#50
- vsli.64 d25,d18,#46
- vmov d29,d18
- vsli.64 d26,d18,#23
-#if 26<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d19,d20 @ Ch(e,f,g)
- vshr.u64 d24,d22,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d21
- vshr.u64 d25,d22,#34
- vsli.64 d24,d22,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d22,#39
- vadd.i64 d28,d10
- vsli.64 d25,d22,#30
- veor d30,d22,d23
- vsli.64 d26,d22,#25
- veor d21,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d16,d23 @ Maj(a,b,c)
- veor d21,d26 @ Sigma0(a)
- vadd.i64 d17,d27
- vadd.i64 d30,d27
- @ vadd.i64 d21,d30
- vshr.u64 d24,d17,#14 @ 27
-#if 27<16
- vld1.64 {d11},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d17,#18
-#if 27>0
- vadd.i64 d21,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d17,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d17,#50
- vsli.64 d25,d17,#46
- vmov d29,d17
- vsli.64 d26,d17,#23
-#if 27<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d18,d19 @ Ch(e,f,g)
- vshr.u64 d24,d21,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d20
- vshr.u64 d25,d21,#34
- vsli.64 d24,d21,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d21,#39
- vadd.i64 d28,d11
- vsli.64 d25,d21,#30
- veor d30,d21,d22
- vsli.64 d26,d21,#25
- veor d20,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d23,d22 @ Maj(a,b,c)
- veor d20,d26 @ Sigma0(a)
- vadd.i64 d16,d27
- vadd.i64 d30,d27
- @ vadd.i64 d20,d30
- vshr.u64 q12,q5,#19
- vshr.u64 q13,q5,#61
- vadd.i64 d20,d30 @ h+=Maj from the past
- vshr.u64 q15,q5,#6
- vsli.64 q12,q5,#45
- vext.8 q14,q6,q7,#8 @ X[i+1]
- vsli.64 q13,q5,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q6,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q2,q3,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d16,#14 @ from NEON_00_15
- vadd.i64 q6,q14
- vshr.u64 d25,d16,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d16,#41 @ from NEON_00_15
- vadd.i64 q6,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d16,#50
- vsli.64 d25,d16,#46
- vmov d29,d16
- vsli.64 d26,d16,#23
-#if 28<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d17,d18 @ Ch(e,f,g)
- vshr.u64 d24,d20,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d19
- vshr.u64 d25,d20,#34
- vsli.64 d24,d20,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d20,#39
- vadd.i64 d28,d12
- vsli.64 d25,d20,#30
- veor d30,d20,d21
- vsli.64 d26,d20,#25
- veor d19,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d22,d21 @ Maj(a,b,c)
- veor d19,d26 @ Sigma0(a)
- vadd.i64 d23,d27
- vadd.i64 d30,d27
- @ vadd.i64 d19,d30
- vshr.u64 d24,d23,#14 @ 29
-#if 29<16
- vld1.64 {d13},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d23,#18
-#if 29>0
- vadd.i64 d19,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d23,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d23,#50
- vsli.64 d25,d23,#46
- vmov d29,d23
- vsli.64 d26,d23,#23
-#if 29<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d16,d17 @ Ch(e,f,g)
- vshr.u64 d24,d19,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d18
- vshr.u64 d25,d19,#34
- vsli.64 d24,d19,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d19,#39
- vadd.i64 d28,d13
- vsli.64 d25,d19,#30
- veor d30,d19,d20
- vsli.64 d26,d19,#25
- veor d18,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d21,d20 @ Maj(a,b,c)
- veor d18,d26 @ Sigma0(a)
- vadd.i64 d22,d27
- vadd.i64 d30,d27
- @ vadd.i64 d18,d30
- vshr.u64 q12,q6,#19
- vshr.u64 q13,q6,#61
- vadd.i64 d18,d30 @ h+=Maj from the past
- vshr.u64 q15,q6,#6
- vsli.64 q12,q6,#45
- vext.8 q14,q7,q0,#8 @ X[i+1]
- vsli.64 q13,q6,#3
- veor q15,q12
- vshr.u64 q12,q14,#1
- veor q15,q13 @ sigma1(X[i+14])
- vshr.u64 q13,q14,#8
- vadd.i64 q7,q15
- vshr.u64 q15,q14,#7
- vsli.64 q12,q14,#63
- vsli.64 q13,q14,#56
- vext.8 q14,q3,q4,#8 @ X[i+9]
- veor q15,q12
- vshr.u64 d24,d22,#14 @ from NEON_00_15
- vadd.i64 q7,q14
- vshr.u64 d25,d22,#18 @ from NEON_00_15
- veor q15,q13 @ sigma0(X[i+1])
- vshr.u64 d26,d22,#41 @ from NEON_00_15
- vadd.i64 q7,q15
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d22,#50
- vsli.64 d25,d22,#46
- vmov d29,d22
- vsli.64 d26,d22,#23
-#if 30<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d23,d16 @ Ch(e,f,g)
- vshr.u64 d24,d18,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d17
- vshr.u64 d25,d18,#34
- vsli.64 d24,d18,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d18,#39
- vadd.i64 d28,d14
- vsli.64 d25,d18,#30
- veor d30,d18,d19
- vsli.64 d26,d18,#25
- veor d17,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d20,d19 @ Maj(a,b,c)
- veor d17,d26 @ Sigma0(a)
- vadd.i64 d21,d27
- vadd.i64 d30,d27
- @ vadd.i64 d17,d30
- vshr.u64 d24,d21,#14 @ 31
-#if 31<16
- vld1.64 {d15},[r1]! @ handles unaligned
-#endif
- vshr.u64 d25,d21,#18
-#if 31>0
- vadd.i64 d17,d30 @ h+=Maj from the past
-#endif
- vshr.u64 d26,d21,#41
- vld1.64 {d28},[r3,:64]! @ K[i++]
- vsli.64 d24,d21,#50
- vsli.64 d25,d21,#46
- vmov d29,d21
- vsli.64 d26,d21,#23
-#if 31<16 && defined(__ARMEL__)
- vrev64.8 ,
-#endif
- veor d25,d24
- vbsl d29,d22,d23 @ Ch(e,f,g)
- vshr.u64 d24,d17,#28
- veor d26,d25 @ Sigma1(e)
- vadd.i64 d27,d29,d16
- vshr.u64 d25,d17,#34
- vsli.64 d24,d17,#36
- vadd.i64 d27,d26
- vshr.u64 d26,d17,#39
- vadd.i64 d28,d15
- vsli.64 d25,d17,#30
- veor d30,d17,d18
- vsli.64 d26,d17,#25
- veor d16,d24,d25
- vadd.i64 d27,d28
- vbsl d30,d19,d18 @ Maj(a,b,c)
- veor d16,d26 @ Sigma0(a)
- vadd.i64 d20,d27
- vadd.i64 d30,d27
- @ vadd.i64 d16,d30
- bne L16_79_neon
-
- vadd.i64 d16,d30 @ h+=Maj from the past
- vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp
- vadd.i64 q8,q12 @ vectorized accumulate
- vadd.i64 q9,q13
- vadd.i64 q10,q14
- vadd.i64 q11,q15
- vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context
- teq r1,r2
- sub r3,#640 @ rewind K512
- bne Loop_neon
-
- VFP_ABI_POP
- bx lr @ .word 0xe12fff1e
-
-#endif
-.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm _OPENSSL_armcap_P,4
-.non_lazy_symbol_pointer
-OPENSSL_armcap_P:
-.indirect_symbol _OPENSSL_armcap_P
-.long 0
-.private_extern _OPENSSL_armcap_P
-#endif
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/vpaes-armv7-apple.S b/apple-arm/crypto/fipsmodule/vpaes-armv7-apple.S
deleted file mode 100644
index 4cdc521..0000000
--- a/apple-arm/crypto/fipsmodule/vpaes-armv7-apple.S
+++ /dev/null
@@ -1,1257 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-.syntax unified
-
-
-
-
-#if defined(__thumb2__)
-.thumb
-#else
-.code 32
-#endif
-
-.text
-
-
-.align 7 @ totally strategic alignment
-_vpaes_consts:
-Lk_mc_forward:@ mc_forward
-.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
-.quad 0x080B0A0904070605, 0x000302010C0F0E0D
-.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
-.quad 0x000302010C0F0E0D, 0x080B0A0904070605
-Lk_mc_backward:@ mc_backward
-.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
-.quad 0x020100030E0D0C0F, 0x0A09080B06050407
-.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
-.quad 0x0A09080B06050407, 0x020100030E0D0C0F
-Lk_sr:@ sr
-.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
-.quad 0x030E09040F0A0500, 0x0B06010C07020D08
-.quad 0x0F060D040B020900, 0x070E050C030A0108
-.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
-
-@
-@ "Hot" constants
-@
-Lk_inv:@ inv, inva
-.quad 0x0E05060F0D080180, 0x040703090A0B0C02
-.quad 0x01040A060F0B0780, 0x030D0E0C02050809
-Lk_ipt:@ input transform (lo, hi)
-.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
-.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
-Lk_sbo:@ sbou, sbot
-.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
-.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
-Lk_sb1:@ sb1u, sb1t
-.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
-.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
-Lk_sb2:@ sb2u, sb2t
-.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
-.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
-
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
-.align 2
-
-.align 6
-@@
-@@ _aes_preheat
-@@
-@@ Fills q9-q15 as specified below.
-@@
-#ifdef __thumb2__
-.thumb_func _vpaes_preheat
-#endif
-.align 4
-_vpaes_preheat:
- adr r10, Lk_inv
- vmov.i8 q9, #0x0f @ Lk_s0F
- vld1.64 {q10,q11}, [r10]! @ Lk_inv
- add r10, r10, #64 @ Skip Lk_ipt, Lk_sbo
- vld1.64 {q12,q13}, [r10]! @ Lk_sb1
- vld1.64 {q14,q15}, [r10] @ Lk_sb2
- bx lr
-
-@@
-@@ _aes_encrypt_core
-@@
-@@ AES-encrypt q0.
-@@
-@@ Inputs:
-@@ q0 = input
-@@ q9-q15 as in _vpaes_preheat
-@@ [r2] = scheduled keys
-@@
-@@ Output in q0
-@@ Clobbers q1-q5, r8-r11
-@@ Preserves q6-q8 so you get some local vectors
-@@
-@@
-#ifdef __thumb2__
-.thumb_func _vpaes_encrypt_core
-#endif
-.align 4
-_vpaes_encrypt_core:
- mov r9, r2
- ldr r8, [r2,#240] @ pull rounds
- adr r11, Lk_ipt
- @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
- @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
- vld1.64 {q2, q3}, [r11]
- adr r11, Lk_mc_forward+16
- vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key
- vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
- vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
- vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1
- vtbl.8 d3, {q2}, d3
- vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2
- vtbl.8 d5, {q3}, d1
- veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0
- veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
-
- @ .Lenc_entry ends with a bnz instruction which is normally paired with
- @ subs in .Lenc_loop.
- tst r8, r8
- b Lenc_entry
-
-.align 4
-Lenc_loop:
- @ middle of middle round
- add r10, r11, #0x40
- vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
- vtbl.8 d9, {q13}, d5
- vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
- vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
- vtbl.8 d1, {q12}, d7
- veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
- vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
- vtbl.8 d11, {q15}, d5
- veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
- vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
- vtbl.8 d5, {q14}, d7
- vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
- vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
- vtbl.8 d7, {q0}, d3
- veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
- @ Write to q5 instead of q0, so the table and destination registers do
- @ not overlap.
- vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
- vtbl.8 d11, {q0}, d9
- veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
- vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
- vtbl.8 d9, {q3}, d3
- @ Here we restore the original q0/q5 usage.
- veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
- and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4
- veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
- subs r8, r8, #1 @ nr--
-
-Lenc_entry:
- @ top of round
- vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k
- vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
- vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
- vtbl.8 d11, {q11}, d3
- veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
- vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
- vtbl.8 d7, {q10}, d1
- vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
- vtbl.8 d9, {q10}, d3
- veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
- veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
- vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
- vtbl.8 d5, {q10}, d7
- vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
- vtbl.8 d7, {q10}, d9
- veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
- veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
- vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5
- bne Lenc_loop
-
- @ middle of last round
- add r10, r11, #0x80
-
- adr r11, Lk_sbo
- @ Read to q1 instead of q4, so the vtbl.8 instruction below does not
- @ overlap table and destination registers.
- vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou
- vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot Lk_sbo+16
- vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
- vtbl.8 d9, {q1}, d5
- vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
- @ Write to q2 instead of q0 below, to avoid overlapping table and
- @ destination registers.
- vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
- vtbl.8 d5, {q0}, d7
- veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
- veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
- @ Here we restore the original q0/q2 usage.
- vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0
- vtbl.8 d1, {q2}, d3
- bx lr
-
-
-.globl _vpaes_encrypt
-.private_extern _vpaes_encrypt
-#ifdef __thumb2__
-.thumb_func _vpaes_encrypt
-#endif
-.align 4
-_vpaes_encrypt:
- @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
- @ alignment.
- stmdb sp!, {r7,r8,r9,r10,r11,lr}
- @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
- vstmdb sp!, {d8,d9,d10,d11}
-
- vld1.64 {q0}, [r0]
- bl _vpaes_preheat
- bl _vpaes_encrypt_core
- vst1.64 {q0}, [r1]
-
- vldmia sp!, {d8,d9,d10,d11}
- ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
-
-
-@
-@ Decryption stuff
-@
-
-.align 4
-_vpaes_decrypt_consts:
-Lk_dipt:@ decryption input transform
-.quad 0x0F505B040B545F00, 0x154A411E114E451A
-.quad 0x86E383E660056500, 0x12771772F491F194
-Lk_dsbo:@ decryption sbox final output
-.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
-.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-Lk_dsb9:@ decryption sbox output *9*u, *9*t
-.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
-.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
-Lk_dsbd:@ decryption sbox output *D*u, *D*t
-.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
-.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
-Lk_dsbb:@ decryption sbox output *B*u, *B*t
-.quad 0xD022649296B44200, 0x602646F6B0F2D404
-.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
-Lk_dsbe:@ decryption sbox output *E*u, *E*t
-.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
-.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
-
-
-@@
-@@ Decryption core
-@@
-@@ Same API as encryption core, except it clobbers q12-q15 rather than using
-@@ the values from _vpaes_preheat. q9-q11 must still be set from
-@@ _vpaes_preheat.
-@@
-#ifdef __thumb2__
-.thumb_func _vpaes_decrypt_core
-#endif
-.align 4
-_vpaes_decrypt_core:
- mov r9, r2
- ldr r8, [r2,#240] @ pull rounds
-
- @ This function performs shuffles with various constants. The x86_64
- @ version loads them on-demand into %xmm0-%xmm5. This does not work well
- @ for ARMv7 because those registers are shuffle destinations. The ARMv8
- @ version preloads those constants into registers, but ARMv7 has half
- @ the registers to work with. Instead, we load them on-demand into
- @ q12-q15, registers normally use for preloaded constants. This is fine
- @ because decryption doesn't use those constants. The values are
- @ constant, so this does not interfere with potential 2x optimizations.
- adr r7, Lk_dipt
-
- vld1.64 {q12,q13}, [r7] @ vmovdqa Lk_dipt(%rip), %xmm2 # iptlo
- lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11
- eor r11, r11, #0x30 @ xor $0x30, %r11
- adr r10, Lk_sr
- and r11, r11, #0x30 @ and $0x30, %r11
- add r11, r11, r10
- adr r10, Lk_mc_forward+48
-
- vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key
- vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
- vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
- vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2
- vtbl.8 d5, {q12}, d3
- vld1.64 {q5}, [r10] @ vmovdqa Lk_mc_forward+48(%rip), %xmm5
- @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
- vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0
- vtbl.8 d1, {q13}, d1
- veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2
- veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
-
- @ .Ldec_entry ends with a bnz instruction which is normally paired with
- @ subs in .Ldec_loop.
- tst r8, r8
- b Ldec_entry
-
-.align 4
-Ldec_loop:
-@
-@ Inverse mix columns
-@
-
- @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of
- @ the function.
- adr r10, Lk_dsb9
- vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
- @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
- @ Load sbd* ahead of time.
- vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
- @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
- vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
- vtbl.8 d9, {q12}, d5
- vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
- vtbl.8 d3, {q13}, d7
- veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0
-
- veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
-
- @ Load sbb* ahead of time.
- vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu
- @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt
-
- vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
- vtbl.8 d9, {q14}, d5
- @ Write to q1 instead of q0, so the table and destination registers do
- @ not overlap.
- vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
- vtbl.8 d3, {q0}, d11
- @ Here we restore the original q0/q1 usage. This instruction is
- @ reordered from the ARMv8 version so we do not clobber the vtbl.8
- @ below.
- veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
- vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
- vtbl.8 d3, {q15}, d7
- @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
- veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
- @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
-
- @ Load sbd* ahead of time.
- vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu
- @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet
-
- vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
- vtbl.8 d9, {q12}, d5
- @ Write to q1 instead of q0, so the table and destination registers do
- @ not overlap.
- vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
- vtbl.8 d3, {q0}, d11
- @ Here we restore the original q0/q1 usage. This instruction is
- @ reordered from the ARMv8 version so we do not clobber the vtbl.8
- @ below.
- veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
- vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
- vtbl.8 d3, {q13}, d7
- veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
-
- vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
- vtbl.8 d9, {q14}, d5
- @ Write to q1 instead of q0, so the table and destination registers do
- @ not overlap.
- vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
- vtbl.8 d3, {q0}, d11
- @ Here we restore the original q0/q1 usage. This instruction is
- @ reordered from the ARMv8 version so we do not clobber the vtbl.8
- @ below.
- veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
- vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
- vtbl.8 d3, {q15}, d7
- vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5
- veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
- subs r8, r8, #1 @ sub $1,%rax # nr--
-
-Ldec_entry:
- @ top of round
- vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
- vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
- vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
- vtbl.8 d5, {q11}, d3
- veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
- vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
- vtbl.8 d7, {q10}, d1
- vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
- vtbl.8 d9, {q10}, d3
- veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
- veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
- vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
- vtbl.8 d5, {q10}, d7
- vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
- vtbl.8 d7, {q10}, d9
- veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
- veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
- vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0
- bne Ldec_loop
-
- @ middle of last round
-
- adr r10, Lk_dsbo
-
- @ Write to q1 rather than q4 to avoid overlapping table and destination.
- vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
- vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
- vtbl.8 d9, {q1}, d5
- @ Write to q2 rather than q1 to avoid overlapping table and destination.
- vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
- vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
- vtbl.8 d3, {q2}, d7
- vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
- veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
- @ Write to q1 rather than q0 so the table and destination registers
- @ below do not overlap.
- veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A
- vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0
- vtbl.8 d1, {q1}, d5
- bx lr
-
-
-.globl _vpaes_decrypt
-.private_extern _vpaes_decrypt
-#ifdef __thumb2__
-.thumb_func _vpaes_decrypt
-#endif
-.align 4
-_vpaes_decrypt:
- @ _vpaes_decrypt_core uses r7-r11.
- stmdb sp!, {r7,r8,r9,r10,r11,lr}
- @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved.
- vstmdb sp!, {d8,d9,d10,d11}
-
- vld1.64 {q0}, [r0]
- bl _vpaes_preheat
- bl _vpaes_decrypt_core
- vst1.64 {q0}, [r1]
-
- vldmia sp!, {d8,d9,d10,d11}
- ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
-
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-@@ @@
-@@ AES key schedule @@
-@@ @@
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
-@ This function diverges from both x86_64 and armv7 in which constants are
-@ pinned. x86_64 has a common preheat function for all operations. aarch64
-@ separates them because it has enough registers to pin nearly all constants.
-@ armv7 does not have enough registers, but needing explicit loads and stores
-@ also complicates using x86_64's register allocation directly.
-@
-@ We pin some constants for convenience and leave q14 and q15 free to load
-@ others on demand.
-
-@
-@ Key schedule constants
-@
-
-.align 4
-_vpaes_key_consts:
-Lk_dksd:@ decryption key schedule: invskew x*D
-.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
-.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
-Lk_dksb:@ decryption key schedule: invskew x*B
-.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
-.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
-Lk_dkse:@ decryption key schedule: invskew x*E + 0x63
-.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
-.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
-Lk_dks9:@ decryption key schedule: invskew x*9
-.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
-.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
-
-Lk_rcon:@ rcon
-.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
-
-Lk_opt:@ output transform
-.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
-.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
-Lk_deskew:@ deskew tables: inverts the sbox's "skew"
-.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
-.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
-
-
-#ifdef __thumb2__
-.thumb_func _vpaes_key_preheat
-#endif
-.align 4
-_vpaes_key_preheat:
- adr r11, Lk_rcon
- vmov.i8 q12, #0x5b @ Lk_s63
- adr r10, Lk_inv @ Must be aligned to 8 mod 16.
- vmov.i8 q9, #0x0f @ Lk_s0F
- vld1.64 {q10,q11}, [r10] @ Lk_inv
- vld1.64 {q8}, [r11] @ Lk_rcon
- bx lr
-
-
-#ifdef __thumb2__
-.thumb_func _vpaes_schedule_core
-#endif
-.align 4
-_vpaes_schedule_core:
- @ We only need to save lr, but ARM requires an 8-byte stack alignment,
- @ so save an extra register.
- stmdb sp!, {r3,lr}
-
- bl _vpaes_key_preheat @ load the tables
-
- adr r11, Lk_ipt @ Must be aligned to 8 mod 16.
- vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned)
-
- @ input transform
- @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
- @ overlap table and destination.
- vmov q4, q0 @ vmovdqa %xmm0, %xmm3
- bl _vpaes_schedule_transform
- adr r10, Lk_sr @ Must be aligned to 8 mod 16.
- vmov q7, q0 @ vmovdqa %xmm0, %xmm7
-
- add r8, r8, r10
- tst r3, r3
- bne Lschedule_am_decrypting
-
- @ encrypting, output zeroth round key after transform
- vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx)
- b Lschedule_go
-
-Lschedule_am_decrypting:
- @ decrypting, output zeroth round key after shiftrows
- vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
- vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
- vtbl.8 d7, {q4}, d3
- vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx)
- eor r8, r8, #0x30 @ xor $0x30, %r8
-
-Lschedule_go:
- cmp r1, #192 @ cmp $192, %esi
- bhi Lschedule_256
- beq Lschedule_192
- @ 128: fall though
-
-@@
-@@ .schedule_128
-@@
-@@ 128-bit specific part of key schedule.
-@@
-@@ This schedule is really simple, because all its parts
-@@ are accomplished by the subroutines.
-@@
-Lschedule_128:
- mov r0, #10 @ mov $10, %esi
-
-Loop_schedule_128:
- bl _vpaes_schedule_round
- subs r0, r0, #1 @ dec %esi
- beq Lschedule_mangle_last
- bl _vpaes_schedule_mangle @ write output
- b Loop_schedule_128
-
-@@
-@@ .aes_schedule_192
-@@
-@@ 192-bit specific part of key schedule.
-@@
-@@ The main body of this schedule is the same as the 128-bit
-@@ schedule, but with more smearing. The long, high side is
-@@ stored in q7 as before, and the short, low side is in
-@@ the high bits of q6.
-@@
-@@ This schedule is somewhat nastier, however, because each
-@@ round produces 192 bits of key material, or 1.5 round keys.
-@@ Therefore, on each cycle we do 2 rounds and produce 3 round
-@@ keys.
-@@
-.align 4
-Lschedule_192:
- sub r0, r0, #8
- vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
- bl _vpaes_schedule_transform @ input transform
- vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part
- vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4
- @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
- mov r0, #4 @ mov $4, %esi
-
-Loop_schedule_192:
- bl _vpaes_schedule_round
- vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0
- bl _vpaes_schedule_mangle @ save key n
- bl _vpaes_schedule_192_smear
- bl _vpaes_schedule_mangle @ save key n+1
- bl _vpaes_schedule_round
- subs r0, r0, #1 @ dec %esi
- beq Lschedule_mangle_last
- bl _vpaes_schedule_mangle @ save key n+2
- bl _vpaes_schedule_192_smear
- b Loop_schedule_192
-
-@@
-@@ .aes_schedule_256
-@@
-@@ 256-bit specific part of key schedule.
-@@
-@@ The structure here is very similar to the 128-bit
-@@ schedule, but with an additional "low side" in
-@@ q6. The low side's rounds are the same as the
-@@ high side's, except no rcon and no rotation.
-@@
-.align 4
-Lschedule_256:
- vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
- bl _vpaes_schedule_transform @ input transform
- mov r0, #7 @ mov $7, %esi
-
-Loop_schedule_256:
- bl _vpaes_schedule_mangle @ output low result
- vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
-
- @ high round
- bl _vpaes_schedule_round
- subs r0, r0, #1 @ dec %esi
- beq Lschedule_mangle_last
- bl _vpaes_schedule_mangle
-
- @ low round. swap xmm7 and xmm6
- vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0
- vmov.i8 q4, #0
- vmov q5, q7 @ vmovdqa %xmm7, %xmm5
- vmov q7, q6 @ vmovdqa %xmm6, %xmm7
- bl _vpaes_schedule_low_round
- vmov q7, q5 @ vmovdqa %xmm5, %xmm7
-
- b Loop_schedule_256
-
-@@
-@@ .aes_schedule_mangle_last
-@@
-@@ Mangler for last round of key schedule
-@@ Mangles q0
-@@ when encrypting, outputs out(q0) ^ 63
-@@ when decrypting, outputs unskew(q0)
-@@
-@@ Always called right before return... jumps to cleanup and exits
-@@
-.align 4
-Lschedule_mangle_last:
- @ schedule last round key from xmm0
- adr r11, Lk_deskew @ lea Lk_deskew(%rip),%r11 # prepare to deskew
- tst r3, r3
- bne Lschedule_mangle_last_dec
-
- @ encrypting
- vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1
- adr r11, Lk_opt @ lea Lk_opt(%rip), %r11 # prepare to output transform
- add r2, r2, #32 @ add $32, %rdx
- vmov q2, q0
- vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute
- vtbl.8 d1, {q2}, d3
-
-Lschedule_mangle_last_dec:
- sub r2, r2, #16 @ add $-16, %rdx
- veor q0, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm0
- bl _vpaes_schedule_transform @ output transform
- vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key
-
- @ cleanup
- veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0
- veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
- veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2
- veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3
- veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4
- veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5
- veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6
- veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7
- ldmia sp!, {r3,pc} @ return
-
-
-@@
-@@ .aes_schedule_192_smear
-@@
-@@ Smear the short, low side in the 192-bit key schedule.
-@@
-@@ Inputs:
-@@ q7: high side, b a x y
-@@ q6: low side, d c 0 0
-@@
-@@ Outputs:
-@@ q6: b+c+d b+c 0 0
-@@ q0: b+c+d b+c b a
-@@
-#ifdef __thumb2__
-.thumb_func _vpaes_schedule_192_smear
-#endif
-.align 4
-_vpaes_schedule_192_smear:
- vmov.i8 q1, #0
- vdup.32 q0, d15[1]
- vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
- vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
- veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
- veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
- veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
- vmov q0, q6 @ vmovdqa %xmm6, %xmm0
- vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
- bx lr
-
-
-@@
-@@ .aes_schedule_round
-@@
-@@ Runs one main round of the key schedule on q0, q7
-@@
-@@ Specifically, runs subbytes on the high dword of q0
-@@ then rotates it by one byte and xors into the low dword of
-@@ q7.
-@@
-@@ Adds rcon from low byte of q8, then rotates q8 for
-@@ next rcon.
-@@
-@@ Smears the dwords of q7 by xoring the low into the
-@@ second low, result into third, result into highest.
-@@
-@@ Returns results in q7 = q0.
-@@ Clobbers q1-q4, r11.
-@@
-#ifdef __thumb2__
-.thumb_func _vpaes_schedule_round
-#endif
-.align 4
-_vpaes_schedule_round:
- @ extract rcon from xmm8
- vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4
- vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1
- vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8
- veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
-
- @ rotate
- vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0
- vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0
-
- @ fall through...
-
- @ low round: same as high round, but no rotation and no rcon.
-_vpaes_schedule_low_round:
- @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
- @ We pin other values in _vpaes_key_preheat, so load them now.
- adr r11, Lk_sb1
- vld1.64 {q14,q15}, [r11]
-
- @ smear xmm7
- vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1
- veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
- vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4
-
- @ subbytes
- vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
- vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
- veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7
- vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
- vtbl.8 d5, {q11}, d3
- veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
- vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
- vtbl.8 d7, {q10}, d1
- veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
- vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
- vtbl.8 d9, {q10}, d3
- veor q7, q7, q12 @ vpxor Lk_s63(%rip), %xmm7, %xmm7
- vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
- vtbl.8 d7, {q10}, d7
- veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
- vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
- vtbl.8 d5, {q10}, d9
- veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io
- veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
- vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
- vtbl.8 d9, {q15}, d7
- vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
- vtbl.8 d3, {q14}, d5
- veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
-
- @ add in smeared stuff
- veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0
- veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7
- bx lr
-
-
-@@
-@@ .aes_schedule_transform
-@@
-@@ Linear-transform q0 according to tables at [r11]
-@@
-@@ Requires that q9 = 0x0F0F... as in preheat
-@@ Output in q0
-@@ Clobbers q1, q2, q14, q15
-@@
-#ifdef __thumb2__
-.thumb_func _vpaes_schedule_transform
-#endif
-.align 4
-_vpaes_schedule_transform:
- vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo
- @ vmovdqa 16(%r11), %xmm1 # hi
- vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
- vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
- vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2
- vtbl.8 d5, {q14}, d3
- vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0
- vtbl.8 d1, {q15}, d1
- veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
- bx lr
-
-
-@@
-@@ .aes_schedule_mangle
-@@
-@@ Mangles q0 from (basis-transformed) standard version
-@@ to our version.
-@@
-@@ On encrypt,
-@@ xor with 0x63
-@@ multiply by circulant 0,1,1,1
-@@ apply shiftrows transform
-@@
-@@ On decrypt,
-@@ xor with 0x63
-@@ multiply by "inverse mixcolumns" circulant E,B,D,9
-@@ deskew
-@@ apply shiftrows transform
-@@
-@@
-@@ Writes out to [r2], and increments or decrements it
-@@ Keeps track of round number mod 4 in r8
-@@ Preserves q0
-@@ Clobbers q1-q5
-@@
-#ifdef __thumb2__
-.thumb_func _vpaes_schedule_mangle
-#endif
-.align 4
-_vpaes_schedule_mangle:
- tst r3, r3
- vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later
- adr r11, Lk_mc_forward @ Must be aligned to 8 mod 16.
- vld1.64 {q5}, [r11] @ vmovdqa Lk_mc_forward(%rip),%xmm5
- bne Lschedule_mangle_dec
-
- @ encrypting
- @ Write to q2 so we do not overlap table and destination below.
- veor q2, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm4
- add r2, r2, #16 @ add $16, %rdx
- vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4
- vtbl.8 d9, {q2}, d11
- vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1
- vtbl.8 d3, {q4}, d11
- vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3
- vtbl.8 d7, {q1}, d11
- veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4
- vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
- veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3
-
- b Lschedule_mangle_both
-.align 4
-Lschedule_mangle_dec:
- @ inverse mix columns
- adr r11, Lk_dksd @ lea Lk_dksd(%rip),%r11
- vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi
- vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo
-
- vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2
- @ vmovdqa 0x10(%r11), %xmm3
- vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
- vtbl.8 d5, {q14}, d9
- vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
- vtbl.8 d7, {q15}, d3
- @ Load .Lk_dksb ahead of time.
- vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2
- @ vmovdqa 0x30(%r11), %xmm3
- @ Write to q13 so we do not overlap table and destination.
- veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
- vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
- vtbl.8 d7, {q13}, d11
-
- vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
- vtbl.8 d5, {q14}, d9
- veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
- vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
- vtbl.8 d7, {q15}, d3
- @ Load .Lk_dkse ahead of time.
- vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2
- @ vmovdqa 0x50(%r11), %xmm3
- @ Write to q13 so we do not overlap table and destination.
- veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
- vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
- vtbl.8 d7, {q13}, d11
-
- vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
- vtbl.8 d5, {q14}, d9
- veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
- vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
- vtbl.8 d7, {q15}, d3
- @ Load .Lk_dkse ahead of time.
- vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2
- @ vmovdqa 0x70(%r11), %xmm4
- @ Write to q13 so we do not overlap table and destination.
- veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
-
- vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
- vtbl.8 d5, {q14}, d9
- vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
- vtbl.8 d7, {q13}, d11
- vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4
- vtbl.8 d9, {q15}, d3
- vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
- veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
- veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3
-
- sub r2, r2, #16 @ add $-16, %rdx
-
-Lschedule_mangle_both:
- @ Write to q2 so table and destination do not overlap.
- vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
- vtbl.8 d5, {q3}, d3
- add r8, r8, #64-16 @ add $-16, %r8
- and r8, r8, #~(1<<6) @ and $0x30, %r8
- vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx)
- bx lr
-
-
-.globl _vpaes_set_encrypt_key
-.private_extern _vpaes_set_encrypt_key
-#ifdef __thumb2__
-.thumb_func _vpaes_set_encrypt_key
-#endif
-.align 4
-_vpaes_set_encrypt_key:
- stmdb sp!, {r7,r8,r9,r10,r11, lr}
- vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-
- lsr r9, r1, #5 @ shr $5,%eax
- add r9, r9, #5 @ $5,%eax
- str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
-
- mov r3, #0 @ mov $0,%ecx
- mov r8, #0x30 @ mov $0x30,%r8d
- bl _vpaes_schedule_core
- eor r0, r0, r0
-
- vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
- ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
-
-
-.globl _vpaes_set_decrypt_key
-.private_extern _vpaes_set_decrypt_key
-#ifdef __thumb2__
-.thumb_func _vpaes_set_decrypt_key
-#endif
-.align 4
-_vpaes_set_decrypt_key:
- stmdb sp!, {r7,r8,r9,r10,r11, lr}
- vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-
- lsr r9, r1, #5 @ shr $5,%eax
- add r9, r9, #5 @ $5,%eax
- str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
- lsl r9, r9, #4 @ shl $4,%eax
- add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx
- add r2, r2, r9
-
- mov r3, #1 @ mov $1,%ecx
- lsr r8, r1, #1 @ shr $1,%r8d
- and r8, r8, #32 @ and $32,%r8d
- eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32
- bl _vpaes_schedule_core
-
- vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
- ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
-
-
-@ Additional constants for converting to bsaes.
-
-.align 4
-_vpaes_convert_consts:
-@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
-@ transform in the AES S-box. 0x63 is incorporated into the low half of the
-@ table. This was computed with the following script:
-@
-@ def u64s_to_u128(x, y):
-@ return x | (y << 64)
-@ def u128_to_u64s(w):
-@ return w & ((1<<64)-1), w >> 64
-@ def get_byte(w, i):
-@ return (w >> (i*8)) & 0xff
-@ def apply_table(table, b):
-@ lo = b & 0xf
-@ hi = b >> 4
-@ return get_byte(table[0], lo) ^ get_byte(table[1], hi)
-@ def opt(b):
-@ table = [
-@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
-@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
-@ ]
-@ return apply_table(table, b)
-@ def rot_byte(b, n):
-@ return 0xff & ((b << n) | (b >> (8-n)))
-@ def skew(x):
-@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
-@ rot_byte(x, 4))
-@ table = [0, 0]
-@ for i in range(16):
-@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
-@ table[1] |= skew(opt(i<<4)) << (i*8)
-@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0]))
-@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1]))
-Lk_opt_then_skew:
-.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b
-.quad 0x1f30062936192f00, 0xb49bad829db284ab
-
-@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
-@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
-@ becomes 0x22334411 and then 0x11443322.
-Lk_decrypt_transform:
-.quad 0x0704050603000102, 0x0f0c0d0e0b08090a
-
-
-@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
-.globl _vpaes_encrypt_key_to_bsaes
-.private_extern _vpaes_encrypt_key_to_bsaes
-#ifdef __thumb2__
-.thumb_func _vpaes_encrypt_key_to_bsaes
-#endif
-.align 4
-_vpaes_encrypt_key_to_bsaes:
- stmdb sp!, {r11, lr}
-
- @ See _vpaes_schedule_core for the key schedule logic. In particular,
- @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
- @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
- @ contain the transformations not in the bsaes representation. This
- @ function inverts those transforms.
- @
- @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
- @ representation, which does not match the other aes_nohw_*
- @ implementations. The ARM aes_nohw_* stores each 32-bit word
- @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
- @ cost of extra REV and VREV32 operations in little-endian ARM.
-
- vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform
- adr r2, Lk_mc_forward @ Must be aligned to 8 mod 16.
- add r3, r2, 0x90 @ Lk_sr+0x10-Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
-
- vld1.64 {q12}, [r2]
- vmov.i8 q10, #0x5b @ Lk_s63 from vpaes-x86_64
- adr r11, Lk_opt @ Must be aligned to 8 mod 16.
- vmov.i8 q11, #0x63 @ LK_s63 without Lk_ipt applied
-
- @ vpaes stores one fewer round count than bsaes, but the number of keys
- @ is the same.
- ldr r2, [r1,#240]
- add r2, r2, #1
- str r2, [r0,#240]
-
- @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
- @ Invert this with .Lk_opt.
- vld1.64 {q0}, [r1]!
- bl _vpaes_schedule_transform
- vrev32.8 q0, q0
- vst1.64 {q0}, [r0]!
-
- @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
- @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
- @ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
-Loop_enc_key_to_bsaes:
- vld1.64 {q0}, [r1]!
-
- @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
- @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
- @ We use r3 rather than r8 to avoid a callee-saved register.
- vld1.64 {q1}, [r3]
- vtbl.8 d4, {q0}, d2
- vtbl.8 d5, {q0}, d3
- add r3, r3, #16
- and r3, r3, #~(1<<6)
- vmov q0, q2
-
- @ Handle the last key differently.
- subs r2, r2, #1
- beq Loop_enc_key_to_bsaes_last
-
- @ Multiply by the circulant. This is its own inverse.
- vtbl.8 d2, {q0}, d24
- vtbl.8 d3, {q0}, d25
- vmov q0, q1
- vtbl.8 d4, {q1}, d24
- vtbl.8 d5, {q1}, d25
- veor q0, q0, q2
- vtbl.8 d2, {q2}, d24
- vtbl.8 d3, {q2}, d25
- veor q0, q0, q1
-
- @ XOR and finish.
- veor q0, q0, q10
- bl _vpaes_schedule_transform
- vrev32.8 q0, q0
- vst1.64 {q0}, [r0]!
- b Loop_enc_key_to_bsaes
-
-Loop_enc_key_to_bsaes_last:
- @ The final key does not have a basis transform (note
- @ .Lschedule_mangle_last inverts the original transform). It only XORs
- @ 0x63 and applies ShiftRows. The latter was already inverted in the
- @ loop. Note that, because we act on the original representation, we use
- @ q11, not q10.
- veor q0, q0, q11
- vrev32.8 q0, q0
- vst1.64 {q0}, [r0]
-
- @ Wipe registers which contained key material.
- veor q0, q0, q0
- veor q1, q1, q1
- veor q2, q2, q2
-
- ldmia sp!, {r11, pc} @ return
-
-
-@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
-.globl _vpaes_decrypt_key_to_bsaes
-.private_extern _vpaes_decrypt_key_to_bsaes
-#ifdef __thumb2__
-.thumb_func _vpaes_decrypt_key_to_bsaes
-#endif
-.align 4
-_vpaes_decrypt_key_to_bsaes:
- stmdb sp!, {r11, lr}
-
- @ See _vpaes_schedule_core for the key schedule logic. Note vpaes
- @ computes the decryption key schedule in reverse. Additionally,
- @ aes-x86_64.pl shares some transformations, so we must only partially
- @ invert vpaes's transformations. In general, vpaes computes in a
- @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
- @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
- @ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
- @
- @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
- @ representation, which does not match the other aes_nohw_*
- @ implementations. The ARM aes_nohw_* stores each 32-bit word
- @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
- @ cost of extra REV and VREV32 operations in little-endian ARM.
-
- adr r2, Lk_decrypt_transform
- adr r3, Lk_sr+0x30
- adr r11, Lk_opt_then_skew @ Input to _vpaes_schedule_transform.
- vld1.64 {q12}, [r2] @ Reuse q12 from encryption.
- vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform
-
- @ vpaes stores one fewer round count than bsaes, but the number of keys
- @ is the same.
- ldr r2, [r1,#240]
- add r2, r2, #1
- str r2, [r0,#240]
-
- @ Undo the basis change and reapply the S-box affine transform. See
- @ .Lschedule_mangle_last.
- vld1.64 {q0}, [r1]!
- bl _vpaes_schedule_transform
- vrev32.8 q0, q0
- vst1.64 {q0}, [r0]!
-
- @ See _vpaes_schedule_mangle for the transform on the middle keys. Note
- @ it simultaneously inverts MixColumns and the S-box affine transform.
- @ See .Lk_dksd through .Lk_dks9.
-Loop_dec_key_to_bsaes:
- vld1.64 {q0}, [r1]!
-
- @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
- @ forwards cancels inverting for which direction we cycle r3. We use r3
- @ rather than r8 to avoid a callee-saved register.
- vld1.64 {q1}, [r3]
- vtbl.8 d4, {q0}, d2
- vtbl.8 d5, {q0}, d3
- add r3, r3, #64-16
- and r3, r3, #~(1<<6)
- vmov q0, q2
-
- @ Handle the last key differently.
- subs r2, r2, #1
- beq Loop_dec_key_to_bsaes_last
-
- @ Undo the basis change and reapply the S-box affine transform.
- bl _vpaes_schedule_transform
-
- @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
- @ combine the two operations in .Lk_decrypt_transform.
- @
- @ TODO(davidben): Where does the rotation come from?
- vtbl.8 d2, {q0}, d24
- vtbl.8 d3, {q0}, d25
-
- vst1.64 {q1}, [r0]!
- b Loop_dec_key_to_bsaes
-
-Loop_dec_key_to_bsaes_last:
- @ The final key only inverts ShiftRows (already done in the loop). See
- @ .Lschedule_am_decrypting. Its basis is not transformed.
- vrev32.8 q0, q0
- vst1.64 {q0}, [r0]!
-
- @ Wipe registers which contained key material.
- veor q0, q0, q0
- veor q1, q1, q1
- veor q2, q2, q2
-
- ldmia sp!, {r11, pc} @ return
-
-.globl _vpaes_ctr32_encrypt_blocks
-.private_extern _vpaes_ctr32_encrypt_blocks
-#ifdef __thumb2__
-.thumb_func _vpaes_ctr32_encrypt_blocks
-#endif
-.align 4
-_vpaes_ctr32_encrypt_blocks:
- mov ip, sp
- stmdb sp!, {r7,r8,r9,r10,r11, lr}
- @ This function uses q4-q7 (d8-d15), which are callee-saved.
- vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-
- cmp r2, #0
- @ r8 is passed on the stack.
- ldr r8, [ip]
- beq Lctr32_done
-
- @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
- mov r9, r3
- mov r3, r2
- mov r2, r9
-
- @ Load the IV and counter portion.
- ldr r7, [r8, #12]
- vld1.8 {q7}, [r8]
-
- bl _vpaes_preheat
- rev r7, r7 @ The counter is big-endian.
-
-Lctr32_loop:
- vmov q0, q7
- vld1.8 {q6}, [r0]! @ Load input ahead of time
- bl _vpaes_encrypt_core
- veor q0, q0, q6 @ XOR input and result
- vst1.8 {q0}, [r1]!
- subs r3, r3, #1
- @ Update the counter.
- add r7, r7, #1
- rev r9, r7
- vmov.32 d15[1], r9
- bne Lctr32_loop
-
-Lctr32_done:
- vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
- ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
-
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/test/trampoline-armv4-apple.S b/apple-arm/crypto/test/trampoline-armv4-apple.S
deleted file mode 100644
index 425a085..0000000
--- a/apple-arm/crypto/test/trampoline-armv4-apple.S
+++ /dev/null
@@ -1,368 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-.syntax unified
-
-
-
-
-.text
-
-@ abi_test_trampoline loads callee-saved registers from |state|, calls |func|
-@ with |argv|, then saves the callee-saved registers into |state|. It returns
-@ the result of |func|. The |unwind| argument is unused.
-@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state,
-@ const uint32_t *argv, size_t argc,
-@ int unwind);
-
-.globl _abi_test_trampoline
-.private_extern _abi_test_trampoline
-.align 4
-_abi_test_trampoline:
- @ Save parameters and all callee-saved registers. For convenience, we
- @ save r9 on iOS even though it's volatile.
- vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
- stmdb sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
-
- @ Reserve stack space for six (10-4) stack parameters, plus an extra 4
- @ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3).
- sub sp, sp, #28
-
- @ Every register in AAPCS is either non-volatile or a parameter (except
- @ r9 on iOS), so this code, by the actual call, loses all its scratch
- @ registers. First fill in stack parameters while there are registers
- @ to spare.
- cmp r3, #4
- bls Lstack_args_done
- mov r4, sp @ r4 is the output pointer.
- add r5, r2, r3, lsl #2 @ Set r5 to the end of argv.
- add r2, r2, #16 @ Skip four arguments.
-Lstack_args_loop:
- ldr r6, [r2], #4
- cmp r2, r5
- str r6, [r4], #4
- bne Lstack_args_loop
-
-Lstack_args_done:
- @ Load registers from |r1|.
- vldmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
-#if defined(__APPLE__)
- @ r9 is not volatile on iOS.
- ldmia r1!, {r4,r5,r6,r7,r8,r10-r11}
-#else
- ldmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
-#endif
-
- @ Load register parameters. This uses up our remaining registers, so we
- @ repurpose lr as scratch space.
- ldr r3, [sp, #40] @ Reload argc.
- ldr lr, [sp, #36] @ Load argv into lr.
- cmp r3, #3
- bhi Larg_r3
- beq Larg_r2
- cmp r3, #1
- bhi Larg_r1
- beq Larg_r0
- b Largs_done
-
-Larg_r3:
- ldr r3, [lr, #12] @ argv[3]
-Larg_r2:
- ldr r2, [lr, #8] @ argv[2]
-Larg_r1:
- ldr r1, [lr, #4] @ argv[1]
-Larg_r0:
- ldr r0, [lr] @ argv[0]
-Largs_done:
-
- @ With every other register in use, load the function pointer into lr
- @ and call the function.
- ldr lr, [sp, #28]
- blx lr
-
- @ r1-r3 are free for use again. The trampoline only supports
- @ single-return functions. Pass r4-r11 to the caller.
- ldr r1, [sp, #32]
- vstmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
-#if defined(__APPLE__)
- @ r9 is not volatile on iOS.
- stmia r1!, {r4,r5,r6,r7,r8,r10-r11}
-#else
- stmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
-#endif
-
- @ Unwind the stack and restore registers.
- add sp, sp, #44 @ 44 = 28+16
- ldmia sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} @ Skip r0-r3 (see +16 above).
- vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-
- bx lr
-
-
-.globl _abi_test_clobber_r0
-.private_extern _abi_test_clobber_r0
-.align 4
-_abi_test_clobber_r0:
- mov r0, #0
- bx lr
-
-
-.globl _abi_test_clobber_r1
-.private_extern _abi_test_clobber_r1
-.align 4
-_abi_test_clobber_r1:
- mov r1, #0
- bx lr
-
-
-.globl _abi_test_clobber_r2
-.private_extern _abi_test_clobber_r2
-.align 4
-_abi_test_clobber_r2:
- mov r2, #0
- bx lr
-
-
-.globl _abi_test_clobber_r3
-.private_extern _abi_test_clobber_r3
-.align 4
-_abi_test_clobber_r3:
- mov r3, #0
- bx lr
-
-
-.globl _abi_test_clobber_r4
-.private_extern _abi_test_clobber_r4
-.align 4
-_abi_test_clobber_r4:
- mov r4, #0
- bx lr
-
-
-.globl _abi_test_clobber_r5
-.private_extern _abi_test_clobber_r5
-.align 4
-_abi_test_clobber_r5:
- mov r5, #0
- bx lr
-
-
-.globl _abi_test_clobber_r6
-.private_extern _abi_test_clobber_r6
-.align 4
-_abi_test_clobber_r6:
- mov r6, #0
- bx lr
-
-
-.globl _abi_test_clobber_r7
-.private_extern _abi_test_clobber_r7
-.align 4
-_abi_test_clobber_r7:
- mov r7, #0
- bx lr
-
-
-.globl _abi_test_clobber_r8
-.private_extern _abi_test_clobber_r8
-.align 4
-_abi_test_clobber_r8:
- mov r8, #0
- bx lr
-
-
-.globl _abi_test_clobber_r9
-.private_extern _abi_test_clobber_r9
-.align 4
-_abi_test_clobber_r9:
- mov r9, #0
- bx lr
-
-
-.globl _abi_test_clobber_r10
-.private_extern _abi_test_clobber_r10
-.align 4
-_abi_test_clobber_r10:
- mov r10, #0
- bx lr
-
-
-.globl _abi_test_clobber_r11
-.private_extern _abi_test_clobber_r11
-.align 4
-_abi_test_clobber_r11:
- mov r11, #0
- bx lr
-
-
-.globl _abi_test_clobber_r12
-.private_extern _abi_test_clobber_r12
-.align 4
-_abi_test_clobber_r12:
- mov r12, #0
- bx lr
-
-
-.globl _abi_test_clobber_d0
-.private_extern _abi_test_clobber_d0
-.align 4
-_abi_test_clobber_d0:
- mov r0, #0
- vmov s0, r0
- vmov s1, r0
- bx lr
-
-
-.globl _abi_test_clobber_d1
-.private_extern _abi_test_clobber_d1
-.align 4
-_abi_test_clobber_d1:
- mov r0, #0
- vmov s2, r0
- vmov s3, r0
- bx lr
-
-
-.globl _abi_test_clobber_d2
-.private_extern _abi_test_clobber_d2
-.align 4
-_abi_test_clobber_d2:
- mov r0, #0
- vmov s4, r0
- vmov s5, r0
- bx lr
-
-
-.globl _abi_test_clobber_d3
-.private_extern _abi_test_clobber_d3
-.align 4
-_abi_test_clobber_d3:
- mov r0, #0
- vmov s6, r0
- vmov s7, r0
- bx lr
-
-
-.globl _abi_test_clobber_d4
-.private_extern _abi_test_clobber_d4
-.align 4
-_abi_test_clobber_d4:
- mov r0, #0
- vmov s8, r0
- vmov s9, r0
- bx lr
-
-
-.globl _abi_test_clobber_d5
-.private_extern _abi_test_clobber_d5
-.align 4
-_abi_test_clobber_d5:
- mov r0, #0
- vmov s10, r0
- vmov s11, r0
- bx lr
-
-
-.globl _abi_test_clobber_d6
-.private_extern _abi_test_clobber_d6
-.align 4
-_abi_test_clobber_d6:
- mov r0, #0
- vmov s12, r0
- vmov s13, r0
- bx lr
-
-
-.globl _abi_test_clobber_d7
-.private_extern _abi_test_clobber_d7
-.align 4
-_abi_test_clobber_d7:
- mov r0, #0
- vmov s14, r0
- vmov s15, r0
- bx lr
-
-
-.globl _abi_test_clobber_d8
-.private_extern _abi_test_clobber_d8
-.align 4
-_abi_test_clobber_d8:
- mov r0, #0
- vmov s16, r0
- vmov s17, r0
- bx lr
-
-
-.globl _abi_test_clobber_d9
-.private_extern _abi_test_clobber_d9
-.align 4
-_abi_test_clobber_d9:
- mov r0, #0
- vmov s18, r0
- vmov s19, r0
- bx lr
-
-
-.globl _abi_test_clobber_d10
-.private_extern _abi_test_clobber_d10
-.align 4
-_abi_test_clobber_d10:
- mov r0, #0
- vmov s20, r0
- vmov s21, r0
- bx lr
-
-
-.globl _abi_test_clobber_d11
-.private_extern _abi_test_clobber_d11
-.align 4
-_abi_test_clobber_d11:
- mov r0, #0
- vmov s22, r0
- vmov s23, r0
- bx lr
-
-
-.globl _abi_test_clobber_d12
-.private_extern _abi_test_clobber_d12
-.align 4
-_abi_test_clobber_d12:
- mov r0, #0
- vmov s24, r0
- vmov s25, r0
- bx lr
-
-
-.globl _abi_test_clobber_d13
-.private_extern _abi_test_clobber_d13
-.align 4
-_abi_test_clobber_d13:
- mov r0, #0
- vmov s26, r0
- vmov s27, r0
- bx lr
-
-
-.globl _abi_test_clobber_d14
-.private_extern _abi_test_clobber_d14
-.align 4
-_abi_test_clobber_d14:
- mov r0, #0
- vmov s28, r0
- vmov s29, r0
- bx lr
-
-
-.globl _abi_test_clobber_d15
-.private_extern _abi_test_clobber_d15
-.align 4
-_abi_test_clobber_d15:
- mov r0, #0
- vmov s30, r0
- vmov s31, r0
- bx lr
-
-#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-x86/crypto/chacha/chacha-x86-apple.S b/apple-x86/crypto/chacha/chacha-x86-apple.S
deleted file mode 100644
index baa06ac..0000000
--- a/apple-x86/crypto/chacha/chacha-x86-apple.S
+++ /dev/null
@@ -1,973 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl _ChaCha20_ctr32
-.private_extern _ChaCha20_ctr32
-.align 4
-_ChaCha20_ctr32:
-L_ChaCha20_ctr32_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- xorl %eax,%eax
- cmpl 28(%esp),%eax
- je L000no_data
- call Lpic_point
-Lpic_point:
- popl %eax
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lpic_point(%eax),%ebp
- testl $16777216,(%ebp)
- jz L001x86
- testl $512,4(%ebp)
- jz L001x86
- jmp Lssse3_shortcut
-L001x86:
- movl 32(%esp),%esi
- movl 36(%esp),%edi
- subl $132,%esp
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edx
- movl %eax,80(%esp)
- movl %ebx,84(%esp)
- movl %ecx,88(%esp)
- movl %edx,92(%esp)
- movl 16(%esi),%eax
- movl 20(%esi),%ebx
- movl 24(%esi),%ecx
- movl 28(%esi),%edx
- movl %eax,96(%esp)
- movl %ebx,100(%esp)
- movl %ecx,104(%esp)
- movl %edx,108(%esp)
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- movl 12(%edi),%edx
- subl $1,%eax
- movl %eax,112(%esp)
- movl %ebx,116(%esp)
- movl %ecx,120(%esp)
- movl %edx,124(%esp)
- jmp L002entry
-.align 4,0x90
-L003outer_loop:
- movl %ebx,156(%esp)
- movl %eax,152(%esp)
- movl %ecx,160(%esp)
-L002entry:
- movl $1634760805,%eax
- movl $857760878,4(%esp)
- movl $2036477234,8(%esp)
- movl $1797285236,12(%esp)
- movl 84(%esp),%ebx
- movl 88(%esp),%ebp
- movl 104(%esp),%ecx
- movl 108(%esp),%esi
- movl 116(%esp),%edx
- movl 120(%esp),%edi
- movl %ebx,20(%esp)
- movl %ebp,24(%esp)
- movl %ecx,40(%esp)
- movl %esi,44(%esp)
- movl %edx,52(%esp)
- movl %edi,56(%esp)
- movl 92(%esp),%ebx
- movl 124(%esp),%edi
- movl 112(%esp),%edx
- movl 80(%esp),%ebp
- movl 96(%esp),%ecx
- movl 100(%esp),%esi
- addl $1,%edx
- movl %ebx,28(%esp)
- movl %edi,60(%esp)
- movl %edx,112(%esp)
- movl $10,%ebx
- jmp L004loop
-.align 4,0x90
-L004loop:
- addl %ebp,%eax
- movl %ebx,128(%esp)
- movl %ebp,%ebx
- xorl %eax,%edx
- roll $16,%edx
- addl %edx,%ecx
- xorl %ecx,%ebx
- movl 52(%esp),%edi
- roll $12,%ebx
- movl 20(%esp),%ebp
- addl %ebx,%eax
- xorl %eax,%edx
- movl %eax,(%esp)
- roll $8,%edx
- movl 4(%esp),%eax
- addl %edx,%ecx
- movl %edx,48(%esp)
- xorl %ecx,%ebx
- addl %ebp,%eax
- roll $7,%ebx
- xorl %eax,%edi
- movl %ecx,32(%esp)
- roll $16,%edi
- movl %ebx,16(%esp)
- addl %edi,%esi
- movl 40(%esp),%ecx
- xorl %esi,%ebp
- movl 56(%esp),%edx
- roll $12,%ebp
- movl 24(%esp),%ebx
- addl %ebp,%eax
- xorl %eax,%edi
- movl %eax,4(%esp)
- roll $8,%edi
- movl 8(%esp),%eax
- addl %edi,%esi
- movl %edi,52(%esp)
- xorl %esi,%ebp
- addl %ebx,%eax
- roll $7,%ebp
- xorl %eax,%edx
- movl %esi,36(%esp)
- roll $16,%edx
- movl %ebp,20(%esp)
- addl %edx,%ecx
- movl 44(%esp),%esi
- xorl %ecx,%ebx
- movl 60(%esp),%edi
- roll $12,%ebx
- movl 28(%esp),%ebp
- addl %ebx,%eax
- xorl %eax,%edx
- movl %eax,8(%esp)
- roll $8,%edx
- movl 12(%esp),%eax
- addl %edx,%ecx
- movl %edx,56(%esp)
- xorl %ecx,%ebx
- addl %ebp,%eax
- roll $7,%ebx
- xorl %eax,%edi
- roll $16,%edi
- movl %ebx,24(%esp)
- addl %edi,%esi
- xorl %esi,%ebp
- roll $12,%ebp
- movl 20(%esp),%ebx
- addl %ebp,%eax
- xorl %eax,%edi
- movl %eax,12(%esp)
- roll $8,%edi
- movl (%esp),%eax
- addl %edi,%esi
- movl %edi,%edx
- xorl %esi,%ebp
- addl %ebx,%eax
- roll $7,%ebp
- xorl %eax,%edx
- roll $16,%edx
- movl %ebp,28(%esp)
- addl %edx,%ecx
- xorl %ecx,%ebx
- movl 48(%esp),%edi
- roll $12,%ebx
- movl 24(%esp),%ebp
- addl %ebx,%eax
- xorl %eax,%edx
- movl %eax,(%esp)
- roll $8,%edx
- movl 4(%esp),%eax
- addl %edx,%ecx
- movl %edx,60(%esp)
- xorl %ecx,%ebx
- addl %ebp,%eax
- roll $7,%ebx
- xorl %eax,%edi
- movl %ecx,40(%esp)
- roll $16,%edi
- movl %ebx,20(%esp)
- addl %edi,%esi
- movl 32(%esp),%ecx
- xorl %esi,%ebp
- movl 52(%esp),%edx
- roll $12,%ebp
- movl 28(%esp),%ebx
- addl %ebp,%eax
- xorl %eax,%edi
- movl %eax,4(%esp)
- roll $8,%edi
- movl 8(%esp),%eax
- addl %edi,%esi
- movl %edi,48(%esp)
- xorl %esi,%ebp
- addl %ebx,%eax
- roll $7,%ebp
- xorl %eax,%edx
- movl %esi,44(%esp)
- roll $16,%edx
- movl %ebp,24(%esp)
- addl %edx,%ecx
- movl 36(%esp),%esi
- xorl %ecx,%ebx
- movl 56(%esp),%edi
- roll $12,%ebx
- movl 16(%esp),%ebp
- addl %ebx,%eax
- xorl %eax,%edx
- movl %eax,8(%esp)
- roll $8,%edx
- movl 12(%esp),%eax
- addl %edx,%ecx
- movl %edx,52(%esp)
- xorl %ecx,%ebx
- addl %ebp,%eax
- roll $7,%ebx
- xorl %eax,%edi
- roll $16,%edi
- movl %ebx,28(%esp)
- addl %edi,%esi
- xorl %esi,%ebp
- movl 48(%esp),%edx
- roll $12,%ebp
- movl 128(%esp),%ebx
- addl %ebp,%eax
- xorl %eax,%edi
- movl %eax,12(%esp)
- roll $8,%edi
- movl (%esp),%eax
- addl %edi,%esi
- movl %edi,56(%esp)
- xorl %esi,%ebp
- roll $7,%ebp
- decl %ebx
- jnz L004loop
- movl 160(%esp),%ebx
- addl $1634760805,%eax
- addl 80(%esp),%ebp
- addl 96(%esp),%ecx
- addl 100(%esp),%esi
- cmpl $64,%ebx
- jb L005tail
- movl 156(%esp),%ebx
- addl 112(%esp),%edx
- addl 120(%esp),%edi
- xorl (%ebx),%eax
- xorl 16(%ebx),%ebp
- movl %eax,(%esp)
- movl 152(%esp),%eax
- xorl 32(%ebx),%ecx
- xorl 36(%ebx),%esi
- xorl 48(%ebx),%edx
- xorl 56(%ebx),%edi
- movl %ebp,16(%eax)
- movl %ecx,32(%eax)
- movl %esi,36(%eax)
- movl %edx,48(%eax)
- movl %edi,56(%eax)
- movl 4(%esp),%ebp
- movl 8(%esp),%ecx
- movl 12(%esp),%esi
- movl 20(%esp),%edx
- movl 24(%esp),%edi
- addl $857760878,%ebp
- addl $2036477234,%ecx
- addl $1797285236,%esi
- addl 84(%esp),%edx
- addl 88(%esp),%edi
- xorl 4(%ebx),%ebp
- xorl 8(%ebx),%ecx
- xorl 12(%ebx),%esi
- xorl 20(%ebx),%edx
- xorl 24(%ebx),%edi
- movl %ebp,4(%eax)
- movl %ecx,8(%eax)
- movl %esi,12(%eax)
- movl %edx,20(%eax)
- movl %edi,24(%eax)
- movl 28(%esp),%ebp
- movl 40(%esp),%ecx
- movl 44(%esp),%esi
- movl 52(%esp),%edx
- movl 60(%esp),%edi
- addl 92(%esp),%ebp
- addl 104(%esp),%ecx
- addl 108(%esp),%esi
- addl 116(%esp),%edx
- addl 124(%esp),%edi
- xorl 28(%ebx),%ebp
- xorl 40(%ebx),%ecx
- xorl 44(%ebx),%esi
- xorl 52(%ebx),%edx
- xorl 60(%ebx),%edi
- leal 64(%ebx),%ebx
- movl %ebp,28(%eax)
- movl (%esp),%ebp
- movl %ecx,40(%eax)
- movl 160(%esp),%ecx
- movl %esi,44(%eax)
- movl %edx,52(%eax)
- movl %edi,60(%eax)
- movl %ebp,(%eax)
- leal 64(%eax),%eax
- subl $64,%ecx
- jnz L003outer_loop
- jmp L006done
-L005tail:
- addl 112(%esp),%edx
- addl 120(%esp),%edi
- movl %eax,(%esp)
- movl %ebp,16(%esp)
- movl %ecx,32(%esp)
- movl %esi,36(%esp)
- movl %edx,48(%esp)
- movl %edi,56(%esp)
- movl 4(%esp),%ebp
- movl 8(%esp),%ecx
- movl 12(%esp),%esi
- movl 20(%esp),%edx
- movl 24(%esp),%edi
- addl $857760878,%ebp
- addl $2036477234,%ecx
- addl $1797285236,%esi
- addl 84(%esp),%edx
- addl 88(%esp),%edi
- movl %ebp,4(%esp)
- movl %ecx,8(%esp)
- movl %esi,12(%esp)
- movl %edx,20(%esp)
- movl %edi,24(%esp)
- movl 28(%esp),%ebp
- movl 40(%esp),%ecx
- movl 44(%esp),%esi
- movl 52(%esp),%edx
- movl 60(%esp),%edi
- addl 92(%esp),%ebp
- addl 104(%esp),%ecx
- addl 108(%esp),%esi
- addl 116(%esp),%edx
- addl 124(%esp),%edi
- movl %ebp,28(%esp)
- movl 156(%esp),%ebp
- movl %ecx,40(%esp)
- movl 152(%esp),%ecx
- movl %esi,44(%esp)
- xorl %esi,%esi
- movl %edx,52(%esp)
- movl %edi,60(%esp)
- xorl %eax,%eax
- xorl %edx,%edx
-L007tail_loop:
- movb (%esi,%ebp,1),%al
- movb (%esp,%esi,1),%dl
- leal 1(%esi),%esi
- xorb %dl,%al
- movb %al,-1(%ecx,%esi,1)
- decl %ebx
- jnz L007tail_loop
-L006done:
- addl $132,%esp
-L000no_data:
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _ChaCha20_ssse3
-.private_extern _ChaCha20_ssse3
-.align 4
-_ChaCha20_ssse3:
-L_ChaCha20_ssse3_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-Lssse3_shortcut:
- movl 20(%esp),%edi
- movl 24(%esp),%esi
- movl 28(%esp),%ecx
- movl 32(%esp),%edx
- movl 36(%esp),%ebx
- movl %esp,%ebp
- subl $524,%esp
- andl $-64,%esp
- movl %ebp,512(%esp)
- leal Lssse3_data-Lpic_point(%eax),%eax
- movdqu (%ebx),%xmm3
- cmpl $256,%ecx
- jb L0081x
- movl %edx,516(%esp)
- movl %ebx,520(%esp)
- subl $256,%ecx
- leal 384(%esp),%ebp
- movdqu (%edx),%xmm7
- pshufd $0,%xmm3,%xmm0
- pshufd $85,%xmm3,%xmm1
- pshufd $170,%xmm3,%xmm2
- pshufd $255,%xmm3,%xmm3
- paddd 48(%eax),%xmm0
- pshufd $0,%xmm7,%xmm4
- pshufd $85,%xmm7,%xmm5
- psubd 64(%eax),%xmm0
- pshufd $170,%xmm7,%xmm6
- pshufd $255,%xmm7,%xmm7
- movdqa %xmm0,64(%ebp)
- movdqa %xmm1,80(%ebp)
- movdqa %xmm2,96(%ebp)
- movdqa %xmm3,112(%ebp)
- movdqu 16(%edx),%xmm3
- movdqa %xmm4,-64(%ebp)
- movdqa %xmm5,-48(%ebp)
- movdqa %xmm6,-32(%ebp)
- movdqa %xmm7,-16(%ebp)
- movdqa 32(%eax),%xmm7
- leal 128(%esp),%ebx
- pshufd $0,%xmm3,%xmm0
- pshufd $85,%xmm3,%xmm1
- pshufd $170,%xmm3,%xmm2
- pshufd $255,%xmm3,%xmm3
- pshufd $0,%xmm7,%xmm4
- pshufd $85,%xmm7,%xmm5
- pshufd $170,%xmm7,%xmm6
- pshufd $255,%xmm7,%xmm7
- movdqa %xmm0,(%ebp)
- movdqa %xmm1,16(%ebp)
- movdqa %xmm2,32(%ebp)
- movdqa %xmm3,48(%ebp)
- movdqa %xmm4,-128(%ebp)
- movdqa %xmm5,-112(%ebp)
- movdqa %xmm6,-96(%ebp)
- movdqa %xmm7,-80(%ebp)
- leal 128(%esi),%esi
- leal 128(%edi),%edi
- jmp L009outer_loop
-.align 4,0x90
-L009outer_loop:
- movdqa -112(%ebp),%xmm1
- movdqa -96(%ebp),%xmm2
- movdqa -80(%ebp),%xmm3
- movdqa -48(%ebp),%xmm5
- movdqa -32(%ebp),%xmm6
- movdqa -16(%ebp),%xmm7
- movdqa %xmm1,-112(%ebx)
- movdqa %xmm2,-96(%ebx)
- movdqa %xmm3,-80(%ebx)
- movdqa %xmm5,-48(%ebx)
- movdqa %xmm6,-32(%ebx)
- movdqa %xmm7,-16(%ebx)
- movdqa 32(%ebp),%xmm2
- movdqa 48(%ebp),%xmm3
- movdqa 64(%ebp),%xmm4
- movdqa 80(%ebp),%xmm5
- movdqa 96(%ebp),%xmm6
- movdqa 112(%ebp),%xmm7
- paddd 64(%eax),%xmm4
- movdqa %xmm2,32(%ebx)
- movdqa %xmm3,48(%ebx)
- movdqa %xmm4,64(%ebx)
- movdqa %xmm5,80(%ebx)
- movdqa %xmm6,96(%ebx)
- movdqa %xmm7,112(%ebx)
- movdqa %xmm4,64(%ebp)
- movdqa -128(%ebp),%xmm0
- movdqa %xmm4,%xmm6
- movdqa -64(%ebp),%xmm3
- movdqa (%ebp),%xmm4
- movdqa 16(%ebp),%xmm5
- movl $10,%edx
- nop
-.align 4,0x90
-L010loop:
- paddd %xmm3,%xmm0
- movdqa %xmm3,%xmm2
- pxor %xmm0,%xmm6
- pshufb (%eax),%xmm6
- paddd %xmm6,%xmm4
- pxor %xmm4,%xmm2
- movdqa -48(%ebx),%xmm3
- movdqa %xmm2,%xmm1
- pslld $12,%xmm2
- psrld $20,%xmm1
- por %xmm1,%xmm2
- movdqa -112(%ebx),%xmm1
- paddd %xmm2,%xmm0
- movdqa 80(%ebx),%xmm7
- pxor %xmm0,%xmm6
- movdqa %xmm0,-128(%ebx)
- pshufb 16(%eax),%xmm6
- paddd %xmm6,%xmm4
- movdqa %xmm6,64(%ebx)
- pxor %xmm4,%xmm2
- paddd %xmm3,%xmm1
- movdqa %xmm2,%xmm0
- pslld $7,%xmm2
- psrld $25,%xmm0
- pxor %xmm1,%xmm7
- por %xmm0,%xmm2
- movdqa %xmm4,(%ebx)
- pshufb (%eax),%xmm7
- movdqa %xmm2,-64(%ebx)
- paddd %xmm7,%xmm5
- movdqa 32(%ebx),%xmm4
- pxor %xmm5,%xmm3
- movdqa -32(%ebx),%xmm2
- movdqa %xmm3,%xmm0
- pslld $12,%xmm3
- psrld $20,%xmm0
- por %xmm0,%xmm3
- movdqa -96(%ebx),%xmm0
- paddd %xmm3,%xmm1
- movdqa 96(%ebx),%xmm6
- pxor %xmm1,%xmm7
- movdqa %xmm1,-112(%ebx)
- pshufb 16(%eax),%xmm7
- paddd %xmm7,%xmm5
- movdqa %xmm7,80(%ebx)
- pxor %xmm5,%xmm3
- paddd %xmm2,%xmm0
- movdqa %xmm3,%xmm1
- pslld $7,%xmm3
- psrld $25,%xmm1
- pxor %xmm0,%xmm6
- por %xmm1,%xmm3
- movdqa %xmm5,16(%ebx)
- pshufb (%eax),%xmm6
- movdqa %xmm3,-48(%ebx)
- paddd %xmm6,%xmm4
- movdqa 48(%ebx),%xmm5
- pxor %xmm4,%xmm2
- movdqa -16(%ebx),%xmm3
- movdqa %xmm2,%xmm1
- pslld $12,%xmm2
- psrld $20,%xmm1
- por %xmm1,%xmm2
- movdqa -80(%ebx),%xmm1
- paddd %xmm2,%xmm0
- movdqa 112(%ebx),%xmm7
- pxor %xmm0,%xmm6
- movdqa %xmm0,-96(%ebx)
- pshufb 16(%eax),%xmm6
- paddd %xmm6,%xmm4
- movdqa %xmm6,96(%ebx)
- pxor %xmm4,%xmm2
- paddd %xmm3,%xmm1
- movdqa %xmm2,%xmm0
- pslld $7,%xmm2
- psrld $25,%xmm0
- pxor %xmm1,%xmm7
- por %xmm0,%xmm2
- pshufb (%eax),%xmm7
- movdqa %xmm2,-32(%ebx)
- paddd %xmm7,%xmm5
- pxor %xmm5,%xmm3
- movdqa -48(%ebx),%xmm2
- movdqa %xmm3,%xmm0
- pslld $12,%xmm3
- psrld $20,%xmm0
- por %xmm0,%xmm3
- movdqa -128(%ebx),%xmm0
- paddd %xmm3,%xmm1
- pxor %xmm1,%xmm7
- movdqa %xmm1,-80(%ebx)
- pshufb 16(%eax),%xmm7
- paddd %xmm7,%xmm5
- movdqa %xmm7,%xmm6
- pxor %xmm5,%xmm3
- paddd %xmm2,%xmm0
- movdqa %xmm3,%xmm1
- pslld $7,%xmm3
- psrld $25,%xmm1
- pxor %xmm0,%xmm6
- por %xmm1,%xmm3
- pshufb (%eax),%xmm6
- movdqa %xmm3,-16(%ebx)
- paddd %xmm6,%xmm4
- pxor %xmm4,%xmm2
- movdqa -32(%ebx),%xmm3
- movdqa %xmm2,%xmm1
- pslld $12,%xmm2
- psrld $20,%xmm1
- por %xmm1,%xmm2
- movdqa -112(%ebx),%xmm1
- paddd %xmm2,%xmm0
- movdqa 64(%ebx),%xmm7
- pxor %xmm0,%xmm6
- movdqa %xmm0,-128(%ebx)
- pshufb 16(%eax),%xmm6
- paddd %xmm6,%xmm4
- movdqa %xmm6,112(%ebx)
- pxor %xmm4,%xmm2
- paddd %xmm3,%xmm1
- movdqa %xmm2,%xmm0
- pslld $7,%xmm2
- psrld $25,%xmm0
- pxor %xmm1,%xmm7
- por %xmm0,%xmm2
- movdqa %xmm4,32(%ebx)
- pshufb (%eax),%xmm7
- movdqa %xmm2,-48(%ebx)
- paddd %xmm7,%xmm5
- movdqa (%ebx),%xmm4
- pxor %xmm5,%xmm3
- movdqa -16(%ebx),%xmm2
- movdqa %xmm3,%xmm0
- pslld $12,%xmm3
- psrld $20,%xmm0
- por %xmm0,%xmm3
- movdqa -96(%ebx),%xmm0
- paddd %xmm3,%xmm1
- movdqa 80(%ebx),%xmm6
- pxor %xmm1,%xmm7
- movdqa %xmm1,-112(%ebx)
- pshufb 16(%eax),%xmm7
- paddd %xmm7,%xmm5
- movdqa %xmm7,64(%ebx)
- pxor %xmm5,%xmm3
- paddd %xmm2,%xmm0
- movdqa %xmm3,%xmm1
- pslld $7,%xmm3
- psrld $25,%xmm1
- pxor %xmm0,%xmm6
- por %xmm1,%xmm3
- movdqa %xmm5,48(%ebx)
- pshufb (%eax),%xmm6
- movdqa %xmm3,-32(%ebx)
- paddd %xmm6,%xmm4
- movdqa 16(%ebx),%xmm5
- pxor %xmm4,%xmm2
- movdqa -64(%ebx),%xmm3
- movdqa %xmm2,%xmm1
- pslld $12,%xmm2
- psrld $20,%xmm1
- por %xmm1,%xmm2
- movdqa -80(%ebx),%xmm1
- paddd %xmm2,%xmm0
- movdqa 96(%ebx),%xmm7
- pxor %xmm0,%xmm6
- movdqa %xmm0,-96(%ebx)
- pshufb 16(%eax),%xmm6
- paddd %xmm6,%xmm4
- movdqa %xmm6,80(%ebx)
- pxor %xmm4,%xmm2
- paddd %xmm3,%xmm1
- movdqa %xmm2,%xmm0
- pslld $7,%xmm2
- psrld $25,%xmm0
- pxor %xmm1,%xmm7
- por %xmm0,%xmm2
- pshufb (%eax),%xmm7
- movdqa %xmm2,-16(%ebx)
- paddd %xmm7,%xmm5
- pxor %xmm5,%xmm3
- movdqa %xmm3,%xmm0
- pslld $12,%xmm3
- psrld $20,%xmm0
- por %xmm0,%xmm3
- movdqa -128(%ebx),%xmm0
- paddd %xmm3,%xmm1
- movdqa 64(%ebx),%xmm6
- pxor %xmm1,%xmm7
- movdqa %xmm1,-80(%ebx)
- pshufb 16(%eax),%xmm7
- paddd %xmm7,%xmm5
- movdqa %xmm7,96(%ebx)
- pxor %xmm5,%xmm3
- movdqa %xmm3,%xmm1
- pslld $7,%xmm3
- psrld $25,%xmm1
- por %xmm1,%xmm3
- decl %edx
- jnz L010loop
- movdqa %xmm3,-64(%ebx)
- movdqa %xmm4,(%ebx)
- movdqa %xmm5,16(%ebx)
- movdqa %xmm6,64(%ebx)
- movdqa %xmm7,96(%ebx)
- movdqa -112(%ebx),%xmm1
- movdqa -96(%ebx),%xmm2
- movdqa -80(%ebx),%xmm3
- paddd -128(%ebp),%xmm0
- paddd -112(%ebp),%xmm1
- paddd -96(%ebp),%xmm2
- paddd -80(%ebp),%xmm3
- movdqa %xmm0,%xmm6
- punpckldq %xmm1,%xmm0
- movdqa %xmm2,%xmm7
- punpckldq %xmm3,%xmm2
- punpckhdq %xmm1,%xmm6
- punpckhdq %xmm3,%xmm7
- movdqa %xmm0,%xmm1
- punpcklqdq %xmm2,%xmm0
- movdqa %xmm6,%xmm3
- punpcklqdq %xmm7,%xmm6
- punpckhqdq %xmm2,%xmm1
- punpckhqdq %xmm7,%xmm3
- movdqu -128(%esi),%xmm4
- movdqu -64(%esi),%xmm5
- movdqu (%esi),%xmm2
- movdqu 64(%esi),%xmm7
- leal 16(%esi),%esi
- pxor %xmm0,%xmm4
- movdqa -64(%ebx),%xmm0
- pxor %xmm1,%xmm5
- movdqa -48(%ebx),%xmm1
- pxor %xmm2,%xmm6
- movdqa -32(%ebx),%xmm2
- pxor %xmm3,%xmm7
- movdqa -16(%ebx),%xmm3
- movdqu %xmm4,-128(%edi)
- movdqu %xmm5,-64(%edi)
- movdqu %xmm6,(%edi)
- movdqu %xmm7,64(%edi)
- leal 16(%edi),%edi
- paddd -64(%ebp),%xmm0
- paddd -48(%ebp),%xmm1
- paddd -32(%ebp),%xmm2
- paddd -16(%ebp),%xmm3
- movdqa %xmm0,%xmm6
- punpckldq %xmm1,%xmm0
- movdqa %xmm2,%xmm7
- punpckldq %xmm3,%xmm2
- punpckhdq %xmm1,%xmm6
- punpckhdq %xmm3,%xmm7
- movdqa %xmm0,%xmm1
- punpcklqdq %xmm2,%xmm0
- movdqa %xmm6,%xmm3
- punpcklqdq %xmm7,%xmm6
- punpckhqdq %xmm2,%xmm1
- punpckhqdq %xmm7,%xmm3
- movdqu -128(%esi),%xmm4
- movdqu -64(%esi),%xmm5
- movdqu (%esi),%xmm2
- movdqu 64(%esi),%xmm7
- leal 16(%esi),%esi
- pxor %xmm0,%xmm4
- movdqa (%ebx),%xmm0
- pxor %xmm1,%xmm5
- movdqa 16(%ebx),%xmm1
- pxor %xmm2,%xmm6
- movdqa 32(%ebx),%xmm2
- pxor %xmm3,%xmm7
- movdqa 48(%ebx),%xmm3
- movdqu %xmm4,-128(%edi)
- movdqu %xmm5,-64(%edi)
- movdqu %xmm6,(%edi)
- movdqu %xmm7,64(%edi)
- leal 16(%edi),%edi
- paddd (%ebp),%xmm0
- paddd 16(%ebp),%xmm1
- paddd 32(%ebp),%xmm2
- paddd 48(%ebp),%xmm3
- movdqa %xmm0,%xmm6
- punpckldq %xmm1,%xmm0
- movdqa %xmm2,%xmm7
- punpckldq %xmm3,%xmm2
- punpckhdq %xmm1,%xmm6
- punpckhdq %xmm3,%xmm7
- movdqa %xmm0,%xmm1
- punpcklqdq %xmm2,%xmm0
- movdqa %xmm6,%xmm3
- punpcklqdq %xmm7,%xmm6
- punpckhqdq %xmm2,%xmm1
- punpckhqdq %xmm7,%xmm3
- movdqu -128(%esi),%xmm4
- movdqu -64(%esi),%xmm5
- movdqu (%esi),%xmm2
- movdqu 64(%esi),%xmm7
- leal 16(%esi),%esi
- pxor %xmm0,%xmm4
- movdqa 64(%ebx),%xmm0
- pxor %xmm1,%xmm5
- movdqa 80(%ebx),%xmm1
- pxor %xmm2,%xmm6
- movdqa 96(%ebx),%xmm2
- pxor %xmm3,%xmm7
- movdqa 112(%ebx),%xmm3
- movdqu %xmm4,-128(%edi)
- movdqu %xmm5,-64(%edi)
- movdqu %xmm6,(%edi)
- movdqu %xmm7,64(%edi)
- leal 16(%edi),%edi
- paddd 64(%ebp),%xmm0
- paddd 80(%ebp),%xmm1
- paddd 96(%ebp),%xmm2
- paddd 112(%ebp),%xmm3
- movdqa %xmm0,%xmm6
- punpckldq %xmm1,%xmm0
- movdqa %xmm2,%xmm7
- punpckldq %xmm3,%xmm2
- punpckhdq %xmm1,%xmm6
- punpckhdq %xmm3,%xmm7
- movdqa %xmm0,%xmm1
- punpcklqdq %xmm2,%xmm0
- movdqa %xmm6,%xmm3
- punpcklqdq %xmm7,%xmm6
- punpckhqdq %xmm2,%xmm1
- punpckhqdq %xmm7,%xmm3
- movdqu -128(%esi),%xmm4
- movdqu -64(%esi),%xmm5
- movdqu (%esi),%xmm2
- movdqu 64(%esi),%xmm7
- leal 208(%esi),%esi
- pxor %xmm0,%xmm4
- pxor %xmm1,%xmm5
- pxor %xmm2,%xmm6
- pxor %xmm3,%xmm7
- movdqu %xmm4,-128(%edi)
- movdqu %xmm5,-64(%edi)
- movdqu %xmm6,(%edi)
- movdqu %xmm7,64(%edi)
- leal 208(%edi),%edi
- subl $256,%ecx
- jnc L009outer_loop
- addl $256,%ecx
- jz L011done
- movl 520(%esp),%ebx
- leal -128(%esi),%esi
- movl 516(%esp),%edx
- leal -128(%edi),%edi
- movd 64(%ebp),%xmm2
- movdqu (%ebx),%xmm3
- paddd 96(%eax),%xmm2
- pand 112(%eax),%xmm3
- por %xmm2,%xmm3
-L0081x:
- movdqa 32(%eax),%xmm0
- movdqu (%edx),%xmm1
- movdqu 16(%edx),%xmm2
- movdqa (%eax),%xmm6
- movdqa 16(%eax),%xmm7
- movl %ebp,48(%esp)
- movdqa %xmm0,(%esp)
- movdqa %xmm1,16(%esp)
- movdqa %xmm2,32(%esp)
- movdqa %xmm3,48(%esp)
- movl $10,%edx
- jmp L012loop1x
-.align 4,0x90
-L013outer1x:
- movdqa 80(%eax),%xmm3
- movdqa (%esp),%xmm0
- movdqa 16(%esp),%xmm1
- movdqa 32(%esp),%xmm2
- paddd 48(%esp),%xmm3
- movl $10,%edx
- movdqa %xmm3,48(%esp)
- jmp L012loop1x
-.align 4,0x90
-L012loop1x:
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
-.byte 102,15,56,0,222
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
-.byte 102,15,56,0,223
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $78,%xmm2,%xmm2
- pshufd $57,%xmm1,%xmm1
- pshufd $147,%xmm3,%xmm3
- nop
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
-.byte 102,15,56,0,222
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
-.byte 102,15,56,0,223
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $78,%xmm2,%xmm2
- pshufd $147,%xmm1,%xmm1
- pshufd $57,%xmm3,%xmm3
- decl %edx
- jnz L012loop1x
- paddd (%esp),%xmm0
- paddd 16(%esp),%xmm1
- paddd 32(%esp),%xmm2
- paddd 48(%esp),%xmm3
- cmpl $64,%ecx
- jb L014tail
- movdqu (%esi),%xmm4
- movdqu 16(%esi),%xmm5
- pxor %xmm4,%xmm0
- movdqu 32(%esi),%xmm4
- pxor %xmm5,%xmm1
- movdqu 48(%esi),%xmm5
- pxor %xmm4,%xmm2
- pxor %xmm5,%xmm3
- leal 64(%esi),%esi
- movdqu %xmm0,(%edi)
- movdqu %xmm1,16(%edi)
- movdqu %xmm2,32(%edi)
- movdqu %xmm3,48(%edi)
- leal 64(%edi),%edi
- subl $64,%ecx
- jnz L013outer1x
- jmp L011done
-L014tail:
- movdqa %xmm0,(%esp)
- movdqa %xmm1,16(%esp)
- movdqa %xmm2,32(%esp)
- movdqa %xmm3,48(%esp)
- xorl %eax,%eax
- xorl %edx,%edx
- xorl %ebp,%ebp
-L015tail_loop:
- movb (%esp,%ebp,1),%al
- movb (%esi,%ebp,1),%dl
- leal 1(%ebp),%ebp
- xorb %dl,%al
- movb %al,-1(%edi,%ebp,1)
- decl %ecx
- jnz L015tail_loop
-L011done:
- movl 512(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 6,0x90
-Lssse3_data:
-.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
-.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
-.long 1634760805,857760878,2036477234,1797285236
-.long 0,1,2,3
-.long 4,4,4,4
-.long 1,0,0,0
-.long 4,0,0,0
-.long 0,-1,-1,-1
-.align 6,0x90
-.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
-.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
-.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
-.byte 114,103,62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol _OPENSSL_ia32cap_P
-.long 0
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/aesni-x86-apple.S b/apple-x86/crypto/fipsmodule/aesni-x86-apple.S
deleted file mode 100644
index 4467604..0000000
--- a/apple-x86/crypto/fipsmodule/aesni-x86-apple.S
+++ /dev/null
@@ -1,2475 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-#ifdef BORINGSSL_DISPATCH_TEST
-#endif
-.globl _aes_hw_encrypt
-.private_extern _aes_hw_encrypt
-.align 4
-_aes_hw_encrypt:
-L_aes_hw_encrypt_begin:
-#ifdef BORINGSSL_DISPATCH_TEST
- pushl %ebx
- pushl %edx
- call L000pic
-L000pic:
- popl %ebx
- leal _BORINGSSL_function_hit+1-L000pic(%ebx),%ebx
- movl $1,%edx
- movb %dl,(%ebx)
- popl %edx
- popl %ebx
-#endif
- movl 4(%esp),%eax
- movl 12(%esp),%edx
- movups (%eax),%xmm2
- movl 240(%edx),%ecx
- movl 8(%esp),%eax
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L001enc1_loop_1:
-.byte 102,15,56,220,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L001enc1_loop_1
-.byte 102,15,56,221,209
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- movups %xmm2,(%eax)
- pxor %xmm2,%xmm2
- ret
-.globl _aes_hw_decrypt
-.private_extern _aes_hw_decrypt
-.align 4
-_aes_hw_decrypt:
-L_aes_hw_decrypt_begin:
- movl 4(%esp),%eax
- movl 12(%esp),%edx
- movups (%eax),%xmm2
- movl 240(%edx),%ecx
- movl 8(%esp),%eax
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L002dec1_loop_2:
-.byte 102,15,56,222,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L002dec1_loop_2
-.byte 102,15,56,223,209
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- movups %xmm2,(%eax)
- pxor %xmm2,%xmm2
- ret
-.private_extern __aesni_encrypt2
-.align 4
-__aesni_encrypt2:
- movups (%edx),%xmm0
- shll $4,%ecx
- movups 16(%edx),%xmm1
- xorps %xmm0,%xmm2
- pxor %xmm0,%xmm3
- movups 32(%edx),%xmm0
- leal 32(%edx,%ecx,1),%edx
- negl %ecx
- addl $16,%ecx
-L003enc2_loop:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
- movups (%edx,%ecx,1),%xmm1
- addl $32,%ecx
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
- movups -16(%edx,%ecx,1),%xmm0
- jnz L003enc2_loop
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
- ret
-.private_extern __aesni_decrypt2
-.align 4
-__aesni_decrypt2:
- movups (%edx),%xmm0
- shll $4,%ecx
- movups 16(%edx),%xmm1
- xorps %xmm0,%xmm2
- pxor %xmm0,%xmm3
- movups 32(%edx),%xmm0
- leal 32(%edx,%ecx,1),%edx
- negl %ecx
- addl $16,%ecx
-L004dec2_loop:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
- movups (%edx,%ecx,1),%xmm1
- addl $32,%ecx
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
- movups -16(%edx,%ecx,1),%xmm0
- jnz L004dec2_loop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
- ret
-.private_extern __aesni_encrypt3
-.align 4
-__aesni_encrypt3:
- movups (%edx),%xmm0
- shll $4,%ecx
- movups 16(%edx),%xmm1
- xorps %xmm0,%xmm2
- pxor %xmm0,%xmm3
- pxor %xmm0,%xmm4
- movups 32(%edx),%xmm0
- leal 32(%edx,%ecx,1),%edx
- negl %ecx
- addl $16,%ecx
-L005enc3_loop:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
- movups (%edx,%ecx,1),%xmm1
- addl $32,%ecx
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
- movups -16(%edx,%ecx,1),%xmm0
- jnz L005enc3_loop
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
- ret
-.private_extern __aesni_decrypt3
-.align 4
-__aesni_decrypt3:
- movups (%edx),%xmm0
- shll $4,%ecx
- movups 16(%edx),%xmm1
- xorps %xmm0,%xmm2
- pxor %xmm0,%xmm3
- pxor %xmm0,%xmm4
- movups 32(%edx),%xmm0
- leal 32(%edx,%ecx,1),%edx
- negl %ecx
- addl $16,%ecx
-L006dec3_loop:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
- movups (%edx,%ecx,1),%xmm1
- addl $32,%ecx
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
- movups -16(%edx,%ecx,1),%xmm0
- jnz L006dec3_loop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
- ret
-.private_extern __aesni_encrypt4
-.align 4
-__aesni_encrypt4:
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- shll $4,%ecx
- xorps %xmm0,%xmm2
- pxor %xmm0,%xmm3
- pxor %xmm0,%xmm4
- pxor %xmm0,%xmm5
- movups 32(%edx),%xmm0
- leal 32(%edx,%ecx,1),%edx
- negl %ecx
-.byte 15,31,64,0
- addl $16,%ecx
-L007enc4_loop:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
- movups (%edx,%ecx,1),%xmm1
- addl $32,%ecx
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
- movups -16(%edx,%ecx,1),%xmm0
- jnz L007enc4_loop
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
-.byte 102,15,56,221,232
- ret
-.private_extern __aesni_decrypt4
-.align 4
-__aesni_decrypt4:
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- shll $4,%ecx
- xorps %xmm0,%xmm2
- pxor %xmm0,%xmm3
- pxor %xmm0,%xmm4
- pxor %xmm0,%xmm5
- movups 32(%edx),%xmm0
- leal 32(%edx,%ecx,1),%edx
- negl %ecx
-.byte 15,31,64,0
- addl $16,%ecx
-L008dec4_loop:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
- movups (%edx,%ecx,1),%xmm1
- addl $32,%ecx
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
- movups -16(%edx,%ecx,1),%xmm0
- jnz L008dec4_loop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
-.byte 102,15,56,223,232
- ret
-.private_extern __aesni_encrypt6
-.align 4
-__aesni_encrypt6:
- movups (%edx),%xmm0
- shll $4,%ecx
- movups 16(%edx),%xmm1
- xorps %xmm0,%xmm2
- pxor %xmm0,%xmm3
- pxor %xmm0,%xmm4
-.byte 102,15,56,220,209
- pxor %xmm0,%xmm5
- pxor %xmm0,%xmm6
-.byte 102,15,56,220,217
- leal 32(%edx,%ecx,1),%edx
- negl %ecx
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm7
- movups (%edx,%ecx,1),%xmm0
- addl $16,%ecx
- jmp L009_aesni_encrypt6_inner
-.align 4,0x90
-L010enc6_loop:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-L009_aesni_encrypt6_inner:
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-L_aesni_encrypt6_enter:
- movups (%edx,%ecx,1),%xmm1
- addl $32,%ecx
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
- movups -16(%edx,%ecx,1),%xmm0
- jnz L010enc6_loop
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
- ret
-.private_extern __aesni_decrypt6
-.align 4
-__aesni_decrypt6:
- movups (%edx),%xmm0
- shll $4,%ecx
- movups 16(%edx),%xmm1
- xorps %xmm0,%xmm2
- pxor %xmm0,%xmm3
- pxor %xmm0,%xmm4
-.byte 102,15,56,222,209
- pxor %xmm0,%xmm5
- pxor %xmm0,%xmm6
-.byte 102,15,56,222,217
- leal 32(%edx,%ecx,1),%edx
- negl %ecx
-.byte 102,15,56,222,225
- pxor %xmm0,%xmm7
- movups (%edx,%ecx,1),%xmm0
- addl $16,%ecx
- jmp L011_aesni_decrypt6_inner
-.align 4,0x90
-L012dec6_loop:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-L011_aesni_decrypt6_inner:
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-L_aesni_decrypt6_enter:
- movups (%edx,%ecx,1),%xmm1
- addl $32,%ecx
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
- movups -16(%edx,%ecx,1),%xmm0
- jnz L012dec6_loop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
- ret
-.globl _aes_hw_ecb_encrypt
-.private_extern _aes_hw_ecb_encrypt
-.align 4
-_aes_hw_ecb_encrypt:
-L_aes_hw_ecb_encrypt_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%eax
- movl 32(%esp),%edx
- movl 36(%esp),%ebx
- andl $-16,%eax
- jz L013ecb_ret
- movl 240(%edx),%ecx
- testl %ebx,%ebx
- jz L014ecb_decrypt
- movl %edx,%ebp
- movl %ecx,%ebx
- cmpl $96,%eax
- jb L015ecb_enc_tail
- movdqu (%esi),%xmm2
- movdqu 16(%esi),%xmm3
- movdqu 32(%esi),%xmm4
- movdqu 48(%esi),%xmm5
- movdqu 64(%esi),%xmm6
- movdqu 80(%esi),%xmm7
- leal 96(%esi),%esi
- subl $96,%eax
- jmp L016ecb_enc_loop6_enter
-.align 4,0x90
-L017ecb_enc_loop6:
- movups %xmm2,(%edi)
- movdqu (%esi),%xmm2
- movups %xmm3,16(%edi)
- movdqu 16(%esi),%xmm3
- movups %xmm4,32(%edi)
- movdqu 32(%esi),%xmm4
- movups %xmm5,48(%edi)
- movdqu 48(%esi),%xmm5
- movups %xmm6,64(%edi)
- movdqu 64(%esi),%xmm6
- movups %xmm7,80(%edi)
- leal 96(%edi),%edi
- movdqu 80(%esi),%xmm7
- leal 96(%esi),%esi
-L016ecb_enc_loop6_enter:
- call __aesni_encrypt6
- movl %ebp,%edx
- movl %ebx,%ecx
- subl $96,%eax
- jnc L017ecb_enc_loop6
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- movups %xmm5,48(%edi)
- movups %xmm6,64(%edi)
- movups %xmm7,80(%edi)
- leal 96(%edi),%edi
- addl $96,%eax
- jz L013ecb_ret
-L015ecb_enc_tail:
- movups (%esi),%xmm2
- cmpl $32,%eax
- jb L018ecb_enc_one
- movups 16(%esi),%xmm3
- je L019ecb_enc_two
- movups 32(%esi),%xmm4
- cmpl $64,%eax
- jb L020ecb_enc_three
- movups 48(%esi),%xmm5
- je L021ecb_enc_four
- movups 64(%esi),%xmm6
- xorps %xmm7,%xmm7
- call __aesni_encrypt6
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- movups %xmm5,48(%edi)
- movups %xmm6,64(%edi)
- jmp L013ecb_ret
-.align 4,0x90
-L018ecb_enc_one:
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L022enc1_loop_3:
-.byte 102,15,56,220,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L022enc1_loop_3
-.byte 102,15,56,221,209
- movups %xmm2,(%edi)
- jmp L013ecb_ret
-.align 4,0x90
-L019ecb_enc_two:
- call __aesni_encrypt2
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- jmp L013ecb_ret
-.align 4,0x90
-L020ecb_enc_three:
- call __aesni_encrypt3
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- jmp L013ecb_ret
-.align 4,0x90
-L021ecb_enc_four:
- call __aesni_encrypt4
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- movups %xmm5,48(%edi)
- jmp L013ecb_ret
-.align 4,0x90
-L014ecb_decrypt:
- movl %edx,%ebp
- movl %ecx,%ebx
- cmpl $96,%eax
- jb L023ecb_dec_tail
- movdqu (%esi),%xmm2
- movdqu 16(%esi),%xmm3
- movdqu 32(%esi),%xmm4
- movdqu 48(%esi),%xmm5
- movdqu 64(%esi),%xmm6
- movdqu 80(%esi),%xmm7
- leal 96(%esi),%esi
- subl $96,%eax
- jmp L024ecb_dec_loop6_enter
-.align 4,0x90
-L025ecb_dec_loop6:
- movups %xmm2,(%edi)
- movdqu (%esi),%xmm2
- movups %xmm3,16(%edi)
- movdqu 16(%esi),%xmm3
- movups %xmm4,32(%edi)
- movdqu 32(%esi),%xmm4
- movups %xmm5,48(%edi)
- movdqu 48(%esi),%xmm5
- movups %xmm6,64(%edi)
- movdqu 64(%esi),%xmm6
- movups %xmm7,80(%edi)
- leal 96(%edi),%edi
- movdqu 80(%esi),%xmm7
- leal 96(%esi),%esi
-L024ecb_dec_loop6_enter:
- call __aesni_decrypt6
- movl %ebp,%edx
- movl %ebx,%ecx
- subl $96,%eax
- jnc L025ecb_dec_loop6
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- movups %xmm5,48(%edi)
- movups %xmm6,64(%edi)
- movups %xmm7,80(%edi)
- leal 96(%edi),%edi
- addl $96,%eax
- jz L013ecb_ret
-L023ecb_dec_tail:
- movups (%esi),%xmm2
- cmpl $32,%eax
- jb L026ecb_dec_one
- movups 16(%esi),%xmm3
- je L027ecb_dec_two
- movups 32(%esi),%xmm4
- cmpl $64,%eax
- jb L028ecb_dec_three
- movups 48(%esi),%xmm5
- je L029ecb_dec_four
- movups 64(%esi),%xmm6
- xorps %xmm7,%xmm7
- call __aesni_decrypt6
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- movups %xmm5,48(%edi)
- movups %xmm6,64(%edi)
- jmp L013ecb_ret
-.align 4,0x90
-L026ecb_dec_one:
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L030dec1_loop_4:
-.byte 102,15,56,222,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L030dec1_loop_4
-.byte 102,15,56,223,209
- movups %xmm2,(%edi)
- jmp L013ecb_ret
-.align 4,0x90
-L027ecb_dec_two:
- call __aesni_decrypt2
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- jmp L013ecb_ret
-.align 4,0x90
-L028ecb_dec_three:
- call __aesni_decrypt3
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- jmp L013ecb_ret
-.align 4,0x90
-L029ecb_dec_four:
- call __aesni_decrypt4
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- movups %xmm5,48(%edi)
-L013ecb_ret:
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
- pxor %xmm7,%xmm7
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _aes_hw_ccm64_encrypt_blocks
-.private_extern _aes_hw_ccm64_encrypt_blocks
-.align 4
-_aes_hw_ccm64_encrypt_blocks:
-L_aes_hw_ccm64_encrypt_blocks_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%eax
- movl 32(%esp),%edx
- movl 36(%esp),%ebx
- movl 40(%esp),%ecx
- movl %esp,%ebp
- subl $60,%esp
- andl $-16,%esp
- movl %ebp,48(%esp)
- movdqu (%ebx),%xmm7
- movdqu (%ecx),%xmm3
- movl 240(%edx),%ecx
- movl $202182159,(%esp)
- movl $134810123,4(%esp)
- movl $67438087,8(%esp)
- movl $66051,12(%esp)
- movl $1,%ebx
- xorl %ebp,%ebp
- movl %ebx,16(%esp)
- movl %ebp,20(%esp)
- movl %ebp,24(%esp)
- movl %ebp,28(%esp)
- shll $4,%ecx
- movl $16,%ebx
- leal (%edx),%ebp
- movdqa (%esp),%xmm5
- movdqa %xmm7,%xmm2
- leal 32(%edx,%ecx,1),%edx
- subl %ecx,%ebx
-.byte 102,15,56,0,253
-L031ccm64_enc_outer:
- movups (%ebp),%xmm0
- movl %ebx,%ecx
- movups (%esi),%xmm6
- xorps %xmm0,%xmm2
- movups 16(%ebp),%xmm1
- xorps %xmm6,%xmm0
- xorps %xmm0,%xmm3
- movups 32(%ebp),%xmm0
-L032ccm64_enc2_loop:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
- movups (%edx,%ecx,1),%xmm1
- addl $32,%ecx
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
- movups -16(%edx,%ecx,1),%xmm0
- jnz L032ccm64_enc2_loop
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
- paddq 16(%esp),%xmm7
- decl %eax
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
- leal 16(%esi),%esi
- xorps %xmm2,%xmm6
- movdqa %xmm7,%xmm2
- movups %xmm6,(%edi)
-.byte 102,15,56,0,213
- leal 16(%edi),%edi
- jnz L031ccm64_enc_outer
- movl 48(%esp),%esp
- movl 40(%esp),%edi
- movups %xmm3,(%edi)
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
- pxor %xmm7,%xmm7
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _aes_hw_ccm64_decrypt_blocks
-.private_extern _aes_hw_ccm64_decrypt_blocks
-.align 4
-_aes_hw_ccm64_decrypt_blocks:
-L_aes_hw_ccm64_decrypt_blocks_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%eax
- movl 32(%esp),%edx
- movl 36(%esp),%ebx
- movl 40(%esp),%ecx
- movl %esp,%ebp
- subl $60,%esp
- andl $-16,%esp
- movl %ebp,48(%esp)
- movdqu (%ebx),%xmm7
- movdqu (%ecx),%xmm3
- movl 240(%edx),%ecx
- movl $202182159,(%esp)
- movl $134810123,4(%esp)
- movl $67438087,8(%esp)
- movl $66051,12(%esp)
- movl $1,%ebx
- xorl %ebp,%ebp
- movl %ebx,16(%esp)
- movl %ebp,20(%esp)
- movl %ebp,24(%esp)
- movl %ebp,28(%esp)
- movdqa (%esp),%xmm5
- movdqa %xmm7,%xmm2
- movl %edx,%ebp
- movl %ecx,%ebx
-.byte 102,15,56,0,253
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L033enc1_loop_5:
-.byte 102,15,56,220,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L033enc1_loop_5
-.byte 102,15,56,221,209
- shll $4,%ebx
- movl $16,%ecx
- movups (%esi),%xmm6
- paddq 16(%esp),%xmm7
- leal 16(%esi),%esi
- subl %ebx,%ecx
- leal 32(%ebp,%ebx,1),%edx
- movl %ecx,%ebx
- jmp L034ccm64_dec_outer
-.align 4,0x90
-L034ccm64_dec_outer:
- xorps %xmm2,%xmm6
- movdqa %xmm7,%xmm2
- movups %xmm6,(%edi)
- leal 16(%edi),%edi
-.byte 102,15,56,0,213
- subl $1,%eax
- jz L035ccm64_dec_break
- movups (%ebp),%xmm0
- movl %ebx,%ecx
- movups 16(%ebp),%xmm1
- xorps %xmm0,%xmm6
- xorps %xmm0,%xmm2
- xorps %xmm6,%xmm3
- movups 32(%ebp),%xmm0
-L036ccm64_dec2_loop:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
- movups (%edx,%ecx,1),%xmm1
- addl $32,%ecx
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
- movups -16(%edx,%ecx,1),%xmm0
- jnz L036ccm64_dec2_loop
- movups (%esi),%xmm6
- paddq 16(%esp),%xmm7
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
- leal 16(%esi),%esi
- jmp L034ccm64_dec_outer
-.align 4,0x90
-L035ccm64_dec_break:
- movl 240(%ebp),%ecx
- movl %ebp,%edx
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- xorps %xmm0,%xmm6
- leal 32(%edx),%edx
- xorps %xmm6,%xmm3
-L037enc1_loop_6:
-.byte 102,15,56,220,217
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L037enc1_loop_6
-.byte 102,15,56,221,217
- movl 48(%esp),%esp
- movl 40(%esp),%edi
- movups %xmm3,(%edi)
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
- pxor %xmm7,%xmm7
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _aes_hw_ctr32_encrypt_blocks
-.private_extern _aes_hw_ctr32_encrypt_blocks
-.align 4
-_aes_hw_ctr32_encrypt_blocks:
-L_aes_hw_ctr32_encrypt_blocks_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-#ifdef BORINGSSL_DISPATCH_TEST
- pushl %ebx
- pushl %edx
- call L038pic
-L038pic:
- popl %ebx
- leal _BORINGSSL_function_hit+0-L038pic(%ebx),%ebx
- movl $1,%edx
- movb %dl,(%ebx)
- popl %edx
- popl %ebx
-#endif
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%eax
- movl 32(%esp),%edx
- movl 36(%esp),%ebx
- movl %esp,%ebp
- subl $88,%esp
- andl $-16,%esp
- movl %ebp,80(%esp)
- cmpl $1,%eax
- je L039ctr32_one_shortcut
- movdqu (%ebx),%xmm7
- movl $202182159,(%esp)
- movl $134810123,4(%esp)
- movl $67438087,8(%esp)
- movl $66051,12(%esp)
- movl $6,%ecx
- xorl %ebp,%ebp
- movl %ecx,16(%esp)
- movl %ecx,20(%esp)
- movl %ecx,24(%esp)
- movl %ebp,28(%esp)
-.byte 102,15,58,22,251,3
-.byte 102,15,58,34,253,3
- movl 240(%edx),%ecx
- bswap %ebx
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- movdqa (%esp),%xmm2
-.byte 102,15,58,34,195,0
- leal 3(%ebx),%ebp
-.byte 102,15,58,34,205,0
- incl %ebx
-.byte 102,15,58,34,195,1
- incl %ebp
-.byte 102,15,58,34,205,1
- incl %ebx
-.byte 102,15,58,34,195,2
- incl %ebp
-.byte 102,15,58,34,205,2
- movdqa %xmm0,48(%esp)
-.byte 102,15,56,0,194
- movdqu (%edx),%xmm6
- movdqa %xmm1,64(%esp)
-.byte 102,15,56,0,202
- pshufd $192,%xmm0,%xmm2
- pshufd $128,%xmm0,%xmm3
- cmpl $6,%eax
- jb L040ctr32_tail
- pxor %xmm6,%xmm7
- shll $4,%ecx
- movl $16,%ebx
- movdqa %xmm7,32(%esp)
- movl %edx,%ebp
- subl %ecx,%ebx
- leal 32(%edx,%ecx,1),%edx
- subl $6,%eax
- jmp L041ctr32_loop6
-.align 4,0x90
-L041ctr32_loop6:
- pshufd $64,%xmm0,%xmm4
- movdqa 32(%esp),%xmm0
- pshufd $192,%xmm1,%xmm5
- pxor %xmm0,%xmm2
- pshufd $128,%xmm1,%xmm6
- pxor %xmm0,%xmm3
- pshufd $64,%xmm1,%xmm7
- movups 16(%ebp),%xmm1
- pxor %xmm0,%xmm4
- pxor %xmm0,%xmm5
-.byte 102,15,56,220,209
- pxor %xmm0,%xmm6
- pxor %xmm0,%xmm7
-.byte 102,15,56,220,217
- movups 32(%ebp),%xmm0
- movl %ebx,%ecx
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- call L_aesni_encrypt6_enter
- movups (%esi),%xmm1
- movups 16(%esi),%xmm0
- xorps %xmm1,%xmm2
- movups 32(%esi),%xmm1
- xorps %xmm0,%xmm3
- movups %xmm2,(%edi)
- movdqa 16(%esp),%xmm0
- xorps %xmm1,%xmm4
- movdqa 64(%esp),%xmm1
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- paddd %xmm0,%xmm1
- paddd 48(%esp),%xmm0
- movdqa (%esp),%xmm2
- movups 48(%esi),%xmm3
- movups 64(%esi),%xmm4
- xorps %xmm3,%xmm5
- movups 80(%esi),%xmm3
- leal 96(%esi),%esi
- movdqa %xmm0,48(%esp)
-.byte 102,15,56,0,194
- xorps %xmm4,%xmm6
- movups %xmm5,48(%edi)
- xorps %xmm3,%xmm7
- movdqa %xmm1,64(%esp)
-.byte 102,15,56,0,202
- movups %xmm6,64(%edi)
- pshufd $192,%xmm0,%xmm2
- movups %xmm7,80(%edi)
- leal 96(%edi),%edi
- pshufd $128,%xmm0,%xmm3
- subl $6,%eax
- jnc L041ctr32_loop6
- addl $6,%eax
- jz L042ctr32_ret
- movdqu (%ebp),%xmm7
- movl %ebp,%edx
- pxor 32(%esp),%xmm7
- movl 240(%ebp),%ecx
-L040ctr32_tail:
- por %xmm7,%xmm2
- cmpl $2,%eax
- jb L043ctr32_one
- pshufd $64,%xmm0,%xmm4
- por %xmm7,%xmm3
- je L044ctr32_two
- pshufd $192,%xmm1,%xmm5
- por %xmm7,%xmm4
- cmpl $4,%eax
- jb L045ctr32_three
- pshufd $128,%xmm1,%xmm6
- por %xmm7,%xmm5
- je L046ctr32_four
- por %xmm7,%xmm6
- call __aesni_encrypt6
- movups (%esi),%xmm1
- movups 16(%esi),%xmm0
- xorps %xmm1,%xmm2
- movups 32(%esi),%xmm1
- xorps %xmm0,%xmm3
- movups 48(%esi),%xmm0
- xorps %xmm1,%xmm4
- movups 64(%esi),%xmm1
- xorps %xmm0,%xmm5
- movups %xmm2,(%edi)
- xorps %xmm1,%xmm6
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- movups %xmm5,48(%edi)
- movups %xmm6,64(%edi)
- jmp L042ctr32_ret
-.align 4,0x90
-L039ctr32_one_shortcut:
- movups (%ebx),%xmm2
- movl 240(%edx),%ecx
-L043ctr32_one:
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L047enc1_loop_7:
-.byte 102,15,56,220,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L047enc1_loop_7
-.byte 102,15,56,221,209
- movups (%esi),%xmm6
- xorps %xmm2,%xmm6
- movups %xmm6,(%edi)
- jmp L042ctr32_ret
-.align 4,0x90
-L044ctr32_two:
- call __aesni_encrypt2
- movups (%esi),%xmm5
- movups 16(%esi),%xmm6
- xorps %xmm5,%xmm2
- xorps %xmm6,%xmm3
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- jmp L042ctr32_ret
-.align 4,0x90
-L045ctr32_three:
- call __aesni_encrypt3
- movups (%esi),%xmm5
- movups 16(%esi),%xmm6
- xorps %xmm5,%xmm2
- movups 32(%esi),%xmm7
- xorps %xmm6,%xmm3
- movups %xmm2,(%edi)
- xorps %xmm7,%xmm4
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- jmp L042ctr32_ret
-.align 4,0x90
-L046ctr32_four:
- call __aesni_encrypt4
- movups (%esi),%xmm6
- movups 16(%esi),%xmm7
- movups 32(%esi),%xmm1
- xorps %xmm6,%xmm2
- movups 48(%esi),%xmm0
- xorps %xmm7,%xmm3
- movups %xmm2,(%edi)
- xorps %xmm1,%xmm4
- movups %xmm3,16(%edi)
- xorps %xmm0,%xmm5
- movups %xmm4,32(%edi)
- movups %xmm5,48(%edi)
-L042ctr32_ret:
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- movdqa %xmm0,32(%esp)
- pxor %xmm5,%xmm5
- movdqa %xmm0,48(%esp)
- pxor %xmm6,%xmm6
- movdqa %xmm0,64(%esp)
- pxor %xmm7,%xmm7
- movl 80(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _aes_hw_xts_encrypt
-.private_extern _aes_hw_xts_encrypt
-.align 4
-_aes_hw_xts_encrypt:
-L_aes_hw_xts_encrypt_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 36(%esp),%edx
- movl 40(%esp),%esi
- movl 240(%edx),%ecx
- movups (%esi),%xmm2
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L048enc1_loop_8:
-.byte 102,15,56,220,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L048enc1_loop_8
-.byte 102,15,56,221,209
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%eax
- movl 32(%esp),%edx
- movl %esp,%ebp
- subl $120,%esp
- movl 240(%edx),%ecx
- andl $-16,%esp
- movl $135,96(%esp)
- movl $0,100(%esp)
- movl $1,104(%esp)
- movl $0,108(%esp)
- movl %eax,112(%esp)
- movl %ebp,116(%esp)
- movdqa %xmm2,%xmm1
- pxor %xmm0,%xmm0
- movdqa 96(%esp),%xmm3
- pcmpgtd %xmm1,%xmm0
- andl $-16,%eax
- movl %edx,%ebp
- movl %ecx,%ebx
- subl $96,%eax
- jc L049xts_enc_short
- shll $4,%ecx
- movl $16,%ebx
- subl %ecx,%ebx
- leal 32(%edx,%ecx,1),%edx
- jmp L050xts_enc_loop6
-.align 4,0x90
-L050xts_enc_loop6:
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa %xmm1,(%esp)
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa %xmm1,16(%esp)
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa %xmm1,32(%esp)
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa %xmm1,48(%esp)
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- pshufd $19,%xmm0,%xmm7
- movdqa %xmm1,64(%esp)
- paddq %xmm1,%xmm1
- movups (%ebp),%xmm0
- pand %xmm3,%xmm7
- movups (%esi),%xmm2
- pxor %xmm1,%xmm7
- movl %ebx,%ecx
- movdqu 16(%esi),%xmm3
- xorps %xmm0,%xmm2
- movdqu 32(%esi),%xmm4
- pxor %xmm0,%xmm3
- movdqu 48(%esi),%xmm5
- pxor %xmm0,%xmm4
- movdqu 64(%esi),%xmm6
- pxor %xmm0,%xmm5
- movdqu 80(%esi),%xmm1
- pxor %xmm0,%xmm6
- leal 96(%esi),%esi
- pxor (%esp),%xmm2
- movdqa %xmm7,80(%esp)
- pxor %xmm1,%xmm7
- movups 16(%ebp),%xmm1
- pxor 16(%esp),%xmm3
- pxor 32(%esp),%xmm4
-.byte 102,15,56,220,209
- pxor 48(%esp),%xmm5
- pxor 64(%esp),%xmm6
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm7
- movups 32(%ebp),%xmm0
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- call L_aesni_encrypt6_enter
- movdqa 80(%esp),%xmm1
- pxor %xmm0,%xmm0
- xorps (%esp),%xmm2
- pcmpgtd %xmm1,%xmm0
- xorps 16(%esp),%xmm3
- movups %xmm2,(%edi)
- xorps 32(%esp),%xmm4
- movups %xmm3,16(%edi)
- xorps 48(%esp),%xmm5
- movups %xmm4,32(%edi)
- xorps 64(%esp),%xmm6
- movups %xmm5,48(%edi)
- xorps %xmm1,%xmm7
- movups %xmm6,64(%edi)
- pshufd $19,%xmm0,%xmm2
- movups %xmm7,80(%edi)
- leal 96(%edi),%edi
- movdqa 96(%esp),%xmm3
- pxor %xmm0,%xmm0
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- subl $96,%eax
- jnc L050xts_enc_loop6
- movl 240(%ebp),%ecx
- movl %ebp,%edx
- movl %ecx,%ebx
-L049xts_enc_short:
- addl $96,%eax
- jz L051xts_enc_done6x
- movdqa %xmm1,%xmm5
- cmpl $32,%eax
- jb L052xts_enc_one
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- je L053xts_enc_two
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa %xmm1,%xmm6
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- cmpl $64,%eax
- jb L054xts_enc_three
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa %xmm1,%xmm7
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- movdqa %xmm5,(%esp)
- movdqa %xmm6,16(%esp)
- je L055xts_enc_four
- movdqa %xmm7,32(%esp)
- pshufd $19,%xmm0,%xmm7
- movdqa %xmm1,48(%esp)
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm7
- pxor %xmm1,%xmm7
- movdqu (%esi),%xmm2
- movdqu 16(%esi),%xmm3
- movdqu 32(%esi),%xmm4
- pxor (%esp),%xmm2
- movdqu 48(%esi),%xmm5
- pxor 16(%esp),%xmm3
- movdqu 64(%esi),%xmm6
- pxor 32(%esp),%xmm4
- leal 80(%esi),%esi
- pxor 48(%esp),%xmm5
- movdqa %xmm7,64(%esp)
- pxor %xmm7,%xmm6
- call __aesni_encrypt6
- movaps 64(%esp),%xmm1
- xorps (%esp),%xmm2
- xorps 16(%esp),%xmm3
- xorps 32(%esp),%xmm4
- movups %xmm2,(%edi)
- xorps 48(%esp),%xmm5
- movups %xmm3,16(%edi)
- xorps %xmm1,%xmm6
- movups %xmm4,32(%edi)
- movups %xmm5,48(%edi)
- movups %xmm6,64(%edi)
- leal 80(%edi),%edi
- jmp L056xts_enc_done
-.align 4,0x90
-L052xts_enc_one:
- movups (%esi),%xmm2
- leal 16(%esi),%esi
- xorps %xmm5,%xmm2
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L057enc1_loop_9:
-.byte 102,15,56,220,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L057enc1_loop_9
-.byte 102,15,56,221,209
- xorps %xmm5,%xmm2
- movups %xmm2,(%edi)
- leal 16(%edi),%edi
- movdqa %xmm5,%xmm1
- jmp L056xts_enc_done
-.align 4,0x90
-L053xts_enc_two:
- movaps %xmm1,%xmm6
- movups (%esi),%xmm2
- movups 16(%esi),%xmm3
- leal 32(%esi),%esi
- xorps %xmm5,%xmm2
- xorps %xmm6,%xmm3
- call __aesni_encrypt2
- xorps %xmm5,%xmm2
- xorps %xmm6,%xmm3
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- leal 32(%edi),%edi
- movdqa %xmm6,%xmm1
- jmp L056xts_enc_done
-.align 4,0x90
-L054xts_enc_three:
- movaps %xmm1,%xmm7
- movups (%esi),%xmm2
- movups 16(%esi),%xmm3
- movups 32(%esi),%xmm4
- leal 48(%esi),%esi
- xorps %xmm5,%xmm2
- xorps %xmm6,%xmm3
- xorps %xmm7,%xmm4
- call __aesni_encrypt3
- xorps %xmm5,%xmm2
- xorps %xmm6,%xmm3
- xorps %xmm7,%xmm4
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- leal 48(%edi),%edi
- movdqa %xmm7,%xmm1
- jmp L056xts_enc_done
-.align 4,0x90
-L055xts_enc_four:
- movaps %xmm1,%xmm6
- movups (%esi),%xmm2
- movups 16(%esi),%xmm3
- movups 32(%esi),%xmm4
- xorps (%esp),%xmm2
- movups 48(%esi),%xmm5
- leal 64(%esi),%esi
- xorps 16(%esp),%xmm3
- xorps %xmm7,%xmm4
- xorps %xmm6,%xmm5
- call __aesni_encrypt4
- xorps (%esp),%xmm2
- xorps 16(%esp),%xmm3
- xorps %xmm7,%xmm4
- movups %xmm2,(%edi)
- xorps %xmm6,%xmm5
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- movups %xmm5,48(%edi)
- leal 64(%edi),%edi
- movdqa %xmm6,%xmm1
- jmp L056xts_enc_done
-.align 4,0x90
-L051xts_enc_done6x:
- movl 112(%esp),%eax
- andl $15,%eax
- jz L058xts_enc_ret
- movdqa %xmm1,%xmm5
- movl %eax,112(%esp)
- jmp L059xts_enc_steal
-.align 4,0x90
-L056xts_enc_done:
- movl 112(%esp),%eax
- pxor %xmm0,%xmm0
- andl $15,%eax
- jz L058xts_enc_ret
- pcmpgtd %xmm1,%xmm0
- movl %eax,112(%esp)
- pshufd $19,%xmm0,%xmm5
- paddq %xmm1,%xmm1
- pand 96(%esp),%xmm5
- pxor %xmm1,%xmm5
-L059xts_enc_steal:
- movzbl (%esi),%ecx
- movzbl -16(%edi),%edx
- leal 1(%esi),%esi
- movb %cl,-16(%edi)
- movb %dl,(%edi)
- leal 1(%edi),%edi
- subl $1,%eax
- jnz L059xts_enc_steal
- subl 112(%esp),%edi
- movl %ebp,%edx
- movl %ebx,%ecx
- movups -16(%edi),%xmm2
- xorps %xmm5,%xmm2
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L060enc1_loop_10:
-.byte 102,15,56,220,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L060enc1_loop_10
-.byte 102,15,56,221,209
- xorps %xmm5,%xmm2
- movups %xmm2,-16(%edi)
-L058xts_enc_ret:
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- movdqa %xmm0,(%esp)
- pxor %xmm3,%xmm3
- movdqa %xmm0,16(%esp)
- pxor %xmm4,%xmm4
- movdqa %xmm0,32(%esp)
- pxor %xmm5,%xmm5
- movdqa %xmm0,48(%esp)
- pxor %xmm6,%xmm6
- movdqa %xmm0,64(%esp)
- pxor %xmm7,%xmm7
- movdqa %xmm0,80(%esp)
- movl 116(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _aes_hw_xts_decrypt
-.private_extern _aes_hw_xts_decrypt
-.align 4
-_aes_hw_xts_decrypt:
-L_aes_hw_xts_decrypt_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 36(%esp),%edx
- movl 40(%esp),%esi
- movl 240(%edx),%ecx
- movups (%esi),%xmm2
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L061enc1_loop_11:
-.byte 102,15,56,220,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L061enc1_loop_11
-.byte 102,15,56,221,209
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%eax
- movl 32(%esp),%edx
- movl %esp,%ebp
- subl $120,%esp
- andl $-16,%esp
- xorl %ebx,%ebx
- testl $15,%eax
- setnz %bl
- shll $4,%ebx
- subl %ebx,%eax
- movl $135,96(%esp)
- movl $0,100(%esp)
- movl $1,104(%esp)
- movl $0,108(%esp)
- movl %eax,112(%esp)
- movl %ebp,116(%esp)
- movl 240(%edx),%ecx
- movl %edx,%ebp
- movl %ecx,%ebx
- movdqa %xmm2,%xmm1
- pxor %xmm0,%xmm0
- movdqa 96(%esp),%xmm3
- pcmpgtd %xmm1,%xmm0
- andl $-16,%eax
- subl $96,%eax
- jc L062xts_dec_short
- shll $4,%ecx
- movl $16,%ebx
- subl %ecx,%ebx
- leal 32(%edx,%ecx,1),%edx
- jmp L063xts_dec_loop6
-.align 4,0x90
-L063xts_dec_loop6:
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa %xmm1,(%esp)
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa %xmm1,16(%esp)
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa %xmm1,32(%esp)
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa %xmm1,48(%esp)
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- pshufd $19,%xmm0,%xmm7
- movdqa %xmm1,64(%esp)
- paddq %xmm1,%xmm1
- movups (%ebp),%xmm0
- pand %xmm3,%xmm7
- movups (%esi),%xmm2
- pxor %xmm1,%xmm7
- movl %ebx,%ecx
- movdqu 16(%esi),%xmm3
- xorps %xmm0,%xmm2
- movdqu 32(%esi),%xmm4
- pxor %xmm0,%xmm3
- movdqu 48(%esi),%xmm5
- pxor %xmm0,%xmm4
- movdqu 64(%esi),%xmm6
- pxor %xmm0,%xmm5
- movdqu 80(%esi),%xmm1
- pxor %xmm0,%xmm6
- leal 96(%esi),%esi
- pxor (%esp),%xmm2
- movdqa %xmm7,80(%esp)
- pxor %xmm1,%xmm7
- movups 16(%ebp),%xmm1
- pxor 16(%esp),%xmm3
- pxor 32(%esp),%xmm4
-.byte 102,15,56,222,209
- pxor 48(%esp),%xmm5
- pxor 64(%esp),%xmm6
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm7
- movups 32(%ebp),%xmm0
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- call L_aesni_decrypt6_enter
- movdqa 80(%esp),%xmm1
- pxor %xmm0,%xmm0
- xorps (%esp),%xmm2
- pcmpgtd %xmm1,%xmm0
- xorps 16(%esp),%xmm3
- movups %xmm2,(%edi)
- xorps 32(%esp),%xmm4
- movups %xmm3,16(%edi)
- xorps 48(%esp),%xmm5
- movups %xmm4,32(%edi)
- xorps 64(%esp),%xmm6
- movups %xmm5,48(%edi)
- xorps %xmm1,%xmm7
- movups %xmm6,64(%edi)
- pshufd $19,%xmm0,%xmm2
- movups %xmm7,80(%edi)
- leal 96(%edi),%edi
- movdqa 96(%esp),%xmm3
- pxor %xmm0,%xmm0
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- subl $96,%eax
- jnc L063xts_dec_loop6
- movl 240(%ebp),%ecx
- movl %ebp,%edx
- movl %ecx,%ebx
-L062xts_dec_short:
- addl $96,%eax
- jz L064xts_dec_done6x
- movdqa %xmm1,%xmm5
- cmpl $32,%eax
- jb L065xts_dec_one
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- je L066xts_dec_two
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa %xmm1,%xmm6
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- cmpl $64,%eax
- jb L067xts_dec_three
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa %xmm1,%xmm7
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
- movdqa %xmm5,(%esp)
- movdqa %xmm6,16(%esp)
- je L068xts_dec_four
- movdqa %xmm7,32(%esp)
- pshufd $19,%xmm0,%xmm7
- movdqa %xmm1,48(%esp)
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm7
- pxor %xmm1,%xmm7
- movdqu (%esi),%xmm2
- movdqu 16(%esi),%xmm3
- movdqu 32(%esi),%xmm4
- pxor (%esp),%xmm2
- movdqu 48(%esi),%xmm5
- pxor 16(%esp),%xmm3
- movdqu 64(%esi),%xmm6
- pxor 32(%esp),%xmm4
- leal 80(%esi),%esi
- pxor 48(%esp),%xmm5
- movdqa %xmm7,64(%esp)
- pxor %xmm7,%xmm6
- call __aesni_decrypt6
- movaps 64(%esp),%xmm1
- xorps (%esp),%xmm2
- xorps 16(%esp),%xmm3
- xorps 32(%esp),%xmm4
- movups %xmm2,(%edi)
- xorps 48(%esp),%xmm5
- movups %xmm3,16(%edi)
- xorps %xmm1,%xmm6
- movups %xmm4,32(%edi)
- movups %xmm5,48(%edi)
- movups %xmm6,64(%edi)
- leal 80(%edi),%edi
- jmp L069xts_dec_done
-.align 4,0x90
-L065xts_dec_one:
- movups (%esi),%xmm2
- leal 16(%esi),%esi
- xorps %xmm5,%xmm2
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L070dec1_loop_12:
-.byte 102,15,56,222,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L070dec1_loop_12
-.byte 102,15,56,223,209
- xorps %xmm5,%xmm2
- movups %xmm2,(%edi)
- leal 16(%edi),%edi
- movdqa %xmm5,%xmm1
- jmp L069xts_dec_done
-.align 4,0x90
-L066xts_dec_two:
- movaps %xmm1,%xmm6
- movups (%esi),%xmm2
- movups 16(%esi),%xmm3
- leal 32(%esi),%esi
- xorps %xmm5,%xmm2
- xorps %xmm6,%xmm3
- call __aesni_decrypt2
- xorps %xmm5,%xmm2
- xorps %xmm6,%xmm3
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- leal 32(%edi),%edi
- movdqa %xmm6,%xmm1
- jmp L069xts_dec_done
-.align 4,0x90
-L067xts_dec_three:
- movaps %xmm1,%xmm7
- movups (%esi),%xmm2
- movups 16(%esi),%xmm3
- movups 32(%esi),%xmm4
- leal 48(%esi),%esi
- xorps %xmm5,%xmm2
- xorps %xmm6,%xmm3
- xorps %xmm7,%xmm4
- call __aesni_decrypt3
- xorps %xmm5,%xmm2
- xorps %xmm6,%xmm3
- xorps %xmm7,%xmm4
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- leal 48(%edi),%edi
- movdqa %xmm7,%xmm1
- jmp L069xts_dec_done
-.align 4,0x90
-L068xts_dec_four:
- movaps %xmm1,%xmm6
- movups (%esi),%xmm2
- movups 16(%esi),%xmm3
- movups 32(%esi),%xmm4
- xorps (%esp),%xmm2
- movups 48(%esi),%xmm5
- leal 64(%esi),%esi
- xorps 16(%esp),%xmm3
- xorps %xmm7,%xmm4
- xorps %xmm6,%xmm5
- call __aesni_decrypt4
- xorps (%esp),%xmm2
- xorps 16(%esp),%xmm3
- xorps %xmm7,%xmm4
- movups %xmm2,(%edi)
- xorps %xmm6,%xmm5
- movups %xmm3,16(%edi)
- movups %xmm4,32(%edi)
- movups %xmm5,48(%edi)
- leal 64(%edi),%edi
- movdqa %xmm6,%xmm1
- jmp L069xts_dec_done
-.align 4,0x90
-L064xts_dec_done6x:
- movl 112(%esp),%eax
- andl $15,%eax
- jz L071xts_dec_ret
- movl %eax,112(%esp)
- jmp L072xts_dec_only_one_more
-.align 4,0x90
-L069xts_dec_done:
- movl 112(%esp),%eax
- pxor %xmm0,%xmm0
- andl $15,%eax
- jz L071xts_dec_ret
- pcmpgtd %xmm1,%xmm0
- movl %eax,112(%esp)
- pshufd $19,%xmm0,%xmm2
- pxor %xmm0,%xmm0
- movdqa 96(%esp),%xmm3
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm2
- pcmpgtd %xmm1,%xmm0
- pxor %xmm2,%xmm1
-L072xts_dec_only_one_more:
- pshufd $19,%xmm0,%xmm5
- movdqa %xmm1,%xmm6
- paddq %xmm1,%xmm1
- pand %xmm3,%xmm5
- pxor %xmm1,%xmm5
- movl %ebp,%edx
- movl %ebx,%ecx
- movups (%esi),%xmm2
- xorps %xmm5,%xmm2
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L073dec1_loop_13:
-.byte 102,15,56,222,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L073dec1_loop_13
-.byte 102,15,56,223,209
- xorps %xmm5,%xmm2
- movups %xmm2,(%edi)
-L074xts_dec_steal:
- movzbl 16(%esi),%ecx
- movzbl (%edi),%edx
- leal 1(%esi),%esi
- movb %cl,(%edi)
- movb %dl,16(%edi)
- leal 1(%edi),%edi
- subl $1,%eax
- jnz L074xts_dec_steal
- subl 112(%esp),%edi
- movl %ebp,%edx
- movl %ebx,%ecx
- movups (%edi),%xmm2
- xorps %xmm6,%xmm2
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L075dec1_loop_14:
-.byte 102,15,56,222,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L075dec1_loop_14
-.byte 102,15,56,223,209
- xorps %xmm6,%xmm2
- movups %xmm2,(%edi)
-L071xts_dec_ret:
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- movdqa %xmm0,(%esp)
- pxor %xmm3,%xmm3
- movdqa %xmm0,16(%esp)
- pxor %xmm4,%xmm4
- movdqa %xmm0,32(%esp)
- pxor %xmm5,%xmm5
- movdqa %xmm0,48(%esp)
- pxor %xmm6,%xmm6
- movdqa %xmm0,64(%esp)
- pxor %xmm7,%xmm7
- movdqa %xmm0,80(%esp)
- movl 116(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _aes_hw_cbc_encrypt
-.private_extern _aes_hw_cbc_encrypt
-.align 4
-_aes_hw_cbc_encrypt:
-L_aes_hw_cbc_encrypt_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%esi
- movl %esp,%ebx
- movl 24(%esp),%edi
- subl $24,%ebx
- movl 28(%esp),%eax
- andl $-16,%ebx
- movl 32(%esp),%edx
- movl 36(%esp),%ebp
- testl %eax,%eax
- jz L076cbc_abort
- cmpl $0,40(%esp)
- xchgl %esp,%ebx
- movups (%ebp),%xmm7
- movl 240(%edx),%ecx
- movl %edx,%ebp
- movl %ebx,16(%esp)
- movl %ecx,%ebx
- je L077cbc_decrypt
- movaps %xmm7,%xmm2
- cmpl $16,%eax
- jb L078cbc_enc_tail
- subl $16,%eax
- jmp L079cbc_enc_loop
-.align 4,0x90
-L079cbc_enc_loop:
- movups (%esi),%xmm7
- leal 16(%esi),%esi
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- xorps %xmm0,%xmm7
- leal 32(%edx),%edx
- xorps %xmm7,%xmm2
-L080enc1_loop_15:
-.byte 102,15,56,220,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L080enc1_loop_15
-.byte 102,15,56,221,209
- movl %ebx,%ecx
- movl %ebp,%edx
- movups %xmm2,(%edi)
- leal 16(%edi),%edi
- subl $16,%eax
- jnc L079cbc_enc_loop
- addl $16,%eax
- jnz L078cbc_enc_tail
- movaps %xmm2,%xmm7
- pxor %xmm2,%xmm2
- jmp L081cbc_ret
-L078cbc_enc_tail:
- movl %eax,%ecx
-.long 2767451785
- movl $16,%ecx
- subl %eax,%ecx
- xorl %eax,%eax
-.long 2868115081
- leal -16(%edi),%edi
- movl %ebx,%ecx
- movl %edi,%esi
- movl %ebp,%edx
- jmp L079cbc_enc_loop
-.align 4,0x90
-L077cbc_decrypt:
- cmpl $80,%eax
- jbe L082cbc_dec_tail
- movaps %xmm7,(%esp)
- subl $80,%eax
- jmp L083cbc_dec_loop6_enter
-.align 4,0x90
-L084cbc_dec_loop6:
- movaps %xmm0,(%esp)
- movups %xmm7,(%edi)
- leal 16(%edi),%edi
-L083cbc_dec_loop6_enter:
- movdqu (%esi),%xmm2
- movdqu 16(%esi),%xmm3
- movdqu 32(%esi),%xmm4
- movdqu 48(%esi),%xmm5
- movdqu 64(%esi),%xmm6
- movdqu 80(%esi),%xmm7
- call __aesni_decrypt6
- movups (%esi),%xmm1
- movups 16(%esi),%xmm0
- xorps (%esp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%esi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%esi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%esi),%xmm1
- xorps %xmm0,%xmm6
- movups 80(%esi),%xmm0
- xorps %xmm1,%xmm7
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- leal 96(%esi),%esi
- movups %xmm4,32(%edi)
- movl %ebx,%ecx
- movups %xmm5,48(%edi)
- movl %ebp,%edx
- movups %xmm6,64(%edi)
- leal 80(%edi),%edi
- subl $96,%eax
- ja L084cbc_dec_loop6
- movaps %xmm7,%xmm2
- movaps %xmm0,%xmm7
- addl $80,%eax
- jle L085cbc_dec_clear_tail_collected
- movups %xmm2,(%edi)
- leal 16(%edi),%edi
-L082cbc_dec_tail:
- movups (%esi),%xmm2
- movaps %xmm2,%xmm6
- cmpl $16,%eax
- jbe L086cbc_dec_one
- movups 16(%esi),%xmm3
- movaps %xmm3,%xmm5
- cmpl $32,%eax
- jbe L087cbc_dec_two
- movups 32(%esi),%xmm4
- cmpl $48,%eax
- jbe L088cbc_dec_three
- movups 48(%esi),%xmm5
- cmpl $64,%eax
- jbe L089cbc_dec_four
- movups 64(%esi),%xmm6
- movaps %xmm7,(%esp)
- movups (%esi),%xmm2
- xorps %xmm7,%xmm7
- call __aesni_decrypt6
- movups (%esi),%xmm1
- movups 16(%esi),%xmm0
- xorps (%esp),%xmm2
- xorps %xmm1,%xmm3
- movups 32(%esi),%xmm1
- xorps %xmm0,%xmm4
- movups 48(%esi),%xmm0
- xorps %xmm1,%xmm5
- movups 64(%esi),%xmm7
- xorps %xmm0,%xmm6
- movups %xmm2,(%edi)
- movups %xmm3,16(%edi)
- pxor %xmm3,%xmm3
- movups %xmm4,32(%edi)
- pxor %xmm4,%xmm4
- movups %xmm5,48(%edi)
- pxor %xmm5,%xmm5
- leal 64(%edi),%edi
- movaps %xmm6,%xmm2
- pxor %xmm6,%xmm6
- subl $80,%eax
- jmp L090cbc_dec_tail_collected
-.align 4,0x90
-L086cbc_dec_one:
- movups (%edx),%xmm0
- movups 16(%edx),%xmm1
- leal 32(%edx),%edx
- xorps %xmm0,%xmm2
-L091dec1_loop_16:
-.byte 102,15,56,222,209
- decl %ecx
- movups (%edx),%xmm1
- leal 16(%edx),%edx
- jnz L091dec1_loop_16
-.byte 102,15,56,223,209
- xorps %xmm7,%xmm2
- movaps %xmm6,%xmm7
- subl $16,%eax
- jmp L090cbc_dec_tail_collected
-.align 4,0x90
-L087cbc_dec_two:
- call __aesni_decrypt2
- xorps %xmm7,%xmm2
- xorps %xmm6,%xmm3
- movups %xmm2,(%edi)
- movaps %xmm3,%xmm2
- pxor %xmm3,%xmm3
- leal 16(%edi),%edi
- movaps %xmm5,%xmm7
- subl $32,%eax
- jmp L090cbc_dec_tail_collected
-.align 4,0x90
-L088cbc_dec_three:
- call __aesni_decrypt3
- xorps %xmm7,%xmm2
- xorps %xmm6,%xmm3
- xorps %xmm5,%xmm4
- movups %xmm2,(%edi)
- movaps %xmm4,%xmm2
- pxor %xmm4,%xmm4
- movups %xmm3,16(%edi)
- pxor %xmm3,%xmm3
- leal 32(%edi),%edi
- movups 32(%esi),%xmm7
- subl $48,%eax
- jmp L090cbc_dec_tail_collected
-.align 4,0x90
-L089cbc_dec_four:
- call __aesni_decrypt4
- movups 16(%esi),%xmm1
- movups 32(%esi),%xmm0
- xorps %xmm7,%xmm2
- movups 48(%esi),%xmm7
- xorps %xmm6,%xmm3
- movups %xmm2,(%edi)
- xorps %xmm1,%xmm4
- movups %xmm3,16(%edi)
- pxor %xmm3,%xmm3
- xorps %xmm0,%xmm5
- movups %xmm4,32(%edi)
- pxor %xmm4,%xmm4
- leal 48(%edi),%edi
- movaps %xmm5,%xmm2
- pxor %xmm5,%xmm5
- subl $64,%eax
- jmp L090cbc_dec_tail_collected
-.align 4,0x90
-L085cbc_dec_clear_tail_collected:
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
-L090cbc_dec_tail_collected:
- andl $15,%eax
- jnz L092cbc_dec_tail_partial
- movups %xmm2,(%edi)
- pxor %xmm0,%xmm0
- jmp L081cbc_ret
-.align 4,0x90
-L092cbc_dec_tail_partial:
- movaps %xmm2,(%esp)
- pxor %xmm0,%xmm0
- movl $16,%ecx
- movl %esp,%esi
- subl %eax,%ecx
-.long 2767451785
- movdqa %xmm2,(%esp)
-L081cbc_ret:
- movl 16(%esp),%esp
- movl 36(%esp),%ebp
- pxor %xmm2,%xmm2
- pxor %xmm1,%xmm1
- movups %xmm7,(%ebp)
- pxor %xmm7,%xmm7
-L076cbc_abort:
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.private_extern __aesni_set_encrypt_key
-.align 4
-__aesni_set_encrypt_key:
- pushl %ebp
- pushl %ebx
- testl %eax,%eax
- jz L093bad_pointer
- testl %edx,%edx
- jz L093bad_pointer
- call L094pic
-L094pic:
- popl %ebx
- leal Lkey_const-L094pic(%ebx),%ebx
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp
- movups (%eax),%xmm0
- xorps %xmm4,%xmm4
- movl 4(%ebp),%ebp
- leal 16(%edx),%edx
- andl $268437504,%ebp
- cmpl $256,%ecx
- je L09514rounds
- cmpl $192,%ecx
- je L09612rounds
- cmpl $128,%ecx
- jne L097bad_keybits
-.align 4,0x90
-L09810rounds:
- cmpl $268435456,%ebp
- je L09910rounds_alt
- movl $9,%ecx
- movups %xmm0,-16(%edx)
-.byte 102,15,58,223,200,1
- call L100key_128_cold
-.byte 102,15,58,223,200,2
- call L101key_128
-.byte 102,15,58,223,200,4
- call L101key_128
-.byte 102,15,58,223,200,8
- call L101key_128
-.byte 102,15,58,223,200,16
- call L101key_128
-.byte 102,15,58,223,200,32
- call L101key_128
-.byte 102,15,58,223,200,64
- call L101key_128
-.byte 102,15,58,223,200,128
- call L101key_128
-.byte 102,15,58,223,200,27
- call L101key_128
-.byte 102,15,58,223,200,54
- call L101key_128
- movups %xmm0,(%edx)
- movl %ecx,80(%edx)
- jmp L102good_key
-.align 4,0x90
-L101key_128:
- movups %xmm0,(%edx)
- leal 16(%edx),%edx
-L100key_128_cold:
- shufps $16,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps $255,%xmm1,%xmm1
- xorps %xmm1,%xmm0
- ret
-.align 4,0x90
-L09910rounds_alt:
- movdqa (%ebx),%xmm5
- movl $8,%ecx
- movdqa 32(%ebx),%xmm4
- movdqa %xmm0,%xmm2
- movdqu %xmm0,-16(%edx)
-L103loop_key128:
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
- pslld $1,%xmm4
- leal 16(%edx),%edx
- movdqa %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm3,%xmm2
- pxor %xmm2,%xmm0
- movdqu %xmm0,-16(%edx)
- movdqa %xmm0,%xmm2
- decl %ecx
- jnz L103loop_key128
- movdqa 48(%ebx),%xmm4
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
- pslld $1,%xmm4
- movdqa %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm3,%xmm2
- pxor %xmm2,%xmm0
- movdqu %xmm0,(%edx)
- movdqa %xmm0,%xmm2
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
- movdqa %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm3,%xmm2
- pxor %xmm2,%xmm0
- movdqu %xmm0,16(%edx)
- movl $9,%ecx
- movl %ecx,96(%edx)
- jmp L102good_key
-.align 4,0x90
-L09612rounds:
- movq 16(%eax),%xmm2
- cmpl $268435456,%ebp
- je L10412rounds_alt
- movl $11,%ecx
- movups %xmm0,-16(%edx)
-.byte 102,15,58,223,202,1
- call L105key_192a_cold
-.byte 102,15,58,223,202,2
- call L106key_192b
-.byte 102,15,58,223,202,4
- call L107key_192a
-.byte 102,15,58,223,202,8
- call L106key_192b
-.byte 102,15,58,223,202,16
- call L107key_192a
-.byte 102,15,58,223,202,32
- call L106key_192b
-.byte 102,15,58,223,202,64
- call L107key_192a
-.byte 102,15,58,223,202,128
- call L106key_192b
- movups %xmm0,(%edx)
- movl %ecx,48(%edx)
- jmp L102good_key
-.align 4,0x90
-L107key_192a:
- movups %xmm0,(%edx)
- leal 16(%edx),%edx
-.align 4,0x90
-L105key_192a_cold:
- movaps %xmm2,%xmm5
-L108key_192b_warm:
- shufps $16,%xmm0,%xmm4
- movdqa %xmm2,%xmm3
- xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
- pslldq $4,%xmm3
- xorps %xmm4,%xmm0
- pshufd $85,%xmm1,%xmm1
- pxor %xmm3,%xmm2
- pxor %xmm1,%xmm0
- pshufd $255,%xmm0,%xmm3
- pxor %xmm3,%xmm2
- ret
-.align 4,0x90
-L106key_192b:
- movaps %xmm0,%xmm3
- shufps $68,%xmm0,%xmm5
- movups %xmm5,(%edx)
- shufps $78,%xmm2,%xmm3
- movups %xmm3,16(%edx)
- leal 32(%edx),%edx
- jmp L108key_192b_warm
-.align 4,0x90
-L10412rounds_alt:
- movdqa 16(%ebx),%xmm5
- movdqa 32(%ebx),%xmm4
- movl $8,%ecx
- movdqu %xmm0,-16(%edx)
-L109loop_key192:
- movq %xmm2,(%edx)
- movdqa %xmm2,%xmm1
-.byte 102,15,56,0,213
-.byte 102,15,56,221,212
- pslld $1,%xmm4
- leal 24(%edx),%edx
- movdqa %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm3,%xmm0
- pshufd $255,%xmm0,%xmm3
- pxor %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm1,%xmm3
- pxor %xmm2,%xmm0
- pxor %xmm3,%xmm2
- movdqu %xmm0,-16(%edx)
- decl %ecx
- jnz L109loop_key192
- movl $11,%ecx
- movl %ecx,32(%edx)
- jmp L102good_key
-.align 4,0x90
-L09514rounds:
- movups 16(%eax),%xmm2
- leal 16(%edx),%edx
- cmpl $268435456,%ebp
- je L11014rounds_alt
- movl $13,%ecx
- movups %xmm0,-32(%edx)
- movups %xmm2,-16(%edx)
-.byte 102,15,58,223,202,1
- call L111key_256a_cold
-.byte 102,15,58,223,200,1
- call L112key_256b
-.byte 102,15,58,223,202,2
- call L113key_256a
-.byte 102,15,58,223,200,2
- call L112key_256b
-.byte 102,15,58,223,202,4
- call L113key_256a
-.byte 102,15,58,223,200,4
- call L112key_256b
-.byte 102,15,58,223,202,8
- call L113key_256a
-.byte 102,15,58,223,200,8
- call L112key_256b
-.byte 102,15,58,223,202,16
- call L113key_256a
-.byte 102,15,58,223,200,16
- call L112key_256b
-.byte 102,15,58,223,202,32
- call L113key_256a
-.byte 102,15,58,223,200,32
- call L112key_256b
-.byte 102,15,58,223,202,64
- call L113key_256a
- movups %xmm0,(%edx)
- movl %ecx,16(%edx)
- xorl %eax,%eax
- jmp L102good_key
-.align 4,0x90
-L113key_256a:
- movups %xmm2,(%edx)
- leal 16(%edx),%edx
-L111key_256a_cold:
- shufps $16,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps $255,%xmm1,%xmm1
- xorps %xmm1,%xmm0
- ret
-.align 4,0x90
-L112key_256b:
- movups %xmm0,(%edx)
- leal 16(%edx),%edx
- shufps $16,%xmm2,%xmm4
- xorps %xmm4,%xmm2
- shufps $140,%xmm2,%xmm4
- xorps %xmm4,%xmm2
- shufps $170,%xmm1,%xmm1
- xorps %xmm1,%xmm2
- ret
-.align 4,0x90
-L11014rounds_alt:
- movdqa (%ebx),%xmm5
- movdqa 32(%ebx),%xmm4
- movl $7,%ecx
- movdqu %xmm0,-32(%edx)
- movdqa %xmm2,%xmm1
- movdqu %xmm2,-16(%edx)
-L114loop_key256:
-.byte 102,15,56,0,213
-.byte 102,15,56,221,212
- movdqa %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm3,%xmm0
- pslld $1,%xmm4
- pxor %xmm2,%xmm0
- movdqu %xmm0,(%edx)
- decl %ecx
- jz L115done_key256
- pshufd $255,%xmm0,%xmm2
- pxor %xmm3,%xmm3
-.byte 102,15,56,221,211
- movdqa %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm3,%xmm1
- pxor %xmm1,%xmm2
- movdqu %xmm2,16(%edx)
- leal 32(%edx),%edx
- movdqa %xmm2,%xmm1
- jmp L114loop_key256
-L115done_key256:
- movl $13,%ecx
- movl %ecx,16(%edx)
-L102good_key:
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- xorl %eax,%eax
- popl %ebx
- popl %ebp
- ret
-.align 2,0x90
-L093bad_pointer:
- movl $-1,%eax
- popl %ebx
- popl %ebp
- ret
-.align 2,0x90
-L097bad_keybits:
- pxor %xmm0,%xmm0
- movl $-2,%eax
- popl %ebx
- popl %ebp
- ret
-.globl _aes_hw_set_encrypt_key
-.private_extern _aes_hw_set_encrypt_key
-.align 4
-_aes_hw_set_encrypt_key:
-L_aes_hw_set_encrypt_key_begin:
-#ifdef BORINGSSL_DISPATCH_TEST
- pushl %ebx
- pushl %edx
- call L116pic
-L116pic:
- popl %ebx
- leal _BORINGSSL_function_hit+3-L116pic(%ebx),%ebx
- movl $1,%edx
- movb %dl,(%ebx)
- popl %edx
- popl %ebx
-#endif
- movl 4(%esp),%eax
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- call __aesni_set_encrypt_key
- ret
-.globl _aes_hw_set_decrypt_key
-.private_extern _aes_hw_set_decrypt_key
-.align 4
-_aes_hw_set_decrypt_key:
-L_aes_hw_set_decrypt_key_begin:
- movl 4(%esp),%eax
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- call __aesni_set_encrypt_key
- movl 12(%esp),%edx
- shll $4,%ecx
- testl %eax,%eax
- jnz L117dec_key_ret
- leal 16(%edx,%ecx,1),%eax
- movups (%edx),%xmm0
- movups (%eax),%xmm1
- movups %xmm0,(%eax)
- movups %xmm1,(%edx)
- leal 16(%edx),%edx
- leal -16(%eax),%eax
-L118dec_key_inverse:
- movups (%edx),%xmm0
- movups (%eax),%xmm1
-.byte 102,15,56,219,192
-.byte 102,15,56,219,201
- leal 16(%edx),%edx
- leal -16(%eax),%eax
- movups %xmm0,16(%eax)
- movups %xmm1,-16(%edx)
- cmpl %edx,%eax
- ja L118dec_key_inverse
- movups (%edx),%xmm0
-.byte 102,15,56,219,192
- movups %xmm0,(%edx)
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- xorl %eax,%eax
-L117dec_key_ret:
- ret
-.align 6,0x90
-Lkey_const:
-.long 202313229,202313229,202313229,202313229
-.long 67569157,67569157,67569157,67569157
-.long 1,1,1,1
-.long 27,27,27,27
-.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
-.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
-.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
-.byte 115,108,46,111,114,103,62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol _OPENSSL_ia32cap_P
-.long 0
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/bn-586-apple.S b/apple-x86/crypto/fipsmodule/bn-586-apple.S
deleted file mode 100644
index 93513d0..0000000
--- a/apple-x86/crypto/fipsmodule/bn-586-apple.S
+++ /dev/null
@@ -1,987 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl _bn_mul_add_words
-.private_extern _bn_mul_add_words
-.align 4
-_bn_mul_add_words:
-L_bn_mul_add_words_begin:
- call L000PIC_me_up
-L000PIC_me_up:
- popl %eax
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L000PIC_me_up(%eax),%eax
- btl $26,(%eax)
- jnc L001maw_non_sse2
- movl 4(%esp),%eax
- movl 8(%esp),%edx
- movl 12(%esp),%ecx
- movd 16(%esp),%mm0
- pxor %mm1,%mm1
- jmp L002maw_sse2_entry
-.align 4,0x90
-L003maw_sse2_unrolled:
- movd (%eax),%mm3
- paddq %mm3,%mm1
- movd (%edx),%mm2
- pmuludq %mm0,%mm2
- movd 4(%edx),%mm4
- pmuludq %mm0,%mm4
- movd 8(%edx),%mm6
- pmuludq %mm0,%mm6
- movd 12(%edx),%mm7
- pmuludq %mm0,%mm7
- paddq %mm2,%mm1
- movd 4(%eax),%mm3
- paddq %mm4,%mm3
- movd 8(%eax),%mm5
- paddq %mm6,%mm5
- movd 12(%eax),%mm4
- paddq %mm4,%mm7
- movd %mm1,(%eax)
- movd 16(%edx),%mm2
- pmuludq %mm0,%mm2
- psrlq $32,%mm1
- movd 20(%edx),%mm4
- pmuludq %mm0,%mm4
- paddq %mm3,%mm1
- movd 24(%edx),%mm6
- pmuludq %mm0,%mm6
- movd %mm1,4(%eax)
- psrlq $32,%mm1
- movd 28(%edx),%mm3
- addl $32,%edx
- pmuludq %mm0,%mm3
- paddq %mm5,%mm1
- movd 16(%eax),%mm5
- paddq %mm5,%mm2
- movd %mm1,8(%eax)
- psrlq $32,%mm1
- paddq %mm7,%mm1
- movd 20(%eax),%mm5
- paddq %mm5,%mm4
- movd %mm1,12(%eax)
- psrlq $32,%mm1
- paddq %mm2,%mm1
- movd 24(%eax),%mm5
- paddq %mm5,%mm6
- movd %mm1,16(%eax)
- psrlq $32,%mm1
- paddq %mm4,%mm1
- movd 28(%eax),%mm5
- paddq %mm5,%mm3
- movd %mm1,20(%eax)
- psrlq $32,%mm1
- paddq %mm6,%mm1
- movd %mm1,24(%eax)
- psrlq $32,%mm1
- paddq %mm3,%mm1
- movd %mm1,28(%eax)
- leal 32(%eax),%eax
- psrlq $32,%mm1
- subl $8,%ecx
- jz L004maw_sse2_exit
-L002maw_sse2_entry:
- testl $4294967288,%ecx
- jnz L003maw_sse2_unrolled
-.align 2,0x90
-L005maw_sse2_loop:
- movd (%edx),%mm2
- movd (%eax),%mm3
- pmuludq %mm0,%mm2
- leal 4(%edx),%edx
- paddq %mm3,%mm1
- paddq %mm2,%mm1
- movd %mm1,(%eax)
- subl $1,%ecx
- psrlq $32,%mm1
- leal 4(%eax),%eax
- jnz L005maw_sse2_loop
-L004maw_sse2_exit:
- movd %mm1,%eax
- emms
- ret
-.align 4,0x90
-L001maw_non_sse2:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-
- xorl %esi,%esi
- movl 20(%esp),%edi
- movl 28(%esp),%ecx
- movl 24(%esp),%ebx
- andl $4294967288,%ecx
- movl 32(%esp),%ebp
- pushl %ecx
- jz L006maw_finish
-.align 4,0x90
-L007maw_loop:
- # Round 0
- movl (%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl (%edi),%eax
- adcl $0,%edx
- movl %eax,(%edi)
- movl %edx,%esi
- # Round 4
- movl 4(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 4(%edi),%eax
- adcl $0,%edx
- movl %eax,4(%edi)
- movl %edx,%esi
- # Round 8
- movl 8(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 8(%edi),%eax
- adcl $0,%edx
- movl %eax,8(%edi)
- movl %edx,%esi
- # Round 12
- movl 12(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 12(%edi),%eax
- adcl $0,%edx
- movl %eax,12(%edi)
- movl %edx,%esi
- # Round 16
- movl 16(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 16(%edi),%eax
- adcl $0,%edx
- movl %eax,16(%edi)
- movl %edx,%esi
- # Round 20
- movl 20(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 20(%edi),%eax
- adcl $0,%edx
- movl %eax,20(%edi)
- movl %edx,%esi
- # Round 24
- movl 24(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 24(%edi),%eax
- adcl $0,%edx
- movl %eax,24(%edi)
- movl %edx,%esi
- # Round 28
- movl 28(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 28(%edi),%eax
- adcl $0,%edx
- movl %eax,28(%edi)
- movl %edx,%esi
-
- subl $8,%ecx
- leal 32(%ebx),%ebx
- leal 32(%edi),%edi
- jnz L007maw_loop
-L006maw_finish:
- movl 32(%esp),%ecx
- andl $7,%ecx
- jnz L008maw_finish2
- jmp L009maw_end
-L008maw_finish2:
- # Tail Round 0
- movl (%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl (%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,(%edi)
- movl %edx,%esi
- jz L009maw_end
- # Tail Round 1
- movl 4(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 4(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,4(%edi)
- movl %edx,%esi
- jz L009maw_end
- # Tail Round 2
- movl 8(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 8(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,8(%edi)
- movl %edx,%esi
- jz L009maw_end
- # Tail Round 3
- movl 12(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 12(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,12(%edi)
- movl %edx,%esi
- jz L009maw_end
- # Tail Round 4
- movl 16(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 16(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,16(%edi)
- movl %edx,%esi
- jz L009maw_end
- # Tail Round 5
- movl 20(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 20(%edi),%eax
- adcl $0,%edx
- decl %ecx
- movl %eax,20(%edi)
- movl %edx,%esi
- jz L009maw_end
- # Tail Round 6
- movl 24(%ebx),%eax
- mull %ebp
- addl %esi,%eax
- adcl $0,%edx
- addl 24(%edi),%eax
- adcl $0,%edx
- movl %eax,24(%edi)
- movl %edx,%esi
-L009maw_end:
- movl %esi,%eax
- popl %ecx
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _bn_mul_words
-.private_extern _bn_mul_words
-.align 4
-_bn_mul_words:
-L_bn_mul_words_begin:
- call L010PIC_me_up
-L010PIC_me_up:
- popl %eax
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L010PIC_me_up(%eax),%eax
- btl $26,(%eax)
- jnc L011mw_non_sse2
- movl 4(%esp),%eax
- movl 8(%esp),%edx
- movl 12(%esp),%ecx
- movd 16(%esp),%mm0
- pxor %mm1,%mm1
-.align 4,0x90
-L012mw_sse2_loop:
- movd (%edx),%mm2
- pmuludq %mm0,%mm2
- leal 4(%edx),%edx
- paddq %mm2,%mm1
- movd %mm1,(%eax)
- subl $1,%ecx
- psrlq $32,%mm1
- leal 4(%eax),%eax
- jnz L012mw_sse2_loop
- movd %mm1,%eax
- emms
- ret
-.align 4,0x90
-L011mw_non_sse2:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-
- xorl %esi,%esi
- movl 20(%esp),%edi
- movl 24(%esp),%ebx
- movl 28(%esp),%ebp
- movl 32(%esp),%ecx
- andl $4294967288,%ebp
- jz L013mw_finish
-L014mw_loop:
- # Round 0
- movl (%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,(%edi)
- movl %edx,%esi
- # Round 4
- movl 4(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,4(%edi)
- movl %edx,%esi
- # Round 8
- movl 8(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,8(%edi)
- movl %edx,%esi
- # Round 12
- movl 12(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,12(%edi)
- movl %edx,%esi
- # Round 16
- movl 16(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,16(%edi)
- movl %edx,%esi
- # Round 20
- movl 20(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,20(%edi)
- movl %edx,%esi
- # Round 24
- movl 24(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,24(%edi)
- movl %edx,%esi
- # Round 28
- movl 28(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,28(%edi)
- movl %edx,%esi
-
- addl $32,%ebx
- addl $32,%edi
- subl $8,%ebp
- jz L013mw_finish
- jmp L014mw_loop
-L013mw_finish:
- movl 28(%esp),%ebp
- andl $7,%ebp
- jnz L015mw_finish2
- jmp L016mw_end
-L015mw_finish2:
- # Tail Round 0
- movl (%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,(%edi)
- movl %edx,%esi
- decl %ebp
- jz L016mw_end
- # Tail Round 1
- movl 4(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,4(%edi)
- movl %edx,%esi
- decl %ebp
- jz L016mw_end
- # Tail Round 2
- movl 8(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,8(%edi)
- movl %edx,%esi
- decl %ebp
- jz L016mw_end
- # Tail Round 3
- movl 12(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,12(%edi)
- movl %edx,%esi
- decl %ebp
- jz L016mw_end
- # Tail Round 4
- movl 16(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,16(%edi)
- movl %edx,%esi
- decl %ebp
- jz L016mw_end
- # Tail Round 5
- movl 20(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,20(%edi)
- movl %edx,%esi
- decl %ebp
- jz L016mw_end
- # Tail Round 6
- movl 24(%ebx),%eax
- mull %ecx
- addl %esi,%eax
- adcl $0,%edx
- movl %eax,24(%edi)
- movl %edx,%esi
-L016mw_end:
- movl %esi,%eax
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _bn_sqr_words
-.private_extern _bn_sqr_words
-.align 4
-_bn_sqr_words:
-L_bn_sqr_words_begin:
- call L017PIC_me_up
-L017PIC_me_up:
- popl %eax
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L017PIC_me_up(%eax),%eax
- btl $26,(%eax)
- jnc L018sqr_non_sse2
- movl 4(%esp),%eax
- movl 8(%esp),%edx
- movl 12(%esp),%ecx
-.align 4,0x90
-L019sqr_sse2_loop:
- movd (%edx),%mm0
- pmuludq %mm0,%mm0
- leal 4(%edx),%edx
- movq %mm0,(%eax)
- subl $1,%ecx
- leal 8(%eax),%eax
- jnz L019sqr_sse2_loop
- emms
- ret
-.align 4,0x90
-L018sqr_non_sse2:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%ebx
- andl $4294967288,%ebx
- jz L020sw_finish
-L021sw_loop:
- # Round 0
- movl (%edi),%eax
- mull %eax
- movl %eax,(%esi)
- movl %edx,4(%esi)
- # Round 4
- movl 4(%edi),%eax
- mull %eax
- movl %eax,8(%esi)
- movl %edx,12(%esi)
- # Round 8
- movl 8(%edi),%eax
- mull %eax
- movl %eax,16(%esi)
- movl %edx,20(%esi)
- # Round 12
- movl 12(%edi),%eax
- mull %eax
- movl %eax,24(%esi)
- movl %edx,28(%esi)
- # Round 16
- movl 16(%edi),%eax
- mull %eax
- movl %eax,32(%esi)
- movl %edx,36(%esi)
- # Round 20
- movl 20(%edi),%eax
- mull %eax
- movl %eax,40(%esi)
- movl %edx,44(%esi)
- # Round 24
- movl 24(%edi),%eax
- mull %eax
- movl %eax,48(%esi)
- movl %edx,52(%esi)
- # Round 28
- movl 28(%edi),%eax
- mull %eax
- movl %eax,56(%esi)
- movl %edx,60(%esi)
-
- addl $32,%edi
- addl $64,%esi
- subl $8,%ebx
- jnz L021sw_loop
-L020sw_finish:
- movl 28(%esp),%ebx
- andl $7,%ebx
- jz L022sw_end
- # Tail Round 0
- movl (%edi),%eax
- mull %eax
- movl %eax,(%esi)
- decl %ebx
- movl %edx,4(%esi)
- jz L022sw_end
- # Tail Round 1
- movl 4(%edi),%eax
- mull %eax
- movl %eax,8(%esi)
- decl %ebx
- movl %edx,12(%esi)
- jz L022sw_end
- # Tail Round 2
- movl 8(%edi),%eax
- mull %eax
- movl %eax,16(%esi)
- decl %ebx
- movl %edx,20(%esi)
- jz L022sw_end
- # Tail Round 3
- movl 12(%edi),%eax
- mull %eax
- movl %eax,24(%esi)
- decl %ebx
- movl %edx,28(%esi)
- jz L022sw_end
- # Tail Round 4
- movl 16(%edi),%eax
- mull %eax
- movl %eax,32(%esi)
- decl %ebx
- movl %edx,36(%esi)
- jz L022sw_end
- # Tail Round 5
- movl 20(%edi),%eax
- mull %eax
- movl %eax,40(%esi)
- decl %ebx
- movl %edx,44(%esi)
- jz L022sw_end
- # Tail Round 6
- movl 24(%edi),%eax
- mull %eax
- movl %eax,48(%esi)
- movl %edx,52(%esi)
-L022sw_end:
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _bn_div_words
-.private_extern _bn_div_words
-.align 4
-_bn_div_words:
-L_bn_div_words_begin:
- movl 4(%esp),%edx
- movl 8(%esp),%eax
- movl 12(%esp),%ecx
- divl %ecx
- ret
-.globl _bn_add_words
-.private_extern _bn_add_words
-.align 4
-_bn_add_words:
-L_bn_add_words_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-
- movl 20(%esp),%ebx
- movl 24(%esp),%esi
- movl 28(%esp),%edi
- movl 32(%esp),%ebp
- xorl %eax,%eax
- andl $4294967288,%ebp
- jz L023aw_finish
-L024aw_loop:
- # Round 0
- movl (%esi),%ecx
- movl (%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- movl %ecx,(%ebx)
- # Round 1
- movl 4(%esi),%ecx
- movl 4(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- movl %ecx,4(%ebx)
- # Round 2
- movl 8(%esi),%ecx
- movl 8(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- movl %ecx,8(%ebx)
- # Round 3
- movl 12(%esi),%ecx
- movl 12(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- movl %ecx,12(%ebx)
- # Round 4
- movl 16(%esi),%ecx
- movl 16(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- movl %ecx,16(%ebx)
- # Round 5
- movl 20(%esi),%ecx
- movl 20(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- movl %ecx,20(%ebx)
- # Round 6
- movl 24(%esi),%ecx
- movl 24(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- movl %ecx,24(%ebx)
- # Round 7
- movl 28(%esi),%ecx
- movl 28(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- movl %ecx,28(%ebx)
-
- addl $32,%esi
- addl $32,%edi
- addl $32,%ebx
- subl $8,%ebp
- jnz L024aw_loop
-L023aw_finish:
- movl 32(%esp),%ebp
- andl $7,%ebp
- jz L025aw_end
- # Tail Round 0
- movl (%esi),%ecx
- movl (%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- decl %ebp
- movl %ecx,(%ebx)
- jz L025aw_end
- # Tail Round 1
- movl 4(%esi),%ecx
- movl 4(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- decl %ebp
- movl %ecx,4(%ebx)
- jz L025aw_end
- # Tail Round 2
- movl 8(%esi),%ecx
- movl 8(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- decl %ebp
- movl %ecx,8(%ebx)
- jz L025aw_end
- # Tail Round 3
- movl 12(%esi),%ecx
- movl 12(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- decl %ebp
- movl %ecx,12(%ebx)
- jz L025aw_end
- # Tail Round 4
- movl 16(%esi),%ecx
- movl 16(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- decl %ebp
- movl %ecx,16(%ebx)
- jz L025aw_end
- # Tail Round 5
- movl 20(%esi),%ecx
- movl 20(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- decl %ebp
- movl %ecx,20(%ebx)
- jz L025aw_end
- # Tail Round 6
- movl 24(%esi),%ecx
- movl 24(%edi),%edx
- addl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- addl %edx,%ecx
- adcl $0,%eax
- movl %ecx,24(%ebx)
-L025aw_end:
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _bn_sub_words
-.private_extern _bn_sub_words
-.align 4
-_bn_sub_words:
-L_bn_sub_words_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-
- movl 20(%esp),%ebx
- movl 24(%esp),%esi
- movl 28(%esp),%edi
- movl 32(%esp),%ebp
- xorl %eax,%eax
- andl $4294967288,%ebp
- jz L026aw_finish
-L027aw_loop:
- # Round 0
- movl (%esi),%ecx
- movl (%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- movl %ecx,(%ebx)
- # Round 1
- movl 4(%esi),%ecx
- movl 4(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- movl %ecx,4(%ebx)
- # Round 2
- movl 8(%esi),%ecx
- movl 8(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- movl %ecx,8(%ebx)
- # Round 3
- movl 12(%esi),%ecx
- movl 12(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- movl %ecx,12(%ebx)
- # Round 4
- movl 16(%esi),%ecx
- movl 16(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- movl %ecx,16(%ebx)
- # Round 5
- movl 20(%esi),%ecx
- movl 20(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- movl %ecx,20(%ebx)
- # Round 6
- movl 24(%esi),%ecx
- movl 24(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- movl %ecx,24(%ebx)
- # Round 7
- movl 28(%esi),%ecx
- movl 28(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- movl %ecx,28(%ebx)
-
- addl $32,%esi
- addl $32,%edi
- addl $32,%ebx
- subl $8,%ebp
- jnz L027aw_loop
-L026aw_finish:
- movl 32(%esp),%ebp
- andl $7,%ebp
- jz L028aw_end
- # Tail Round 0
- movl (%esi),%ecx
- movl (%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- decl %ebp
- movl %ecx,(%ebx)
- jz L028aw_end
- # Tail Round 1
- movl 4(%esi),%ecx
- movl 4(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- decl %ebp
- movl %ecx,4(%ebx)
- jz L028aw_end
- # Tail Round 2
- movl 8(%esi),%ecx
- movl 8(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- decl %ebp
- movl %ecx,8(%ebx)
- jz L028aw_end
- # Tail Round 3
- movl 12(%esi),%ecx
- movl 12(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- decl %ebp
- movl %ecx,12(%ebx)
- jz L028aw_end
- # Tail Round 4
- movl 16(%esi),%ecx
- movl 16(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- decl %ebp
- movl %ecx,16(%ebx)
- jz L028aw_end
- # Tail Round 5
- movl 20(%esi),%ecx
- movl 20(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- decl %ebp
- movl %ecx,20(%ebx)
- jz L028aw_end
- # Tail Round 6
- movl 24(%esi),%ecx
- movl 24(%edi),%edx
- subl %eax,%ecx
- movl $0,%eax
- adcl %eax,%eax
- subl %edx,%ecx
- adcl $0,%eax
- movl %ecx,24(%ebx)
-L028aw_end:
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol _OPENSSL_ia32cap_P
-.long 0
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/co-586-apple.S b/apple-x86/crypto/fipsmodule/co-586-apple.S
deleted file mode 100644
index ab985ee..0000000
--- a/apple-x86/crypto/fipsmodule/co-586-apple.S
+++ /dev/null
@@ -1,1256 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl _bn_mul_comba8
-.private_extern _bn_mul_comba8
-.align 4
-_bn_mul_comba8:
-L_bn_mul_comba8_begin:
- pushl %esi
- movl 12(%esp),%esi
- pushl %edi
- movl 20(%esp),%edi
- pushl %ebp
- pushl %ebx
- xorl %ebx,%ebx
- movl (%esi),%eax
- xorl %ecx,%ecx
- movl (%edi),%edx
- # ################## Calculate word 0
- xorl %ebp,%ebp
- # mul a[0]*b[0]
- mull %edx
- addl %eax,%ebx
- movl 20(%esp),%eax
- adcl %edx,%ecx
- movl (%edi),%edx
- adcl $0,%ebp
- movl %ebx,(%eax)
- movl 4(%esi),%eax
- # saved r[0]
- # ################## Calculate word 1
- xorl %ebx,%ebx
- # mul a[1]*b[0]
- mull %edx
- addl %eax,%ecx
- movl (%esi),%eax
- adcl %edx,%ebp
- movl 4(%edi),%edx
- adcl $0,%ebx
- # mul a[0]*b[1]
- mull %edx
- addl %eax,%ecx
- movl 20(%esp),%eax
- adcl %edx,%ebp
- movl (%edi),%edx
- adcl $0,%ebx
- movl %ecx,4(%eax)
- movl 8(%esi),%eax
- # saved r[1]
- # ################## Calculate word 2
- xorl %ecx,%ecx
- # mul a[2]*b[0]
- mull %edx
- addl %eax,%ebp
- movl 4(%esi),%eax
- adcl %edx,%ebx
- movl 4(%edi),%edx
- adcl $0,%ecx
- # mul a[1]*b[1]
- mull %edx
- addl %eax,%ebp
- movl (%esi),%eax
- adcl %edx,%ebx
- movl 8(%edi),%edx
- adcl $0,%ecx
- # mul a[0]*b[2]
- mull %edx
- addl %eax,%ebp
- movl 20(%esp),%eax
- adcl %edx,%ebx
- movl (%edi),%edx
- adcl $0,%ecx
- movl %ebp,8(%eax)
- movl 12(%esi),%eax
- # saved r[2]
- # ################## Calculate word 3
- xorl %ebp,%ebp
- # mul a[3]*b[0]
- mull %edx
- addl %eax,%ebx
- movl 8(%esi),%eax
- adcl %edx,%ecx
- movl 4(%edi),%edx
- adcl $0,%ebp
- # mul a[2]*b[1]
- mull %edx
- addl %eax,%ebx
- movl 4(%esi),%eax
- adcl %edx,%ecx
- movl 8(%edi),%edx
- adcl $0,%ebp
- # mul a[1]*b[2]
- mull %edx
- addl %eax,%ebx
- movl (%esi),%eax
- adcl %edx,%ecx
- movl 12(%edi),%edx
- adcl $0,%ebp
- # mul a[0]*b[3]
- mull %edx
- addl %eax,%ebx
- movl 20(%esp),%eax
- adcl %edx,%ecx
- movl (%edi),%edx
- adcl $0,%ebp
- movl %ebx,12(%eax)
- movl 16(%esi),%eax
- # saved r[3]
- # ################## Calculate word 4
- xorl %ebx,%ebx
- # mul a[4]*b[0]
- mull %edx
- addl %eax,%ecx
- movl 12(%esi),%eax
- adcl %edx,%ebp
- movl 4(%edi),%edx
- adcl $0,%ebx
- # mul a[3]*b[1]
- mull %edx
- addl %eax,%ecx
- movl 8(%esi),%eax
- adcl %edx,%ebp
- movl 8(%edi),%edx
- adcl $0,%ebx
- # mul a[2]*b[2]
- mull %edx
- addl %eax,%ecx
- movl 4(%esi),%eax
- adcl %edx,%ebp
- movl 12(%edi),%edx
- adcl $0,%ebx
- # mul a[1]*b[3]
- mull %edx
- addl %eax,%ecx
- movl (%esi),%eax
- adcl %edx,%ebp
- movl 16(%edi),%edx
- adcl $0,%ebx
- # mul a[0]*b[4]
- mull %edx
- addl %eax,%ecx
- movl 20(%esp),%eax
- adcl %edx,%ebp
- movl (%edi),%edx
- adcl $0,%ebx
- movl %ecx,16(%eax)
- movl 20(%esi),%eax
- # saved r[4]
- # ################## Calculate word 5
- xorl %ecx,%ecx
- # mul a[5]*b[0]
- mull %edx
- addl %eax,%ebp
- movl 16(%esi),%eax
- adcl %edx,%ebx
- movl 4(%edi),%edx
- adcl $0,%ecx
- # mul a[4]*b[1]
- mull %edx
- addl %eax,%ebp
- movl 12(%esi),%eax
- adcl %edx,%ebx
- movl 8(%edi),%edx
- adcl $0,%ecx
- # mul a[3]*b[2]
- mull %edx
- addl %eax,%ebp
- movl 8(%esi),%eax
- adcl %edx,%ebx
- movl 12(%edi),%edx
- adcl $0,%ecx
- # mul a[2]*b[3]
- mull %edx
- addl %eax,%ebp
- movl 4(%esi),%eax
- adcl %edx,%ebx
- movl 16(%edi),%edx
- adcl $0,%ecx
- # mul a[1]*b[4]
- mull %edx
- addl %eax,%ebp
- movl (%esi),%eax
- adcl %edx,%ebx
- movl 20(%edi),%edx
- adcl $0,%ecx
- # mul a[0]*b[5]
- mull %edx
- addl %eax,%ebp
- movl 20(%esp),%eax
- adcl %edx,%ebx
- movl (%edi),%edx
- adcl $0,%ecx
- movl %ebp,20(%eax)
- movl 24(%esi),%eax
- # saved r[5]
- # ################## Calculate word 6
- xorl %ebp,%ebp
- # mul a[6]*b[0]
- mull %edx
- addl %eax,%ebx
- movl 20(%esi),%eax
- adcl %edx,%ecx
- movl 4(%edi),%edx
- adcl $0,%ebp
- # mul a[5]*b[1]
- mull %edx
- addl %eax,%ebx
- movl 16(%esi),%eax
- adcl %edx,%ecx
- movl 8(%edi),%edx
- adcl $0,%ebp
- # mul a[4]*b[2]
- mull %edx
- addl %eax,%ebx
- movl 12(%esi),%eax
- adcl %edx,%ecx
- movl 12(%edi),%edx
- adcl $0,%ebp
- # mul a[3]*b[3]
- mull %edx
- addl %eax,%ebx
- movl 8(%esi),%eax
- adcl %edx,%ecx
- movl 16(%edi),%edx
- adcl $0,%ebp
- # mul a[2]*b[4]
- mull %edx
- addl %eax,%ebx
- movl 4(%esi),%eax
- adcl %edx,%ecx
- movl 20(%edi),%edx
- adcl $0,%ebp
- # mul a[1]*b[5]
- mull %edx
- addl %eax,%ebx
- movl (%esi),%eax
- adcl %edx,%ecx
- movl 24(%edi),%edx
- adcl $0,%ebp
- # mul a[0]*b[6]
- mull %edx
- addl %eax,%ebx
- movl 20(%esp),%eax
- adcl %edx,%ecx
- movl (%edi),%edx
- adcl $0,%ebp
- movl %ebx,24(%eax)
- movl 28(%esi),%eax
- # saved r[6]
- # ################## Calculate word 7
- xorl %ebx,%ebx
- # mul a[7]*b[0]
- mull %edx
- addl %eax,%ecx
- movl 24(%esi),%eax
- adcl %edx,%ebp
- movl 4(%edi),%edx
- adcl $0,%ebx
- # mul a[6]*b[1]
- mull %edx
- addl %eax,%ecx
- movl 20(%esi),%eax
- adcl %edx,%ebp
- movl 8(%edi),%edx
- adcl $0,%ebx
- # mul a[5]*b[2]
- mull %edx
- addl %eax,%ecx
- movl 16(%esi),%eax
- adcl %edx,%ebp
- movl 12(%edi),%edx
- adcl $0,%ebx
- # mul a[4]*b[3]
- mull %edx
- addl %eax,%ecx
- movl 12(%esi),%eax
- adcl %edx,%ebp
- movl 16(%edi),%edx
- adcl $0,%ebx
- # mul a[3]*b[4]
- mull %edx
- addl %eax,%ecx
- movl 8(%esi),%eax
- adcl %edx,%ebp
- movl 20(%edi),%edx
- adcl $0,%ebx
- # mul a[2]*b[5]
- mull %edx
- addl %eax,%ecx
- movl 4(%esi),%eax
- adcl %edx,%ebp
- movl 24(%edi),%edx
- adcl $0,%ebx
- # mul a[1]*b[6]
- mull %edx
- addl %eax,%ecx
- movl (%esi),%eax
- adcl %edx,%ebp
- movl 28(%edi),%edx
- adcl $0,%ebx
- # mul a[0]*b[7]
- mull %edx
- addl %eax,%ecx
- movl 20(%esp),%eax
- adcl %edx,%ebp
- movl 4(%edi),%edx
- adcl $0,%ebx
- movl %ecx,28(%eax)
- movl 28(%esi),%eax
- # saved r[7]
- # ################## Calculate word 8
- xorl %ecx,%ecx
- # mul a[7]*b[1]
- mull %edx
- addl %eax,%ebp
- movl 24(%esi),%eax
- adcl %edx,%ebx
- movl 8(%edi),%edx
- adcl $0,%ecx
- # mul a[6]*b[2]
- mull %edx
- addl %eax,%ebp
- movl 20(%esi),%eax
- adcl %edx,%ebx
- movl 12(%edi),%edx
- adcl $0,%ecx
- # mul a[5]*b[3]
- mull %edx
- addl %eax,%ebp
- movl 16(%esi),%eax
- adcl %edx,%ebx
- movl 16(%edi),%edx
- adcl $0,%ecx
- # mul a[4]*b[4]
- mull %edx
- addl %eax,%ebp
- movl 12(%esi),%eax
- adcl %edx,%ebx
- movl 20(%edi),%edx
- adcl $0,%ecx
- # mul a[3]*b[5]
- mull %edx
- addl %eax,%ebp
- movl 8(%esi),%eax
- adcl %edx,%ebx
- movl 24(%edi),%edx
- adcl $0,%ecx
- # mul a[2]*b[6]
- mull %edx
- addl %eax,%ebp
- movl 4(%esi),%eax
- adcl %edx,%ebx
- movl 28(%edi),%edx
- adcl $0,%ecx
- # mul a[1]*b[7]
- mull %edx
- addl %eax,%ebp
- movl 20(%esp),%eax
- adcl %edx,%ebx
- movl 8(%edi),%edx
- adcl $0,%ecx
- movl %ebp,32(%eax)
- movl 28(%esi),%eax
- # saved r[8]
- # ################## Calculate word 9
- xorl %ebp,%ebp
- # mul a[7]*b[2]
- mull %edx
- addl %eax,%ebx
- movl 24(%esi),%eax
- adcl %edx,%ecx
- movl 12(%edi),%edx
- adcl $0,%ebp
- # mul a[6]*b[3]
- mull %edx
- addl %eax,%ebx
- movl 20(%esi),%eax
- adcl %edx,%ecx
- movl 16(%edi),%edx
- adcl $0,%ebp
- # mul a[5]*b[4]
- mull %edx
- addl %eax,%ebx
- movl 16(%esi),%eax
- adcl %edx,%ecx
- movl 20(%edi),%edx
- adcl $0,%ebp
- # mul a[4]*b[5]
- mull %edx
- addl %eax,%ebx
- movl 12(%esi),%eax
- adcl %edx,%ecx
- movl 24(%edi),%edx
- adcl $0,%ebp
- # mul a[3]*b[6]
- mull %edx
- addl %eax,%ebx
- movl 8(%esi),%eax
- adcl %edx,%ecx
- movl 28(%edi),%edx
- adcl $0,%ebp
- # mul a[2]*b[7]
- mull %edx
- addl %eax,%ebx
- movl 20(%esp),%eax
- adcl %edx,%ecx
- movl 12(%edi),%edx
- adcl $0,%ebp
- movl %ebx,36(%eax)
- movl 28(%esi),%eax
- # saved r[9]
- # ################## Calculate word 10
- xorl %ebx,%ebx
- # mul a[7]*b[3]
- mull %edx
- addl %eax,%ecx
- movl 24(%esi),%eax
- adcl %edx,%ebp
- movl 16(%edi),%edx
- adcl $0,%ebx
- # mul a[6]*b[4]
- mull %edx
- addl %eax,%ecx
- movl 20(%esi),%eax
- adcl %edx,%ebp
- movl 20(%edi),%edx
- adcl $0,%ebx
- # mul a[5]*b[5]
- mull %edx
- addl %eax,%ecx
- movl 16(%esi),%eax
- adcl %edx,%ebp
- movl 24(%edi),%edx
- adcl $0,%ebx
- # mul a[4]*b[6]
- mull %edx
- addl %eax,%ecx
- movl 12(%esi),%eax
- adcl %edx,%ebp
- movl 28(%edi),%edx
- adcl $0,%ebx
- # mul a[3]*b[7]
- mull %edx
- addl %eax,%ecx
- movl 20(%esp),%eax
- adcl %edx,%ebp
- movl 16(%edi),%edx
- adcl $0,%ebx
- movl %ecx,40(%eax)
- movl 28(%esi),%eax
- # saved r[10]
- # ################## Calculate word 11
- xorl %ecx,%ecx
- # mul a[7]*b[4]
- mull %edx
- addl %eax,%ebp
- movl 24(%esi),%eax
- adcl %edx,%ebx
- movl 20(%edi),%edx
- adcl $0,%ecx
- # mul a[6]*b[5]
- mull %edx
- addl %eax,%ebp
- movl 20(%esi),%eax
- adcl %edx,%ebx
- movl 24(%edi),%edx
- adcl $0,%ecx
- # mul a[5]*b[6]
- mull %edx
- addl %eax,%ebp
- movl 16(%esi),%eax
- adcl %edx,%ebx
- movl 28(%edi),%edx
- adcl $0,%ecx
- # mul a[4]*b[7]
- mull %edx
- addl %eax,%ebp
- movl 20(%esp),%eax
- adcl %edx,%ebx
- movl 20(%edi),%edx
- adcl $0,%ecx
- movl %ebp,44(%eax)
- movl 28(%esi),%eax
- # saved r[11]
- # ################## Calculate word 12
- xorl %ebp,%ebp
- # mul a[7]*b[5]
- mull %edx
- addl %eax,%ebx
- movl 24(%esi),%eax
- adcl %edx,%ecx
- movl 24(%edi),%edx
- adcl $0,%ebp
- # mul a[6]*b[6]
- mull %edx
- addl %eax,%ebx
- movl 20(%esi),%eax
- adcl %edx,%ecx
- movl 28(%edi),%edx
- adcl $0,%ebp
- # mul a[5]*b[7]
- mull %edx
- addl %eax,%ebx
- movl 20(%esp),%eax
- adcl %edx,%ecx
- movl 24(%edi),%edx
- adcl $0,%ebp
- movl %ebx,48(%eax)
- movl 28(%esi),%eax
- # saved r[12]
- # ################## Calculate word 13
- xorl %ebx,%ebx
- # mul a[7]*b[6]
- mull %edx
- addl %eax,%ecx
- movl 24(%esi),%eax
- adcl %edx,%ebp
- movl 28(%edi),%edx
- adcl $0,%ebx
- # mul a[6]*b[7]
- mull %edx
- addl %eax,%ecx
- movl 20(%esp),%eax
- adcl %edx,%ebp
- movl 28(%edi),%edx
- adcl $0,%ebx
- movl %ecx,52(%eax)
- movl 28(%esi),%eax
- # saved r[13]
- # ################## Calculate word 14
- xorl %ecx,%ecx
- # mul a[7]*b[7]
- mull %edx
- addl %eax,%ebp
- movl 20(%esp),%eax
- adcl %edx,%ebx
- adcl $0,%ecx
- movl %ebp,56(%eax)
- # saved r[14]
- # save r[15]
- movl %ebx,60(%eax)
- popl %ebx
- popl %ebp
- popl %edi
- popl %esi
- ret
-.globl _bn_mul_comba4
-.private_extern _bn_mul_comba4
-.align 4
-_bn_mul_comba4:
-L_bn_mul_comba4_begin:
- pushl %esi
- movl 12(%esp),%esi
- pushl %edi
- movl 20(%esp),%edi
- pushl %ebp
- pushl %ebx
- xorl %ebx,%ebx
- movl (%esi),%eax
- xorl %ecx,%ecx
- movl (%edi),%edx
- # ################## Calculate word 0
- xorl %ebp,%ebp
- # mul a[0]*b[0]
- mull %edx
- addl %eax,%ebx
- movl 20(%esp),%eax
- adcl %edx,%ecx
- movl (%edi),%edx
- adcl $0,%ebp
- movl %ebx,(%eax)
- movl 4(%esi),%eax
- # saved r[0]
- # ################## Calculate word 1
- xorl %ebx,%ebx
- # mul a[1]*b[0]
- mull %edx
- addl %eax,%ecx
- movl (%esi),%eax
- adcl %edx,%ebp
- movl 4(%edi),%edx
- adcl $0,%ebx
- # mul a[0]*b[1]
- mull %edx
- addl %eax,%ecx
- movl 20(%esp),%eax
- adcl %edx,%ebp
- movl (%edi),%edx
- adcl $0,%ebx
- movl %ecx,4(%eax)
- movl 8(%esi),%eax
- # saved r[1]
- # ################## Calculate word 2
- xorl %ecx,%ecx
- # mul a[2]*b[0]
- mull %edx
- addl %eax,%ebp
- movl 4(%esi),%eax
- adcl %edx,%ebx
- movl 4(%edi),%edx
- adcl $0,%ecx
- # mul a[1]*b[1]
- mull %edx
- addl %eax,%ebp
- movl (%esi),%eax
- adcl %edx,%ebx
- movl 8(%edi),%edx
- adcl $0,%ecx
- # mul a[0]*b[2]
- mull %edx
- addl %eax,%ebp
- movl 20(%esp),%eax
- adcl %edx,%ebx
- movl (%edi),%edx
- adcl $0,%ecx
- movl %ebp,8(%eax)
- movl 12(%esi),%eax
- # saved r[2]
- # ################## Calculate word 3
- xorl %ebp,%ebp
- # mul a[3]*b[0]
- mull %edx
- addl %eax,%ebx
- movl 8(%esi),%eax
- adcl %edx,%ecx
- movl 4(%edi),%edx
- adcl $0,%ebp
- # mul a[2]*b[1]
- mull %edx
- addl %eax,%ebx
- movl 4(%esi),%eax
- adcl %edx,%ecx
- movl 8(%edi),%edx
- adcl $0,%ebp
- # mul a[1]*b[2]
- mull %edx
- addl %eax,%ebx
- movl (%esi),%eax
- adcl %edx,%ecx
- movl 12(%edi),%edx
- adcl $0,%ebp
- # mul a[0]*b[3]
- mull %edx
- addl %eax,%ebx
- movl 20(%esp),%eax
- adcl %edx,%ecx
- movl 4(%edi),%edx
- adcl $0,%ebp
- movl %ebx,12(%eax)
- movl 12(%esi),%eax
- # saved r[3]
- # ################## Calculate word 4
- xorl %ebx,%ebx
- # mul a[3]*b[1]
- mull %edx
- addl %eax,%ecx
- movl 8(%esi),%eax
- adcl %edx,%ebp
- movl 8(%edi),%edx
- adcl $0,%ebx
- # mul a[2]*b[2]
- mull %edx
- addl %eax,%ecx
- movl 4(%esi),%eax
- adcl %edx,%ebp
- movl 12(%edi),%edx
- adcl $0,%ebx
- # mul a[1]*b[3]
- mull %edx
- addl %eax,%ecx
- movl 20(%esp),%eax
- adcl %edx,%ebp
- movl 8(%edi),%edx
- adcl $0,%ebx
- movl %ecx,16(%eax)
- movl 12(%esi),%eax
- # saved r[4]
- # ################## Calculate word 5
- xorl %ecx,%ecx
- # mul a[3]*b[2]
- mull %edx
- addl %eax,%ebp
- movl 8(%esi),%eax
- adcl %edx,%ebx
- movl 12(%edi),%edx
- adcl $0,%ecx
- # mul a[2]*b[3]
- mull %edx
- addl %eax,%ebp
- movl 20(%esp),%eax
- adcl %edx,%ebx
- movl 12(%edi),%edx
- adcl $0,%ecx
- movl %ebp,20(%eax)
- movl 12(%esi),%eax
- # saved r[5]
- # ################## Calculate word 6
- xorl %ebp,%ebp
- # mul a[3]*b[3]
- mull %edx
- addl %eax,%ebx
- movl 20(%esp),%eax
- adcl %edx,%ecx
- adcl $0,%ebp
- movl %ebx,24(%eax)
- # saved r[6]
- # save r[7]
- movl %ecx,28(%eax)
- popl %ebx
- popl %ebp
- popl %edi
- popl %esi
- ret
-.globl _bn_sqr_comba8
-.private_extern _bn_sqr_comba8
-.align 4
-_bn_sqr_comba8:
-L_bn_sqr_comba8_begin:
- pushl %esi
- pushl %edi
- pushl %ebp
- pushl %ebx
- movl 20(%esp),%edi
- movl 24(%esp),%esi
- xorl %ebx,%ebx
- xorl %ecx,%ecx
- movl (%esi),%eax
- # ############### Calculate word 0
- xorl %ebp,%ebp
- # sqr a[0]*a[0]
- mull %eax
- addl %eax,%ebx
- adcl %edx,%ecx
- movl (%esi),%edx
- adcl $0,%ebp
- movl %ebx,(%edi)
- movl 4(%esi),%eax
- # saved r[0]
- # ############### Calculate word 1
- xorl %ebx,%ebx
- # sqr a[1]*a[0]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebx
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 8(%esi),%eax
- adcl $0,%ebx
- movl %ecx,4(%edi)
- movl (%esi),%edx
- # saved r[1]
- # ############### Calculate word 2
- xorl %ecx,%ecx
- # sqr a[2]*a[0]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ecx
- addl %eax,%ebp
- adcl %edx,%ebx
- movl 4(%esi),%eax
- adcl $0,%ecx
- # sqr a[1]*a[1]
- mull %eax
- addl %eax,%ebp
- adcl %edx,%ebx
- movl (%esi),%edx
- adcl $0,%ecx
- movl %ebp,8(%edi)
- movl 12(%esi),%eax
- # saved r[2]
- # ############### Calculate word 3
- xorl %ebp,%ebp
- # sqr a[3]*a[0]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebp
- addl %eax,%ebx
- adcl %edx,%ecx
- movl 8(%esi),%eax
- adcl $0,%ebp
- movl 4(%esi),%edx
- # sqr a[2]*a[1]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebp
- addl %eax,%ebx
- adcl %edx,%ecx
- movl 16(%esi),%eax
- adcl $0,%ebp
- movl %ebx,12(%edi)
- movl (%esi),%edx
- # saved r[3]
- # ############### Calculate word 4
- xorl %ebx,%ebx
- # sqr a[4]*a[0]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebx
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 12(%esi),%eax
- adcl $0,%ebx
- movl 4(%esi),%edx
- # sqr a[3]*a[1]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebx
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 8(%esi),%eax
- adcl $0,%ebx
- # sqr a[2]*a[2]
- mull %eax
- addl %eax,%ecx
- adcl %edx,%ebp
- movl (%esi),%edx
- adcl $0,%ebx
- movl %ecx,16(%edi)
- movl 20(%esi),%eax
- # saved r[4]
- # ############### Calculate word 5
- xorl %ecx,%ecx
- # sqr a[5]*a[0]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ecx
- addl %eax,%ebp
- adcl %edx,%ebx
- movl 16(%esi),%eax
- adcl $0,%ecx
- movl 4(%esi),%edx
- # sqr a[4]*a[1]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ecx
- addl %eax,%ebp
- adcl %edx,%ebx
- movl 12(%esi),%eax
- adcl $0,%ecx
- movl 8(%esi),%edx
- # sqr a[3]*a[2]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ecx
- addl %eax,%ebp
- adcl %edx,%ebx
- movl 24(%esi),%eax
- adcl $0,%ecx
- movl %ebp,20(%edi)
- movl (%esi),%edx
- # saved r[5]
- # ############### Calculate word 6
- xorl %ebp,%ebp
- # sqr a[6]*a[0]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebp
- addl %eax,%ebx
- adcl %edx,%ecx
- movl 20(%esi),%eax
- adcl $0,%ebp
- movl 4(%esi),%edx
- # sqr a[5]*a[1]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebp
- addl %eax,%ebx
- adcl %edx,%ecx
- movl 16(%esi),%eax
- adcl $0,%ebp
- movl 8(%esi),%edx
- # sqr a[4]*a[2]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebp
- addl %eax,%ebx
- adcl %edx,%ecx
- movl 12(%esi),%eax
- adcl $0,%ebp
- # sqr a[3]*a[3]
- mull %eax
- addl %eax,%ebx
- adcl %edx,%ecx
- movl (%esi),%edx
- adcl $0,%ebp
- movl %ebx,24(%edi)
- movl 28(%esi),%eax
- # saved r[6]
- # ############### Calculate word 7
- xorl %ebx,%ebx
- # sqr a[7]*a[0]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebx
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 24(%esi),%eax
- adcl $0,%ebx
- movl 4(%esi),%edx
- # sqr a[6]*a[1]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebx
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 20(%esi),%eax
- adcl $0,%ebx
- movl 8(%esi),%edx
- # sqr a[5]*a[2]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebx
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 16(%esi),%eax
- adcl $0,%ebx
- movl 12(%esi),%edx
- # sqr a[4]*a[3]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebx
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 28(%esi),%eax
- adcl $0,%ebx
- movl %ecx,28(%edi)
- movl 4(%esi),%edx
- # saved r[7]
- # ############### Calculate word 8
- xorl %ecx,%ecx
- # sqr a[7]*a[1]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ecx
- addl %eax,%ebp
- adcl %edx,%ebx
- movl 24(%esi),%eax
- adcl $0,%ecx
- movl 8(%esi),%edx
- # sqr a[6]*a[2]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ecx
- addl %eax,%ebp
- adcl %edx,%ebx
- movl 20(%esi),%eax
- adcl $0,%ecx
- movl 12(%esi),%edx
- # sqr a[5]*a[3]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ecx
- addl %eax,%ebp
- adcl %edx,%ebx
- movl 16(%esi),%eax
- adcl $0,%ecx
- # sqr a[4]*a[4]
- mull %eax
- addl %eax,%ebp
- adcl %edx,%ebx
- movl 8(%esi),%edx
- adcl $0,%ecx
- movl %ebp,32(%edi)
- movl 28(%esi),%eax
- # saved r[8]
- # ############### Calculate word 9
- xorl %ebp,%ebp
- # sqr a[7]*a[2]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebp
- addl %eax,%ebx
- adcl %edx,%ecx
- movl 24(%esi),%eax
- adcl $0,%ebp
- movl 12(%esi),%edx
- # sqr a[6]*a[3]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebp
- addl %eax,%ebx
- adcl %edx,%ecx
- movl 20(%esi),%eax
- adcl $0,%ebp
- movl 16(%esi),%edx
- # sqr a[5]*a[4]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebp
- addl %eax,%ebx
- adcl %edx,%ecx
- movl 28(%esi),%eax
- adcl $0,%ebp
- movl %ebx,36(%edi)
- movl 12(%esi),%edx
- # saved r[9]
- # ############### Calculate word 10
- xorl %ebx,%ebx
- # sqr a[7]*a[3]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebx
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 24(%esi),%eax
- adcl $0,%ebx
- movl 16(%esi),%edx
- # sqr a[6]*a[4]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebx
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 20(%esi),%eax
- adcl $0,%ebx
- # sqr a[5]*a[5]
- mull %eax
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 16(%esi),%edx
- adcl $0,%ebx
- movl %ecx,40(%edi)
- movl 28(%esi),%eax
- # saved r[10]
- # ############### Calculate word 11
- xorl %ecx,%ecx
- # sqr a[7]*a[4]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ecx
- addl %eax,%ebp
- adcl %edx,%ebx
- movl 24(%esi),%eax
- adcl $0,%ecx
- movl 20(%esi),%edx
- # sqr a[6]*a[5]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ecx
- addl %eax,%ebp
- adcl %edx,%ebx
- movl 28(%esi),%eax
- adcl $0,%ecx
- movl %ebp,44(%edi)
- movl 20(%esi),%edx
- # saved r[11]
- # ############### Calculate word 12
- xorl %ebp,%ebp
- # sqr a[7]*a[5]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebp
- addl %eax,%ebx
- adcl %edx,%ecx
- movl 24(%esi),%eax
- adcl $0,%ebp
- # sqr a[6]*a[6]
- mull %eax
- addl %eax,%ebx
- adcl %edx,%ecx
- movl 24(%esi),%edx
- adcl $0,%ebp
- movl %ebx,48(%edi)
- movl 28(%esi),%eax
- # saved r[12]
- # ############### Calculate word 13
- xorl %ebx,%ebx
- # sqr a[7]*a[6]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebx
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 28(%esi),%eax
- adcl $0,%ebx
- movl %ecx,52(%edi)
- # saved r[13]
- # ############### Calculate word 14
- xorl %ecx,%ecx
- # sqr a[7]*a[7]
- mull %eax
- addl %eax,%ebp
- adcl %edx,%ebx
- adcl $0,%ecx
- movl %ebp,56(%edi)
- # saved r[14]
- movl %ebx,60(%edi)
- popl %ebx
- popl %ebp
- popl %edi
- popl %esi
- ret
-.globl _bn_sqr_comba4
-.private_extern _bn_sqr_comba4
-.align 4
-_bn_sqr_comba4:
-L_bn_sqr_comba4_begin:
- pushl %esi
- pushl %edi
- pushl %ebp
- pushl %ebx
- movl 20(%esp),%edi
- movl 24(%esp),%esi
- xorl %ebx,%ebx
- xorl %ecx,%ecx
- movl (%esi),%eax
- # ############### Calculate word 0
- xorl %ebp,%ebp
- # sqr a[0]*a[0]
- mull %eax
- addl %eax,%ebx
- adcl %edx,%ecx
- movl (%esi),%edx
- adcl $0,%ebp
- movl %ebx,(%edi)
- movl 4(%esi),%eax
- # saved r[0]
- # ############### Calculate word 1
- xorl %ebx,%ebx
- # sqr a[1]*a[0]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebx
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 8(%esi),%eax
- adcl $0,%ebx
- movl %ecx,4(%edi)
- movl (%esi),%edx
- # saved r[1]
- # ############### Calculate word 2
- xorl %ecx,%ecx
- # sqr a[2]*a[0]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ecx
- addl %eax,%ebp
- adcl %edx,%ebx
- movl 4(%esi),%eax
- adcl $0,%ecx
- # sqr a[1]*a[1]
- mull %eax
- addl %eax,%ebp
- adcl %edx,%ebx
- movl (%esi),%edx
- adcl $0,%ecx
- movl %ebp,8(%edi)
- movl 12(%esi),%eax
- # saved r[2]
- # ############### Calculate word 3
- xorl %ebp,%ebp
- # sqr a[3]*a[0]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebp
- addl %eax,%ebx
- adcl %edx,%ecx
- movl 8(%esi),%eax
- adcl $0,%ebp
- movl 4(%esi),%edx
- # sqr a[2]*a[1]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebp
- addl %eax,%ebx
- adcl %edx,%ecx
- movl 12(%esi),%eax
- adcl $0,%ebp
- movl %ebx,12(%edi)
- movl 4(%esi),%edx
- # saved r[3]
- # ############### Calculate word 4
- xorl %ebx,%ebx
- # sqr a[3]*a[1]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ebx
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 8(%esi),%eax
- adcl $0,%ebx
- # sqr a[2]*a[2]
- mull %eax
- addl %eax,%ecx
- adcl %edx,%ebp
- movl 8(%esi),%edx
- adcl $0,%ebx
- movl %ecx,16(%edi)
- movl 12(%esi),%eax
- # saved r[4]
- # ############### Calculate word 5
- xorl %ecx,%ecx
- # sqr a[3]*a[2]
- mull %edx
- addl %eax,%eax
- adcl %edx,%edx
- adcl $0,%ecx
- addl %eax,%ebp
- adcl %edx,%ebx
- movl 12(%esi),%eax
- adcl $0,%ecx
- movl %ebp,20(%edi)
- # saved r[5]
- # ############### Calculate word 6
- xorl %ebp,%ebp
- # sqr a[3]*a[3]
- mull %eax
- addl %eax,%ebx
- adcl %edx,%ecx
- adcl $0,%ebp
- movl %ebx,24(%edi)
- # saved r[6]
- movl %ecx,28(%edi)
- popl %ebx
- popl %ebp
- popl %edi
- popl %esi
- ret
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/ghash-ssse3-x86-apple.S b/apple-x86/crypto/fipsmodule/ghash-ssse3-x86-apple.S
deleted file mode 100644
index 24b1f2f..0000000
--- a/apple-x86/crypto/fipsmodule/ghash-ssse3-x86-apple.S
+++ /dev/null
@@ -1,288 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl _gcm_gmult_ssse3
-.private_extern _gcm_gmult_ssse3
-.align 4
-_gcm_gmult_ssse3:
-L_gcm_gmult_ssse3_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%edi
- movl 24(%esp),%esi
- movdqu (%edi),%xmm0
- call L000pic_point
-L000pic_point:
- popl %eax
- movdqa Lreverse_bytes-L000pic_point(%eax),%xmm7
- movdqa Llow4_mask-L000pic_point(%eax),%xmm2
-.byte 102,15,56,0,199
- movdqa %xmm2,%xmm1
- pandn %xmm0,%xmm1
- psrld $4,%xmm1
- pand %xmm2,%xmm0
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- movl $5,%eax
-L001loop_row_1:
- movdqa (%esi),%xmm4
- leal 16(%esi),%esi
- movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
- movdqa %xmm6,%xmm3
- psrldq $1,%xmm2
- movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
- pxor %xmm5,%xmm2
- movdqa %xmm4,%xmm5
- psllq $60,%xmm5
- movdqa %xmm5,%xmm6
- pslldq $8,%xmm6
- pxor %xmm6,%xmm3
- psrldq $8,%xmm5
- pxor %xmm5,%xmm2
- psrlq $4,%xmm4
- pxor %xmm4,%xmm2
- subl $1,%eax
- jnz L001loop_row_1
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $5,%xmm3
- pxor %xmm3,%xmm2
- pxor %xmm3,%xmm3
- movl $5,%eax
-L002loop_row_2:
- movdqa (%esi),%xmm4
- leal 16(%esi),%esi
- movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
- movdqa %xmm6,%xmm3
- psrldq $1,%xmm2
- movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
- pxor %xmm5,%xmm2
- movdqa %xmm4,%xmm5
- psllq $60,%xmm5
- movdqa %xmm5,%xmm6
- pslldq $8,%xmm6
- pxor %xmm6,%xmm3
- psrldq $8,%xmm5
- pxor %xmm5,%xmm2
- psrlq $4,%xmm4
- pxor %xmm4,%xmm2
- subl $1,%eax
- jnz L002loop_row_2
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $5,%xmm3
- pxor %xmm3,%xmm2
- pxor %xmm3,%xmm3
- movl $6,%eax
-L003loop_row_3:
- movdqa (%esi),%xmm4
- leal 16(%esi),%esi
- movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
- movdqa %xmm6,%xmm3
- psrldq $1,%xmm2
- movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
- pxor %xmm5,%xmm2
- movdqa %xmm4,%xmm5
- psllq $60,%xmm5
- movdqa %xmm5,%xmm6
- pslldq $8,%xmm6
- pxor %xmm6,%xmm3
- psrldq $8,%xmm5
- pxor %xmm5,%xmm2
- psrlq $4,%xmm4
- pxor %xmm4,%xmm2
- subl $1,%eax
- jnz L003loop_row_3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $5,%xmm3
- pxor %xmm3,%xmm2
- pxor %xmm3,%xmm3
-.byte 102,15,56,0,215
- movdqu %xmm2,(%edi)
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _gcm_ghash_ssse3
-.private_extern _gcm_ghash_ssse3
-.align 4
-_gcm_ghash_ssse3:
-L_gcm_ghash_ssse3_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%edi
- movl 24(%esp),%esi
- movl 28(%esp),%edx
- movl 32(%esp),%ecx
- movdqu (%edi),%xmm0
- call L004pic_point
-L004pic_point:
- popl %ebx
- movdqa Lreverse_bytes-L004pic_point(%ebx),%xmm7
- andl $-16,%ecx
-.byte 102,15,56,0,199
- pxor %xmm3,%xmm3
-L005loop_ghash:
- movdqa Llow4_mask-L004pic_point(%ebx),%xmm2
- movdqu (%edx),%xmm1
-.byte 102,15,56,0,207
- pxor %xmm1,%xmm0
- movdqa %xmm2,%xmm1
- pandn %xmm0,%xmm1
- psrld $4,%xmm1
- pand %xmm2,%xmm0
- pxor %xmm2,%xmm2
- movl $5,%eax
-L006loop_row_4:
- movdqa (%esi),%xmm4
- leal 16(%esi),%esi
- movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
- movdqa %xmm6,%xmm3
- psrldq $1,%xmm2
- movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
- pxor %xmm5,%xmm2
- movdqa %xmm4,%xmm5
- psllq $60,%xmm5
- movdqa %xmm5,%xmm6
- pslldq $8,%xmm6
- pxor %xmm6,%xmm3
- psrldq $8,%xmm5
- pxor %xmm5,%xmm2
- psrlq $4,%xmm4
- pxor %xmm4,%xmm2
- subl $1,%eax
- jnz L006loop_row_4
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $5,%xmm3
- pxor %xmm3,%xmm2
- pxor %xmm3,%xmm3
- movl $5,%eax
-L007loop_row_5:
- movdqa (%esi),%xmm4
- leal 16(%esi),%esi
- movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
- movdqa %xmm6,%xmm3
- psrldq $1,%xmm2
- movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
- pxor %xmm5,%xmm2
- movdqa %xmm4,%xmm5
- psllq $60,%xmm5
- movdqa %xmm5,%xmm6
- pslldq $8,%xmm6
- pxor %xmm6,%xmm3
- psrldq $8,%xmm5
- pxor %xmm5,%xmm2
- psrlq $4,%xmm4
- pxor %xmm4,%xmm2
- subl $1,%eax
- jnz L007loop_row_5
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $5,%xmm3
- pxor %xmm3,%xmm2
- pxor %xmm3,%xmm3
- movl $6,%eax
-L008loop_row_6:
- movdqa (%esi),%xmm4
- leal 16(%esi),%esi
- movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
- movdqa %xmm6,%xmm3
- psrldq $1,%xmm2
- movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
- pxor %xmm5,%xmm2
- movdqa %xmm4,%xmm5
- psllq $60,%xmm5
- movdqa %xmm5,%xmm6
- pslldq $8,%xmm6
- pxor %xmm6,%xmm3
- psrldq $8,%xmm5
- pxor %xmm5,%xmm2
- psrlq $4,%xmm4
- pxor %xmm4,%xmm2
- subl $1,%eax
- jnz L008loop_row_6
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $5,%xmm3
- pxor %xmm3,%xmm2
- pxor %xmm3,%xmm3
- movdqa %xmm2,%xmm0
- leal -256(%esi),%esi
- leal 16(%edx),%edx
- subl $16,%ecx
- jnz L005loop_ghash
-.byte 102,15,56,0,199
- movdqu %xmm0,(%edi)
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 4,0x90
-Lreverse_bytes:
-.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.align 4,0x90
-Llow4_mask:
-.long 252645135,252645135,252645135,252645135
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/ghash-x86-apple.S b/apple-x86/crypto/fipsmodule/ghash-x86-apple.S
deleted file mode 100644
index a178b74..0000000
--- a/apple-x86/crypto/fipsmodule/ghash-x86-apple.S
+++ /dev/null
@@ -1,322 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl _gcm_init_clmul
-.private_extern _gcm_init_clmul
-.align 4
-_gcm_init_clmul:
-L_gcm_init_clmul_begin:
- movl 4(%esp),%edx
- movl 8(%esp),%eax
- call L000pic
-L000pic:
- popl %ecx
- leal Lbswap-L000pic(%ecx),%ecx
- movdqu (%eax),%xmm2
- pshufd $78,%xmm2,%xmm2
- pshufd $255,%xmm2,%xmm4
- movdqa %xmm2,%xmm3
- psllq $1,%xmm2
- pxor %xmm5,%xmm5
- psrlq $63,%xmm3
- pcmpgtd %xmm4,%xmm5
- pslldq $8,%xmm3
- por %xmm3,%xmm2
- pand 16(%ecx),%xmm5
- pxor %xmm5,%xmm2
- movdqa %xmm2,%xmm0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
- xorps %xmm0,%xmm3
- xorps %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
- psllq $5,%xmm0
- pxor %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm3
- pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- movdqa %xmm0,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
- pshufd $78,%xmm2,%xmm3
- pshufd $78,%xmm0,%xmm4
- pxor %xmm2,%xmm3
- movdqu %xmm2,(%edx)
- pxor %xmm0,%xmm4
- movdqu %xmm0,16(%edx)
-.byte 102,15,58,15,227,8
- movdqu %xmm4,32(%edx)
- ret
-.globl _gcm_gmult_clmul
-.private_extern _gcm_gmult_clmul
-.align 4
-_gcm_gmult_clmul:
-L_gcm_gmult_clmul_begin:
- movl 4(%esp),%eax
- movl 8(%esp),%edx
- call L001pic
-L001pic:
- popl %ecx
- leal Lbswap-L001pic(%ecx),%ecx
- movdqu (%eax),%xmm0
- movdqa (%ecx),%xmm5
- movups (%edx),%xmm2
-.byte 102,15,56,0,197
- movups 32(%edx),%xmm4
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
- xorps %xmm0,%xmm3
- xorps %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
- psllq $5,%xmm0
- pxor %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm3
- pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- movdqa %xmm0,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
-.byte 102,15,56,0,197
- movdqu %xmm0,(%eax)
- ret
-.globl _gcm_ghash_clmul
-.private_extern _gcm_ghash_clmul
-.align 4
-_gcm_ghash_clmul:
-L_gcm_ghash_clmul_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%eax
- movl 24(%esp),%edx
- movl 28(%esp),%esi
- movl 32(%esp),%ebx
- call L002pic
-L002pic:
- popl %ecx
- leal Lbswap-L002pic(%ecx),%ecx
- movdqu (%eax),%xmm0
- movdqa (%ecx),%xmm5
- movdqu (%edx),%xmm2
-.byte 102,15,56,0,197
- subl $16,%ebx
- jz L003odd_tail
- movdqu (%esi),%xmm3
- movdqu 16(%esi),%xmm6
-.byte 102,15,56,0,221
-.byte 102,15,56,0,245
- movdqu 32(%edx),%xmm5
- pxor %xmm3,%xmm0
- pshufd $78,%xmm6,%xmm3
- movdqa %xmm6,%xmm7
- pxor %xmm6,%xmm3
- leal 32(%esi),%esi
-.byte 102,15,58,68,242,0
-.byte 102,15,58,68,250,17
-.byte 102,15,58,68,221,0
- movups 16(%edx),%xmm2
- nop
- subl $32,%ebx
- jbe L004even_tail
- jmp L005mod_loop
-.align 5,0x90
-L005mod_loop:
- pshufd $78,%xmm0,%xmm4
- movdqa %xmm0,%xmm1
- pxor %xmm0,%xmm4
- nop
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,229,16
- movups (%edx),%xmm2
- xorps %xmm6,%xmm0
- movdqa (%ecx),%xmm5
- xorps %xmm7,%xmm1
- movdqu (%esi),%xmm7
- pxor %xmm0,%xmm3
- movdqu 16(%esi),%xmm6
- pxor %xmm1,%xmm3
-.byte 102,15,56,0,253
- pxor %xmm3,%xmm4
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm4
- pslldq $8,%xmm3
- pxor %xmm4,%xmm1
- pxor %xmm3,%xmm0
-.byte 102,15,56,0,245
- pxor %xmm7,%xmm1
- movdqa %xmm6,%xmm7
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
- psllq $5,%xmm0
- pxor %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
-.byte 102,15,58,68,242,0
- movups 32(%edx),%xmm5
- psllq $57,%xmm0
- movdqa %xmm0,%xmm3
- pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- pshufd $78,%xmm7,%xmm3
- movdqa %xmm0,%xmm4
- psrlq $1,%xmm0
- pxor %xmm7,%xmm3
- pxor %xmm4,%xmm1
-.byte 102,15,58,68,250,17
- movups 16(%edx),%xmm2
- pxor %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
-.byte 102,15,58,68,221,0
- leal 32(%esi),%esi
- subl $32,%ebx
- ja L005mod_loop
-L004even_tail:
- pshufd $78,%xmm0,%xmm4
- movdqa %xmm0,%xmm1
- pxor %xmm0,%xmm4
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,229,16
- movdqa (%ecx),%xmm5
- xorps %xmm6,%xmm0
- xorps %xmm7,%xmm1
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm3,%xmm4
- movdqa %xmm4,%xmm3
- psrldq $8,%xmm4
- pslldq $8,%xmm3
- pxor %xmm4,%xmm1
- pxor %xmm3,%xmm0
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
- psllq $5,%xmm0
- pxor %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm3
- pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- movdqa %xmm0,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
- testl %ebx,%ebx
- jnz L006done
- movups (%edx),%xmm2
-L003odd_tail:
- movdqu (%esi),%xmm3
-.byte 102,15,56,0,221
- pxor %xmm3,%xmm0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pshufd $78,%xmm2,%xmm4
- pxor %xmm0,%xmm3
- pxor %xmm2,%xmm4
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
- xorps %xmm0,%xmm3
- xorps %xmm1,%xmm3
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
- psllq $5,%xmm0
- pxor %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm3
- pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
- movdqa %xmm0,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
-L006done:
-.byte 102,15,56,0,197
- movdqu %xmm0,(%eax)
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 6,0x90
-Lbswap:
-.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
-.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
-.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
-.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
-.byte 0
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/md5-586-apple.S b/apple-x86/crypto/fipsmodule/md5-586-apple.S
deleted file mode 100644
index 986d590..0000000
--- a/apple-x86/crypto/fipsmodule/md5-586-apple.S
+++ /dev/null
@@ -1,684 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl _md5_block_asm_data_order
-.private_extern _md5_block_asm_data_order
-.align 4
-_md5_block_asm_data_order:
-L_md5_block_asm_data_order_begin:
- pushl %esi
- pushl %edi
- movl 12(%esp),%edi
- movl 16(%esp),%esi
- movl 20(%esp),%ecx
- pushl %ebp
- shll $6,%ecx
- pushl %ebx
- addl %esi,%ecx
- subl $64,%ecx
- movl (%edi),%eax
- pushl %ecx
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- movl 12(%edi),%edx
-L000start:
-
- # R0 section
- movl %ecx,%edi
- movl (%esi),%ebp
- # R0 0
- xorl %edx,%edi
- andl %ebx,%edi
- leal 3614090360(%eax,%ebp,1),%eax
- xorl %edx,%edi
- addl %edi,%eax
- movl %ebx,%edi
- roll $7,%eax
- movl 4(%esi),%ebp
- addl %ebx,%eax
- # R0 1
- xorl %ecx,%edi
- andl %eax,%edi
- leal 3905402710(%edx,%ebp,1),%edx
- xorl %ecx,%edi
- addl %edi,%edx
- movl %eax,%edi
- roll $12,%edx
- movl 8(%esi),%ebp
- addl %eax,%edx
- # R0 2
- xorl %ebx,%edi
- andl %edx,%edi
- leal 606105819(%ecx,%ebp,1),%ecx
- xorl %ebx,%edi
- addl %edi,%ecx
- movl %edx,%edi
- roll $17,%ecx
- movl 12(%esi),%ebp
- addl %edx,%ecx
- # R0 3
- xorl %eax,%edi
- andl %ecx,%edi
- leal 3250441966(%ebx,%ebp,1),%ebx
- xorl %eax,%edi
- addl %edi,%ebx
- movl %ecx,%edi
- roll $22,%ebx
- movl 16(%esi),%ebp
- addl %ecx,%ebx
- # R0 4
- xorl %edx,%edi
- andl %ebx,%edi
- leal 4118548399(%eax,%ebp,1),%eax
- xorl %edx,%edi
- addl %edi,%eax
- movl %ebx,%edi
- roll $7,%eax
- movl 20(%esi),%ebp
- addl %ebx,%eax
- # R0 5
- xorl %ecx,%edi
- andl %eax,%edi
- leal 1200080426(%edx,%ebp,1),%edx
- xorl %ecx,%edi
- addl %edi,%edx
- movl %eax,%edi
- roll $12,%edx
- movl 24(%esi),%ebp
- addl %eax,%edx
- # R0 6
- xorl %ebx,%edi
- andl %edx,%edi
- leal 2821735955(%ecx,%ebp,1),%ecx
- xorl %ebx,%edi
- addl %edi,%ecx
- movl %edx,%edi
- roll $17,%ecx
- movl 28(%esi),%ebp
- addl %edx,%ecx
- # R0 7
- xorl %eax,%edi
- andl %ecx,%edi
- leal 4249261313(%ebx,%ebp,1),%ebx
- xorl %eax,%edi
- addl %edi,%ebx
- movl %ecx,%edi
- roll $22,%ebx
- movl 32(%esi),%ebp
- addl %ecx,%ebx
- # R0 8
- xorl %edx,%edi
- andl %ebx,%edi
- leal 1770035416(%eax,%ebp,1),%eax
- xorl %edx,%edi
- addl %edi,%eax
- movl %ebx,%edi
- roll $7,%eax
- movl 36(%esi),%ebp
- addl %ebx,%eax
- # R0 9
- xorl %ecx,%edi
- andl %eax,%edi
- leal 2336552879(%edx,%ebp,1),%edx
- xorl %ecx,%edi
- addl %edi,%edx
- movl %eax,%edi
- roll $12,%edx
- movl 40(%esi),%ebp
- addl %eax,%edx
- # R0 10
- xorl %ebx,%edi
- andl %edx,%edi
- leal 4294925233(%ecx,%ebp,1),%ecx
- xorl %ebx,%edi
- addl %edi,%ecx
- movl %edx,%edi
- roll $17,%ecx
- movl 44(%esi),%ebp
- addl %edx,%ecx
- # R0 11
- xorl %eax,%edi
- andl %ecx,%edi
- leal 2304563134(%ebx,%ebp,1),%ebx
- xorl %eax,%edi
- addl %edi,%ebx
- movl %ecx,%edi
- roll $22,%ebx
- movl 48(%esi),%ebp
- addl %ecx,%ebx
- # R0 12
- xorl %edx,%edi
- andl %ebx,%edi
- leal 1804603682(%eax,%ebp,1),%eax
- xorl %edx,%edi
- addl %edi,%eax
- movl %ebx,%edi
- roll $7,%eax
- movl 52(%esi),%ebp
- addl %ebx,%eax
- # R0 13
- xorl %ecx,%edi
- andl %eax,%edi
- leal 4254626195(%edx,%ebp,1),%edx
- xorl %ecx,%edi
- addl %edi,%edx
- movl %eax,%edi
- roll $12,%edx
- movl 56(%esi),%ebp
- addl %eax,%edx
- # R0 14
- xorl %ebx,%edi
- andl %edx,%edi
- leal 2792965006(%ecx,%ebp,1),%ecx
- xorl %ebx,%edi
- addl %edi,%ecx
- movl %edx,%edi
- roll $17,%ecx
- movl 60(%esi),%ebp
- addl %edx,%ecx
- # R0 15
- xorl %eax,%edi
- andl %ecx,%edi
- leal 1236535329(%ebx,%ebp,1),%ebx
- xorl %eax,%edi
- addl %edi,%ebx
- movl %ecx,%edi
- roll $22,%ebx
- movl 4(%esi),%ebp
- addl %ecx,%ebx
-
- # R1 section
- # R1 16
- leal 4129170786(%eax,%ebp,1),%eax
- xorl %ebx,%edi
- andl %edx,%edi
- movl 24(%esi),%ebp
- xorl %ecx,%edi
- addl %edi,%eax
- movl %ebx,%edi
- roll $5,%eax
- addl %ebx,%eax
- # R1 17
- leal 3225465664(%edx,%ebp,1),%edx
- xorl %eax,%edi
- andl %ecx,%edi
- movl 44(%esi),%ebp
- xorl %ebx,%edi
- addl %edi,%edx
- movl %eax,%edi
- roll $9,%edx
- addl %eax,%edx
- # R1 18
- leal 643717713(%ecx,%ebp,1),%ecx
- xorl %edx,%edi
- andl %ebx,%edi
- movl (%esi),%ebp
- xorl %eax,%edi
- addl %edi,%ecx
- movl %edx,%edi
- roll $14,%ecx
- addl %edx,%ecx
- # R1 19
- leal 3921069994(%ebx,%ebp,1),%ebx
- xorl %ecx,%edi
- andl %eax,%edi
- movl 20(%esi),%ebp
- xorl %edx,%edi
- addl %edi,%ebx
- movl %ecx,%edi
- roll $20,%ebx
- addl %ecx,%ebx
- # R1 20
- leal 3593408605(%eax,%ebp,1),%eax
- xorl %ebx,%edi
- andl %edx,%edi
- movl 40(%esi),%ebp
- xorl %ecx,%edi
- addl %edi,%eax
- movl %ebx,%edi
- roll $5,%eax
- addl %ebx,%eax
- # R1 21
- leal 38016083(%edx,%ebp,1),%edx
- xorl %eax,%edi
- andl %ecx,%edi
- movl 60(%esi),%ebp
- xorl %ebx,%edi
- addl %edi,%edx
- movl %eax,%edi
- roll $9,%edx
- addl %eax,%edx
- # R1 22
- leal 3634488961(%ecx,%ebp,1),%ecx
- xorl %edx,%edi
- andl %ebx,%edi
- movl 16(%esi),%ebp
- xorl %eax,%edi
- addl %edi,%ecx
- movl %edx,%edi
- roll $14,%ecx
- addl %edx,%ecx
- # R1 23
- leal 3889429448(%ebx,%ebp,1),%ebx
- xorl %ecx,%edi
- andl %eax,%edi
- movl 36(%esi),%ebp
- xorl %edx,%edi
- addl %edi,%ebx
- movl %ecx,%edi
- roll $20,%ebx
- addl %ecx,%ebx
- # R1 24
- leal 568446438(%eax,%ebp,1),%eax
- xorl %ebx,%edi
- andl %edx,%edi
- movl 56(%esi),%ebp
- xorl %ecx,%edi
- addl %edi,%eax
- movl %ebx,%edi
- roll $5,%eax
- addl %ebx,%eax
- # R1 25
- leal 3275163606(%edx,%ebp,1),%edx
- xorl %eax,%edi
- andl %ecx,%edi
- movl 12(%esi),%ebp
- xorl %ebx,%edi
- addl %edi,%edx
- movl %eax,%edi
- roll $9,%edx
- addl %eax,%edx
- # R1 26
- leal 4107603335(%ecx,%ebp,1),%ecx
- xorl %edx,%edi
- andl %ebx,%edi
- movl 32(%esi),%ebp
- xorl %eax,%edi
- addl %edi,%ecx
- movl %edx,%edi
- roll $14,%ecx
- addl %edx,%ecx
- # R1 27
- leal 1163531501(%ebx,%ebp,1),%ebx
- xorl %ecx,%edi
- andl %eax,%edi
- movl 52(%esi),%ebp
- xorl %edx,%edi
- addl %edi,%ebx
- movl %ecx,%edi
- roll $20,%ebx
- addl %ecx,%ebx
- # R1 28
- leal 2850285829(%eax,%ebp,1),%eax
- xorl %ebx,%edi
- andl %edx,%edi
- movl 8(%esi),%ebp
- xorl %ecx,%edi
- addl %edi,%eax
- movl %ebx,%edi
- roll $5,%eax
- addl %ebx,%eax
- # R1 29
- leal 4243563512(%edx,%ebp,1),%edx
- xorl %eax,%edi
- andl %ecx,%edi
- movl 28(%esi),%ebp
- xorl %ebx,%edi
- addl %edi,%edx
- movl %eax,%edi
- roll $9,%edx
- addl %eax,%edx
- # R1 30
- leal 1735328473(%ecx,%ebp,1),%ecx
- xorl %edx,%edi
- andl %ebx,%edi
- movl 48(%esi),%ebp
- xorl %eax,%edi
- addl %edi,%ecx
- movl %edx,%edi
- roll $14,%ecx
- addl %edx,%ecx
- # R1 31
- leal 2368359562(%ebx,%ebp,1),%ebx
- xorl %ecx,%edi
- andl %eax,%edi
- movl 20(%esi),%ebp
- xorl %edx,%edi
- addl %edi,%ebx
- movl %ecx,%edi
- roll $20,%ebx
- addl %ecx,%ebx
-
- # R2 section
- # R2 32
- xorl %edx,%edi
- xorl %ebx,%edi
- leal 4294588738(%eax,%ebp,1),%eax
- addl %edi,%eax
- roll $4,%eax
- movl 32(%esi),%ebp
- movl %ebx,%edi
- # R2 33
- leal 2272392833(%edx,%ebp,1),%edx
- addl %ebx,%eax
- xorl %ecx,%edi
- xorl %eax,%edi
- movl 44(%esi),%ebp
- addl %edi,%edx
- movl %eax,%edi
- roll $11,%edx
- addl %eax,%edx
- # R2 34
- xorl %ebx,%edi
- xorl %edx,%edi
- leal 1839030562(%ecx,%ebp,1),%ecx
- addl %edi,%ecx
- roll $16,%ecx
- movl 56(%esi),%ebp
- movl %edx,%edi
- # R2 35
- leal 4259657740(%ebx,%ebp,1),%ebx
- addl %edx,%ecx
- xorl %eax,%edi
- xorl %ecx,%edi
- movl 4(%esi),%ebp
- addl %edi,%ebx
- movl %ecx,%edi
- roll $23,%ebx
- addl %ecx,%ebx
- # R2 36
- xorl %edx,%edi
- xorl %ebx,%edi
- leal 2763975236(%eax,%ebp,1),%eax
- addl %edi,%eax
- roll $4,%eax
- movl 16(%esi),%ebp
- movl %ebx,%edi
- # R2 37
- leal 1272893353(%edx,%ebp,1),%edx
- addl %ebx,%eax
- xorl %ecx,%edi
- xorl %eax,%edi
- movl 28(%esi),%ebp
- addl %edi,%edx
- movl %eax,%edi
- roll $11,%edx
- addl %eax,%edx
- # R2 38
- xorl %ebx,%edi
- xorl %edx,%edi
- leal 4139469664(%ecx,%ebp,1),%ecx
- addl %edi,%ecx
- roll $16,%ecx
- movl 40(%esi),%ebp
- movl %edx,%edi
- # R2 39
- leal 3200236656(%ebx,%ebp,1),%ebx
- addl %edx,%ecx
- xorl %eax,%edi
- xorl %ecx,%edi
- movl 52(%esi),%ebp
- addl %edi,%ebx
- movl %ecx,%edi
- roll $23,%ebx
- addl %ecx,%ebx
- # R2 40
- xorl %edx,%edi
- xorl %ebx,%edi
- leal 681279174(%eax,%ebp,1),%eax
- addl %edi,%eax
- roll $4,%eax
- movl (%esi),%ebp
- movl %ebx,%edi
- # R2 41
- leal 3936430074(%edx,%ebp,1),%edx
- addl %ebx,%eax
- xorl %ecx,%edi
- xorl %eax,%edi
- movl 12(%esi),%ebp
- addl %edi,%edx
- movl %eax,%edi
- roll $11,%edx
- addl %eax,%edx
- # R2 42
- xorl %ebx,%edi
- xorl %edx,%edi
- leal 3572445317(%ecx,%ebp,1),%ecx
- addl %edi,%ecx
- roll $16,%ecx
- movl 24(%esi),%ebp
- movl %edx,%edi
- # R2 43
- leal 76029189(%ebx,%ebp,1),%ebx
- addl %edx,%ecx
- xorl %eax,%edi
- xorl %ecx,%edi
- movl 36(%esi),%ebp
- addl %edi,%ebx
- movl %ecx,%edi
- roll $23,%ebx
- addl %ecx,%ebx
- # R2 44
- xorl %edx,%edi
- xorl %ebx,%edi
- leal 3654602809(%eax,%ebp,1),%eax
- addl %edi,%eax
- roll $4,%eax
- movl 48(%esi),%ebp
- movl %ebx,%edi
- # R2 45
- leal 3873151461(%edx,%ebp,1),%edx
- addl %ebx,%eax
- xorl %ecx,%edi
- xorl %eax,%edi
- movl 60(%esi),%ebp
- addl %edi,%edx
- movl %eax,%edi
- roll $11,%edx
- addl %eax,%edx
- # R2 46
- xorl %ebx,%edi
- xorl %edx,%edi
- leal 530742520(%ecx,%ebp,1),%ecx
- addl %edi,%ecx
- roll $16,%ecx
- movl 8(%esi),%ebp
- movl %edx,%edi
- # R2 47
- leal 3299628645(%ebx,%ebp,1),%ebx
- addl %edx,%ecx
- xorl %eax,%edi
- xorl %ecx,%edi
- movl (%esi),%ebp
- addl %edi,%ebx
- movl $-1,%edi
- roll $23,%ebx
- addl %ecx,%ebx
-
- # R3 section
- # R3 48
- xorl %edx,%edi
- orl %ebx,%edi
- leal 4096336452(%eax,%ebp,1),%eax
- xorl %ecx,%edi
- movl 28(%esi),%ebp
- addl %edi,%eax
- movl $-1,%edi
- roll $6,%eax
- xorl %ecx,%edi
- addl %ebx,%eax
- # R3 49
- orl %eax,%edi
- leal 1126891415(%edx,%ebp,1),%edx
- xorl %ebx,%edi
- movl 56(%esi),%ebp
- addl %edi,%edx
- movl $-1,%edi
- roll $10,%edx
- xorl %ebx,%edi
- addl %eax,%edx
- # R3 50
- orl %edx,%edi
- leal 2878612391(%ecx,%ebp,1),%ecx
- xorl %eax,%edi
- movl 20(%esi),%ebp
- addl %edi,%ecx
- movl $-1,%edi
- roll $15,%ecx
- xorl %eax,%edi
- addl %edx,%ecx
- # R3 51
- orl %ecx,%edi
- leal 4237533241(%ebx,%ebp,1),%ebx
- xorl %edx,%edi
- movl 48(%esi),%ebp
- addl %edi,%ebx
- movl $-1,%edi
- roll $21,%ebx
- xorl %edx,%edi
- addl %ecx,%ebx
- # R3 52
- orl %ebx,%edi
- leal 1700485571(%eax,%ebp,1),%eax
- xorl %ecx,%edi
- movl 12(%esi),%ebp
- addl %edi,%eax
- movl $-1,%edi
- roll $6,%eax
- xorl %ecx,%edi
- addl %ebx,%eax
- # R3 53
- orl %eax,%edi
- leal 2399980690(%edx,%ebp,1),%edx
- xorl %ebx,%edi
- movl 40(%esi),%ebp
- addl %edi,%edx
- movl $-1,%edi
- roll $10,%edx
- xorl %ebx,%edi
- addl %eax,%edx
- # R3 54
- orl %edx,%edi
- leal 4293915773(%ecx,%ebp,1),%ecx
- xorl %eax,%edi
- movl 4(%esi),%ebp
- addl %edi,%ecx
- movl $-1,%edi
- roll $15,%ecx
- xorl %eax,%edi
- addl %edx,%ecx
- # R3 55
- orl %ecx,%edi
- leal 2240044497(%ebx,%ebp,1),%ebx
- xorl %edx,%edi
- movl 32(%esi),%ebp
- addl %edi,%ebx
- movl $-1,%edi
- roll $21,%ebx
- xorl %edx,%edi
- addl %ecx,%ebx
- # R3 56
- orl %ebx,%edi
- leal 1873313359(%eax,%ebp,1),%eax
- xorl %ecx,%edi
- movl 60(%esi),%ebp
- addl %edi,%eax
- movl $-1,%edi
- roll $6,%eax
- xorl %ecx,%edi
- addl %ebx,%eax
- # R3 57
- orl %eax,%edi
- leal 4264355552(%edx,%ebp,1),%edx
- xorl %ebx,%edi
- movl 24(%esi),%ebp
- addl %edi,%edx
- movl $-1,%edi
- roll $10,%edx
- xorl %ebx,%edi
- addl %eax,%edx
- # R3 58
- orl %edx,%edi
- leal 2734768916(%ecx,%ebp,1),%ecx
- xorl %eax,%edi
- movl 52(%esi),%ebp
- addl %edi,%ecx
- movl $-1,%edi
- roll $15,%ecx
- xorl %eax,%edi
- addl %edx,%ecx
- # R3 59
- orl %ecx,%edi
- leal 1309151649(%ebx,%ebp,1),%ebx
- xorl %edx,%edi
- movl 16(%esi),%ebp
- addl %edi,%ebx
- movl $-1,%edi
- roll $21,%ebx
- xorl %edx,%edi
- addl %ecx,%ebx
- # R3 60
- orl %ebx,%edi
- leal 4149444226(%eax,%ebp,1),%eax
- xorl %ecx,%edi
- movl 44(%esi),%ebp
- addl %edi,%eax
- movl $-1,%edi
- roll $6,%eax
- xorl %ecx,%edi
- addl %ebx,%eax
- # R3 61
- orl %eax,%edi
- leal 3174756917(%edx,%ebp,1),%edx
- xorl %ebx,%edi
- movl 8(%esi),%ebp
- addl %edi,%edx
- movl $-1,%edi
- roll $10,%edx
- xorl %ebx,%edi
- addl %eax,%edx
- # R3 62
- orl %edx,%edi
- leal 718787259(%ecx,%ebp,1),%ecx
- xorl %eax,%edi
- movl 36(%esi),%ebp
- addl %edi,%ecx
- movl $-1,%edi
- roll $15,%ecx
- xorl %eax,%edi
- addl %edx,%ecx
- # R3 63
- orl %ecx,%edi
- leal 3951481745(%ebx,%ebp,1),%ebx
- xorl %edx,%edi
- movl 24(%esp),%ebp
- addl %edi,%ebx
- addl $64,%esi
- roll $21,%ebx
- movl (%ebp),%edi
- addl %ecx,%ebx
- addl %edi,%eax
- movl 4(%ebp),%edi
- addl %edi,%ebx
- movl 8(%ebp),%edi
- addl %edi,%ecx
- movl 12(%ebp),%edi
- addl %edi,%edx
- movl %eax,(%ebp)
- movl %ebx,4(%ebp)
- movl (%esp),%edi
- movl %ecx,8(%ebp)
- movl %edx,12(%ebp)
- cmpl %esi,%edi
- jae L000start
- popl %eax
- popl %ebx
- popl %ebp
- popl %edi
- popl %esi
- ret
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/sha1-586-apple.S b/apple-x86/crypto/fipsmodule/sha1-586-apple.S
deleted file mode 100644
index 76ee6bc..0000000
--- a/apple-x86/crypto/fipsmodule/sha1-586-apple.S
+++ /dev/null
@@ -1,3804 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl _sha1_block_data_order
-.private_extern _sha1_block_data_order
-.align 4
-_sha1_block_data_order:
-L_sha1_block_data_order_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- call L000pic_point
-L000pic_point:
- popl %ebp
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L000pic_point(%ebp),%esi
- leal LK_XX_XX-L000pic_point(%ebp),%ebp
- movl (%esi),%eax
- movl 4(%esi),%edx
- testl $512,%edx
- jz L001x86
- movl 8(%esi),%ecx
- testl $16777216,%eax
- jz L001x86
- andl $268435456,%edx
- andl $1073741824,%eax
- orl %edx,%eax
- cmpl $1342177280,%eax
- je Lavx_shortcut
- jmp Lssse3_shortcut
-.align 4,0x90
-L001x86:
- movl 20(%esp),%ebp
- movl 24(%esp),%esi
- movl 28(%esp),%eax
- subl $76,%esp
- shll $6,%eax
- addl %esi,%eax
- movl %eax,104(%esp)
- movl 16(%ebp),%edi
- jmp L002loop
-.align 4,0x90
-L002loop:
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- movl %ecx,8(%esp)
- movl %edx,12(%esp)
- movl 16(%esi),%eax
- movl 20(%esi),%ebx
- movl 24(%esi),%ecx
- movl 28(%esi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- movl %eax,16(%esp)
- movl %ebx,20(%esp)
- movl %ecx,24(%esp)
- movl %edx,28(%esp)
- movl 32(%esi),%eax
- movl 36(%esi),%ebx
- movl 40(%esi),%ecx
- movl 44(%esi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- movl %eax,32(%esp)
- movl %ebx,36(%esp)
- movl %ecx,40(%esp)
- movl %edx,44(%esp)
- movl 48(%esi),%eax
- movl 52(%esi),%ebx
- movl 56(%esi),%ecx
- movl 60(%esi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- movl %eax,48(%esp)
- movl %ebx,52(%esp)
- movl %ecx,56(%esp)
- movl %edx,60(%esp)
- movl %esi,100(%esp)
- movl (%ebp),%eax
- movl 4(%ebp),%ebx
- movl 8(%ebp),%ecx
- movl 12(%ebp),%edx
- # 00_15 0
- movl %ecx,%esi
- movl %eax,%ebp
- roll $5,%ebp
- xorl %edx,%esi
- addl %edi,%ebp
- movl (%esp),%edi
- andl %ebx,%esi
- rorl $2,%ebx
- xorl %edx,%esi
- leal 1518500249(%ebp,%edi,1),%ebp
- addl %esi,%ebp
- # 00_15 1
- movl %ebx,%edi
- movl %ebp,%esi
- roll $5,%ebp
- xorl %ecx,%edi
- addl %edx,%ebp
- movl 4(%esp),%edx
- andl %eax,%edi
- rorl $2,%eax
- xorl %ecx,%edi
- leal 1518500249(%ebp,%edx,1),%ebp
- addl %edi,%ebp
- # 00_15 2
- movl %eax,%edx
- movl %ebp,%edi
- roll $5,%ebp
- xorl %ebx,%edx
- addl %ecx,%ebp
- movl 8(%esp),%ecx
- andl %esi,%edx
- rorl $2,%esi
- xorl %ebx,%edx
- leal 1518500249(%ebp,%ecx,1),%ebp
- addl %edx,%ebp
- # 00_15 3
- movl %esi,%ecx
- movl %ebp,%edx
- roll $5,%ebp
- xorl %eax,%ecx
- addl %ebx,%ebp
- movl 12(%esp),%ebx
- andl %edi,%ecx
- rorl $2,%edi
- xorl %eax,%ecx
- leal 1518500249(%ebp,%ebx,1),%ebp
- addl %ecx,%ebp
- # 00_15 4
- movl %edi,%ebx
- movl %ebp,%ecx
- roll $5,%ebp
- xorl %esi,%ebx
- addl %eax,%ebp
- movl 16(%esp),%eax
- andl %edx,%ebx
- rorl $2,%edx
- xorl %esi,%ebx
- leal 1518500249(%ebp,%eax,1),%ebp
- addl %ebx,%ebp
- # 00_15 5
- movl %edx,%eax
- movl %ebp,%ebx
- roll $5,%ebp
- xorl %edi,%eax
- addl %esi,%ebp
- movl 20(%esp),%esi
- andl %ecx,%eax
- rorl $2,%ecx
- xorl %edi,%eax
- leal 1518500249(%ebp,%esi,1),%ebp
- addl %eax,%ebp
- # 00_15 6
- movl %ecx,%esi
- movl %ebp,%eax
- roll $5,%ebp
- xorl %edx,%esi
- addl %edi,%ebp
- movl 24(%esp),%edi
- andl %ebx,%esi
- rorl $2,%ebx
- xorl %edx,%esi
- leal 1518500249(%ebp,%edi,1),%ebp
- addl %esi,%ebp
- # 00_15 7
- movl %ebx,%edi
- movl %ebp,%esi
- roll $5,%ebp
- xorl %ecx,%edi
- addl %edx,%ebp
- movl 28(%esp),%edx
- andl %eax,%edi
- rorl $2,%eax
- xorl %ecx,%edi
- leal 1518500249(%ebp,%edx,1),%ebp
- addl %edi,%ebp
- # 00_15 8
- movl %eax,%edx
- movl %ebp,%edi
- roll $5,%ebp
- xorl %ebx,%edx
- addl %ecx,%ebp
- movl 32(%esp),%ecx
- andl %esi,%edx
- rorl $2,%esi
- xorl %ebx,%edx
- leal 1518500249(%ebp,%ecx,1),%ebp
- addl %edx,%ebp
- # 00_15 9
- movl %esi,%ecx
- movl %ebp,%edx
- roll $5,%ebp
- xorl %eax,%ecx
- addl %ebx,%ebp
- movl 36(%esp),%ebx
- andl %edi,%ecx
- rorl $2,%edi
- xorl %eax,%ecx
- leal 1518500249(%ebp,%ebx,1),%ebp
- addl %ecx,%ebp
- # 00_15 10
- movl %edi,%ebx
- movl %ebp,%ecx
- roll $5,%ebp
- xorl %esi,%ebx
- addl %eax,%ebp
- movl 40(%esp),%eax
- andl %edx,%ebx
- rorl $2,%edx
- xorl %esi,%ebx
- leal 1518500249(%ebp,%eax,1),%ebp
- addl %ebx,%ebp
- # 00_15 11
- movl %edx,%eax
- movl %ebp,%ebx
- roll $5,%ebp
- xorl %edi,%eax
- addl %esi,%ebp
- movl 44(%esp),%esi
- andl %ecx,%eax
- rorl $2,%ecx
- xorl %edi,%eax
- leal 1518500249(%ebp,%esi,1),%ebp
- addl %eax,%ebp
- # 00_15 12
- movl %ecx,%esi
- movl %ebp,%eax
- roll $5,%ebp
- xorl %edx,%esi
- addl %edi,%ebp
- movl 48(%esp),%edi
- andl %ebx,%esi
- rorl $2,%ebx
- xorl %edx,%esi
- leal 1518500249(%ebp,%edi,1),%ebp
- addl %esi,%ebp
- # 00_15 13
- movl %ebx,%edi
- movl %ebp,%esi
- roll $5,%ebp
- xorl %ecx,%edi
- addl %edx,%ebp
- movl 52(%esp),%edx
- andl %eax,%edi
- rorl $2,%eax
- xorl %ecx,%edi
- leal 1518500249(%ebp,%edx,1),%ebp
- addl %edi,%ebp
- # 00_15 14
- movl %eax,%edx
- movl %ebp,%edi
- roll $5,%ebp
- xorl %ebx,%edx
- addl %ecx,%ebp
- movl 56(%esp),%ecx
- andl %esi,%edx
- rorl $2,%esi
- xorl %ebx,%edx
- leal 1518500249(%ebp,%ecx,1),%ebp
- addl %edx,%ebp
- # 00_15 15
- movl %esi,%ecx
- movl %ebp,%edx
- roll $5,%ebp
- xorl %eax,%ecx
- addl %ebx,%ebp
- movl 60(%esp),%ebx
- andl %edi,%ecx
- rorl $2,%edi
- xorl %eax,%ecx
- leal 1518500249(%ebp,%ebx,1),%ebp
- movl (%esp),%ebx
- addl %ebp,%ecx
- # 16_19 16
- movl %edi,%ebp
- xorl 8(%esp),%ebx
- xorl %esi,%ebp
- xorl 32(%esp),%ebx
- andl %edx,%ebp
- xorl 52(%esp),%ebx
- roll $1,%ebx
- xorl %esi,%ebp
- addl %ebp,%eax
- movl %ecx,%ebp
- rorl $2,%edx
- movl %ebx,(%esp)
- roll $5,%ebp
- leal 1518500249(%ebx,%eax,1),%ebx
- movl 4(%esp),%eax
- addl %ebp,%ebx
- # 16_19 17
- movl %edx,%ebp
- xorl 12(%esp),%eax
- xorl %edi,%ebp
- xorl 36(%esp),%eax
- andl %ecx,%ebp
- xorl 56(%esp),%eax
- roll $1,%eax
- xorl %edi,%ebp
- addl %ebp,%esi
- movl %ebx,%ebp
- rorl $2,%ecx
- movl %eax,4(%esp)
- roll $5,%ebp
- leal 1518500249(%eax,%esi,1),%eax
- movl 8(%esp),%esi
- addl %ebp,%eax
- # 16_19 18
- movl %ecx,%ebp
- xorl 16(%esp),%esi
- xorl %edx,%ebp
- xorl 40(%esp),%esi
- andl %ebx,%ebp
- xorl 60(%esp),%esi
- roll $1,%esi
- xorl %edx,%ebp
- addl %ebp,%edi
- movl %eax,%ebp
- rorl $2,%ebx
- movl %esi,8(%esp)
- roll $5,%ebp
- leal 1518500249(%esi,%edi,1),%esi
- movl 12(%esp),%edi
- addl %ebp,%esi
- # 16_19 19
- movl %ebx,%ebp
- xorl 20(%esp),%edi
- xorl %ecx,%ebp
- xorl 44(%esp),%edi
- andl %eax,%ebp
- xorl (%esp),%edi
- roll $1,%edi
- xorl %ecx,%ebp
- addl %ebp,%edx
- movl %esi,%ebp
- rorl $2,%eax
- movl %edi,12(%esp)
- roll $5,%ebp
- leal 1518500249(%edi,%edx,1),%edi
- movl 16(%esp),%edx
- addl %ebp,%edi
- # 20_39 20
- movl %esi,%ebp
- xorl 24(%esp),%edx
- xorl %eax,%ebp
- xorl 48(%esp),%edx
- xorl %ebx,%ebp
- xorl 4(%esp),%edx
- roll $1,%edx
- addl %ebp,%ecx
- rorl $2,%esi
- movl %edi,%ebp
- roll $5,%ebp
- movl %edx,16(%esp)
- leal 1859775393(%edx,%ecx,1),%edx
- movl 20(%esp),%ecx
- addl %ebp,%edx
- # 20_39 21
- movl %edi,%ebp
- xorl 28(%esp),%ecx
- xorl %esi,%ebp
- xorl 52(%esp),%ecx
- xorl %eax,%ebp
- xorl 8(%esp),%ecx
- roll $1,%ecx
- addl %ebp,%ebx
- rorl $2,%edi
- movl %edx,%ebp
- roll $5,%ebp
- movl %ecx,20(%esp)
- leal 1859775393(%ecx,%ebx,1),%ecx
- movl 24(%esp),%ebx
- addl %ebp,%ecx
- # 20_39 22
- movl %edx,%ebp
- xorl 32(%esp),%ebx
- xorl %edi,%ebp
- xorl 56(%esp),%ebx
- xorl %esi,%ebp
- xorl 12(%esp),%ebx
- roll $1,%ebx
- addl %ebp,%eax
- rorl $2,%edx
- movl %ecx,%ebp
- roll $5,%ebp
- movl %ebx,24(%esp)
- leal 1859775393(%ebx,%eax,1),%ebx
- movl 28(%esp),%eax
- addl %ebp,%ebx
- # 20_39 23
- movl %ecx,%ebp
- xorl 36(%esp),%eax
- xorl %edx,%ebp
- xorl 60(%esp),%eax
- xorl %edi,%ebp
- xorl 16(%esp),%eax
- roll $1,%eax
- addl %ebp,%esi
- rorl $2,%ecx
- movl %ebx,%ebp
- roll $5,%ebp
- movl %eax,28(%esp)
- leal 1859775393(%eax,%esi,1),%eax
- movl 32(%esp),%esi
- addl %ebp,%eax
- # 20_39 24
- movl %ebx,%ebp
- xorl 40(%esp),%esi
- xorl %ecx,%ebp
- xorl (%esp),%esi
- xorl %edx,%ebp
- xorl 20(%esp),%esi
- roll $1,%esi
- addl %ebp,%edi
- rorl $2,%ebx
- movl %eax,%ebp
- roll $5,%ebp
- movl %esi,32(%esp)
- leal 1859775393(%esi,%edi,1),%esi
- movl 36(%esp),%edi
- addl %ebp,%esi
- # 20_39 25
- movl %eax,%ebp
- xorl 44(%esp),%edi
- xorl %ebx,%ebp
- xorl 4(%esp),%edi
- xorl %ecx,%ebp
- xorl 24(%esp),%edi
- roll $1,%edi
- addl %ebp,%edx
- rorl $2,%eax
- movl %esi,%ebp
- roll $5,%ebp
- movl %edi,36(%esp)
- leal 1859775393(%edi,%edx,1),%edi
- movl 40(%esp),%edx
- addl %ebp,%edi
- # 20_39 26
- movl %esi,%ebp
- xorl 48(%esp),%edx
- xorl %eax,%ebp
- xorl 8(%esp),%edx
- xorl %ebx,%ebp
- xorl 28(%esp),%edx
- roll $1,%edx
- addl %ebp,%ecx
- rorl $2,%esi
- movl %edi,%ebp
- roll $5,%ebp
- movl %edx,40(%esp)
- leal 1859775393(%edx,%ecx,1),%edx
- movl 44(%esp),%ecx
- addl %ebp,%edx
- # 20_39 27
- movl %edi,%ebp
- xorl 52(%esp),%ecx
- xorl %esi,%ebp
- xorl 12(%esp),%ecx
- xorl %eax,%ebp
- xorl 32(%esp),%ecx
- roll $1,%ecx
- addl %ebp,%ebx
- rorl $2,%edi
- movl %edx,%ebp
- roll $5,%ebp
- movl %ecx,44(%esp)
- leal 1859775393(%ecx,%ebx,1),%ecx
- movl 48(%esp),%ebx
- addl %ebp,%ecx
- # 20_39 28
- movl %edx,%ebp
- xorl 56(%esp),%ebx
- xorl %edi,%ebp
- xorl 16(%esp),%ebx
- xorl %esi,%ebp
- xorl 36(%esp),%ebx
- roll $1,%ebx
- addl %ebp,%eax
- rorl $2,%edx
- movl %ecx,%ebp
- roll $5,%ebp
- movl %ebx,48(%esp)
- leal 1859775393(%ebx,%eax,1),%ebx
- movl 52(%esp),%eax
- addl %ebp,%ebx
- # 20_39 29
- movl %ecx,%ebp
- xorl 60(%esp),%eax
- xorl %edx,%ebp
- xorl 20(%esp),%eax
- xorl %edi,%ebp
- xorl 40(%esp),%eax
- roll $1,%eax
- addl %ebp,%esi
- rorl $2,%ecx
- movl %ebx,%ebp
- roll $5,%ebp
- movl %eax,52(%esp)
- leal 1859775393(%eax,%esi,1),%eax
- movl 56(%esp),%esi
- addl %ebp,%eax
- # 20_39 30
- movl %ebx,%ebp
- xorl (%esp),%esi
- xorl %ecx,%ebp
- xorl 24(%esp),%esi
- xorl %edx,%ebp
- xorl 44(%esp),%esi
- roll $1,%esi
- addl %ebp,%edi
- rorl $2,%ebx
- movl %eax,%ebp
- roll $5,%ebp
- movl %esi,56(%esp)
- leal 1859775393(%esi,%edi,1),%esi
- movl 60(%esp),%edi
- addl %ebp,%esi
- # 20_39 31
- movl %eax,%ebp
- xorl 4(%esp),%edi
- xorl %ebx,%ebp
- xorl 28(%esp),%edi
- xorl %ecx,%ebp
- xorl 48(%esp),%edi
- roll $1,%edi
- addl %ebp,%edx
- rorl $2,%eax
- movl %esi,%ebp
- roll $5,%ebp
- movl %edi,60(%esp)
- leal 1859775393(%edi,%edx,1),%edi
- movl (%esp),%edx
- addl %ebp,%edi
- # 20_39 32
- movl %esi,%ebp
- xorl 8(%esp),%edx
- xorl %eax,%ebp
- xorl 32(%esp),%edx
- xorl %ebx,%ebp
- xorl 52(%esp),%edx
- roll $1,%edx
- addl %ebp,%ecx
- rorl $2,%esi
- movl %edi,%ebp
- roll $5,%ebp
- movl %edx,(%esp)
- leal 1859775393(%edx,%ecx,1),%edx
- movl 4(%esp),%ecx
- addl %ebp,%edx
- # 20_39 33
- movl %edi,%ebp
- xorl 12(%esp),%ecx
- xorl %esi,%ebp
- xorl 36(%esp),%ecx
- xorl %eax,%ebp
- xorl 56(%esp),%ecx
- roll $1,%ecx
- addl %ebp,%ebx
- rorl $2,%edi
- movl %edx,%ebp
- roll $5,%ebp
- movl %ecx,4(%esp)
- leal 1859775393(%ecx,%ebx,1),%ecx
- movl 8(%esp),%ebx
- addl %ebp,%ecx
- # 20_39 34
- movl %edx,%ebp
- xorl 16(%esp),%ebx
- xorl %edi,%ebp
- xorl 40(%esp),%ebx
- xorl %esi,%ebp
- xorl 60(%esp),%ebx
- roll $1,%ebx
- addl %ebp,%eax
- rorl $2,%edx
- movl %ecx,%ebp
- roll $5,%ebp
- movl %ebx,8(%esp)
- leal 1859775393(%ebx,%eax,1),%ebx
- movl 12(%esp),%eax
- addl %ebp,%ebx
- # 20_39 35
- movl %ecx,%ebp
- xorl 20(%esp),%eax
- xorl %edx,%ebp
- xorl 44(%esp),%eax
- xorl %edi,%ebp
- xorl (%esp),%eax
- roll $1,%eax
- addl %ebp,%esi
- rorl $2,%ecx
- movl %ebx,%ebp
- roll $5,%ebp
- movl %eax,12(%esp)
- leal 1859775393(%eax,%esi,1),%eax
- movl 16(%esp),%esi
- addl %ebp,%eax
- # 20_39 36
- movl %ebx,%ebp
- xorl 24(%esp),%esi
- xorl %ecx,%ebp
- xorl 48(%esp),%esi
- xorl %edx,%ebp
- xorl 4(%esp),%esi
- roll $1,%esi
- addl %ebp,%edi
- rorl $2,%ebx
- movl %eax,%ebp
- roll $5,%ebp
- movl %esi,16(%esp)
- leal 1859775393(%esi,%edi,1),%esi
- movl 20(%esp),%edi
- addl %ebp,%esi
- # 20_39 37
- movl %eax,%ebp
- xorl 28(%esp),%edi
- xorl %ebx,%ebp
- xorl 52(%esp),%edi
- xorl %ecx,%ebp
- xorl 8(%esp),%edi
- roll $1,%edi
- addl %ebp,%edx
- rorl $2,%eax
- movl %esi,%ebp
- roll $5,%ebp
- movl %edi,20(%esp)
- leal 1859775393(%edi,%edx,1),%edi
- movl 24(%esp),%edx
- addl %ebp,%edi
- # 20_39 38
- movl %esi,%ebp
- xorl 32(%esp),%edx
- xorl %eax,%ebp
- xorl 56(%esp),%edx
- xorl %ebx,%ebp
- xorl 12(%esp),%edx
- roll $1,%edx
- addl %ebp,%ecx
- rorl $2,%esi
- movl %edi,%ebp
- roll $5,%ebp
- movl %edx,24(%esp)
- leal 1859775393(%edx,%ecx,1),%edx
- movl 28(%esp),%ecx
- addl %ebp,%edx
- # 20_39 39
- movl %edi,%ebp
- xorl 36(%esp),%ecx
- xorl %esi,%ebp
- xorl 60(%esp),%ecx
- xorl %eax,%ebp
- xorl 16(%esp),%ecx
- roll $1,%ecx
- addl %ebp,%ebx
- rorl $2,%edi
- movl %edx,%ebp
- roll $5,%ebp
- movl %ecx,28(%esp)
- leal 1859775393(%ecx,%ebx,1),%ecx
- movl 32(%esp),%ebx
- addl %ebp,%ecx
- # 40_59 40
- movl %edi,%ebp
- xorl 40(%esp),%ebx
- xorl %esi,%ebp
- xorl (%esp),%ebx
- andl %edx,%ebp
- xorl 20(%esp),%ebx
- roll $1,%ebx
- addl %eax,%ebp
- rorl $2,%edx
- movl %ecx,%eax
- roll $5,%eax
- movl %ebx,32(%esp)
- leal 2400959708(%ebx,%ebp,1),%ebx
- movl %edi,%ebp
- addl %eax,%ebx
- andl %esi,%ebp
- movl 36(%esp),%eax
- addl %ebp,%ebx
- # 40_59 41
- movl %edx,%ebp
- xorl 44(%esp),%eax
- xorl %edi,%ebp
- xorl 4(%esp),%eax
- andl %ecx,%ebp
- xorl 24(%esp),%eax
- roll $1,%eax
- addl %esi,%ebp
- rorl $2,%ecx
- movl %ebx,%esi
- roll $5,%esi
- movl %eax,36(%esp)
- leal 2400959708(%eax,%ebp,1),%eax
- movl %edx,%ebp
- addl %esi,%eax
- andl %edi,%ebp
- movl 40(%esp),%esi
- addl %ebp,%eax
- # 40_59 42
- movl %ecx,%ebp
- xorl 48(%esp),%esi
- xorl %edx,%ebp
- xorl 8(%esp),%esi
- andl %ebx,%ebp
- xorl 28(%esp),%esi
- roll $1,%esi
- addl %edi,%ebp
- rorl $2,%ebx
- movl %eax,%edi
- roll $5,%edi
- movl %esi,40(%esp)
- leal 2400959708(%esi,%ebp,1),%esi
- movl %ecx,%ebp
- addl %edi,%esi
- andl %edx,%ebp
- movl 44(%esp),%edi
- addl %ebp,%esi
- # 40_59 43
- movl %ebx,%ebp
- xorl 52(%esp),%edi
- xorl %ecx,%ebp
- xorl 12(%esp),%edi
- andl %eax,%ebp
- xorl 32(%esp),%edi
- roll $1,%edi
- addl %edx,%ebp
- rorl $2,%eax
- movl %esi,%edx
- roll $5,%edx
- movl %edi,44(%esp)
- leal 2400959708(%edi,%ebp,1),%edi
- movl %ebx,%ebp
- addl %edx,%edi
- andl %ecx,%ebp
- movl 48(%esp),%edx
- addl %ebp,%edi
- # 40_59 44
- movl %eax,%ebp
- xorl 56(%esp),%edx
- xorl %ebx,%ebp
- xorl 16(%esp),%edx
- andl %esi,%ebp
- xorl 36(%esp),%edx
- roll $1,%edx
- addl %ecx,%ebp
- rorl $2,%esi
- movl %edi,%ecx
- roll $5,%ecx
- movl %edx,48(%esp)
- leal 2400959708(%edx,%ebp,1),%edx
- movl %eax,%ebp
- addl %ecx,%edx
- andl %ebx,%ebp
- movl 52(%esp),%ecx
- addl %ebp,%edx
- # 40_59 45
- movl %esi,%ebp
- xorl 60(%esp),%ecx
- xorl %eax,%ebp
- xorl 20(%esp),%ecx
- andl %edi,%ebp
- xorl 40(%esp),%ecx
- roll $1,%ecx
- addl %ebx,%ebp
- rorl $2,%edi
- movl %edx,%ebx
- roll $5,%ebx
- movl %ecx,52(%esp)
- leal 2400959708(%ecx,%ebp,1),%ecx
- movl %esi,%ebp
- addl %ebx,%ecx
- andl %eax,%ebp
- movl 56(%esp),%ebx
- addl %ebp,%ecx
- # 40_59 46
- movl %edi,%ebp
- xorl (%esp),%ebx
- xorl %esi,%ebp
- xorl 24(%esp),%ebx
- andl %edx,%ebp
- xorl 44(%esp),%ebx
- roll $1,%ebx
- addl %eax,%ebp
- rorl $2,%edx
- movl %ecx,%eax
- roll $5,%eax
- movl %ebx,56(%esp)
- leal 2400959708(%ebx,%ebp,1),%ebx
- movl %edi,%ebp
- addl %eax,%ebx
- andl %esi,%ebp
- movl 60(%esp),%eax
- addl %ebp,%ebx
- # 40_59 47
- movl %edx,%ebp
- xorl 4(%esp),%eax
- xorl %edi,%ebp
- xorl 28(%esp),%eax
- andl %ecx,%ebp
- xorl 48(%esp),%eax
- roll $1,%eax
- addl %esi,%ebp
- rorl $2,%ecx
- movl %ebx,%esi
- roll $5,%esi
- movl %eax,60(%esp)
- leal 2400959708(%eax,%ebp,1),%eax
- movl %edx,%ebp
- addl %esi,%eax
- andl %edi,%ebp
- movl (%esp),%esi
- addl %ebp,%eax
- # 40_59 48
- movl %ecx,%ebp
- xorl 8(%esp),%esi
- xorl %edx,%ebp
- xorl 32(%esp),%esi
- andl %ebx,%ebp
- xorl 52(%esp),%esi
- roll $1,%esi
- addl %edi,%ebp
- rorl $2,%ebx
- movl %eax,%edi
- roll $5,%edi
- movl %esi,(%esp)
- leal 2400959708(%esi,%ebp,1),%esi
- movl %ecx,%ebp
- addl %edi,%esi
- andl %edx,%ebp
- movl 4(%esp),%edi
- addl %ebp,%esi
- # 40_59 49
- movl %ebx,%ebp
- xorl 12(%esp),%edi
- xorl %ecx,%ebp
- xorl 36(%esp),%edi
- andl %eax,%ebp
- xorl 56(%esp),%edi
- roll $1,%edi
- addl %edx,%ebp
- rorl $2,%eax
- movl %esi,%edx
- roll $5,%edx
- movl %edi,4(%esp)
- leal 2400959708(%edi,%ebp,1),%edi
- movl %ebx,%ebp
- addl %edx,%edi
- andl %ecx,%ebp
- movl 8(%esp),%edx
- addl %ebp,%edi
- # 40_59 50
- movl %eax,%ebp
- xorl 16(%esp),%edx
- xorl %ebx,%ebp
- xorl 40(%esp),%edx
- andl %esi,%ebp
- xorl 60(%esp),%edx
- roll $1,%edx
- addl %ecx,%ebp
- rorl $2,%esi
- movl %edi,%ecx
- roll $5,%ecx
- movl %edx,8(%esp)
- leal 2400959708(%edx,%ebp,1),%edx
- movl %eax,%ebp
- addl %ecx,%edx
- andl %ebx,%ebp
- movl 12(%esp),%ecx
- addl %ebp,%edx
- # 40_59 51
- movl %esi,%ebp
- xorl 20(%esp),%ecx
- xorl %eax,%ebp
- xorl 44(%esp),%ecx
- andl %edi,%ebp
- xorl (%esp),%ecx
- roll $1,%ecx
- addl %ebx,%ebp
- rorl $2,%edi
- movl %edx,%ebx
- roll $5,%ebx
- movl %ecx,12(%esp)
- leal 2400959708(%ecx,%ebp,1),%ecx
- movl %esi,%ebp
- addl %ebx,%ecx
- andl %eax,%ebp
- movl 16(%esp),%ebx
- addl %ebp,%ecx
- # 40_59 52
- movl %edi,%ebp
- xorl 24(%esp),%ebx
- xorl %esi,%ebp
- xorl 48(%esp),%ebx
- andl %edx,%ebp
- xorl 4(%esp),%ebx
- roll $1,%ebx
- addl %eax,%ebp
- rorl $2,%edx
- movl %ecx,%eax
- roll $5,%eax
- movl %ebx,16(%esp)
- leal 2400959708(%ebx,%ebp,1),%ebx
- movl %edi,%ebp
- addl %eax,%ebx
- andl %esi,%ebp
- movl 20(%esp),%eax
- addl %ebp,%ebx
- # 40_59 53
- movl %edx,%ebp
- xorl 28(%esp),%eax
- xorl %edi,%ebp
- xorl 52(%esp),%eax
- andl %ecx,%ebp
- xorl 8(%esp),%eax
- roll $1,%eax
- addl %esi,%ebp
- rorl $2,%ecx
- movl %ebx,%esi
- roll $5,%esi
- movl %eax,20(%esp)
- leal 2400959708(%eax,%ebp,1),%eax
- movl %edx,%ebp
- addl %esi,%eax
- andl %edi,%ebp
- movl 24(%esp),%esi
- addl %ebp,%eax
- # 40_59 54
- movl %ecx,%ebp
- xorl 32(%esp),%esi
- xorl %edx,%ebp
- xorl 56(%esp),%esi
- andl %ebx,%ebp
- xorl 12(%esp),%esi
- roll $1,%esi
- addl %edi,%ebp
- rorl $2,%ebx
- movl %eax,%edi
- roll $5,%edi
- movl %esi,24(%esp)
- leal 2400959708(%esi,%ebp,1),%esi
- movl %ecx,%ebp
- addl %edi,%esi
- andl %edx,%ebp
- movl 28(%esp),%edi
- addl %ebp,%esi
- # 40_59 55
- movl %ebx,%ebp
- xorl 36(%esp),%edi
- xorl %ecx,%ebp
- xorl 60(%esp),%edi
- andl %eax,%ebp
- xorl 16(%esp),%edi
- roll $1,%edi
- addl %edx,%ebp
- rorl $2,%eax
- movl %esi,%edx
- roll $5,%edx
- movl %edi,28(%esp)
- leal 2400959708(%edi,%ebp,1),%edi
- movl %ebx,%ebp
- addl %edx,%edi
- andl %ecx,%ebp
- movl 32(%esp),%edx
- addl %ebp,%edi
- # 40_59 56
- movl %eax,%ebp
- xorl 40(%esp),%edx
- xorl %ebx,%ebp
- xorl (%esp),%edx
- andl %esi,%ebp
- xorl 20(%esp),%edx
- roll $1,%edx
- addl %ecx,%ebp
- rorl $2,%esi
- movl %edi,%ecx
- roll $5,%ecx
- movl %edx,32(%esp)
- leal 2400959708(%edx,%ebp,1),%edx
- movl %eax,%ebp
- addl %ecx,%edx
- andl %ebx,%ebp
- movl 36(%esp),%ecx
- addl %ebp,%edx
- # 40_59 57
- movl %esi,%ebp
- xorl 44(%esp),%ecx
- xorl %eax,%ebp
- xorl 4(%esp),%ecx
- andl %edi,%ebp
- xorl 24(%esp),%ecx
- roll $1,%ecx
- addl %ebx,%ebp
- rorl $2,%edi
- movl %edx,%ebx
- roll $5,%ebx
- movl %ecx,36(%esp)
- leal 2400959708(%ecx,%ebp,1),%ecx
- movl %esi,%ebp
- addl %ebx,%ecx
- andl %eax,%ebp
- movl 40(%esp),%ebx
- addl %ebp,%ecx
- # 40_59 58
- movl %edi,%ebp
- xorl 48(%esp),%ebx
- xorl %esi,%ebp
- xorl 8(%esp),%ebx
- andl %edx,%ebp
- xorl 28(%esp),%ebx
- roll $1,%ebx
- addl %eax,%ebp
- rorl $2,%edx
- movl %ecx,%eax
- roll $5,%eax
- movl %ebx,40(%esp)
- leal 2400959708(%ebx,%ebp,1),%ebx
- movl %edi,%ebp
- addl %eax,%ebx
- andl %esi,%ebp
- movl 44(%esp),%eax
- addl %ebp,%ebx
- # 40_59 59
- movl %edx,%ebp
- xorl 52(%esp),%eax
- xorl %edi,%ebp
- xorl 12(%esp),%eax
- andl %ecx,%ebp
- xorl 32(%esp),%eax
- roll $1,%eax
- addl %esi,%ebp
- rorl $2,%ecx
- movl %ebx,%esi
- roll $5,%esi
- movl %eax,44(%esp)
- leal 2400959708(%eax,%ebp,1),%eax
- movl %edx,%ebp
- addl %esi,%eax
- andl %edi,%ebp
- movl 48(%esp),%esi
- addl %ebp,%eax
- # 20_39 60
- movl %ebx,%ebp
- xorl 56(%esp),%esi
- xorl %ecx,%ebp
- xorl 16(%esp),%esi
- xorl %edx,%ebp
- xorl 36(%esp),%esi
- roll $1,%esi
- addl %ebp,%edi
- rorl $2,%ebx
- movl %eax,%ebp
- roll $5,%ebp
- movl %esi,48(%esp)
- leal 3395469782(%esi,%edi,1),%esi
- movl 52(%esp),%edi
- addl %ebp,%esi
- # 20_39 61
- movl %eax,%ebp
- xorl 60(%esp),%edi
- xorl %ebx,%ebp
- xorl 20(%esp),%edi
- xorl %ecx,%ebp
- xorl 40(%esp),%edi
- roll $1,%edi
- addl %ebp,%edx
- rorl $2,%eax
- movl %esi,%ebp
- roll $5,%ebp
- movl %edi,52(%esp)
- leal 3395469782(%edi,%edx,1),%edi
- movl 56(%esp),%edx
- addl %ebp,%edi
- # 20_39 62
- movl %esi,%ebp
- xorl (%esp),%edx
- xorl %eax,%ebp
- xorl 24(%esp),%edx
- xorl %ebx,%ebp
- xorl 44(%esp),%edx
- roll $1,%edx
- addl %ebp,%ecx
- rorl $2,%esi
- movl %edi,%ebp
- roll $5,%ebp
- movl %edx,56(%esp)
- leal 3395469782(%edx,%ecx,1),%edx
- movl 60(%esp),%ecx
- addl %ebp,%edx
- # 20_39 63
- movl %edi,%ebp
- xorl 4(%esp),%ecx
- xorl %esi,%ebp
- xorl 28(%esp),%ecx
- xorl %eax,%ebp
- xorl 48(%esp),%ecx
- roll $1,%ecx
- addl %ebp,%ebx
- rorl $2,%edi
- movl %edx,%ebp
- roll $5,%ebp
- movl %ecx,60(%esp)
- leal 3395469782(%ecx,%ebx,1),%ecx
- movl (%esp),%ebx
- addl %ebp,%ecx
- # 20_39 64
- movl %edx,%ebp
- xorl 8(%esp),%ebx
- xorl %edi,%ebp
- xorl 32(%esp),%ebx
- xorl %esi,%ebp
- xorl 52(%esp),%ebx
- roll $1,%ebx
- addl %ebp,%eax
- rorl $2,%edx
- movl %ecx,%ebp
- roll $5,%ebp
- movl %ebx,(%esp)
- leal 3395469782(%ebx,%eax,1),%ebx
- movl 4(%esp),%eax
- addl %ebp,%ebx
- # 20_39 65
- movl %ecx,%ebp
- xorl 12(%esp),%eax
- xorl %edx,%ebp
- xorl 36(%esp),%eax
- xorl %edi,%ebp
- xorl 56(%esp),%eax
- roll $1,%eax
- addl %ebp,%esi
- rorl $2,%ecx
- movl %ebx,%ebp
- roll $5,%ebp
- movl %eax,4(%esp)
- leal 3395469782(%eax,%esi,1),%eax
- movl 8(%esp),%esi
- addl %ebp,%eax
- # 20_39 66
- movl %ebx,%ebp
- xorl 16(%esp),%esi
- xorl %ecx,%ebp
- xorl 40(%esp),%esi
- xorl %edx,%ebp
- xorl 60(%esp),%esi
- roll $1,%esi
- addl %ebp,%edi
- rorl $2,%ebx
- movl %eax,%ebp
- roll $5,%ebp
- movl %esi,8(%esp)
- leal 3395469782(%esi,%edi,1),%esi
- movl 12(%esp),%edi
- addl %ebp,%esi
- # 20_39 67
- movl %eax,%ebp
- xorl 20(%esp),%edi
- xorl %ebx,%ebp
- xorl 44(%esp),%edi
- xorl %ecx,%ebp
- xorl (%esp),%edi
- roll $1,%edi
- addl %ebp,%edx
- rorl $2,%eax
- movl %esi,%ebp
- roll $5,%ebp
- movl %edi,12(%esp)
- leal 3395469782(%edi,%edx,1),%edi
- movl 16(%esp),%edx
- addl %ebp,%edi
- # 20_39 68
- movl %esi,%ebp
- xorl 24(%esp),%edx
- xorl %eax,%ebp
- xorl 48(%esp),%edx
- xorl %ebx,%ebp
- xorl 4(%esp),%edx
- roll $1,%edx
- addl %ebp,%ecx
- rorl $2,%esi
- movl %edi,%ebp
- roll $5,%ebp
- movl %edx,16(%esp)
- leal 3395469782(%edx,%ecx,1),%edx
- movl 20(%esp),%ecx
- addl %ebp,%edx
- # 20_39 69
- movl %edi,%ebp
- xorl 28(%esp),%ecx
- xorl %esi,%ebp
- xorl 52(%esp),%ecx
- xorl %eax,%ebp
- xorl 8(%esp),%ecx
- roll $1,%ecx
- addl %ebp,%ebx
- rorl $2,%edi
- movl %edx,%ebp
- roll $5,%ebp
- movl %ecx,20(%esp)
- leal 3395469782(%ecx,%ebx,1),%ecx
- movl 24(%esp),%ebx
- addl %ebp,%ecx
- # 20_39 70
- movl %edx,%ebp
- xorl 32(%esp),%ebx
- xorl %edi,%ebp
- xorl 56(%esp),%ebx
- xorl %esi,%ebp
- xorl 12(%esp),%ebx
- roll $1,%ebx
- addl %ebp,%eax
- rorl $2,%edx
- movl %ecx,%ebp
- roll $5,%ebp
- movl %ebx,24(%esp)
- leal 3395469782(%ebx,%eax,1),%ebx
- movl 28(%esp),%eax
- addl %ebp,%ebx
- # 20_39 71
- movl %ecx,%ebp
- xorl 36(%esp),%eax
- xorl %edx,%ebp
- xorl 60(%esp),%eax
- xorl %edi,%ebp
- xorl 16(%esp),%eax
- roll $1,%eax
- addl %ebp,%esi
- rorl $2,%ecx
- movl %ebx,%ebp
- roll $5,%ebp
- movl %eax,28(%esp)
- leal 3395469782(%eax,%esi,1),%eax
- movl 32(%esp),%esi
- addl %ebp,%eax
- # 20_39 72
- movl %ebx,%ebp
- xorl 40(%esp),%esi
- xorl %ecx,%ebp
- xorl (%esp),%esi
- xorl %edx,%ebp
- xorl 20(%esp),%esi
- roll $1,%esi
- addl %ebp,%edi
- rorl $2,%ebx
- movl %eax,%ebp
- roll $5,%ebp
- movl %esi,32(%esp)
- leal 3395469782(%esi,%edi,1),%esi
- movl 36(%esp),%edi
- addl %ebp,%esi
- # 20_39 73
- movl %eax,%ebp
- xorl 44(%esp),%edi
- xorl %ebx,%ebp
- xorl 4(%esp),%edi
- xorl %ecx,%ebp
- xorl 24(%esp),%edi
- roll $1,%edi
- addl %ebp,%edx
- rorl $2,%eax
- movl %esi,%ebp
- roll $5,%ebp
- movl %edi,36(%esp)
- leal 3395469782(%edi,%edx,1),%edi
- movl 40(%esp),%edx
- addl %ebp,%edi
- # 20_39 74
- movl %esi,%ebp
- xorl 48(%esp),%edx
- xorl %eax,%ebp
- xorl 8(%esp),%edx
- xorl %ebx,%ebp
- xorl 28(%esp),%edx
- roll $1,%edx
- addl %ebp,%ecx
- rorl $2,%esi
- movl %edi,%ebp
- roll $5,%ebp
- movl %edx,40(%esp)
- leal 3395469782(%edx,%ecx,1),%edx
- movl 44(%esp),%ecx
- addl %ebp,%edx
- # 20_39 75
- movl %edi,%ebp
- xorl 52(%esp),%ecx
- xorl %esi,%ebp
- xorl 12(%esp),%ecx
- xorl %eax,%ebp
- xorl 32(%esp),%ecx
- roll $1,%ecx
- addl %ebp,%ebx
- rorl $2,%edi
- movl %edx,%ebp
- roll $5,%ebp
- movl %ecx,44(%esp)
- leal 3395469782(%ecx,%ebx,1),%ecx
- movl 48(%esp),%ebx
- addl %ebp,%ecx
- # 20_39 76
- movl %edx,%ebp
- xorl 56(%esp),%ebx
- xorl %edi,%ebp
- xorl 16(%esp),%ebx
- xorl %esi,%ebp
- xorl 36(%esp),%ebx
- roll $1,%ebx
- addl %ebp,%eax
- rorl $2,%edx
- movl %ecx,%ebp
- roll $5,%ebp
- movl %ebx,48(%esp)
- leal 3395469782(%ebx,%eax,1),%ebx
- movl 52(%esp),%eax
- addl %ebp,%ebx
- # 20_39 77
- movl %ecx,%ebp
- xorl 60(%esp),%eax
- xorl %edx,%ebp
- xorl 20(%esp),%eax
- xorl %edi,%ebp
- xorl 40(%esp),%eax
- roll $1,%eax
- addl %ebp,%esi
- rorl $2,%ecx
- movl %ebx,%ebp
- roll $5,%ebp
- leal 3395469782(%eax,%esi,1),%eax
- movl 56(%esp),%esi
- addl %ebp,%eax
- # 20_39 78
- movl %ebx,%ebp
- xorl (%esp),%esi
- xorl %ecx,%ebp
- xorl 24(%esp),%esi
- xorl %edx,%ebp
- xorl 44(%esp),%esi
- roll $1,%esi
- addl %ebp,%edi
- rorl $2,%ebx
- movl %eax,%ebp
- roll $5,%ebp
- leal 3395469782(%esi,%edi,1),%esi
- movl 60(%esp),%edi
- addl %ebp,%esi
- # 20_39 79
- movl %eax,%ebp
- xorl 4(%esp),%edi
- xorl %ebx,%ebp
- xorl 28(%esp),%edi
- xorl %ecx,%ebp
- xorl 48(%esp),%edi
- roll $1,%edi
- addl %ebp,%edx
- rorl $2,%eax
- movl %esi,%ebp
- roll $5,%ebp
- leal 3395469782(%edi,%edx,1),%edi
- addl %ebp,%edi
- movl 96(%esp),%ebp
- movl 100(%esp),%edx
- addl (%ebp),%edi
- addl 4(%ebp),%esi
- addl 8(%ebp),%eax
- addl 12(%ebp),%ebx
- addl 16(%ebp),%ecx
- movl %edi,(%ebp)
- addl $64,%edx
- movl %esi,4(%ebp)
- cmpl 104(%esp),%edx
- movl %eax,8(%ebp)
- movl %ecx,%edi
- movl %ebx,12(%ebp)
- movl %edx,%esi
- movl %ecx,16(%ebp)
- jb L002loop
- addl $76,%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.private_extern __sha1_block_data_order_ssse3
-.align 4
-__sha1_block_data_order_ssse3:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- call L003pic_point
-L003pic_point:
- popl %ebp
- leal LK_XX_XX-L003pic_point(%ebp),%ebp
-Lssse3_shortcut:
- movdqa (%ebp),%xmm7
- movdqa 16(%ebp),%xmm0
- movdqa 32(%ebp),%xmm1
- movdqa 48(%ebp),%xmm2
- movdqa 64(%ebp),%xmm6
- movl 20(%esp),%edi
- movl 24(%esp),%ebp
- movl 28(%esp),%edx
- movl %esp,%esi
- subl $208,%esp
- andl $-64,%esp
- movdqa %xmm0,112(%esp)
- movdqa %xmm1,128(%esp)
- movdqa %xmm2,144(%esp)
- shll $6,%edx
- movdqa %xmm7,160(%esp)
- addl %ebp,%edx
- movdqa %xmm6,176(%esp)
- addl $64,%ebp
- movl %edi,192(%esp)
- movl %ebp,196(%esp)
- movl %edx,200(%esp)
- movl %esi,204(%esp)
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- movl 12(%edi),%edx
- movl 16(%edi),%edi
- movl %ebx,%esi
- movdqu -64(%ebp),%xmm0
- movdqu -48(%ebp),%xmm1
- movdqu -32(%ebp),%xmm2
- movdqu -16(%ebp),%xmm3
-.byte 102,15,56,0,198
-.byte 102,15,56,0,206
-.byte 102,15,56,0,214
- movdqa %xmm7,96(%esp)
-.byte 102,15,56,0,222
- paddd %xmm7,%xmm0
- paddd %xmm7,%xmm1
- paddd %xmm7,%xmm2
- movdqa %xmm0,(%esp)
- psubd %xmm7,%xmm0
- movdqa %xmm1,16(%esp)
- psubd %xmm7,%xmm1
- movdqa %xmm2,32(%esp)
- movl %ecx,%ebp
- psubd %xmm7,%xmm2
- xorl %edx,%ebp
- pshufd $238,%xmm0,%xmm4
- andl %ebp,%esi
- jmp L004loop
-.align 4,0x90
-L004loop:
- rorl $2,%ebx
- xorl %edx,%esi
- movl %eax,%ebp
- punpcklqdq %xmm1,%xmm4
- movdqa %xmm3,%xmm6
- addl (%esp),%edi
- xorl %ecx,%ebx
- paddd %xmm3,%xmm7
- movdqa %xmm0,64(%esp)
- roll $5,%eax
- addl %esi,%edi
- psrldq $4,%xmm6
- andl %ebx,%ebp
- xorl %ecx,%ebx
- pxor %xmm0,%xmm4
- addl %eax,%edi
- rorl $7,%eax
- pxor %xmm2,%xmm6
- xorl %ecx,%ebp
- movl %edi,%esi
- addl 4(%esp),%edx
- pxor %xmm6,%xmm4
- xorl %ebx,%eax
- roll $5,%edi
- movdqa %xmm7,48(%esp)
- addl %ebp,%edx
- andl %eax,%esi
- movdqa %xmm4,%xmm0
- xorl %ebx,%eax
- addl %edi,%edx
- rorl $7,%edi
- movdqa %xmm4,%xmm6
- xorl %ebx,%esi
- pslldq $12,%xmm0
- paddd %xmm4,%xmm4
- movl %edx,%ebp
- addl 8(%esp),%ecx
- psrld $31,%xmm6
- xorl %eax,%edi
- roll $5,%edx
- movdqa %xmm0,%xmm7
- addl %esi,%ecx
- andl %edi,%ebp
- xorl %eax,%edi
- psrld $30,%xmm0
- addl %edx,%ecx
- rorl $7,%edx
- por %xmm6,%xmm4
- xorl %eax,%ebp
- movl %ecx,%esi
- addl 12(%esp),%ebx
- pslld $2,%xmm7
- xorl %edi,%edx
- roll $5,%ecx
- pxor %xmm0,%xmm4
- movdqa 96(%esp),%xmm0
- addl %ebp,%ebx
- andl %edx,%esi
- pxor %xmm7,%xmm4
- pshufd $238,%xmm1,%xmm5
- xorl %edi,%edx
- addl %ecx,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- movl %ebx,%ebp
- punpcklqdq %xmm2,%xmm5
- movdqa %xmm4,%xmm7
- addl 16(%esp),%eax
- xorl %edx,%ecx
- paddd %xmm4,%xmm0
- movdqa %xmm1,80(%esp)
- roll $5,%ebx
- addl %esi,%eax
- psrldq $4,%xmm7
- andl %ecx,%ebp
- xorl %edx,%ecx
- pxor %xmm1,%xmm5
- addl %ebx,%eax
- rorl $7,%ebx
- pxor %xmm3,%xmm7
- xorl %edx,%ebp
- movl %eax,%esi
- addl 20(%esp),%edi
- pxor %xmm7,%xmm5
- xorl %ecx,%ebx
- roll $5,%eax
- movdqa %xmm0,(%esp)
- addl %ebp,%edi
- andl %ebx,%esi
- movdqa %xmm5,%xmm1
- xorl %ecx,%ebx
- addl %eax,%edi
- rorl $7,%eax
- movdqa %xmm5,%xmm7
- xorl %ecx,%esi
- pslldq $12,%xmm1
- paddd %xmm5,%xmm5
- movl %edi,%ebp
- addl 24(%esp),%edx
- psrld $31,%xmm7
- xorl %ebx,%eax
- roll $5,%edi
- movdqa %xmm1,%xmm0
- addl %esi,%edx
- andl %eax,%ebp
- xorl %ebx,%eax
- psrld $30,%xmm1
- addl %edi,%edx
- rorl $7,%edi
- por %xmm7,%xmm5
- xorl %ebx,%ebp
- movl %edx,%esi
- addl 28(%esp),%ecx
- pslld $2,%xmm0
- xorl %eax,%edi
- roll $5,%edx
- pxor %xmm1,%xmm5
- movdqa 112(%esp),%xmm1
- addl %ebp,%ecx
- andl %edi,%esi
- pxor %xmm0,%xmm5
- pshufd $238,%xmm2,%xmm6
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%edx
- xorl %eax,%esi
- movl %ecx,%ebp
- punpcklqdq %xmm3,%xmm6
- movdqa %xmm5,%xmm0
- addl 32(%esp),%ebx
- xorl %edi,%edx
- paddd %xmm5,%xmm1
- movdqa %xmm2,96(%esp)
- roll $5,%ecx
- addl %esi,%ebx
- psrldq $4,%xmm0
- andl %edx,%ebp
- xorl %edi,%edx
- pxor %xmm2,%xmm6
- addl %ecx,%ebx
- rorl $7,%ecx
- pxor %xmm4,%xmm0
- xorl %edi,%ebp
- movl %ebx,%esi
- addl 36(%esp),%eax
- pxor %xmm0,%xmm6
- xorl %edx,%ecx
- roll $5,%ebx
- movdqa %xmm1,16(%esp)
- addl %ebp,%eax
- andl %ecx,%esi
- movdqa %xmm6,%xmm2
- xorl %edx,%ecx
- addl %ebx,%eax
- rorl $7,%ebx
- movdqa %xmm6,%xmm0
- xorl %edx,%esi
- pslldq $12,%xmm2
- paddd %xmm6,%xmm6
- movl %eax,%ebp
- addl 40(%esp),%edi
- psrld $31,%xmm0
- xorl %ecx,%ebx
- roll $5,%eax
- movdqa %xmm2,%xmm1
- addl %esi,%edi
- andl %ebx,%ebp
- xorl %ecx,%ebx
- psrld $30,%xmm2
- addl %eax,%edi
- rorl $7,%eax
- por %xmm0,%xmm6
- xorl %ecx,%ebp
- movdqa 64(%esp),%xmm0
- movl %edi,%esi
- addl 44(%esp),%edx
- pslld $2,%xmm1
- xorl %ebx,%eax
- roll $5,%edi
- pxor %xmm2,%xmm6
- movdqa 112(%esp),%xmm2
- addl %ebp,%edx
- andl %eax,%esi
- pxor %xmm1,%xmm6
- pshufd $238,%xmm3,%xmm7
- xorl %ebx,%eax
- addl %edi,%edx
- rorl $7,%edi
- xorl %ebx,%esi
- movl %edx,%ebp
- punpcklqdq %xmm4,%xmm7
- movdqa %xmm6,%xmm1
- addl 48(%esp),%ecx
- xorl %eax,%edi
- paddd %xmm6,%xmm2
- movdqa %xmm3,64(%esp)
- roll $5,%edx
- addl %esi,%ecx
- psrldq $4,%xmm1
- andl %edi,%ebp
- xorl %eax,%edi
- pxor %xmm3,%xmm7
- addl %edx,%ecx
- rorl $7,%edx
- pxor %xmm5,%xmm1
- xorl %eax,%ebp
- movl %ecx,%esi
- addl 52(%esp),%ebx
- pxor %xmm1,%xmm7
- xorl %edi,%edx
- roll $5,%ecx
- movdqa %xmm2,32(%esp)
- addl %ebp,%ebx
- andl %edx,%esi
- movdqa %xmm7,%xmm3
- xorl %edi,%edx
- addl %ecx,%ebx
- rorl $7,%ecx
- movdqa %xmm7,%xmm1
- xorl %edi,%esi
- pslldq $12,%xmm3
- paddd %xmm7,%xmm7
- movl %ebx,%ebp
- addl 56(%esp),%eax
- psrld $31,%xmm1
- xorl %edx,%ecx
- roll $5,%ebx
- movdqa %xmm3,%xmm2
- addl %esi,%eax
- andl %ecx,%ebp
- xorl %edx,%ecx
- psrld $30,%xmm3
- addl %ebx,%eax
- rorl $7,%ebx
- por %xmm1,%xmm7
- xorl %edx,%ebp
- movdqa 80(%esp),%xmm1
- movl %eax,%esi
- addl 60(%esp),%edi
- pslld $2,%xmm2
- xorl %ecx,%ebx
- roll $5,%eax
- pxor %xmm3,%xmm7
- movdqa 112(%esp),%xmm3
- addl %ebp,%edi
- andl %ebx,%esi
- pxor %xmm2,%xmm7
- pshufd $238,%xmm6,%xmm2
- xorl %ecx,%ebx
- addl %eax,%edi
- rorl $7,%eax
- pxor %xmm4,%xmm0
- punpcklqdq %xmm7,%xmm2
- xorl %ecx,%esi
- movl %edi,%ebp
- addl (%esp),%edx
- pxor %xmm1,%xmm0
- movdqa %xmm4,80(%esp)
- xorl %ebx,%eax
- roll $5,%edi
- movdqa %xmm3,%xmm4
- addl %esi,%edx
- paddd %xmm7,%xmm3
- andl %eax,%ebp
- pxor %xmm2,%xmm0
- xorl %ebx,%eax
- addl %edi,%edx
- rorl $7,%edi
- xorl %ebx,%ebp
- movdqa %xmm0,%xmm2
- movdqa %xmm3,48(%esp)
- movl %edx,%esi
- addl 4(%esp),%ecx
- xorl %eax,%edi
- roll $5,%edx
- pslld $2,%xmm0
- addl %ebp,%ecx
- andl %edi,%esi
- psrld $30,%xmm2
- xorl %eax,%edi
- addl %edx,%ecx
- rorl $7,%edx
- xorl %eax,%esi
- movl %ecx,%ebp
- addl 8(%esp),%ebx
- xorl %edi,%edx
- roll $5,%ecx
- por %xmm2,%xmm0
- addl %esi,%ebx
- andl %edx,%ebp
- movdqa 96(%esp),%xmm2
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 12(%esp),%eax
- xorl %edi,%ebp
- movl %ebx,%esi
- pshufd $238,%xmm7,%xmm3
- roll $5,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- rorl $7,%ecx
- addl %ebx,%eax
- addl 16(%esp),%edi
- pxor %xmm5,%xmm1
- punpcklqdq %xmm0,%xmm3
- xorl %ecx,%esi
- movl %eax,%ebp
- roll $5,%eax
- pxor %xmm2,%xmm1
- movdqa %xmm5,96(%esp)
- addl %esi,%edi
- xorl %ecx,%ebp
- movdqa %xmm4,%xmm5
- rorl $7,%ebx
- paddd %xmm0,%xmm4
- addl %eax,%edi
- pxor %xmm3,%xmm1
- addl 20(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- roll $5,%edi
- movdqa %xmm1,%xmm3
- movdqa %xmm4,(%esp)
- addl %ebp,%edx
- xorl %ebx,%esi
- rorl $7,%eax
- addl %edi,%edx
- pslld $2,%xmm1
- addl 24(%esp),%ecx
- xorl %eax,%esi
- psrld $30,%xmm3
- movl %edx,%ebp
- roll $5,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- rorl $7,%edi
- addl %edx,%ecx
- por %xmm3,%xmm1
- addl 28(%esp),%ebx
- xorl %edi,%ebp
- movdqa 64(%esp),%xmm3
- movl %ecx,%esi
- roll $5,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- rorl $7,%edx
- pshufd $238,%xmm0,%xmm4
- addl %ecx,%ebx
- addl 32(%esp),%eax
- pxor %xmm6,%xmm2
- punpcklqdq %xmm1,%xmm4
- xorl %edx,%esi
- movl %ebx,%ebp
- roll $5,%ebx
- pxor %xmm3,%xmm2
- movdqa %xmm6,64(%esp)
- addl %esi,%eax
- xorl %edx,%ebp
- movdqa 128(%esp),%xmm6
- rorl $7,%ecx
- paddd %xmm1,%xmm5
- addl %ebx,%eax
- pxor %xmm4,%xmm2
- addl 36(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- roll $5,%eax
- movdqa %xmm2,%xmm4
- movdqa %xmm5,16(%esp)
- addl %ebp,%edi
- xorl %ecx,%esi
- rorl $7,%ebx
- addl %eax,%edi
- pslld $2,%xmm2
- addl 40(%esp),%edx
- xorl %ebx,%esi
- psrld $30,%xmm4
- movl %edi,%ebp
- roll $5,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- rorl $7,%eax
- addl %edi,%edx
- por %xmm4,%xmm2
- addl 44(%esp),%ecx
- xorl %eax,%ebp
- movdqa 80(%esp),%xmm4
- movl %edx,%esi
- roll $5,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- rorl $7,%edi
- pshufd $238,%xmm1,%xmm5
- addl %edx,%ecx
- addl 48(%esp),%ebx
- pxor %xmm7,%xmm3
- punpcklqdq %xmm2,%xmm5
- xorl %edi,%esi
- movl %ecx,%ebp
- roll $5,%ecx
- pxor %xmm4,%xmm3
- movdqa %xmm7,80(%esp)
- addl %esi,%ebx
- xorl %edi,%ebp
- movdqa %xmm6,%xmm7
- rorl $7,%edx
- paddd %xmm2,%xmm6
- addl %ecx,%ebx
- pxor %xmm5,%xmm3
- addl 52(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- roll $5,%ebx
- movdqa %xmm3,%xmm5
- movdqa %xmm6,32(%esp)
- addl %ebp,%eax
- xorl %edx,%esi
- rorl $7,%ecx
- addl %ebx,%eax
- pslld $2,%xmm3
- addl 56(%esp),%edi
- xorl %ecx,%esi
- psrld $30,%xmm5
- movl %eax,%ebp
- roll $5,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- rorl $7,%ebx
- addl %eax,%edi
- por %xmm5,%xmm3
- addl 60(%esp),%edx
- xorl %ebx,%ebp
- movdqa 96(%esp),%xmm5
- movl %edi,%esi
- roll $5,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- rorl $7,%eax
- pshufd $238,%xmm2,%xmm6
- addl %edi,%edx
- addl (%esp),%ecx
- pxor %xmm0,%xmm4
- punpcklqdq %xmm3,%xmm6
- xorl %eax,%esi
- movl %edx,%ebp
- roll $5,%edx
- pxor %xmm5,%xmm4
- movdqa %xmm0,96(%esp)
- addl %esi,%ecx
- xorl %eax,%ebp
- movdqa %xmm7,%xmm0
- rorl $7,%edi
- paddd %xmm3,%xmm7
- addl %edx,%ecx
- pxor %xmm6,%xmm4
- addl 4(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- roll $5,%ecx
- movdqa %xmm4,%xmm6
- movdqa %xmm7,48(%esp)
- addl %ebp,%ebx
- xorl %edi,%esi
- rorl $7,%edx
- addl %ecx,%ebx
- pslld $2,%xmm4
- addl 8(%esp),%eax
- xorl %edx,%esi
- psrld $30,%xmm6
- movl %ebx,%ebp
- roll $5,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- rorl $7,%ecx
- addl %ebx,%eax
- por %xmm6,%xmm4
- addl 12(%esp),%edi
- xorl %ecx,%ebp
- movdqa 64(%esp),%xmm6
- movl %eax,%esi
- roll $5,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- rorl $7,%ebx
- pshufd $238,%xmm3,%xmm7
- addl %eax,%edi
- addl 16(%esp),%edx
- pxor %xmm1,%xmm5
- punpcklqdq %xmm4,%xmm7
- xorl %ebx,%esi
- movl %edi,%ebp
- roll $5,%edi
- pxor %xmm6,%xmm5
- movdqa %xmm1,64(%esp)
- addl %esi,%edx
- xorl %ebx,%ebp
- movdqa %xmm0,%xmm1
- rorl $7,%eax
- paddd %xmm4,%xmm0
- addl %edi,%edx
- pxor %xmm7,%xmm5
- addl 20(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- roll $5,%edx
- movdqa %xmm5,%xmm7
- movdqa %xmm0,(%esp)
- addl %ebp,%ecx
- xorl %eax,%esi
- rorl $7,%edi
- addl %edx,%ecx
- pslld $2,%xmm5
- addl 24(%esp),%ebx
- xorl %edi,%esi
- psrld $30,%xmm7
- movl %ecx,%ebp
- roll $5,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- rorl $7,%edx
- addl %ecx,%ebx
- por %xmm7,%xmm5
- addl 28(%esp),%eax
- movdqa 80(%esp),%xmm7
- rorl $7,%ecx
- movl %ebx,%esi
- xorl %edx,%ebp
- roll $5,%ebx
- pshufd $238,%xmm4,%xmm0
- addl %ebp,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 32(%esp),%edi
- pxor %xmm2,%xmm6
- punpcklqdq %xmm5,%xmm0
- andl %ecx,%esi
- xorl %edx,%ecx
- rorl $7,%ebx
- pxor %xmm7,%xmm6
- movdqa %xmm2,80(%esp)
- movl %eax,%ebp
- xorl %ecx,%esi
- roll $5,%eax
- movdqa %xmm1,%xmm2
- addl %esi,%edi
- paddd %xmm5,%xmm1
- xorl %ebx,%ebp
- pxor %xmm0,%xmm6
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 36(%esp),%edx
- andl %ebx,%ebp
- movdqa %xmm6,%xmm0
- movdqa %xmm1,16(%esp)
- xorl %ecx,%ebx
- rorl $7,%eax
- movl %edi,%esi
- xorl %ebx,%ebp
- roll $5,%edi
- pslld $2,%xmm6
- addl %ebp,%edx
- xorl %eax,%esi
- psrld $30,%xmm0
- xorl %ebx,%eax
- addl %edi,%edx
- addl 40(%esp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- rorl $7,%edi
- por %xmm0,%xmm6
- movl %edx,%ebp
- xorl %eax,%esi
- movdqa 96(%esp),%xmm0
- roll $5,%edx
- addl %esi,%ecx
- xorl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- pshufd $238,%xmm5,%xmm1
- addl 44(%esp),%ebx
- andl %edi,%ebp
- xorl %eax,%edi
- rorl $7,%edx
- movl %ecx,%esi
- xorl %edi,%ebp
- roll $5,%ecx
- addl %ebp,%ebx
- xorl %edx,%esi
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 48(%esp),%eax
- pxor %xmm3,%xmm7
- punpcklqdq %xmm6,%xmm1
- andl %edx,%esi
- xorl %edi,%edx
- rorl $7,%ecx
- pxor %xmm0,%xmm7
- movdqa %xmm3,96(%esp)
- movl %ebx,%ebp
- xorl %edx,%esi
- roll $5,%ebx
- movdqa 144(%esp),%xmm3
- addl %esi,%eax
- paddd %xmm6,%xmm2
- xorl %ecx,%ebp
- pxor %xmm1,%xmm7
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 52(%esp),%edi
- andl %ecx,%ebp
- movdqa %xmm7,%xmm1
- movdqa %xmm2,32(%esp)
- xorl %edx,%ecx
- rorl $7,%ebx
- movl %eax,%esi
- xorl %ecx,%ebp
- roll $5,%eax
- pslld $2,%xmm7
- addl %ebp,%edi
- xorl %ebx,%esi
- psrld $30,%xmm1
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 56(%esp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- rorl $7,%eax
- por %xmm1,%xmm7
- movl %edi,%ebp
- xorl %ebx,%esi
- movdqa 64(%esp),%xmm1
- roll $5,%edi
- addl %esi,%edx
- xorl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- pshufd $238,%xmm6,%xmm2
- addl 60(%esp),%ecx
- andl %eax,%ebp
- xorl %ebx,%eax
- rorl $7,%edi
- movl %edx,%esi
- xorl %eax,%ebp
- roll $5,%edx
- addl %ebp,%ecx
- xorl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- addl (%esp),%ebx
- pxor %xmm4,%xmm0
- punpcklqdq %xmm7,%xmm2
- andl %edi,%esi
- xorl %eax,%edi
- rorl $7,%edx
- pxor %xmm1,%xmm0
- movdqa %xmm4,64(%esp)
- movl %ecx,%ebp
- xorl %edi,%esi
- roll $5,%ecx
- movdqa %xmm3,%xmm4
- addl %esi,%ebx
- paddd %xmm7,%xmm3
- xorl %edx,%ebp
- pxor %xmm2,%xmm0
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 4(%esp),%eax
- andl %edx,%ebp
- movdqa %xmm0,%xmm2
- movdqa %xmm3,48(%esp)
- xorl %edi,%edx
- rorl $7,%ecx
- movl %ebx,%esi
- xorl %edx,%ebp
- roll $5,%ebx
- pslld $2,%xmm0
- addl %ebp,%eax
- xorl %ecx,%esi
- psrld $30,%xmm2
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 8(%esp),%edi
- andl %ecx,%esi
- xorl %edx,%ecx
- rorl $7,%ebx
- por %xmm2,%xmm0
- movl %eax,%ebp
- xorl %ecx,%esi
- movdqa 80(%esp),%xmm2
- roll $5,%eax
- addl %esi,%edi
- xorl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- pshufd $238,%xmm7,%xmm3
- addl 12(%esp),%edx
- andl %ebx,%ebp
- xorl %ecx,%ebx
- rorl $7,%eax
- movl %edi,%esi
- xorl %ebx,%ebp
- roll $5,%edi
- addl %ebp,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %edi,%edx
- addl 16(%esp),%ecx
- pxor %xmm5,%xmm1
- punpcklqdq %xmm0,%xmm3
- andl %eax,%esi
- xorl %ebx,%eax
- rorl $7,%edi
- pxor %xmm2,%xmm1
- movdqa %xmm5,80(%esp)
- movl %edx,%ebp
- xorl %eax,%esi
- roll $5,%edx
- movdqa %xmm4,%xmm5
- addl %esi,%ecx
- paddd %xmm0,%xmm4
- xorl %edi,%ebp
- pxor %xmm3,%xmm1
- xorl %eax,%edi
- addl %edx,%ecx
- addl 20(%esp),%ebx
- andl %edi,%ebp
- movdqa %xmm1,%xmm3
- movdqa %xmm4,(%esp)
- xorl %eax,%edi
- rorl $7,%edx
- movl %ecx,%esi
- xorl %edi,%ebp
- roll $5,%ecx
- pslld $2,%xmm1
- addl %ebp,%ebx
- xorl %edx,%esi
- psrld $30,%xmm3
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 24(%esp),%eax
- andl %edx,%esi
- xorl %edi,%edx
- rorl $7,%ecx
- por %xmm3,%xmm1
- movl %ebx,%ebp
- xorl %edx,%esi
- movdqa 96(%esp),%xmm3
- roll $5,%ebx
- addl %esi,%eax
- xorl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- pshufd $238,%xmm0,%xmm4
- addl 28(%esp),%edi
- andl %ecx,%ebp
- xorl %edx,%ecx
- rorl $7,%ebx
- movl %eax,%esi
- xorl %ecx,%ebp
- roll $5,%eax
- addl %ebp,%edi
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 32(%esp),%edx
- pxor %xmm6,%xmm2
- punpcklqdq %xmm1,%xmm4
- andl %ebx,%esi
- xorl %ecx,%ebx
- rorl $7,%eax
- pxor %xmm3,%xmm2
- movdqa %xmm6,96(%esp)
- movl %edi,%ebp
- xorl %ebx,%esi
- roll $5,%edi
- movdqa %xmm5,%xmm6
- addl %esi,%edx
- paddd %xmm1,%xmm5
- xorl %eax,%ebp
- pxor %xmm4,%xmm2
- xorl %ebx,%eax
- addl %edi,%edx
- addl 36(%esp),%ecx
- andl %eax,%ebp
- movdqa %xmm2,%xmm4
- movdqa %xmm5,16(%esp)
- xorl %ebx,%eax
- rorl $7,%edi
- movl %edx,%esi
- xorl %eax,%ebp
- roll $5,%edx
- pslld $2,%xmm2
- addl %ebp,%ecx
- xorl %edi,%esi
- psrld $30,%xmm4
- xorl %eax,%edi
- addl %edx,%ecx
- addl 40(%esp),%ebx
- andl %edi,%esi
- xorl %eax,%edi
- rorl $7,%edx
- por %xmm4,%xmm2
- movl %ecx,%ebp
- xorl %edi,%esi
- movdqa 64(%esp),%xmm4
- roll $5,%ecx
- addl %esi,%ebx
- xorl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- pshufd $238,%xmm1,%xmm5
- addl 44(%esp),%eax
- andl %edx,%ebp
- xorl %edi,%edx
- rorl $7,%ecx
- movl %ebx,%esi
- xorl %edx,%ebp
- roll $5,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- addl 48(%esp),%edi
- pxor %xmm7,%xmm3
- punpcklqdq %xmm2,%xmm5
- xorl %ecx,%esi
- movl %eax,%ebp
- roll $5,%eax
- pxor %xmm4,%xmm3
- movdqa %xmm7,64(%esp)
- addl %esi,%edi
- xorl %ecx,%ebp
- movdqa %xmm6,%xmm7
- rorl $7,%ebx
- paddd %xmm2,%xmm6
- addl %eax,%edi
- pxor %xmm5,%xmm3
- addl 52(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- roll $5,%edi
- movdqa %xmm3,%xmm5
- movdqa %xmm6,32(%esp)
- addl %ebp,%edx
- xorl %ebx,%esi
- rorl $7,%eax
- addl %edi,%edx
- pslld $2,%xmm3
- addl 56(%esp),%ecx
- xorl %eax,%esi
- psrld $30,%xmm5
- movl %edx,%ebp
- roll $5,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- rorl $7,%edi
- addl %edx,%ecx
- por %xmm5,%xmm3
- addl 60(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- roll $5,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- rorl $7,%edx
- addl %ecx,%ebx
- addl (%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- roll $5,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- rorl $7,%ecx
- paddd %xmm3,%xmm7
- addl %ebx,%eax
- addl 4(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- movdqa %xmm7,48(%esp)
- roll $5,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- rorl $7,%ebx
- addl %eax,%edi
- addl 8(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- roll $5,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- rorl $7,%eax
- addl %edi,%edx
- addl 12(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- roll $5,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- rorl $7,%edi
- addl %edx,%ecx
- movl 196(%esp),%ebp
- cmpl 200(%esp),%ebp
- je L005done
- movdqa 160(%esp),%xmm7
- movdqa 176(%esp),%xmm6
- movdqu (%ebp),%xmm0
- movdqu 16(%ebp),%xmm1
- movdqu 32(%ebp),%xmm2
- movdqu 48(%ebp),%xmm3
- addl $64,%ebp
-.byte 102,15,56,0,198
- movl %ebp,196(%esp)
- movdqa %xmm7,96(%esp)
- addl 16(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- roll $5,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- rorl $7,%edx
-.byte 102,15,56,0,206
- addl %ecx,%ebx
- addl 20(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- paddd %xmm7,%xmm0
- roll $5,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- rorl $7,%ecx
- movdqa %xmm0,(%esp)
- addl %ebx,%eax
- addl 24(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- psubd %xmm7,%xmm0
- roll $5,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- rorl $7,%ebx
- addl %eax,%edi
- addl 28(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- roll $5,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- rorl $7,%eax
- addl %edi,%edx
- addl 32(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- roll $5,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- rorl $7,%edi
-.byte 102,15,56,0,214
- addl %edx,%ecx
- addl 36(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- paddd %xmm7,%xmm1
- roll $5,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- rorl $7,%edx
- movdqa %xmm1,16(%esp)
- addl %ecx,%ebx
- addl 40(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- psubd %xmm7,%xmm1
- roll $5,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- rorl $7,%ecx
- addl %ebx,%eax
- addl 44(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- roll $5,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- rorl $7,%ebx
- addl %eax,%edi
- addl 48(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- roll $5,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- rorl $7,%eax
-.byte 102,15,56,0,222
- addl %edi,%edx
- addl 52(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- paddd %xmm7,%xmm2
- roll $5,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- rorl $7,%edi
- movdqa %xmm2,32(%esp)
- addl %edx,%ecx
- addl 56(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- psubd %xmm7,%xmm2
- roll $5,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- rorl $7,%edx
- addl %ecx,%ebx
- addl 60(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- roll $5,%ebx
- addl %ebp,%eax
- rorl $7,%ecx
- addl %ebx,%eax
- movl 192(%esp),%ebp
- addl (%ebp),%eax
- addl 4(%ebp),%esi
- addl 8(%ebp),%ecx
- movl %eax,(%ebp)
- addl 12(%ebp),%edx
- movl %esi,4(%ebp)
- addl 16(%ebp),%edi
- movl %ecx,8(%ebp)
- movl %ecx,%ebx
- movl %edx,12(%ebp)
- xorl %edx,%ebx
- movl %edi,16(%ebp)
- movl %esi,%ebp
- pshufd $238,%xmm0,%xmm4
- andl %ebx,%esi
- movl %ebp,%ebx
- jmp L004loop
-.align 4,0x90
-L005done:
- addl 16(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- roll $5,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- rorl $7,%edx
- addl %ecx,%ebx
- addl 20(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- roll $5,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- rorl $7,%ecx
- addl %ebx,%eax
- addl 24(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- roll $5,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- rorl $7,%ebx
- addl %eax,%edi
- addl 28(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- roll $5,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- rorl $7,%eax
- addl %edi,%edx
- addl 32(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- roll $5,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- rorl $7,%edi
- addl %edx,%ecx
- addl 36(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- roll $5,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- rorl $7,%edx
- addl %ecx,%ebx
- addl 40(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- roll $5,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- rorl $7,%ecx
- addl %ebx,%eax
- addl 44(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- roll $5,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- rorl $7,%ebx
- addl %eax,%edi
- addl 48(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- roll $5,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- rorl $7,%eax
- addl %edi,%edx
- addl 52(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- roll $5,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- rorl $7,%edi
- addl %edx,%ecx
- addl 56(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- roll $5,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- rorl $7,%edx
- addl %ecx,%ebx
- addl 60(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- roll $5,%ebx
- addl %ebp,%eax
- rorl $7,%ecx
- addl %ebx,%eax
- movl 192(%esp),%ebp
- addl (%ebp),%eax
- movl 204(%esp),%esp
- addl 4(%ebp),%esi
- addl 8(%ebp),%ecx
- movl %eax,(%ebp)
- addl 12(%ebp),%edx
- movl %esi,4(%ebp)
- addl 16(%ebp),%edi
- movl %ecx,8(%ebp)
- movl %edx,12(%ebp)
- movl %edi,16(%ebp)
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.private_extern __sha1_block_data_order_avx
-.align 4
-__sha1_block_data_order_avx:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- call L006pic_point
-L006pic_point:
- popl %ebp
- leal LK_XX_XX-L006pic_point(%ebp),%ebp
-Lavx_shortcut:
- vzeroall
- vmovdqa (%ebp),%xmm7
- vmovdqa 16(%ebp),%xmm0
- vmovdqa 32(%ebp),%xmm1
- vmovdqa 48(%ebp),%xmm2
- vmovdqa 64(%ebp),%xmm6
- movl 20(%esp),%edi
- movl 24(%esp),%ebp
- movl 28(%esp),%edx
- movl %esp,%esi
- subl $208,%esp
- andl $-64,%esp
- vmovdqa %xmm0,112(%esp)
- vmovdqa %xmm1,128(%esp)
- vmovdqa %xmm2,144(%esp)
- shll $6,%edx
- vmovdqa %xmm7,160(%esp)
- addl %ebp,%edx
- vmovdqa %xmm6,176(%esp)
- addl $64,%ebp
- movl %edi,192(%esp)
- movl %ebp,196(%esp)
- movl %edx,200(%esp)
- movl %esi,204(%esp)
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- movl 12(%edi),%edx
- movl 16(%edi),%edi
- movl %ebx,%esi
- vmovdqu -64(%ebp),%xmm0
- vmovdqu -48(%ebp),%xmm1
- vmovdqu -32(%ebp),%xmm2
- vmovdqu -16(%ebp),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- vpshufb %xmm6,%xmm1,%xmm1
- vpshufb %xmm6,%xmm2,%xmm2
- vmovdqa %xmm7,96(%esp)
- vpshufb %xmm6,%xmm3,%xmm3
- vpaddd %xmm7,%xmm0,%xmm4
- vpaddd %xmm7,%xmm1,%xmm5
- vpaddd %xmm7,%xmm2,%xmm6
- vmovdqa %xmm4,(%esp)
- movl %ecx,%ebp
- vmovdqa %xmm5,16(%esp)
- xorl %edx,%ebp
- vmovdqa %xmm6,32(%esp)
- andl %ebp,%esi
- jmp L007loop
-.align 4,0x90
-L007loop:
- shrdl $2,%ebx,%ebx
- xorl %edx,%esi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- movl %eax,%ebp
- addl (%esp),%edi
- vpaddd %xmm3,%xmm7,%xmm7
- vmovdqa %xmm0,64(%esp)
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrldq $4,%xmm3,%xmm6
- addl %esi,%edi
- andl %ebx,%ebp
- vpxor %xmm0,%xmm4,%xmm4
- xorl %ecx,%ebx
- addl %eax,%edi
- vpxor %xmm2,%xmm6,%xmm6
- shrdl $7,%eax,%eax
- xorl %ecx,%ebp
- vmovdqa %xmm7,48(%esp)
- movl %edi,%esi
- addl 4(%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- addl %ebp,%edx
- andl %eax,%esi
- vpsrld $31,%xmm4,%xmm6
- xorl %ebx,%eax
- addl %edi,%edx
- shrdl $7,%edi,%edi
- xorl %ebx,%esi
- vpslldq $12,%xmm4,%xmm0
- vpaddd %xmm4,%xmm4,%xmm4
- movl %edx,%ebp
- addl 8(%esp),%ecx
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm0,%xmm7
- vpor %xmm6,%xmm4,%xmm4
- addl %esi,%ecx
- andl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- vpslld $2,%xmm0,%xmm0
- shrdl $7,%edx,%edx
- xorl %eax,%ebp
- vpxor %xmm7,%xmm4,%xmm4
- movl %ecx,%esi
- addl 12(%esp),%ebx
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- vpxor %xmm0,%xmm4,%xmm4
- addl %ebp,%ebx
- andl %edx,%esi
- vmovdqa 96(%esp),%xmm0
- xorl %edi,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %edi,%esi
- vpalignr $8,%xmm1,%xmm2,%xmm5
- movl %ebx,%ebp
- addl 16(%esp),%eax
- vpaddd %xmm4,%xmm0,%xmm0
- vmovdqa %xmm1,80(%esp)
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrldq $4,%xmm4,%xmm7
- addl %esi,%eax
- andl %ecx,%ebp
- vpxor %xmm1,%xmm5,%xmm5
- xorl %edx,%ecx
- addl %ebx,%eax
- vpxor %xmm3,%xmm7,%xmm7
- shrdl $7,%ebx,%ebx
- xorl %edx,%ebp
- vmovdqa %xmm0,(%esp)
- movl %eax,%esi
- addl 20(%esp),%edi
- vpxor %xmm7,%xmm5,%xmm5
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- addl %ebp,%edi
- andl %ebx,%esi
- vpsrld $31,%xmm5,%xmm7
- xorl %ecx,%ebx
- addl %eax,%edi
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- vpslldq $12,%xmm5,%xmm1
- vpaddd %xmm5,%xmm5,%xmm5
- movl %edi,%ebp
- addl 24(%esp),%edx
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- vpsrld $30,%xmm1,%xmm0
- vpor %xmm7,%xmm5,%xmm5
- addl %esi,%edx
- andl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- vpslld $2,%xmm1,%xmm1
- shrdl $7,%edi,%edi
- xorl %ebx,%ebp
- vpxor %xmm0,%xmm5,%xmm5
- movl %edx,%esi
- addl 28(%esp),%ecx
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpxor %xmm1,%xmm5,%xmm5
- addl %ebp,%ecx
- andl %edi,%esi
- vmovdqa 112(%esp),%xmm1
- xorl %eax,%edi
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- vpalignr $8,%xmm2,%xmm3,%xmm6
- movl %ecx,%ebp
- addl 32(%esp),%ebx
- vpaddd %xmm5,%xmm1,%xmm1
- vmovdqa %xmm2,96(%esp)
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- vpsrldq $4,%xmm5,%xmm0
- addl %esi,%ebx
- andl %edx,%ebp
- vpxor %xmm2,%xmm6,%xmm6
- xorl %edi,%edx
- addl %ecx,%ebx
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%ecx,%ecx
- xorl %edi,%ebp
- vmovdqa %xmm1,16(%esp)
- movl %ebx,%esi
- addl 36(%esp),%eax
- vpxor %xmm0,%xmm6,%xmm6
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- andl %ecx,%esi
- vpsrld $31,%xmm6,%xmm0
- xorl %edx,%ecx
- addl %ebx,%eax
- shrdl $7,%ebx,%ebx
- xorl %edx,%esi
- vpslldq $12,%xmm6,%xmm2
- vpaddd %xmm6,%xmm6,%xmm6
- movl %eax,%ebp
- addl 40(%esp),%edi
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm1
- vpor %xmm0,%xmm6,%xmm6
- addl %esi,%edi
- andl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- vpslld $2,%xmm2,%xmm2
- vmovdqa 64(%esp),%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%ebp
- vpxor %xmm1,%xmm6,%xmm6
- movl %edi,%esi
- addl 44(%esp),%edx
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- vpxor %xmm2,%xmm6,%xmm6
- addl %ebp,%edx
- andl %eax,%esi
- vmovdqa 112(%esp),%xmm2
- xorl %ebx,%eax
- addl %edi,%edx
- shrdl $7,%edi,%edi
- xorl %ebx,%esi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- movl %edx,%ebp
- addl 48(%esp),%ecx
- vpaddd %xmm6,%xmm2,%xmm2
- vmovdqa %xmm3,64(%esp)
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpsrldq $4,%xmm6,%xmm1
- addl %esi,%ecx
- andl %edi,%ebp
- vpxor %xmm3,%xmm7,%xmm7
- xorl %eax,%edi
- addl %edx,%ecx
- vpxor %xmm5,%xmm1,%xmm1
- shrdl $7,%edx,%edx
- xorl %eax,%ebp
- vmovdqa %xmm2,32(%esp)
- movl %ecx,%esi
- addl 52(%esp),%ebx
- vpxor %xmm1,%xmm7,%xmm7
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- andl %edx,%esi
- vpsrld $31,%xmm7,%xmm1
- xorl %edi,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %edi,%esi
- vpslldq $12,%xmm7,%xmm3
- vpaddd %xmm7,%xmm7,%xmm7
- movl %ebx,%ebp
- addl 56(%esp),%eax
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm2
- vpor %xmm1,%xmm7,%xmm7
- addl %esi,%eax
- andl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- vmovdqa 80(%esp),%xmm1
- shrdl $7,%ebx,%ebx
- xorl %edx,%ebp
- vpxor %xmm2,%xmm7,%xmm7
- movl %eax,%esi
- addl 60(%esp),%edi
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpxor %xmm3,%xmm7,%xmm7
- addl %ebp,%edi
- andl %ebx,%esi
- vmovdqa 112(%esp),%xmm3
- xorl %ecx,%ebx
- addl %eax,%edi
- vpalignr $8,%xmm6,%xmm7,%xmm2
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- movl %edi,%ebp
- addl (%esp),%edx
- vpxor %xmm1,%xmm0,%xmm0
- vmovdqa %xmm4,80(%esp)
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- vmovdqa %xmm3,%xmm4
- vpaddd %xmm7,%xmm3,%xmm3
- addl %esi,%edx
- andl %eax,%ebp
- vpxor %xmm2,%xmm0,%xmm0
- xorl %ebx,%eax
- addl %edi,%edx
- shrdl $7,%edi,%edi
- xorl %ebx,%ebp
- vpsrld $30,%xmm0,%xmm2
- vmovdqa %xmm3,48(%esp)
- movl %edx,%esi
- addl 4(%esp),%ecx
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpslld $2,%xmm0,%xmm0
- addl %ebp,%ecx
- andl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- movl %ecx,%ebp
- addl 8(%esp),%ebx
- vpor %xmm2,%xmm0,%xmm0
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- vmovdqa 96(%esp),%xmm2
- addl %esi,%ebx
- andl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 12(%esp),%eax
- xorl %edi,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm7,%xmm0,%xmm3
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm5,96(%esp)
- addl %esi,%edi
- xorl %ecx,%ebp
- vmovdqa %xmm4,%xmm5
- vpaddd %xmm0,%xmm4,%xmm4
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpxor %xmm3,%xmm1,%xmm1
- addl 20(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- vpsrld $30,%xmm1,%xmm3
- vmovdqa %xmm4,(%esp)
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpslld $2,%xmm1,%xmm1
- addl 24(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpor %xmm3,%xmm1,%xmm1
- addl 28(%esp),%ebx
- xorl %edi,%ebp
- vmovdqa 64(%esp),%xmm3
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm0,%xmm1,%xmm4
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- vpxor %xmm3,%xmm2,%xmm2
- vmovdqa %xmm6,64(%esp)
- addl %esi,%eax
- xorl %edx,%ebp
- vmovdqa 128(%esp),%xmm6
- vpaddd %xmm1,%xmm5,%xmm5
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpxor %xmm4,%xmm2,%xmm2
- addl 36(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm4
- vmovdqa %xmm5,16(%esp)
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpslld $2,%xmm2,%xmm2
- addl 40(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpor %xmm4,%xmm2,%xmm2
- addl 44(%esp),%ecx
- xorl %eax,%ebp
- vmovdqa 80(%esp),%xmm4
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpalignr $8,%xmm1,%xmm2,%xmm5
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- vpxor %xmm4,%xmm3,%xmm3
- vmovdqa %xmm7,80(%esp)
- addl %esi,%ebx
- xorl %edi,%ebp
- vmovdqa %xmm6,%xmm7
- vpaddd %xmm2,%xmm6,%xmm6
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpxor %xmm5,%xmm3,%xmm3
- addl 52(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm5
- vmovdqa %xmm6,32(%esp)
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- addl 56(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpor %xmm5,%xmm3,%xmm3
- addl 60(%esp),%edx
- xorl %ebx,%ebp
- vmovdqa 96(%esp),%xmm5
- movl %edi,%esi
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpalignr $8,%xmm2,%xmm3,%xmm6
- vpxor %xmm0,%xmm4,%xmm4
- addl (%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- vpxor %xmm5,%xmm4,%xmm4
- vmovdqa %xmm0,96(%esp)
- addl %esi,%ecx
- xorl %eax,%ebp
- vmovdqa %xmm7,%xmm0
- vpaddd %xmm3,%xmm7,%xmm7
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpxor %xmm6,%xmm4,%xmm4
- addl 4(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- vpsrld $30,%xmm4,%xmm6
- vmovdqa %xmm7,48(%esp)
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpslld $2,%xmm4,%xmm4
- addl 8(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpor %xmm6,%xmm4,%xmm4
- addl 12(%esp),%edi
- xorl %ecx,%ebp
- vmovdqa 64(%esp),%xmm6
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- vpxor %xmm1,%xmm5,%xmm5
- addl 16(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- vpxor %xmm6,%xmm5,%xmm5
- vmovdqa %xmm1,64(%esp)
- addl %esi,%edx
- xorl %ebx,%ebp
- vmovdqa %xmm0,%xmm1
- vpaddd %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpxor %xmm7,%xmm5,%xmm5
- addl 20(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm5,%xmm7
- vmovdqa %xmm0,(%esp)
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpslld $2,%xmm5,%xmm5
- addl 24(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpor %xmm7,%xmm5,%xmm5
- addl 28(%esp),%eax
- vmovdqa 80(%esp),%xmm7
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%ebp
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm4,%xmm5,%xmm0
- vpxor %xmm2,%xmm6,%xmm6
- addl 32(%esp),%edi
- andl %ecx,%esi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- vmovdqa %xmm2,80(%esp)
- movl %eax,%ebp
- xorl %ecx,%esi
- vmovdqa %xmm1,%xmm2
- vpaddd %xmm5,%xmm1,%xmm1
- shldl $5,%eax,%eax
- addl %esi,%edi
- vpxor %xmm0,%xmm6,%xmm6
- xorl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 36(%esp),%edx
- vpsrld $30,%xmm6,%xmm0
- vmovdqa %xmm1,16(%esp)
- andl %ebx,%ebp
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %edi,%esi
- vpslld $2,%xmm6,%xmm6
- xorl %ebx,%ebp
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %edi,%edx
- addl 40(%esp),%ecx
- andl %eax,%esi
- vpor %xmm0,%xmm6,%xmm6
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- vmovdqa 96(%esp),%xmm0
- movl %edx,%ebp
- xorl %eax,%esi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- addl 44(%esp),%ebx
- andl %edi,%ebp
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- xorl %edi,%ebp
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edx,%esi
- xorl %edi,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm5,%xmm6,%xmm1
- vpxor %xmm3,%xmm7,%xmm7
- addl 48(%esp),%eax
- andl %edx,%esi
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- vpxor %xmm0,%xmm7,%xmm7
- vmovdqa %xmm3,96(%esp)
- movl %ebx,%ebp
- xorl %edx,%esi
- vmovdqa 144(%esp),%xmm3
- vpaddd %xmm6,%xmm2,%xmm2
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vpxor %xmm1,%xmm7,%xmm7
- xorl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 52(%esp),%edi
- vpsrld $30,%xmm7,%xmm1
- vmovdqa %xmm2,32(%esp)
- andl %ecx,%ebp
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- vpslld $2,%xmm7,%xmm7
- xorl %ecx,%ebp
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 56(%esp),%edx
- andl %ebx,%esi
- vpor %xmm1,%xmm7,%xmm7
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vmovdqa 64(%esp),%xmm1
- movl %edi,%ebp
- xorl %ebx,%esi
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- addl 60(%esp),%ecx
- andl %eax,%ebp
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- movl %edx,%esi
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- vpalignr $8,%xmm6,%xmm7,%xmm2
- vpxor %xmm4,%xmm0,%xmm0
- addl (%esp),%ebx
- andl %edi,%esi
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- vpxor %xmm1,%xmm0,%xmm0
- vmovdqa %xmm4,64(%esp)
- movl %ecx,%ebp
- xorl %edi,%esi
- vmovdqa %xmm3,%xmm4
- vpaddd %xmm7,%xmm3,%xmm3
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- vpxor %xmm2,%xmm0,%xmm0
- xorl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 4(%esp),%eax
- vpsrld $30,%xmm0,%xmm2
- vmovdqa %xmm3,48(%esp)
- andl %edx,%ebp
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- vpslld $2,%xmm0,%xmm0
- xorl %edx,%ebp
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 8(%esp),%edi
- andl %ecx,%esi
- vpor %xmm2,%xmm0,%xmm0
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vmovdqa 80(%esp),%xmm2
- movl %eax,%ebp
- xorl %ecx,%esi
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 12(%esp),%edx
- andl %ebx,%ebp
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %edi,%esi
- xorl %ebx,%ebp
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %edi,%edx
- vpalignr $8,%xmm7,%xmm0,%xmm3
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%esp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm5,80(%esp)
- movl %edx,%ebp
- xorl %eax,%esi
- vmovdqa %xmm4,%xmm5
- vpaddd %xmm0,%xmm4,%xmm4
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vpxor %xmm3,%xmm1,%xmm1
- xorl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- addl 20(%esp),%ebx
- vpsrld $30,%xmm1,%xmm3
- vmovdqa %xmm4,(%esp)
- andl %edi,%ebp
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- vpslld $2,%xmm1,%xmm1
- xorl %edi,%ebp
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edx,%esi
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 24(%esp),%eax
- andl %edx,%esi
- vpor %xmm3,%xmm1,%xmm1
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- vmovdqa 96(%esp),%xmm3
- movl %ebx,%ebp
- xorl %edx,%esi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 28(%esp),%edi
- andl %ecx,%ebp
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- xorl %ecx,%ebp
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%edi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%esp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vpxor %xmm3,%xmm2,%xmm2
- vmovdqa %xmm6,96(%esp)
- movl %edi,%ebp
- xorl %ebx,%esi
- vmovdqa %xmm5,%xmm6
- vpaddd %xmm1,%xmm5,%xmm5
- shldl $5,%edi,%edi
- addl %esi,%edx
- vpxor %xmm4,%xmm2,%xmm2
- xorl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- addl 36(%esp),%ecx
- vpsrld $30,%xmm2,%xmm4
- vmovdqa %xmm5,16(%esp)
- andl %eax,%ebp
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- movl %edx,%esi
- vpslld $2,%xmm2,%xmm2
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- addl 40(%esp),%ebx
- andl %edi,%esi
- vpor %xmm4,%xmm2,%xmm2
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- vmovdqa 64(%esp),%xmm4
- movl %ecx,%ebp
- xorl %edi,%esi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 44(%esp),%eax
- andl %edx,%ebp
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%ebp
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- vpalignr $8,%xmm1,%xmm2,%xmm5
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- vpxor %xmm4,%xmm3,%xmm3
- vmovdqa %xmm7,64(%esp)
- addl %esi,%edi
- xorl %ecx,%ebp
- vmovdqa %xmm6,%xmm7
- vpaddd %xmm2,%xmm6,%xmm6
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpxor %xmm5,%xmm3,%xmm3
- addl 52(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- vpsrld $30,%xmm3,%xmm5
- vmovdqa %xmm6,32(%esp)
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpslld $2,%xmm3,%xmm3
- addl 56(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpor %xmm5,%xmm3,%xmm3
- addl 60(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl (%esp),%eax
- vpaddd %xmm3,%xmm7,%xmm7
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vmovdqa %xmm7,48(%esp)
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 4(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 8(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 12(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- movl 196(%esp),%ebp
- cmpl 200(%esp),%ebp
- je L008done
- vmovdqa 160(%esp),%xmm7
- vmovdqa 176(%esp),%xmm6
- vmovdqu (%ebp),%xmm0
- vmovdqu 16(%ebp),%xmm1
- vmovdqu 32(%ebp),%xmm2
- vmovdqu 48(%ebp),%xmm3
- addl $64,%ebp
- vpshufb %xmm6,%xmm0,%xmm0
- movl %ebp,196(%esp)
- vmovdqa %xmm7,96(%esp)
- addl 16(%esp),%ebx
- xorl %edi,%esi
- vpshufb %xmm6,%xmm1,%xmm1
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- vpaddd %xmm7,%xmm0,%xmm4
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vmovdqa %xmm4,(%esp)
- addl 20(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 28(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 32(%esp),%ecx
- xorl %eax,%esi
- vpshufb %xmm6,%xmm2,%xmm2
- movl %edx,%ebp
- shldl $5,%edx,%edx
- vpaddd %xmm7,%xmm1,%xmm5
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vmovdqa %xmm5,16(%esp)
- addl 36(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 48(%esp),%edx
- xorl %ebx,%esi
- vpshufb %xmm6,%xmm3,%xmm3
- movl %edi,%ebp
- shldl $5,%edi,%edi
- vpaddd %xmm7,%xmm2,%xmm6
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vmovdqa %xmm6,32(%esp)
- addl 52(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- addl 56(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- movl 192(%esp),%ebp
- addl (%ebp),%eax
- addl 4(%ebp),%esi
- addl 8(%ebp),%ecx
- movl %eax,(%ebp)
- addl 12(%ebp),%edx
- movl %esi,4(%ebp)
- addl 16(%ebp),%edi
- movl %ecx,%ebx
- movl %ecx,8(%ebp)
- xorl %edx,%ebx
- movl %edx,12(%ebp)
- movl %edi,16(%ebp)
- movl %esi,%ebp
- andl %ebx,%esi
- movl %ebp,%ebx
- jmp L007loop
-.align 4,0x90
-L008done:
- addl 16(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 20(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 28(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 32(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- addl 36(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 48(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 52(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- addl 56(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vzeroall
- movl 192(%esp),%ebp
- addl (%ebp),%eax
- movl 204(%esp),%esp
- addl 4(%ebp),%esi
- addl 8(%ebp),%ecx
- movl %eax,(%ebp)
- addl 12(%ebp),%edx
- movl %esi,4(%ebp)
- addl 16(%ebp),%edi
- movl %ecx,8(%ebp)
- movl %edx,12(%ebp)
- movl %edi,16(%ebp)
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 6,0x90
-LK_XX_XX:
-.long 1518500249,1518500249,1518500249,1518500249
-.long 1859775393,1859775393,1859775393,1859775393
-.long 2400959708,2400959708,2400959708,2400959708
-.long 3395469782,3395469782,3395469782,3395469782
-.long 66051,67438087,134810123,202182159
-.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
-.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
-.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
-.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol _OPENSSL_ia32cap_P
-.long 0
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/sha256-586-apple.S b/apple-x86/crypto/fipsmodule/sha256-586-apple.S
deleted file mode 100644
index d43510a..0000000
--- a/apple-x86/crypto/fipsmodule/sha256-586-apple.S
+++ /dev/null
@@ -1,5567 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl _sha256_block_data_order
-.private_extern _sha256_block_data_order
-.align 4
-_sha256_block_data_order:
-L_sha256_block_data_order_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%eax
- movl %esp,%ebx
- call L000pic_point
-L000pic_point:
- popl %ebp
- leal L001K256-L000pic_point(%ebp),%ebp
- subl $16,%esp
- andl $-64,%esp
- shll $6,%eax
- addl %edi,%eax
- movl %esi,(%esp)
- movl %edi,4(%esp)
- movl %eax,8(%esp)
- movl %ebx,12(%esp)
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001K256(%ebp),%edx
- movl (%edx),%ecx
- movl 4(%edx),%ebx
- testl $1048576,%ecx
- jnz L002loop
- movl 8(%edx),%edx
- testl $16777216,%ecx
- jz L003no_xmm
- andl $1073741824,%ecx
- andl $268435968,%ebx
- orl %ebx,%ecx
- andl $1342177280,%ecx
- cmpl $1342177280,%ecx
- je L004AVX
- testl $512,%ebx
- jnz L005SSSE3
-L003no_xmm:
- subl %edi,%eax
- cmpl $256,%eax
- jae L006unrolled
- jmp L002loop
-.align 4,0x90
-L002loop:
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- bswap %eax
- movl 12(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 16(%edi),%eax
- movl 20(%edi),%ebx
- movl 24(%edi),%ecx
- bswap %eax
- movl 28(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 32(%edi),%eax
- movl 36(%edi),%ebx
- movl 40(%edi),%ecx
- bswap %eax
- movl 44(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 48(%edi),%eax
- movl 52(%edi),%ebx
- movl 56(%edi),%ecx
- bswap %eax
- movl 60(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- addl $64,%edi
- leal -36(%esp),%esp
- movl %edi,104(%esp)
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edi
- movl %ebx,8(%esp)
- xorl %ecx,%ebx
- movl %ecx,12(%esp)
- movl %edi,16(%esp)
- movl %ebx,(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%ebx
- movl 24(%esi),%ecx
- movl 28(%esi),%edi
- movl %ebx,24(%esp)
- movl %ecx,28(%esp)
- movl %edi,32(%esp)
-.align 4,0x90
-L00700_15:
- movl %edx,%ecx
- movl 24(%esp),%esi
- rorl $14,%ecx
- movl 28(%esp),%edi
- xorl %edx,%ecx
- xorl %edi,%esi
- movl 96(%esp),%ebx
- rorl $5,%ecx
- andl %edx,%esi
- movl %edx,20(%esp)
- xorl %ecx,%edx
- addl 32(%esp),%ebx
- xorl %edi,%esi
- rorl $6,%edx
- movl %eax,%ecx
- addl %esi,%ebx
- rorl $9,%ecx
- addl %edx,%ebx
- movl 8(%esp),%edi
- xorl %eax,%ecx
- movl %eax,4(%esp)
- leal -4(%esp),%esp
- rorl $11,%ecx
- movl (%ebp),%esi
- xorl %eax,%ecx
- movl 20(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %esi,%ebx
- movl %eax,(%esp)
- addl %ebx,%edx
- andl 4(%esp),%eax
- addl %ecx,%ebx
- xorl %edi,%eax
- addl $4,%ebp
- addl %ebx,%eax
- cmpl $3248222580,%esi
- jne L00700_15
- movl 156(%esp),%ecx
- jmp L00816_63
-.align 4,0x90
-L00816_63:
- movl %ecx,%ebx
- movl 104(%esp),%esi
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 160(%esp),%ebx
- shrl $10,%edi
- addl 124(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 24(%esp),%esi
- rorl $14,%ecx
- addl %edi,%ebx
- movl 28(%esp),%edi
- xorl %edx,%ecx
- xorl %edi,%esi
- movl %ebx,96(%esp)
- rorl $5,%ecx
- andl %edx,%esi
- movl %edx,20(%esp)
- xorl %ecx,%edx
- addl 32(%esp),%ebx
- xorl %edi,%esi
- rorl $6,%edx
- movl %eax,%ecx
- addl %esi,%ebx
- rorl $9,%ecx
- addl %edx,%ebx
- movl 8(%esp),%edi
- xorl %eax,%ecx
- movl %eax,4(%esp)
- leal -4(%esp),%esp
- rorl $11,%ecx
- movl (%ebp),%esi
- xorl %eax,%ecx
- movl 20(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %esi,%ebx
- movl %eax,(%esp)
- addl %ebx,%edx
- andl 4(%esp),%eax
- addl %ecx,%ebx
- xorl %edi,%eax
- movl 156(%esp),%ecx
- addl $4,%ebp
- addl %ebx,%eax
- cmpl $3329325298,%esi
- jne L00816_63
- movl 356(%esp),%esi
- movl 8(%esp),%ebx
- movl 16(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebx
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl 24(%esp),%eax
- movl 28(%esp),%ebx
- movl 32(%esp),%ecx
- movl 360(%esp),%edi
- addl 16(%esi),%edx
- addl 20(%esi),%eax
- addl 24(%esi),%ebx
- addl 28(%esi),%ecx
- movl %edx,16(%esi)
- movl %eax,20(%esi)
- movl %ebx,24(%esi)
- movl %ecx,28(%esi)
- leal 356(%esp),%esp
- subl $256,%ebp
- cmpl 8(%esp),%edi
- jb L002loop
- movl 12(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 6,0x90
-L001K256:
-.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
-.long 66051,67438087,134810123,202182159
-.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
-.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
-.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
-.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
-.byte 62,0
-.align 4,0x90
-L006unrolled:
- leal -96(%esp),%esp
- movl (%esi),%eax
- movl 4(%esi),%ebp
- movl 8(%esi),%ecx
- movl 12(%esi),%ebx
- movl %ebp,4(%esp)
- xorl %ecx,%ebp
- movl %ecx,8(%esp)
- movl %ebx,12(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%ebx
- movl 24(%esi),%ecx
- movl 28(%esi),%esi
- movl %ebx,20(%esp)
- movl %ecx,24(%esp)
- movl %esi,28(%esp)
- jmp L009grand_loop
-.align 4,0x90
-L009grand_loop:
- movl (%edi),%ebx
- movl 4(%edi),%ecx
- bswap %ebx
- movl 8(%edi),%esi
- bswap %ecx
- movl %ebx,32(%esp)
- bswap %esi
- movl %ecx,36(%esp)
- movl %esi,40(%esp)
- movl 12(%edi),%ebx
- movl 16(%edi),%ecx
- bswap %ebx
- movl 20(%edi),%esi
- bswap %ecx
- movl %ebx,44(%esp)
- bswap %esi
- movl %ecx,48(%esp)
- movl %esi,52(%esp)
- movl 24(%edi),%ebx
- movl 28(%edi),%ecx
- bswap %ebx
- movl 32(%edi),%esi
- bswap %ecx
- movl %ebx,56(%esp)
- bswap %esi
- movl %ecx,60(%esp)
- movl %esi,64(%esp)
- movl 36(%edi),%ebx
- movl 40(%edi),%ecx
- bswap %ebx
- movl 44(%edi),%esi
- bswap %ecx
- movl %ebx,68(%esp)
- bswap %esi
- movl %ecx,72(%esp)
- movl %esi,76(%esp)
- movl 48(%edi),%ebx
- movl 52(%edi),%ecx
- bswap %ebx
- movl 56(%edi),%esi
- bswap %ecx
- movl %ebx,80(%esp)
- bswap %esi
- movl %ecx,84(%esp)
- movl %esi,88(%esp)
- movl 60(%edi),%ebx
- addl $64,%edi
- bswap %ebx
- movl %edi,100(%esp)
- movl %ebx,92(%esp)
- movl %edx,%ecx
- movl 20(%esp),%esi
- rorl $14,%edx
- movl 24(%esp),%edi
- xorl %ecx,%edx
- movl 32(%esp),%ebx
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- addl 28(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 4(%esp),%edi
- xorl %eax,%ecx
- movl %eax,(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 1116352408(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- rorl $2,%ecx
- addl %edx,%ebp
- addl 12(%esp),%edx
- addl %ecx,%ebp
- movl %edx,%esi
- movl 16(%esp),%ecx
- rorl $14,%edx
- movl 20(%esp),%edi
- xorl %esi,%edx
- movl 36(%esp),%ebx
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,12(%esp)
- xorl %esi,%edx
- addl 24(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl (%esp),%edi
- xorl %ebp,%esi
- movl %ebp,28(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 1899447441(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- rorl $2,%esi
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %esi,%eax
- movl %edx,%ecx
- movl 12(%esp),%esi
- rorl $14,%edx
- movl 16(%esp),%edi
- xorl %ecx,%edx
- movl 40(%esp),%ebx
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- addl 20(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 28(%esp),%edi
- xorl %eax,%ecx
- movl %eax,24(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 3049323471(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- rorl $2,%ecx
- addl %edx,%ebp
- addl 4(%esp),%edx
- addl %ecx,%ebp
- movl %edx,%esi
- movl 8(%esp),%ecx
- rorl $14,%edx
- movl 12(%esp),%edi
- xorl %esi,%edx
- movl 44(%esp),%ebx
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,4(%esp)
- xorl %esi,%edx
- addl 16(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 24(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,20(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 3921009573(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- rorl $2,%esi
- addl %edx,%eax
- addl (%esp),%edx
- addl %esi,%eax
- movl %edx,%ecx
- movl 4(%esp),%esi
- rorl $14,%edx
- movl 8(%esp),%edi
- xorl %ecx,%edx
- movl 48(%esp),%ebx
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- addl 12(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 20(%esp),%edi
- xorl %eax,%ecx
- movl %eax,16(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 961987163(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- rorl $2,%ecx
- addl %edx,%ebp
- addl 28(%esp),%edx
- addl %ecx,%ebp
- movl %edx,%esi
- movl (%esp),%ecx
- rorl $14,%edx
- movl 4(%esp),%edi
- xorl %esi,%edx
- movl 52(%esp),%ebx
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,28(%esp)
- xorl %esi,%edx
- addl 8(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 16(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,12(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 1508970993(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- rorl $2,%esi
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %esi,%eax
- movl %edx,%ecx
- movl 28(%esp),%esi
- rorl $14,%edx
- movl (%esp),%edi
- xorl %ecx,%edx
- movl 56(%esp),%ebx
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- addl 4(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 12(%esp),%edi
- xorl %eax,%ecx
- movl %eax,8(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 2453635748(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- rorl $2,%ecx
- addl %edx,%ebp
- addl 20(%esp),%edx
- addl %ecx,%ebp
- movl %edx,%esi
- movl 24(%esp),%ecx
- rorl $14,%edx
- movl 28(%esp),%edi
- xorl %esi,%edx
- movl 60(%esp),%ebx
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,20(%esp)
- xorl %esi,%edx
- addl (%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 8(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,4(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 2870763221(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- rorl $2,%esi
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %esi,%eax
- movl %edx,%ecx
- movl 20(%esp),%esi
- rorl $14,%edx
- movl 24(%esp),%edi
- xorl %ecx,%edx
- movl 64(%esp),%ebx
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- addl 28(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 4(%esp),%edi
- xorl %eax,%ecx
- movl %eax,(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 3624381080(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- rorl $2,%ecx
- addl %edx,%ebp
- addl 12(%esp),%edx
- addl %ecx,%ebp
- movl %edx,%esi
- movl 16(%esp),%ecx
- rorl $14,%edx
- movl 20(%esp),%edi
- xorl %esi,%edx
- movl 68(%esp),%ebx
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,12(%esp)
- xorl %esi,%edx
- addl 24(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl (%esp),%edi
- xorl %ebp,%esi
- movl %ebp,28(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 310598401(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- rorl $2,%esi
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %esi,%eax
- movl %edx,%ecx
- movl 12(%esp),%esi
- rorl $14,%edx
- movl 16(%esp),%edi
- xorl %ecx,%edx
- movl 72(%esp),%ebx
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- addl 20(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 28(%esp),%edi
- xorl %eax,%ecx
- movl %eax,24(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 607225278(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- rorl $2,%ecx
- addl %edx,%ebp
- addl 4(%esp),%edx
- addl %ecx,%ebp
- movl %edx,%esi
- movl 8(%esp),%ecx
- rorl $14,%edx
- movl 12(%esp),%edi
- xorl %esi,%edx
- movl 76(%esp),%ebx
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,4(%esp)
- xorl %esi,%edx
- addl 16(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 24(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,20(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 1426881987(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- rorl $2,%esi
- addl %edx,%eax
- addl (%esp),%edx
- addl %esi,%eax
- movl %edx,%ecx
- movl 4(%esp),%esi
- rorl $14,%edx
- movl 8(%esp),%edi
- xorl %ecx,%edx
- movl 80(%esp),%ebx
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- addl 12(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 20(%esp),%edi
- xorl %eax,%ecx
- movl %eax,16(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 1925078388(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- rorl $2,%ecx
- addl %edx,%ebp
- addl 28(%esp),%edx
- addl %ecx,%ebp
- movl %edx,%esi
- movl (%esp),%ecx
- rorl $14,%edx
- movl 4(%esp),%edi
- xorl %esi,%edx
- movl 84(%esp),%ebx
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,28(%esp)
- xorl %esi,%edx
- addl 8(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 16(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,12(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 2162078206(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- rorl $2,%esi
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %esi,%eax
- movl %edx,%ecx
- movl 28(%esp),%esi
- rorl $14,%edx
- movl (%esp),%edi
- xorl %ecx,%edx
- movl 88(%esp),%ebx
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- addl 4(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 12(%esp),%edi
- xorl %eax,%ecx
- movl %eax,8(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 2614888103(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- rorl $2,%ecx
- addl %edx,%ebp
- addl 20(%esp),%edx
- addl %ecx,%ebp
- movl %edx,%esi
- movl 24(%esp),%ecx
- rorl $14,%edx
- movl 28(%esp),%edi
- xorl %esi,%edx
- movl 92(%esp),%ebx
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,20(%esp)
- xorl %esi,%edx
- addl (%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 8(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,4(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 3248222580(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 36(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %esi,%eax
- movl 88(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 32(%esp),%ebx
- shrl $10,%edi
- addl 68(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 20(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 24(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,32(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- addl 28(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 4(%esp),%edi
- xorl %eax,%ecx
- movl %eax,(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 3835390401(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 40(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 12(%esp),%edx
- addl %ecx,%ebp
- movl 92(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 36(%esp),%ebx
- shrl $10,%edi
- addl 72(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 16(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 20(%esp),%edi
- xorl %esi,%edx
- movl %ebx,36(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,12(%esp)
- xorl %esi,%edx
- addl 24(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl (%esp),%edi
- xorl %ebp,%esi
- movl %ebp,28(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 4022224774(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 44(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %esi,%eax
- movl 32(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 40(%esp),%ebx
- shrl $10,%edi
- addl 76(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 12(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 16(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,40(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- addl 20(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 28(%esp),%edi
- xorl %eax,%ecx
- movl %eax,24(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 264347078(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 48(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 4(%esp),%edx
- addl %ecx,%ebp
- movl 36(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 44(%esp),%ebx
- shrl $10,%edi
- addl 80(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 8(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 12(%esp),%edi
- xorl %esi,%edx
- movl %ebx,44(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,4(%esp)
- xorl %esi,%edx
- addl 16(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 24(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,20(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 604807628(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 52(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl (%esp),%edx
- addl %esi,%eax
- movl 40(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 48(%esp),%ebx
- shrl $10,%edi
- addl 84(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 4(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 8(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,48(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- addl 12(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 20(%esp),%edi
- xorl %eax,%ecx
- movl %eax,16(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 770255983(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 56(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 28(%esp),%edx
- addl %ecx,%ebp
- movl 44(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 52(%esp),%ebx
- shrl $10,%edi
- addl 88(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl (%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 4(%esp),%edi
- xorl %esi,%edx
- movl %ebx,52(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,28(%esp)
- xorl %esi,%edx
- addl 8(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 16(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,12(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 1249150122(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 60(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %esi,%eax
- movl 48(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 56(%esp),%ebx
- shrl $10,%edi
- addl 92(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 28(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl (%esp),%edi
- xorl %ecx,%edx
- movl %ebx,56(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- addl 4(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 12(%esp),%edi
- xorl %eax,%ecx
- movl %eax,8(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 1555081692(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 64(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 20(%esp),%edx
- addl %ecx,%ebp
- movl 52(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 60(%esp),%ebx
- shrl $10,%edi
- addl 32(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 24(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 28(%esp),%edi
- xorl %esi,%edx
- movl %ebx,60(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,20(%esp)
- xorl %esi,%edx
- addl (%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 8(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,4(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 1996064986(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 68(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %esi,%eax
- movl 56(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 64(%esp),%ebx
- shrl $10,%edi
- addl 36(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 20(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 24(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,64(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- addl 28(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 4(%esp),%edi
- xorl %eax,%ecx
- movl %eax,(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 2554220882(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 72(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 12(%esp),%edx
- addl %ecx,%ebp
- movl 60(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 68(%esp),%ebx
- shrl $10,%edi
- addl 40(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 16(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 20(%esp),%edi
- xorl %esi,%edx
- movl %ebx,68(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,12(%esp)
- xorl %esi,%edx
- addl 24(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl (%esp),%edi
- xorl %ebp,%esi
- movl %ebp,28(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 2821834349(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 76(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %esi,%eax
- movl 64(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 72(%esp),%ebx
- shrl $10,%edi
- addl 44(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 12(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 16(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,72(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- addl 20(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 28(%esp),%edi
- xorl %eax,%ecx
- movl %eax,24(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 2952996808(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 80(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 4(%esp),%edx
- addl %ecx,%ebp
- movl 68(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 76(%esp),%ebx
- shrl $10,%edi
- addl 48(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 8(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 12(%esp),%edi
- xorl %esi,%edx
- movl %ebx,76(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,4(%esp)
- xorl %esi,%edx
- addl 16(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 24(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,20(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 3210313671(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 84(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl (%esp),%edx
- addl %esi,%eax
- movl 72(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 80(%esp),%ebx
- shrl $10,%edi
- addl 52(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 4(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 8(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,80(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- addl 12(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 20(%esp),%edi
- xorl %eax,%ecx
- movl %eax,16(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 3336571891(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 88(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 28(%esp),%edx
- addl %ecx,%ebp
- movl 76(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 84(%esp),%ebx
- shrl $10,%edi
- addl 56(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl (%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 4(%esp),%edi
- xorl %esi,%edx
- movl %ebx,84(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,28(%esp)
- xorl %esi,%edx
- addl 8(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 16(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,12(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 3584528711(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 92(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %esi,%eax
- movl 80(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 88(%esp),%ebx
- shrl $10,%edi
- addl 60(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 28(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl (%esp),%edi
- xorl %ecx,%edx
- movl %ebx,88(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- addl 4(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 12(%esp),%edi
- xorl %eax,%ecx
- movl %eax,8(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 113926993(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 32(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 20(%esp),%edx
- addl %ecx,%ebp
- movl 84(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 92(%esp),%ebx
- shrl $10,%edi
- addl 64(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 24(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 28(%esp),%edi
- xorl %esi,%edx
- movl %ebx,92(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,20(%esp)
- xorl %esi,%edx
- addl (%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 8(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,4(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 338241895(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 36(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %esi,%eax
- movl 88(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 32(%esp),%ebx
- shrl $10,%edi
- addl 68(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 20(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 24(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,32(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- addl 28(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 4(%esp),%edi
- xorl %eax,%ecx
- movl %eax,(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 666307205(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 40(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 12(%esp),%edx
- addl %ecx,%ebp
- movl 92(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 36(%esp),%ebx
- shrl $10,%edi
- addl 72(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 16(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 20(%esp),%edi
- xorl %esi,%edx
- movl %ebx,36(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,12(%esp)
- xorl %esi,%edx
- addl 24(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl (%esp),%edi
- xorl %ebp,%esi
- movl %ebp,28(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 773529912(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 44(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %esi,%eax
- movl 32(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 40(%esp),%ebx
- shrl $10,%edi
- addl 76(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 12(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 16(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,40(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- addl 20(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 28(%esp),%edi
- xorl %eax,%ecx
- movl %eax,24(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 1294757372(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 48(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 4(%esp),%edx
- addl %ecx,%ebp
- movl 36(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 44(%esp),%ebx
- shrl $10,%edi
- addl 80(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 8(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 12(%esp),%edi
- xorl %esi,%edx
- movl %ebx,44(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,4(%esp)
- xorl %esi,%edx
- addl 16(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 24(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,20(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 1396182291(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 52(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl (%esp),%edx
- addl %esi,%eax
- movl 40(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 48(%esp),%ebx
- shrl $10,%edi
- addl 84(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 4(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 8(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,48(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- addl 12(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 20(%esp),%edi
- xorl %eax,%ecx
- movl %eax,16(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 1695183700(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 56(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 28(%esp),%edx
- addl %ecx,%ebp
- movl 44(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 52(%esp),%ebx
- shrl $10,%edi
- addl 88(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl (%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 4(%esp),%edi
- xorl %esi,%edx
- movl %ebx,52(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,28(%esp)
- xorl %esi,%edx
- addl 8(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 16(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,12(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 1986661051(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 60(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %esi,%eax
- movl 48(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 56(%esp),%ebx
- shrl $10,%edi
- addl 92(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 28(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl (%esp),%edi
- xorl %ecx,%edx
- movl %ebx,56(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- addl 4(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 12(%esp),%edi
- xorl %eax,%ecx
- movl %eax,8(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 2177026350(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 64(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 20(%esp),%edx
- addl %ecx,%ebp
- movl 52(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 60(%esp),%ebx
- shrl $10,%edi
- addl 32(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 24(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 28(%esp),%edi
- xorl %esi,%edx
- movl %ebx,60(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,20(%esp)
- xorl %esi,%edx
- addl (%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 8(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,4(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 2456956037(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 68(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %esi,%eax
- movl 56(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 64(%esp),%ebx
- shrl $10,%edi
- addl 36(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 20(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 24(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,64(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- addl 28(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 4(%esp),%edi
- xorl %eax,%ecx
- movl %eax,(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 2730485921(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 72(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 12(%esp),%edx
- addl %ecx,%ebp
- movl 60(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 68(%esp),%ebx
- shrl $10,%edi
- addl 40(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 16(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 20(%esp),%edi
- xorl %esi,%edx
- movl %ebx,68(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,12(%esp)
- xorl %esi,%edx
- addl 24(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl (%esp),%edi
- xorl %ebp,%esi
- movl %ebp,28(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 2820302411(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 76(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %esi,%eax
- movl 64(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 72(%esp),%ebx
- shrl $10,%edi
- addl 44(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 12(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 16(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,72(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- addl 20(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 28(%esp),%edi
- xorl %eax,%ecx
- movl %eax,24(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 3259730800(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 80(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 4(%esp),%edx
- addl %ecx,%ebp
- movl 68(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 76(%esp),%ebx
- shrl $10,%edi
- addl 48(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 8(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 12(%esp),%edi
- xorl %esi,%edx
- movl %ebx,76(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,4(%esp)
- xorl %esi,%edx
- addl 16(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 24(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,20(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 3345764771(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 84(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl (%esp),%edx
- addl %esi,%eax
- movl 72(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 80(%esp),%ebx
- shrl $10,%edi
- addl 52(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 4(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 8(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,80(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- addl 12(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 20(%esp),%edi
- xorl %eax,%ecx
- movl %eax,16(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 3516065817(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 88(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 28(%esp),%edx
- addl %ecx,%ebp
- movl 76(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 84(%esp),%ebx
- shrl $10,%edi
- addl 56(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl (%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 4(%esp),%edi
- xorl %esi,%edx
- movl %ebx,84(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,28(%esp)
- xorl %esi,%edx
- addl 8(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 16(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,12(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 3600352804(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 92(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %esi,%eax
- movl 80(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 88(%esp),%ebx
- shrl $10,%edi
- addl 60(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 28(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl (%esp),%edi
- xorl %ecx,%edx
- movl %ebx,88(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- addl 4(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 12(%esp),%edi
- xorl %eax,%ecx
- movl %eax,8(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 4094571909(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 32(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 20(%esp),%edx
- addl %ecx,%ebp
- movl 84(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 92(%esp),%ebx
- shrl $10,%edi
- addl 64(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 24(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 28(%esp),%edi
- xorl %esi,%edx
- movl %ebx,92(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,20(%esp)
- xorl %esi,%edx
- addl (%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 8(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,4(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 275423344(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 36(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %esi,%eax
- movl 88(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 32(%esp),%ebx
- shrl $10,%edi
- addl 68(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 20(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 24(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,32(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- addl 28(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 4(%esp),%edi
- xorl %eax,%ecx
- movl %eax,(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 430227734(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 40(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 12(%esp),%edx
- addl %ecx,%ebp
- movl 92(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 36(%esp),%ebx
- shrl $10,%edi
- addl 72(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 16(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 20(%esp),%edi
- xorl %esi,%edx
- movl %ebx,36(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,12(%esp)
- xorl %esi,%edx
- addl 24(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl (%esp),%edi
- xorl %ebp,%esi
- movl %ebp,28(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 506948616(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 44(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %esi,%eax
- movl 32(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 40(%esp),%ebx
- shrl $10,%edi
- addl 76(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 12(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 16(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,40(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- addl 20(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 28(%esp),%edi
- xorl %eax,%ecx
- movl %eax,24(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 659060556(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 48(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 4(%esp),%edx
- addl %ecx,%ebp
- movl 36(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 44(%esp),%ebx
- shrl $10,%edi
- addl 80(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 8(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 12(%esp),%edi
- xorl %esi,%edx
- movl %ebx,44(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,4(%esp)
- xorl %esi,%edx
- addl 16(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 24(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,20(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 883997877(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 52(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl (%esp),%edx
- addl %esi,%eax
- movl 40(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 48(%esp),%ebx
- shrl $10,%edi
- addl 84(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 4(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 8(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,48(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- addl 12(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 20(%esp),%edi
- xorl %eax,%ecx
- movl %eax,16(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 958139571(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 56(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 28(%esp),%edx
- addl %ecx,%ebp
- movl 44(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 52(%esp),%ebx
- shrl $10,%edi
- addl 88(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl (%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 4(%esp),%edi
- xorl %esi,%edx
- movl %ebx,52(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,28(%esp)
- xorl %esi,%edx
- addl 8(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 16(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,12(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 1322822218(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 60(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %esi,%eax
- movl 48(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 56(%esp),%ebx
- shrl $10,%edi
- addl 92(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 28(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl (%esp),%edi
- xorl %ecx,%edx
- movl %ebx,56(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- addl 4(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 12(%esp),%edi
- xorl %eax,%ecx
- movl %eax,8(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 1537002063(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 64(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 20(%esp),%edx
- addl %ecx,%ebp
- movl 52(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 60(%esp),%ebx
- shrl $10,%edi
- addl 32(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 24(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 28(%esp),%edi
- xorl %esi,%edx
- movl %ebx,60(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,20(%esp)
- xorl %esi,%edx
- addl (%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 8(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,4(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 1747873779(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 68(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %esi,%eax
- movl 56(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 64(%esp),%ebx
- shrl $10,%edi
- addl 36(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 20(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 24(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,64(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- addl 28(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 4(%esp),%edi
- xorl %eax,%ecx
- movl %eax,(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 1955562222(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 72(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 12(%esp),%edx
- addl %ecx,%ebp
- movl 60(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 68(%esp),%ebx
- shrl $10,%edi
- addl 40(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 16(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 20(%esp),%edi
- xorl %esi,%edx
- movl %ebx,68(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,12(%esp)
- xorl %esi,%edx
- addl 24(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl (%esp),%edi
- xorl %ebp,%esi
- movl %ebp,28(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 2024104815(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 76(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %esi,%eax
- movl 64(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 72(%esp),%ebx
- shrl $10,%edi
- addl 44(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 12(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 16(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,72(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- addl 20(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 28(%esp),%edi
- xorl %eax,%ecx
- movl %eax,24(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 2227730452(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 80(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 4(%esp),%edx
- addl %ecx,%ebp
- movl 68(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 76(%esp),%ebx
- shrl $10,%edi
- addl 48(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 8(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 12(%esp),%edi
- xorl %esi,%edx
- movl %ebx,76(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,4(%esp)
- xorl %esi,%edx
- addl 16(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 24(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,20(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 2361852424(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 84(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl (%esp),%edx
- addl %esi,%eax
- movl 72(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 80(%esp),%ebx
- shrl $10,%edi
- addl 52(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 4(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl 8(%esp),%edi
- xorl %ecx,%edx
- movl %ebx,80(%esp)
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- addl 12(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 20(%esp),%edi
- xorl %eax,%ecx
- movl %eax,16(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 2428436474(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 88(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 28(%esp),%edx
- addl %ecx,%ebp
- movl 76(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 84(%esp),%ebx
- shrl $10,%edi
- addl 56(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl (%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 4(%esp),%edi
- xorl %esi,%edx
- movl %ebx,84(%esp)
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,28(%esp)
- xorl %esi,%edx
- addl 8(%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 16(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,12(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 2756734187(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- movl 92(%esp),%ecx
- rorl $2,%esi
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %esi,%eax
- movl 80(%esp),%esi
- movl %ecx,%ebx
- rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 88(%esp),%ebx
- shrl $10,%edi
- addl 60(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 28(%esp),%esi
- rorl $14,%edx
- addl %edi,%ebx
- movl (%esp),%edi
- xorl %ecx,%edx
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- addl 4(%esp),%ebx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%ebx
- rorl $9,%ecx
- movl %eax,%esi
- movl 12(%esp),%edi
- xorl %eax,%ecx
- movl %eax,8(%esp)
- xorl %edi,%eax
- rorl $11,%ecx
- andl %eax,%ebp
- leal 3204031479(%ebx,%edx,1),%edx
- xorl %esi,%ecx
- xorl %edi,%ebp
- movl 32(%esp),%esi
- rorl $2,%ecx
- addl %edx,%ebp
- addl 20(%esp),%edx
- addl %ecx,%ebp
- movl 84(%esp),%ecx
- movl %esi,%ebx
- rorl $11,%esi
- movl %ecx,%edi
- rorl $2,%ecx
- xorl %ebx,%esi
- shrl $3,%ebx
- rorl $7,%esi
- xorl %edi,%ecx
- xorl %esi,%ebx
- rorl $17,%ecx
- addl 92(%esp),%ebx
- shrl $10,%edi
- addl 64(%esp),%ebx
- movl %edx,%esi
- xorl %ecx,%edi
- movl 24(%esp),%ecx
- rorl $14,%edx
- addl %edi,%ebx
- movl 28(%esp),%edi
- xorl %esi,%edx
- xorl %edi,%ecx
- rorl $5,%edx
- andl %esi,%ecx
- movl %esi,20(%esp)
- xorl %esi,%edx
- addl (%esp),%ebx
- xorl %ecx,%edi
- rorl $6,%edx
- movl %ebp,%esi
- addl %edi,%ebx
- rorl $9,%esi
- movl %ebp,%ecx
- movl 8(%esp),%edi
- xorl %ebp,%esi
- movl %ebp,4(%esp)
- xorl %edi,%ebp
- rorl $11,%esi
- andl %ebp,%eax
- leal 3329325298(%ebx,%edx,1),%edx
- xorl %ecx,%esi
- xorl %edi,%eax
- rorl $2,%esi
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %esi,%eax
- movl 96(%esp),%esi
- xorl %edi,%ebp
- movl 12(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebp
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebp,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl %ebp,4(%esp)
- xorl %edi,%ebp
- movl %edi,8(%esp)
- movl %ecx,12(%esp)
- movl 20(%esp),%edi
- movl 24(%esp),%ebx
- movl 28(%esp),%ecx
- addl 16(%esi),%edx
- addl 20(%esi),%edi
- addl 24(%esi),%ebx
- addl 28(%esi),%ecx
- movl %edx,16(%esi)
- movl %edi,20(%esi)
- movl %ebx,24(%esi)
- movl %ecx,28(%esi)
- movl %edi,20(%esp)
- movl 100(%esp),%edi
- movl %ebx,24(%esp)
- movl %ecx,28(%esp)
- cmpl 104(%esp),%edi
- jb L009grand_loop
- movl 108(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 5,0x90
-L005SSSE3:
- leal -96(%esp),%esp
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edi
- movl %ebx,4(%esp)
- xorl %ecx,%ebx
- movl %ecx,8(%esp)
- movl %edi,12(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%edi
- movl 24(%esi),%ecx
- movl 28(%esi),%esi
- movl %edi,20(%esp)
- movl 100(%esp),%edi
- movl %ecx,24(%esp)
- movl %esi,28(%esp)
- movdqa 256(%ebp),%xmm7
- jmp L010grand_ssse3
-.align 4,0x90
-L010grand_ssse3:
- movdqu (%edi),%xmm0
- movdqu 16(%edi),%xmm1
- movdqu 32(%edi),%xmm2
- movdqu 48(%edi),%xmm3
- addl $64,%edi
-.byte 102,15,56,0,199
- movl %edi,100(%esp)
-.byte 102,15,56,0,207
- movdqa (%ebp),%xmm4
-.byte 102,15,56,0,215
- movdqa 16(%ebp),%xmm5
- paddd %xmm0,%xmm4
-.byte 102,15,56,0,223
- movdqa 32(%ebp),%xmm6
- paddd %xmm1,%xmm5
- movdqa 48(%ebp),%xmm7
- movdqa %xmm4,32(%esp)
- paddd %xmm2,%xmm6
- movdqa %xmm5,48(%esp)
- paddd %xmm3,%xmm7
- movdqa %xmm6,64(%esp)
- movdqa %xmm7,80(%esp)
- jmp L011ssse3_00_47
-.align 4,0x90
-L011ssse3_00_47:
- addl $64,%ebp
- movl %edx,%ecx
- movdqa %xmm1,%xmm4
- rorl $14,%edx
- movl 20(%esp),%esi
- movdqa %xmm3,%xmm7
- xorl %ecx,%edx
- movl 24(%esp),%edi
-.byte 102,15,58,15,224,4
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
-.byte 102,15,58,15,250,4
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- movdqa %xmm4,%xmm5
- rorl $6,%edx
- movl %eax,%ecx
- movdqa %xmm4,%xmm6
- addl %edi,%edx
- movl 4(%esp),%edi
- psrld $3,%xmm4
- movl %eax,%esi
- rorl $9,%ecx
- paddd %xmm7,%xmm0
- movl %eax,(%esp)
- xorl %eax,%ecx
- psrld $7,%xmm6
- xorl %edi,%eax
- addl 28(%esp),%edx
- rorl $11,%ecx
- andl %eax,%ebx
- pshufd $250,%xmm3,%xmm7
- xorl %esi,%ecx
- addl 32(%esp),%edx
- pslld $14,%xmm5
- xorl %edi,%ebx
- rorl $2,%ecx
- pxor %xmm6,%xmm4
- addl %edx,%ebx
- addl 12(%esp),%edx
- psrld $11,%xmm6
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- pxor %xmm5,%xmm4
- movl 16(%esp),%esi
- xorl %ecx,%edx
- pslld $11,%xmm5
- movl 20(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- pxor %xmm6,%xmm4
- andl %ecx,%esi
- movl %ecx,12(%esp)
- movdqa %xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- pxor %xmm5,%xmm4
- movl %ebx,%ecx
- addl %edi,%edx
- psrld $10,%xmm7
- movl (%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- paddd %xmm4,%xmm0
- movl %ebx,28(%esp)
- xorl %ebx,%ecx
- psrlq $17,%xmm6
- xorl %edi,%ebx
- addl 24(%esp),%edx
- rorl $11,%ecx
- pxor %xmm6,%xmm7
- andl %ebx,%eax
- xorl %esi,%ecx
- psrlq $2,%xmm6
- addl 36(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- pxor %xmm6,%xmm7
- addl %edx,%eax
- addl 8(%esp),%edx
- pshufd $128,%xmm7,%xmm7
- addl %ecx,%eax
- movl %edx,%ecx
- rorl $14,%edx
- movl 12(%esp),%esi
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- psrldq $8,%xmm7
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- paddd %xmm7,%xmm0
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- movl %eax,%esi
- rorl $9,%ecx
- movl %eax,24(%esp)
- pshufd $80,%xmm0,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- movdqa %xmm7,%xmm6
- rorl $11,%ecx
- psrld $10,%xmm7
- andl %eax,%ebx
- psrlq $17,%xmm6
- xorl %esi,%ecx
- addl 40(%esp),%edx
- xorl %edi,%ebx
- rorl $2,%ecx
- pxor %xmm6,%xmm7
- addl %edx,%ebx
- addl 4(%esp),%edx
- psrlq $2,%xmm6
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- pxor %xmm6,%xmm7
- movl 8(%esp),%esi
- xorl %ecx,%edx
- movl 12(%esp),%edi
- pshufd $8,%xmm7,%xmm7
- xorl %edi,%esi
- rorl $5,%edx
- movdqa (%ebp),%xmm6
- andl %ecx,%esi
- movl %ecx,4(%esp)
- pslldq $8,%xmm7
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- paddd %xmm7,%xmm0
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- paddd %xmm0,%xmm6
- rorl $11,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 44(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- movdqa %xmm6,32(%esp)
- movl %edx,%ecx
- movdqa %xmm2,%xmm4
- rorl $14,%edx
- movl 4(%esp),%esi
- movdqa %xmm0,%xmm7
- xorl %ecx,%edx
- movl 8(%esp),%edi
-.byte 102,15,58,15,225,4
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
-.byte 102,15,58,15,251,4
- movl %ecx,(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- movdqa %xmm4,%xmm5
- rorl $6,%edx
- movl %eax,%ecx
- movdqa %xmm4,%xmm6
- addl %edi,%edx
- movl 20(%esp),%edi
- psrld $3,%xmm4
- movl %eax,%esi
- rorl $9,%ecx
- paddd %xmm7,%xmm1
- movl %eax,16(%esp)
- xorl %eax,%ecx
- psrld $7,%xmm6
- xorl %edi,%eax
- addl 12(%esp),%edx
- rorl $11,%ecx
- andl %eax,%ebx
- pshufd $250,%xmm0,%xmm7
- xorl %esi,%ecx
- addl 48(%esp),%edx
- pslld $14,%xmm5
- xorl %edi,%ebx
- rorl $2,%ecx
- pxor %xmm6,%xmm4
- addl %edx,%ebx
- addl 28(%esp),%edx
- psrld $11,%xmm6
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- pxor %xmm5,%xmm4
- movl (%esp),%esi
- xorl %ecx,%edx
- pslld $11,%xmm5
- movl 4(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- pxor %xmm6,%xmm4
- andl %ecx,%esi
- movl %ecx,28(%esp)
- movdqa %xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- pxor %xmm5,%xmm4
- movl %ebx,%ecx
- addl %edi,%edx
- psrld $10,%xmm7
- movl 16(%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- paddd %xmm4,%xmm1
- movl %ebx,12(%esp)
- xorl %ebx,%ecx
- psrlq $17,%xmm6
- xorl %edi,%ebx
- addl 8(%esp),%edx
- rorl $11,%ecx
- pxor %xmm6,%xmm7
- andl %ebx,%eax
- xorl %esi,%ecx
- psrlq $2,%xmm6
- addl 52(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- pxor %xmm6,%xmm7
- addl %edx,%eax
- addl 24(%esp),%edx
- pshufd $128,%xmm7,%xmm7
- addl %ecx,%eax
- movl %edx,%ecx
- rorl $14,%edx
- movl 28(%esp),%esi
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- psrldq $8,%xmm7
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- paddd %xmm7,%xmm1
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- movl %eax,%esi
- rorl $9,%ecx
- movl %eax,8(%esp)
- pshufd $80,%xmm1,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- movdqa %xmm7,%xmm6
- rorl $11,%ecx
- psrld $10,%xmm7
- andl %eax,%ebx
- psrlq $17,%xmm6
- xorl %esi,%ecx
- addl 56(%esp),%edx
- xorl %edi,%ebx
- rorl $2,%ecx
- pxor %xmm6,%xmm7
- addl %edx,%ebx
- addl 20(%esp),%edx
- psrlq $2,%xmm6
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- pxor %xmm6,%xmm7
- movl 24(%esp),%esi
- xorl %ecx,%edx
- movl 28(%esp),%edi
- pshufd $8,%xmm7,%xmm7
- xorl %edi,%esi
- rorl $5,%edx
- movdqa 16(%ebp),%xmm6
- andl %ecx,%esi
- movl %ecx,20(%esp)
- pslldq $8,%xmm7
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- paddd %xmm7,%xmm1
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- paddd %xmm1,%xmm6
- rorl $11,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 60(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- movdqa %xmm6,48(%esp)
- movl %edx,%ecx
- movdqa %xmm3,%xmm4
- rorl $14,%edx
- movl 20(%esp),%esi
- movdqa %xmm1,%xmm7
- xorl %ecx,%edx
- movl 24(%esp),%edi
-.byte 102,15,58,15,226,4
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
-.byte 102,15,58,15,248,4
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- movdqa %xmm4,%xmm5
- rorl $6,%edx
- movl %eax,%ecx
- movdqa %xmm4,%xmm6
- addl %edi,%edx
- movl 4(%esp),%edi
- psrld $3,%xmm4
- movl %eax,%esi
- rorl $9,%ecx
- paddd %xmm7,%xmm2
- movl %eax,(%esp)
- xorl %eax,%ecx
- psrld $7,%xmm6
- xorl %edi,%eax
- addl 28(%esp),%edx
- rorl $11,%ecx
- andl %eax,%ebx
- pshufd $250,%xmm1,%xmm7
- xorl %esi,%ecx
- addl 64(%esp),%edx
- pslld $14,%xmm5
- xorl %edi,%ebx
- rorl $2,%ecx
- pxor %xmm6,%xmm4
- addl %edx,%ebx
- addl 12(%esp),%edx
- psrld $11,%xmm6
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- pxor %xmm5,%xmm4
- movl 16(%esp),%esi
- xorl %ecx,%edx
- pslld $11,%xmm5
- movl 20(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- pxor %xmm6,%xmm4
- andl %ecx,%esi
- movl %ecx,12(%esp)
- movdqa %xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- pxor %xmm5,%xmm4
- movl %ebx,%ecx
- addl %edi,%edx
- psrld $10,%xmm7
- movl (%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- paddd %xmm4,%xmm2
- movl %ebx,28(%esp)
- xorl %ebx,%ecx
- psrlq $17,%xmm6
- xorl %edi,%ebx
- addl 24(%esp),%edx
- rorl $11,%ecx
- pxor %xmm6,%xmm7
- andl %ebx,%eax
- xorl %esi,%ecx
- psrlq $2,%xmm6
- addl 68(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- pxor %xmm6,%xmm7
- addl %edx,%eax
- addl 8(%esp),%edx
- pshufd $128,%xmm7,%xmm7
- addl %ecx,%eax
- movl %edx,%ecx
- rorl $14,%edx
- movl 12(%esp),%esi
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- psrldq $8,%xmm7
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- paddd %xmm7,%xmm2
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- movl %eax,%esi
- rorl $9,%ecx
- movl %eax,24(%esp)
- pshufd $80,%xmm2,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- movdqa %xmm7,%xmm6
- rorl $11,%ecx
- psrld $10,%xmm7
- andl %eax,%ebx
- psrlq $17,%xmm6
- xorl %esi,%ecx
- addl 72(%esp),%edx
- xorl %edi,%ebx
- rorl $2,%ecx
- pxor %xmm6,%xmm7
- addl %edx,%ebx
- addl 4(%esp),%edx
- psrlq $2,%xmm6
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- pxor %xmm6,%xmm7
- movl 8(%esp),%esi
- xorl %ecx,%edx
- movl 12(%esp),%edi
- pshufd $8,%xmm7,%xmm7
- xorl %edi,%esi
- rorl $5,%edx
- movdqa 32(%ebp),%xmm6
- andl %ecx,%esi
- movl %ecx,4(%esp)
- pslldq $8,%xmm7
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- paddd %xmm7,%xmm2
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- paddd %xmm2,%xmm6
- rorl $11,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 76(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- movdqa %xmm6,64(%esp)
- movl %edx,%ecx
- movdqa %xmm0,%xmm4
- rorl $14,%edx
- movl 4(%esp),%esi
- movdqa %xmm2,%xmm7
- xorl %ecx,%edx
- movl 8(%esp),%edi
-.byte 102,15,58,15,227,4
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
-.byte 102,15,58,15,249,4
- movl %ecx,(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- movdqa %xmm4,%xmm5
- rorl $6,%edx
- movl %eax,%ecx
- movdqa %xmm4,%xmm6
- addl %edi,%edx
- movl 20(%esp),%edi
- psrld $3,%xmm4
- movl %eax,%esi
- rorl $9,%ecx
- paddd %xmm7,%xmm3
- movl %eax,16(%esp)
- xorl %eax,%ecx
- psrld $7,%xmm6
- xorl %edi,%eax
- addl 12(%esp),%edx
- rorl $11,%ecx
- andl %eax,%ebx
- pshufd $250,%xmm2,%xmm7
- xorl %esi,%ecx
- addl 80(%esp),%edx
- pslld $14,%xmm5
- xorl %edi,%ebx
- rorl $2,%ecx
- pxor %xmm6,%xmm4
- addl %edx,%ebx
- addl 28(%esp),%edx
- psrld $11,%xmm6
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- pxor %xmm5,%xmm4
- movl (%esp),%esi
- xorl %ecx,%edx
- pslld $11,%xmm5
- movl 4(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- pxor %xmm6,%xmm4
- andl %ecx,%esi
- movl %ecx,28(%esp)
- movdqa %xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- pxor %xmm5,%xmm4
- movl %ebx,%ecx
- addl %edi,%edx
- psrld $10,%xmm7
- movl 16(%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- paddd %xmm4,%xmm3
- movl %ebx,12(%esp)
- xorl %ebx,%ecx
- psrlq $17,%xmm6
- xorl %edi,%ebx
- addl 8(%esp),%edx
- rorl $11,%ecx
- pxor %xmm6,%xmm7
- andl %ebx,%eax
- xorl %esi,%ecx
- psrlq $2,%xmm6
- addl 84(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- pxor %xmm6,%xmm7
- addl %edx,%eax
- addl 24(%esp),%edx
- pshufd $128,%xmm7,%xmm7
- addl %ecx,%eax
- movl %edx,%ecx
- rorl $14,%edx
- movl 28(%esp),%esi
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- psrldq $8,%xmm7
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- paddd %xmm7,%xmm3
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- movl %eax,%esi
- rorl $9,%ecx
- movl %eax,8(%esp)
- pshufd $80,%xmm3,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- movdqa %xmm7,%xmm6
- rorl $11,%ecx
- psrld $10,%xmm7
- andl %eax,%ebx
- psrlq $17,%xmm6
- xorl %esi,%ecx
- addl 88(%esp),%edx
- xorl %edi,%ebx
- rorl $2,%ecx
- pxor %xmm6,%xmm7
- addl %edx,%ebx
- addl 20(%esp),%edx
- psrlq $2,%xmm6
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- pxor %xmm6,%xmm7
- movl 24(%esp),%esi
- xorl %ecx,%edx
- movl 28(%esp),%edi
- pshufd $8,%xmm7,%xmm7
- xorl %edi,%esi
- rorl $5,%edx
- movdqa 48(%ebp),%xmm6
- andl %ecx,%esi
- movl %ecx,20(%esp)
- pslldq $8,%xmm7
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- paddd %xmm7,%xmm3
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- paddd %xmm3,%xmm6
- rorl $11,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 92(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- movdqa %xmm6,80(%esp)
- cmpl $66051,64(%ebp)
- jne L011ssse3_00_47
- movl %edx,%ecx
- rorl $14,%edx
- movl 20(%esp),%esi
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- movl %eax,%esi
- rorl $9,%ecx
- movl %eax,(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- rorl $11,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 32(%esp),%edx
- xorl %edi,%ebx
- rorl $2,%ecx
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- movl 16(%esp),%esi
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- movl %ebx,28(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- rorl $11,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 36(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- rorl $14,%edx
- movl 12(%esp),%esi
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- movl %eax,%esi
- rorl $9,%ecx
- movl %eax,24(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- rorl $11,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 40(%esp),%edx
- xorl %edi,%ebx
- rorl $2,%ecx
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- movl 8(%esp),%esi
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- rorl $11,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 44(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- rorl $14,%edx
- movl 4(%esp),%esi
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- movl %eax,%esi
- rorl $9,%ecx
- movl %eax,16(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- rorl $11,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 48(%esp),%edx
- xorl %edi,%ebx
- rorl $2,%ecx
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- movl (%esp),%esi
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- movl %ebx,12(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- rorl $11,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 52(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- rorl $14,%edx
- movl 28(%esp),%esi
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- movl %eax,%esi
- rorl $9,%ecx
- movl %eax,8(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- rorl $11,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 56(%esp),%edx
- xorl %edi,%ebx
- rorl $2,%ecx
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- movl 24(%esp),%esi
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- rorl $11,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 60(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- rorl $14,%edx
- movl 20(%esp),%esi
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- movl %eax,%esi
- rorl $9,%ecx
- movl %eax,(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- rorl $11,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 64(%esp),%edx
- xorl %edi,%ebx
- rorl $2,%ecx
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- movl 16(%esp),%esi
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- movl %ebx,28(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- rorl $11,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 68(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- rorl $14,%edx
- movl 12(%esp),%esi
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- movl %eax,%esi
- rorl $9,%ecx
- movl %eax,24(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- rorl $11,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 72(%esp),%edx
- xorl %edi,%ebx
- rorl $2,%ecx
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- movl 8(%esp),%esi
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- rorl $11,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 76(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- rorl $14,%edx
- movl 4(%esp),%esi
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- movl %eax,%esi
- rorl $9,%ecx
- movl %eax,16(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- rorl $11,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 80(%esp),%edx
- xorl %edi,%ebx
- rorl $2,%ecx
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- movl (%esp),%esi
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- movl %ebx,12(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- rorl $11,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 84(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- rorl $14,%edx
- movl 28(%esp),%esi
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- movl %eax,%esi
- rorl $9,%ecx
- movl %eax,8(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- rorl $11,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 88(%esp),%edx
- xorl %edi,%ebx
- rorl $2,%ecx
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- rorl $14,%edx
- movl 24(%esp),%esi
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- rorl $5,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- rorl $6,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- rorl $9,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- rorl $11,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 92(%esp),%edx
- xorl %edi,%eax
- rorl $2,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- movl 96(%esp),%esi
- xorl %edi,%ebx
- movl 12(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebx
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl %ebx,4(%esp)
- xorl %edi,%ebx
- movl %edi,8(%esp)
- movl %ecx,12(%esp)
- movl 20(%esp),%edi
- movl 24(%esp),%ecx
- addl 16(%esi),%edx
- addl 20(%esi),%edi
- addl 24(%esi),%ecx
- movl %edx,16(%esi)
- movl %edi,20(%esi)
- movl %edi,20(%esp)
- movl 28(%esp),%edi
- movl %ecx,24(%esi)
- addl 28(%esi),%edi
- movl %ecx,24(%esp)
- movl %edi,28(%esi)
- movl %edi,28(%esp)
- movl 100(%esp),%edi
- movdqa 64(%ebp),%xmm7
- subl $192,%ebp
- cmpl 104(%esp),%edi
- jb L010grand_ssse3
- movl 108(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 5,0x90
-L004AVX:
- leal -96(%esp),%esp
- vzeroall
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edi
- movl %ebx,4(%esp)
- xorl %ecx,%ebx
- movl %ecx,8(%esp)
- movl %edi,12(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%edi
- movl 24(%esi),%ecx
- movl 28(%esi),%esi
- movl %edi,20(%esp)
- movl 100(%esp),%edi
- movl %ecx,24(%esp)
- movl %esi,28(%esp)
- vmovdqa 256(%ebp),%xmm7
- jmp L012grand_avx
-.align 5,0x90
-L012grand_avx:
- vmovdqu (%edi),%xmm0
- vmovdqu 16(%edi),%xmm1
- vmovdqu 32(%edi),%xmm2
- vmovdqu 48(%edi),%xmm3
- addl $64,%edi
- vpshufb %xmm7,%xmm0,%xmm0
- movl %edi,100(%esp)
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd (%ebp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 16(%ebp),%xmm1,%xmm5
- vpaddd 32(%ebp),%xmm2,%xmm6
- vpaddd 48(%ebp),%xmm3,%xmm7
- vmovdqa %xmm4,32(%esp)
- vmovdqa %xmm5,48(%esp)
- vmovdqa %xmm6,64(%esp)
- vmovdqa %xmm7,80(%esp)
- jmp L013avx_00_47
-.align 4,0x90
-L013avx_00_47:
- addl $64,%ebp
- vpalignr $4,%xmm0,%xmm1,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- vpalignr $4,%xmm2,%xmm3,%xmm7
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- vpshufd $250,%xmm3,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 32(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- vpaddd %xmm4,%xmm0,%xmm0
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 36(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm0,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 40(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm0,%xmm0
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- vpaddd (%ebp),%xmm0,%xmm6
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 44(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,32(%esp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- vpalignr $4,%xmm3,%xmm0,%xmm7
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- vpshufd $250,%xmm0,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 48(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- vpaddd %xmm4,%xmm1,%xmm1
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 52(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm1,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 56(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm1,%xmm1
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- vpaddd 16(%ebp),%xmm1,%xmm6
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 60(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,48(%esp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- vpalignr $4,%xmm0,%xmm1,%xmm7
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- vpshufd $250,%xmm1,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 64(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- vpaddd %xmm4,%xmm2,%xmm2
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 68(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm2,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 72(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm2,%xmm2
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- vpaddd 32(%ebp),%xmm2,%xmm6
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 76(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,64(%esp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- vpalignr $4,%xmm1,%xmm2,%xmm7
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- vpshufd $250,%xmm2,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 80(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- vpaddd %xmm4,%xmm3,%xmm3
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 84(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm3,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 88(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm3,%xmm3
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- vpaddd 48(%ebp),%xmm3,%xmm6
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 92(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,80(%esp)
- cmpl $66051,64(%ebp)
- jne L013avx_00_47
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 32(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 36(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 40(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 44(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 48(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 52(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 56(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 60(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 64(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 68(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 72(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 76(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 80(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 84(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 88(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 92(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- movl 96(%esp),%esi
- xorl %edi,%ebx
- movl 12(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebx
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl %ebx,4(%esp)
- xorl %edi,%ebx
- movl %edi,8(%esp)
- movl %ecx,12(%esp)
- movl 20(%esp),%edi
- movl 24(%esp),%ecx
- addl 16(%esi),%edx
- addl 20(%esi),%edi
- addl 24(%esi),%ecx
- movl %edx,16(%esi)
- movl %edi,20(%esi)
- movl %edi,20(%esp)
- movl 28(%esp),%edi
- movl %ecx,24(%esi)
- addl 28(%esi),%edi
- movl %ecx,24(%esp)
- movl %edi,28(%esi)
- movl %edi,28(%esp)
- movl 100(%esp),%edi
- vmovdqa 64(%ebp),%xmm7
- subl $192,%ebp
- cmpl 104(%esp),%edi
- jb L012grand_avx
- movl 108(%esp),%esp
- vzeroall
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol _OPENSSL_ia32cap_P
-.long 0
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/sha512-586-apple.S b/apple-x86/crypto/fipsmodule/sha512-586-apple.S
deleted file mode 100644
index cfdeac1..0000000
--- a/apple-x86/crypto/fipsmodule/sha512-586-apple.S
+++ /dev/null
@@ -1,2837 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl _sha512_block_data_order
-.private_extern _sha512_block_data_order
-.align 4
-_sha512_block_data_order:
-L_sha512_block_data_order_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%eax
- movl %esp,%ebx
- call L000pic_point
-L000pic_point:
- popl %ebp
- leal L001K512-L000pic_point(%ebp),%ebp
- subl $16,%esp
- andl $-64,%esp
- shll $7,%eax
- addl %edi,%eax
- movl %esi,(%esp)
- movl %edi,4(%esp)
- movl %eax,8(%esp)
- movl %ebx,12(%esp)
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001K512(%ebp),%edx
- movl (%edx),%ecx
- testl $67108864,%ecx
- jz L002loop_x86
- movl 4(%edx),%edx
- movq (%esi),%mm0
- andl $16777216,%ecx
- movq 8(%esi),%mm1
- andl $512,%edx
- movq 16(%esi),%mm2
- orl %edx,%ecx
- movq 24(%esi),%mm3
- movq 32(%esi),%mm4
- movq 40(%esi),%mm5
- movq 48(%esi),%mm6
- movq 56(%esi),%mm7
- cmpl $16777728,%ecx
- je L003SSSE3
- subl $80,%esp
- jmp L004loop_sse2
-.align 4,0x90
-L004loop_sse2:
- movq %mm1,8(%esp)
- movq %mm2,16(%esp)
- movq %mm3,24(%esp)
- movq %mm5,40(%esp)
- movq %mm6,48(%esp)
- pxor %mm1,%mm2
- movq %mm7,56(%esp)
- movq %mm0,%mm3
- movl (%edi),%eax
- movl 4(%edi),%ebx
- addl $8,%edi
- movl $15,%edx
- bswap %eax
- bswap %ebx
- jmp L00500_14_sse2
-.align 4,0x90
-L00500_14_sse2:
- movd %eax,%mm1
- movl (%edi),%eax
- movd %ebx,%mm7
- movl 4(%edi),%ebx
- addl $8,%edi
- bswap %eax
- bswap %ebx
- punpckldq %mm1,%mm7
- movq %mm4,%mm1
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,32(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- movq %mm3,%mm0
- movq %mm7,72(%esp)
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 56(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- paddq (%ebp),%mm7
- pxor %mm4,%mm3
- movq 24(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 8(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- subl $8,%esp
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 40(%esp),%mm5
- paddq %mm2,%mm3
- movq %mm0,%mm2
- addl $8,%ebp
- paddq %mm6,%mm3
- movq 48(%esp),%mm6
- decl %edx
- jnz L00500_14_sse2
- movd %eax,%mm1
- movd %ebx,%mm7
- punpckldq %mm1,%mm7
- movq %mm4,%mm1
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,32(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- movq %mm3,%mm0
- movq %mm7,72(%esp)
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 56(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- paddq (%ebp),%mm7
- pxor %mm4,%mm3
- movq 24(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 8(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- subl $8,%esp
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 192(%esp),%mm7
- paddq %mm2,%mm3
- movq %mm0,%mm2
- addl $8,%ebp
- paddq %mm6,%mm3
- pxor %mm0,%mm0
- movl $32,%edx
- jmp L00616_79_sse2
-.align 4,0x90
-L00616_79_sse2:
- movq 88(%esp),%mm5
- movq %mm7,%mm1
- psrlq $1,%mm7
- movq %mm5,%mm6
- psrlq $6,%mm5
- psllq $56,%mm1
- paddq %mm3,%mm0
- movq %mm7,%mm3
- psrlq $6,%mm7
- pxor %mm1,%mm3
- psllq $7,%mm1
- pxor %mm7,%mm3
- psrlq $1,%mm7
- pxor %mm1,%mm3
- movq %mm5,%mm1
- psrlq $13,%mm5
- pxor %mm3,%mm7
- psllq $3,%mm6
- pxor %mm5,%mm1
- paddq 200(%esp),%mm7
- pxor %mm6,%mm1
- psrlq $42,%mm5
- paddq 128(%esp),%mm7
- pxor %mm5,%mm1
- psllq $42,%mm6
- movq 40(%esp),%mm5
- pxor %mm6,%mm1
- movq 48(%esp),%mm6
- paddq %mm1,%mm7
- movq %mm4,%mm1
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,32(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- movq %mm7,72(%esp)
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 56(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- paddq (%ebp),%mm7
- pxor %mm4,%mm3
- movq 24(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 8(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- subl $8,%esp
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 192(%esp),%mm7
- paddq %mm6,%mm2
- addl $8,%ebp
- movq 88(%esp),%mm5
- movq %mm7,%mm1
- psrlq $1,%mm7
- movq %mm5,%mm6
- psrlq $6,%mm5
- psllq $56,%mm1
- paddq %mm3,%mm2
- movq %mm7,%mm3
- psrlq $6,%mm7
- pxor %mm1,%mm3
- psllq $7,%mm1
- pxor %mm7,%mm3
- psrlq $1,%mm7
- pxor %mm1,%mm3
- movq %mm5,%mm1
- psrlq $13,%mm5
- pxor %mm3,%mm7
- psllq $3,%mm6
- pxor %mm5,%mm1
- paddq 200(%esp),%mm7
- pxor %mm6,%mm1
- psrlq $42,%mm5
- paddq 128(%esp),%mm7
- pxor %mm5,%mm1
- psllq $42,%mm6
- movq 40(%esp),%mm5
- pxor %mm6,%mm1
- movq 48(%esp),%mm6
- paddq %mm1,%mm7
- movq %mm4,%mm1
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,32(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- movq %mm7,72(%esp)
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 56(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- paddq (%ebp),%mm7
- pxor %mm4,%mm3
- movq 24(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 8(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- subl $8,%esp
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 192(%esp),%mm7
- paddq %mm6,%mm0
- addl $8,%ebp
- decl %edx
- jnz L00616_79_sse2
- paddq %mm3,%mm0
- movq 8(%esp),%mm1
- movq 24(%esp),%mm3
- movq 40(%esp),%mm5
- movq 48(%esp),%mm6
- movq 56(%esp),%mm7
- pxor %mm1,%mm2
- paddq (%esi),%mm0
- paddq 8(%esi),%mm1
- paddq 16(%esi),%mm2
- paddq 24(%esi),%mm3
- paddq 32(%esi),%mm4
- paddq 40(%esi),%mm5
- paddq 48(%esi),%mm6
- paddq 56(%esi),%mm7
- movl $640,%eax
- movq %mm0,(%esi)
- movq %mm1,8(%esi)
- movq %mm2,16(%esi)
- movq %mm3,24(%esi)
- movq %mm4,32(%esi)
- movq %mm5,40(%esi)
- movq %mm6,48(%esi)
- movq %mm7,56(%esi)
- leal (%esp,%eax,1),%esp
- subl %eax,%ebp
- cmpl 88(%esp),%edi
- jb L004loop_sse2
- movl 92(%esp),%esp
- emms
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 5,0x90
-L003SSSE3:
- leal -64(%esp),%edx
- subl $256,%esp
- movdqa 640(%ebp),%xmm1
- movdqu (%edi),%xmm0
-.byte 102,15,56,0,193
- movdqa (%ebp),%xmm3
- movdqa %xmm1,%xmm2
- movdqu 16(%edi),%xmm1
- paddq %xmm0,%xmm3
-.byte 102,15,56,0,202
- movdqa %xmm3,-128(%edx)
- movdqa 16(%ebp),%xmm4
- movdqa %xmm2,%xmm3
- movdqu 32(%edi),%xmm2
- paddq %xmm1,%xmm4
-.byte 102,15,56,0,211
- movdqa %xmm4,-112(%edx)
- movdqa 32(%ebp),%xmm5
- movdqa %xmm3,%xmm4
- movdqu 48(%edi),%xmm3
- paddq %xmm2,%xmm5
-.byte 102,15,56,0,220
- movdqa %xmm5,-96(%edx)
- movdqa 48(%ebp),%xmm6
- movdqa %xmm4,%xmm5
- movdqu 64(%edi),%xmm4
- paddq %xmm3,%xmm6
-.byte 102,15,56,0,229
- movdqa %xmm6,-80(%edx)
- movdqa 64(%ebp),%xmm7
- movdqa %xmm5,%xmm6
- movdqu 80(%edi),%xmm5
- paddq %xmm4,%xmm7
-.byte 102,15,56,0,238
- movdqa %xmm7,-64(%edx)
- movdqa %xmm0,(%edx)
- movdqa 80(%ebp),%xmm0
- movdqa %xmm6,%xmm7
- movdqu 96(%edi),%xmm6
- paddq %xmm5,%xmm0
-.byte 102,15,56,0,247
- movdqa %xmm0,-48(%edx)
- movdqa %xmm1,16(%edx)
- movdqa 96(%ebp),%xmm1
- movdqa %xmm7,%xmm0
- movdqu 112(%edi),%xmm7
- paddq %xmm6,%xmm1
-.byte 102,15,56,0,248
- movdqa %xmm1,-32(%edx)
- movdqa %xmm2,32(%edx)
- movdqa 112(%ebp),%xmm2
- movdqa (%edx),%xmm0
- paddq %xmm7,%xmm2
- movdqa %xmm2,-16(%edx)
- nop
-.align 5,0x90
-L007loop_ssse3:
- movdqa 16(%edx),%xmm2
- movdqa %xmm3,48(%edx)
- leal 128(%ebp),%ebp
- movq %mm1,8(%esp)
- movl %edi,%ebx
- movq %mm2,16(%esp)
- leal 128(%edi),%edi
- movq %mm3,24(%esp)
- cmpl %eax,%edi
- movq %mm5,40(%esp)
- cmovbl %edi,%ebx
- movq %mm6,48(%esp)
- movl $4,%ecx
- pxor %mm1,%mm2
- movq %mm7,56(%esp)
- pxor %mm3,%mm3
- jmp L00800_47_ssse3
-.align 5,0x90
-L00800_47_ssse3:
- movdqa %xmm5,%xmm3
- movdqa %xmm2,%xmm1
-.byte 102,15,58,15,208,8
- movdqa %xmm4,(%edx)
-.byte 102,15,58,15,220,8
- movdqa %xmm2,%xmm4
- psrlq $7,%xmm2
- paddq %xmm3,%xmm0
- movdqa %xmm4,%xmm3
- psrlq $1,%xmm4
- psllq $56,%xmm3
- pxor %xmm4,%xmm2
- psrlq $7,%xmm4
- pxor %xmm3,%xmm2
- psllq $7,%xmm3
- pxor %xmm4,%xmm2
- movdqa %xmm7,%xmm4
- pxor %xmm3,%xmm2
- movdqa %xmm7,%xmm3
- psrlq $6,%xmm4
- paddq %xmm2,%xmm0
- movdqa %xmm7,%xmm2
- psrlq $19,%xmm3
- psllq $3,%xmm2
- pxor %xmm3,%xmm4
- psrlq $42,%xmm3
- pxor %xmm2,%xmm4
- psllq $42,%xmm2
- pxor %xmm3,%xmm4
- movdqa 32(%edx),%xmm3
- pxor %xmm2,%xmm4
- movdqa (%ebp),%xmm2
- movq %mm4,%mm1
- paddq %xmm4,%xmm0
- movq -128(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,32(%esp)
- paddq %xmm0,%xmm2
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 56(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 24(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 8(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 32(%esp),%mm5
- paddq %mm6,%mm2
- movq 40(%esp),%mm6
- movq %mm4,%mm1
- movq -120(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,24(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,56(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 48(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 16(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq (%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 24(%esp),%mm5
- paddq %mm6,%mm0
- movq 32(%esp),%mm6
- movdqa %xmm2,-128(%edx)
- movdqa %xmm6,%xmm4
- movdqa %xmm3,%xmm2
-.byte 102,15,58,15,217,8
- movdqa %xmm5,16(%edx)
-.byte 102,15,58,15,229,8
- movdqa %xmm3,%xmm5
- psrlq $7,%xmm3
- paddq %xmm4,%xmm1
- movdqa %xmm5,%xmm4
- psrlq $1,%xmm5
- psllq $56,%xmm4
- pxor %xmm5,%xmm3
- psrlq $7,%xmm5
- pxor %xmm4,%xmm3
- psllq $7,%xmm4
- pxor %xmm5,%xmm3
- movdqa %xmm0,%xmm5
- pxor %xmm4,%xmm3
- movdqa %xmm0,%xmm4
- psrlq $6,%xmm5
- paddq %xmm3,%xmm1
- movdqa %xmm0,%xmm3
- psrlq $19,%xmm4
- psllq $3,%xmm3
- pxor %xmm4,%xmm5
- psrlq $42,%xmm4
- pxor %xmm3,%xmm5
- psllq $42,%xmm3
- pxor %xmm4,%xmm5
- movdqa 48(%edx),%xmm4
- pxor %xmm3,%xmm5
- movdqa 16(%ebp),%xmm3
- movq %mm4,%mm1
- paddq %xmm5,%xmm1
- movq -112(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,16(%esp)
- paddq %xmm1,%xmm3
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,48(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 40(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 8(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 56(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 16(%esp),%mm5
- paddq %mm6,%mm2
- movq 24(%esp),%mm6
- movq %mm4,%mm1
- movq -104(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,8(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,40(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 32(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq (%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 48(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 8(%esp),%mm5
- paddq %mm6,%mm0
- movq 16(%esp),%mm6
- movdqa %xmm3,-112(%edx)
- movdqa %xmm7,%xmm5
- movdqa %xmm4,%xmm3
-.byte 102,15,58,15,226,8
- movdqa %xmm6,32(%edx)
-.byte 102,15,58,15,238,8
- movdqa %xmm4,%xmm6
- psrlq $7,%xmm4
- paddq %xmm5,%xmm2
- movdqa %xmm6,%xmm5
- psrlq $1,%xmm6
- psllq $56,%xmm5
- pxor %xmm6,%xmm4
- psrlq $7,%xmm6
- pxor %xmm5,%xmm4
- psllq $7,%xmm5
- pxor %xmm6,%xmm4
- movdqa %xmm1,%xmm6
- pxor %xmm5,%xmm4
- movdqa %xmm1,%xmm5
- psrlq $6,%xmm6
- paddq %xmm4,%xmm2
- movdqa %xmm1,%xmm4
- psrlq $19,%xmm5
- psllq $3,%xmm4
- pxor %xmm5,%xmm6
- psrlq $42,%xmm5
- pxor %xmm4,%xmm6
- psllq $42,%xmm4
- pxor %xmm5,%xmm6
- movdqa (%edx),%xmm5
- pxor %xmm4,%xmm6
- movdqa 32(%ebp),%xmm4
- movq %mm4,%mm1
- paddq %xmm6,%xmm2
- movq -96(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,(%esp)
- paddq %xmm2,%xmm4
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,32(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 24(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 56(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 40(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq (%esp),%mm5
- paddq %mm6,%mm2
- movq 8(%esp),%mm6
- movq %mm4,%mm1
- movq -88(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,56(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,24(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 16(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 48(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 32(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 56(%esp),%mm5
- paddq %mm6,%mm0
- movq (%esp),%mm6
- movdqa %xmm4,-96(%edx)
- movdqa %xmm0,%xmm6
- movdqa %xmm5,%xmm4
-.byte 102,15,58,15,235,8
- movdqa %xmm7,48(%edx)
-.byte 102,15,58,15,247,8
- movdqa %xmm5,%xmm7
- psrlq $7,%xmm5
- paddq %xmm6,%xmm3
- movdqa %xmm7,%xmm6
- psrlq $1,%xmm7
- psllq $56,%xmm6
- pxor %xmm7,%xmm5
- psrlq $7,%xmm7
- pxor %xmm6,%xmm5
- psllq $7,%xmm6
- pxor %xmm7,%xmm5
- movdqa %xmm2,%xmm7
- pxor %xmm6,%xmm5
- movdqa %xmm2,%xmm6
- psrlq $6,%xmm7
- paddq %xmm5,%xmm3
- movdqa %xmm2,%xmm5
- psrlq $19,%xmm6
- psllq $3,%xmm5
- pxor %xmm6,%xmm7
- psrlq $42,%xmm6
- pxor %xmm5,%xmm7
- psllq $42,%xmm5
- pxor %xmm6,%xmm7
- movdqa 16(%edx),%xmm6
- pxor %xmm5,%xmm7
- movdqa 48(%ebp),%xmm5
- movq %mm4,%mm1
- paddq %xmm7,%xmm3
- movq -80(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,48(%esp)
- paddq %xmm3,%xmm5
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,16(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 8(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 40(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 24(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 48(%esp),%mm5
- paddq %mm6,%mm2
- movq 56(%esp),%mm6
- movq %mm4,%mm1
- movq -72(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,40(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,8(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq (%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 32(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 16(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 40(%esp),%mm5
- paddq %mm6,%mm0
- movq 48(%esp),%mm6
- movdqa %xmm5,-80(%edx)
- movdqa %xmm1,%xmm7
- movdqa %xmm6,%xmm5
-.byte 102,15,58,15,244,8
- movdqa %xmm0,(%edx)
-.byte 102,15,58,15,248,8
- movdqa %xmm6,%xmm0
- psrlq $7,%xmm6
- paddq %xmm7,%xmm4
- movdqa %xmm0,%xmm7
- psrlq $1,%xmm0
- psllq $56,%xmm7
- pxor %xmm0,%xmm6
- psrlq $7,%xmm0
- pxor %xmm7,%xmm6
- psllq $7,%xmm7
- pxor %xmm0,%xmm6
- movdqa %xmm3,%xmm0
- pxor %xmm7,%xmm6
- movdqa %xmm3,%xmm7
- psrlq $6,%xmm0
- paddq %xmm6,%xmm4
- movdqa %xmm3,%xmm6
- psrlq $19,%xmm7
- psllq $3,%xmm6
- pxor %xmm7,%xmm0
- psrlq $42,%xmm7
- pxor %xmm6,%xmm0
- psllq $42,%xmm6
- pxor %xmm7,%xmm0
- movdqa 32(%edx),%xmm7
- pxor %xmm6,%xmm0
- movdqa 64(%ebp),%xmm6
- movq %mm4,%mm1
- paddq %xmm0,%xmm4
- movq -64(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,32(%esp)
- paddq %xmm4,%xmm6
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 56(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 24(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 8(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 32(%esp),%mm5
- paddq %mm6,%mm2
- movq 40(%esp),%mm6
- movq %mm4,%mm1
- movq -56(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,24(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,56(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 48(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 16(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq (%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 24(%esp),%mm5
- paddq %mm6,%mm0
- movq 32(%esp),%mm6
- movdqa %xmm6,-64(%edx)
- movdqa %xmm2,%xmm0
- movdqa %xmm7,%xmm6
-.byte 102,15,58,15,253,8
- movdqa %xmm1,16(%edx)
-.byte 102,15,58,15,193,8
- movdqa %xmm7,%xmm1
- psrlq $7,%xmm7
- paddq %xmm0,%xmm5
- movdqa %xmm1,%xmm0
- psrlq $1,%xmm1
- psllq $56,%xmm0
- pxor %xmm1,%xmm7
- psrlq $7,%xmm1
- pxor %xmm0,%xmm7
- psllq $7,%xmm0
- pxor %xmm1,%xmm7
- movdqa %xmm4,%xmm1
- pxor %xmm0,%xmm7
- movdqa %xmm4,%xmm0
- psrlq $6,%xmm1
- paddq %xmm7,%xmm5
- movdqa %xmm4,%xmm7
- psrlq $19,%xmm0
- psllq $3,%xmm7
- pxor %xmm0,%xmm1
- psrlq $42,%xmm0
- pxor %xmm7,%xmm1
- psllq $42,%xmm7
- pxor %xmm0,%xmm1
- movdqa 48(%edx),%xmm0
- pxor %xmm7,%xmm1
- movdqa 80(%ebp),%xmm7
- movq %mm4,%mm1
- paddq %xmm1,%xmm5
- movq -48(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,16(%esp)
- paddq %xmm5,%xmm7
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,48(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 40(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 8(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 56(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 16(%esp),%mm5
- paddq %mm6,%mm2
- movq 24(%esp),%mm6
- movq %mm4,%mm1
- movq -40(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,8(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,40(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 32(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq (%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 48(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 8(%esp),%mm5
- paddq %mm6,%mm0
- movq 16(%esp),%mm6
- movdqa %xmm7,-48(%edx)
- movdqa %xmm3,%xmm1
- movdqa %xmm0,%xmm7
-.byte 102,15,58,15,198,8
- movdqa %xmm2,32(%edx)
-.byte 102,15,58,15,202,8
- movdqa %xmm0,%xmm2
- psrlq $7,%xmm0
- paddq %xmm1,%xmm6
- movdqa %xmm2,%xmm1
- psrlq $1,%xmm2
- psllq $56,%xmm1
- pxor %xmm2,%xmm0
- psrlq $7,%xmm2
- pxor %xmm1,%xmm0
- psllq $7,%xmm1
- pxor %xmm2,%xmm0
- movdqa %xmm5,%xmm2
- pxor %xmm1,%xmm0
- movdqa %xmm5,%xmm1
- psrlq $6,%xmm2
- paddq %xmm0,%xmm6
- movdqa %xmm5,%xmm0
- psrlq $19,%xmm1
- psllq $3,%xmm0
- pxor %xmm1,%xmm2
- psrlq $42,%xmm1
- pxor %xmm0,%xmm2
- psllq $42,%xmm0
- pxor %xmm1,%xmm2
- movdqa (%edx),%xmm1
- pxor %xmm0,%xmm2
- movdqa 96(%ebp),%xmm0
- movq %mm4,%mm1
- paddq %xmm2,%xmm6
- movq -32(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,(%esp)
- paddq %xmm6,%xmm0
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,32(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 24(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 56(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 40(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq (%esp),%mm5
- paddq %mm6,%mm2
- movq 8(%esp),%mm6
- movq %mm4,%mm1
- movq -24(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,56(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,24(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 16(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 48(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 32(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 56(%esp),%mm5
- paddq %mm6,%mm0
- movq (%esp),%mm6
- movdqa %xmm0,-32(%edx)
- movdqa %xmm4,%xmm2
- movdqa %xmm1,%xmm0
-.byte 102,15,58,15,207,8
- movdqa %xmm3,48(%edx)
-.byte 102,15,58,15,211,8
- movdqa %xmm1,%xmm3
- psrlq $7,%xmm1
- paddq %xmm2,%xmm7
- movdqa %xmm3,%xmm2
- psrlq $1,%xmm3
- psllq $56,%xmm2
- pxor %xmm3,%xmm1
- psrlq $7,%xmm3
- pxor %xmm2,%xmm1
- psllq $7,%xmm2
- pxor %xmm3,%xmm1
- movdqa %xmm6,%xmm3
- pxor %xmm2,%xmm1
- movdqa %xmm6,%xmm2
- psrlq $6,%xmm3
- paddq %xmm1,%xmm7
- movdqa %xmm6,%xmm1
- psrlq $19,%xmm2
- psllq $3,%xmm1
- pxor %xmm2,%xmm3
- psrlq $42,%xmm2
- pxor %xmm1,%xmm3
- psllq $42,%xmm1
- pxor %xmm2,%xmm3
- movdqa 16(%edx),%xmm2
- pxor %xmm1,%xmm3
- movdqa 112(%ebp),%xmm1
- movq %mm4,%mm1
- paddq %xmm3,%xmm7
- movq -16(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,48(%esp)
- paddq %xmm7,%xmm1
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,16(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 8(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 40(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 24(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 48(%esp),%mm5
- paddq %mm6,%mm2
- movq 56(%esp),%mm6
- movq %mm4,%mm1
- movq -8(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,40(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,8(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq (%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 32(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 16(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 40(%esp),%mm5
- paddq %mm6,%mm0
- movq 48(%esp),%mm6
- movdqa %xmm1,-16(%edx)
- leal 128(%ebp),%ebp
- decl %ecx
- jnz L00800_47_ssse3
- movdqa (%ebp),%xmm1
- leal -640(%ebp),%ebp
- movdqu (%ebx),%xmm0
-.byte 102,15,56,0,193
- movdqa (%ebp),%xmm3
- movdqa %xmm1,%xmm2
- movdqu 16(%ebx),%xmm1
- paddq %xmm0,%xmm3
-.byte 102,15,56,0,202
- movq %mm4,%mm1
- movq -128(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,32(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 56(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 24(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 8(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 32(%esp),%mm5
- paddq %mm6,%mm2
- movq 40(%esp),%mm6
- movq %mm4,%mm1
- movq -120(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,24(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,56(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 48(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 16(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq (%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 24(%esp),%mm5
- paddq %mm6,%mm0
- movq 32(%esp),%mm6
- movdqa %xmm3,-128(%edx)
- movdqa 16(%ebp),%xmm4
- movdqa %xmm2,%xmm3
- movdqu 32(%ebx),%xmm2
- paddq %xmm1,%xmm4
-.byte 102,15,56,0,211
- movq %mm4,%mm1
- movq -112(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,16(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,48(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 40(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 8(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 56(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 16(%esp),%mm5
- paddq %mm6,%mm2
- movq 24(%esp),%mm6
- movq %mm4,%mm1
- movq -104(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,8(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,40(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 32(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq (%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 48(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 8(%esp),%mm5
- paddq %mm6,%mm0
- movq 16(%esp),%mm6
- movdqa %xmm4,-112(%edx)
- movdqa 32(%ebp),%xmm5
- movdqa %xmm3,%xmm4
- movdqu 48(%ebx),%xmm3
- paddq %xmm2,%xmm5
-.byte 102,15,56,0,220
- movq %mm4,%mm1
- movq -96(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,32(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 24(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 56(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 40(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq (%esp),%mm5
- paddq %mm6,%mm2
- movq 8(%esp),%mm6
- movq %mm4,%mm1
- movq -88(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,56(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,24(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 16(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 48(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 32(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 56(%esp),%mm5
- paddq %mm6,%mm0
- movq (%esp),%mm6
- movdqa %xmm5,-96(%edx)
- movdqa 48(%ebp),%xmm6
- movdqa %xmm4,%xmm5
- movdqu 64(%ebx),%xmm4
- paddq %xmm3,%xmm6
-.byte 102,15,56,0,229
- movq %mm4,%mm1
- movq -80(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,48(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,16(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 8(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 40(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 24(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 48(%esp),%mm5
- paddq %mm6,%mm2
- movq 56(%esp),%mm6
- movq %mm4,%mm1
- movq -72(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,40(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,8(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq (%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 32(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 16(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 40(%esp),%mm5
- paddq %mm6,%mm0
- movq 48(%esp),%mm6
- movdqa %xmm6,-80(%edx)
- movdqa 64(%ebp),%xmm7
- movdqa %xmm5,%xmm6
- movdqu 80(%ebx),%xmm5
- paddq %xmm4,%xmm7
-.byte 102,15,56,0,238
- movq %mm4,%mm1
- movq -64(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,32(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 56(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 24(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 8(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 32(%esp),%mm5
- paddq %mm6,%mm2
- movq 40(%esp),%mm6
- movq %mm4,%mm1
- movq -56(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,24(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,56(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 48(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 16(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq (%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 24(%esp),%mm5
- paddq %mm6,%mm0
- movq 32(%esp),%mm6
- movdqa %xmm7,-64(%edx)
- movdqa %xmm0,(%edx)
- movdqa 80(%ebp),%xmm0
- movdqa %xmm6,%xmm7
- movdqu 96(%ebx),%xmm6
- paddq %xmm5,%xmm0
-.byte 102,15,56,0,247
- movq %mm4,%mm1
- movq -48(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,16(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,48(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 40(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 8(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 56(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 16(%esp),%mm5
- paddq %mm6,%mm2
- movq 24(%esp),%mm6
- movq %mm4,%mm1
- movq -40(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,8(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,40(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 32(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq (%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 48(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 8(%esp),%mm5
- paddq %mm6,%mm0
- movq 16(%esp),%mm6
- movdqa %xmm0,-48(%edx)
- movdqa %xmm1,16(%edx)
- movdqa 96(%ebp),%xmm1
- movdqa %xmm7,%xmm0
- movdqu 112(%ebx),%xmm7
- paddq %xmm6,%xmm1
-.byte 102,15,56,0,248
- movq %mm4,%mm1
- movq -32(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,32(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 24(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 56(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 40(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq (%esp),%mm5
- paddq %mm6,%mm2
- movq 8(%esp),%mm6
- movq %mm4,%mm1
- movq -24(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,56(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,24(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 16(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 48(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 32(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 56(%esp),%mm5
- paddq %mm6,%mm0
- movq (%esp),%mm6
- movdqa %xmm1,-32(%edx)
- movdqa %xmm2,32(%edx)
- movdqa 112(%ebp),%xmm2
- movdqa (%edx),%xmm0
- paddq %xmm7,%xmm2
- movq %mm4,%mm1
- movq -16(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,48(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm0
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm0,16(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq 8(%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 40(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm0,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm0,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 24(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm0,%mm2
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- pxor %mm7,%mm6
- movq 48(%esp),%mm5
- paddq %mm6,%mm2
- movq 56(%esp),%mm6
- movq %mm4,%mm1
- movq -8(%edx),%mm7
- pxor %mm6,%mm5
- psrlq $14,%mm1
- movq %mm4,40(%esp)
- pand %mm4,%mm5
- psllq $23,%mm4
- paddq %mm3,%mm2
- movq %mm1,%mm3
- psrlq $4,%mm1
- pxor %mm6,%mm5
- pxor %mm4,%mm3
- psllq $23,%mm4
- pxor %mm1,%mm3
- movq %mm2,8(%esp)
- paddq %mm5,%mm7
- pxor %mm4,%mm3
- psrlq $23,%mm1
- paddq (%esp),%mm7
- pxor %mm1,%mm3
- psllq $4,%mm4
- pxor %mm4,%mm3
- movq 32(%esp),%mm4
- paddq %mm7,%mm3
- movq %mm2,%mm5
- psrlq $28,%mm5
- paddq %mm3,%mm4
- movq %mm2,%mm6
- movq %mm5,%mm7
- psllq $25,%mm6
- movq 16(%esp),%mm1
- psrlq $6,%mm5
- pxor %mm6,%mm7
- psllq $5,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm2
- psrlq $5,%mm5
- pxor %mm6,%mm7
- pand %mm2,%mm0
- psllq $6,%mm6
- pxor %mm5,%mm7
- pxor %mm1,%mm0
- pxor %mm7,%mm6
- movq 40(%esp),%mm5
- paddq %mm6,%mm0
- movq 48(%esp),%mm6
- movdqa %xmm2,-16(%edx)
- movq 8(%esp),%mm1
- paddq %mm3,%mm0
- movq 24(%esp),%mm3
- movq 56(%esp),%mm7
- pxor %mm1,%mm2
- paddq (%esi),%mm0
- paddq 8(%esi),%mm1
- paddq 16(%esi),%mm2
- paddq 24(%esi),%mm3
- paddq 32(%esi),%mm4
- paddq 40(%esi),%mm5
- paddq 48(%esi),%mm6
- paddq 56(%esi),%mm7
- movq %mm0,(%esi)
- movq %mm1,8(%esi)
- movq %mm2,16(%esi)
- movq %mm3,24(%esi)
- movq %mm4,32(%esi)
- movq %mm5,40(%esi)
- movq %mm6,48(%esi)
- movq %mm7,56(%esi)
- cmpl %eax,%edi
- jb L007loop_ssse3
- movl 76(%edx),%esp
- emms
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 4,0x90
-L002loop_x86:
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- movl 12(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 16(%edi),%eax
- movl 20(%edi),%ebx
- movl 24(%edi),%ecx
- movl 28(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 32(%edi),%eax
- movl 36(%edi),%ebx
- movl 40(%edi),%ecx
- movl 44(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 48(%edi),%eax
- movl 52(%edi),%ebx
- movl 56(%edi),%ecx
- movl 60(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 64(%edi),%eax
- movl 68(%edi),%ebx
- movl 72(%edi),%ecx
- movl 76(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 80(%edi),%eax
- movl 84(%edi),%ebx
- movl 88(%edi),%ecx
- movl 92(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 96(%edi),%eax
- movl 100(%edi),%ebx
- movl 104(%edi),%ecx
- movl 108(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- movl 112(%edi),%eax
- movl 116(%edi),%ebx
- movl 120(%edi),%ecx
- movl 124(%edi),%edx
- bswap %eax
- bswap %ebx
- bswap %ecx
- bswap %edx
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
- addl $128,%edi
- subl $72,%esp
- movl %edi,204(%esp)
- leal 8(%esp),%edi
- movl $16,%ecx
-.long 2784229001
-.align 4,0x90
-L00900_15_x86:
- movl 40(%esp),%ecx
- movl 44(%esp),%edx
- movl %ecx,%esi
- shrl $9,%ecx
- movl %edx,%edi
- shrl $9,%edx
- movl %ecx,%ebx
- shll $14,%esi
- movl %edx,%eax
- shll $14,%edi
- xorl %esi,%ebx
- shrl $5,%ecx
- xorl %edi,%eax
- shrl $5,%edx
- xorl %ecx,%eax
- shll $4,%esi
- xorl %edx,%ebx
- shll $4,%edi
- xorl %esi,%ebx
- shrl $4,%ecx
- xorl %edi,%eax
- shrl $4,%edx
- xorl %ecx,%eax
- shll $5,%esi
- xorl %edx,%ebx
- shll $5,%edi
- xorl %esi,%eax
- xorl %edi,%ebx
- movl 48(%esp),%ecx
- movl 52(%esp),%edx
- movl 56(%esp),%esi
- movl 60(%esp),%edi
- addl 64(%esp),%eax
- adcl 68(%esp),%ebx
- xorl %esi,%ecx
- xorl %edi,%edx
- andl 40(%esp),%ecx
- andl 44(%esp),%edx
- addl 192(%esp),%eax
- adcl 196(%esp),%ebx
- xorl %esi,%ecx
- xorl %edi,%edx
- movl (%ebp),%esi
- movl 4(%ebp),%edi
- addl %ecx,%eax
- adcl %edx,%ebx
- movl 32(%esp),%ecx
- movl 36(%esp),%edx
- addl %esi,%eax
- adcl %edi,%ebx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- addl %ecx,%eax
- adcl %edx,%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- movl %eax,32(%esp)
- movl %ebx,36(%esp)
- movl %ecx,%esi
- shrl $2,%ecx
- movl %edx,%edi
- shrl $2,%edx
- movl %ecx,%ebx
- shll $4,%esi
- movl %edx,%eax
- shll $4,%edi
- xorl %esi,%ebx
- shrl $5,%ecx
- xorl %edi,%eax
- shrl $5,%edx
- xorl %ecx,%ebx
- shll $21,%esi
- xorl %edx,%eax
- shll $21,%edi
- xorl %esi,%eax
- shrl $21,%ecx
- xorl %edi,%ebx
- shrl $21,%edx
- xorl %ecx,%eax
- shll $5,%esi
- xorl %edx,%ebx
- shll $5,%edi
- xorl %esi,%eax
- xorl %edi,%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- movl 16(%esp),%esi
- movl 20(%esp),%edi
- addl (%esp),%eax
- adcl 4(%esp),%ebx
- orl %esi,%ecx
- orl %edi,%edx
- andl 24(%esp),%ecx
- andl 28(%esp),%edx
- andl 8(%esp),%esi
- andl 12(%esp),%edi
- orl %esi,%ecx
- orl %edi,%edx
- addl %ecx,%eax
- adcl %edx,%ebx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- movb (%ebp),%dl
- subl $8,%esp
- leal 8(%ebp),%ebp
- cmpb $148,%dl
- jne L00900_15_x86
-.align 4,0x90
-L01016_79_x86:
- movl 312(%esp),%ecx
- movl 316(%esp),%edx
- movl %ecx,%esi
- shrl $1,%ecx
- movl %edx,%edi
- shrl $1,%edx
- movl %ecx,%eax
- shll $24,%esi
- movl %edx,%ebx
- shll $24,%edi
- xorl %esi,%ebx
- shrl $6,%ecx
- xorl %edi,%eax
- shrl $6,%edx
- xorl %ecx,%eax
- shll $7,%esi
- xorl %edx,%ebx
- shll $1,%edi
- xorl %esi,%ebx
- shrl $1,%ecx
- xorl %edi,%eax
- shrl $1,%edx
- xorl %ecx,%eax
- shll $6,%edi
- xorl %edx,%ebx
- xorl %edi,%eax
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- movl 208(%esp),%ecx
- movl 212(%esp),%edx
- movl %ecx,%esi
- shrl $6,%ecx
- movl %edx,%edi
- shrl $6,%edx
- movl %ecx,%eax
- shll $3,%esi
- movl %edx,%ebx
- shll $3,%edi
- xorl %esi,%eax
- shrl $13,%ecx
- xorl %edi,%ebx
- shrl $13,%edx
- xorl %ecx,%eax
- shll $10,%esi
- xorl %edx,%ebx
- shll $10,%edi
- xorl %esi,%ebx
- shrl $10,%ecx
- xorl %edi,%eax
- shrl $10,%edx
- xorl %ecx,%ebx
- shll $13,%edi
- xorl %edx,%eax
- xorl %edi,%eax
- movl 320(%esp),%ecx
- movl 324(%esp),%edx
- addl (%esp),%eax
- adcl 4(%esp),%ebx
- movl 248(%esp),%esi
- movl 252(%esp),%edi
- addl %ecx,%eax
- adcl %edx,%ebx
- addl %esi,%eax
- adcl %edi,%ebx
- movl %eax,192(%esp)
- movl %ebx,196(%esp)
- movl 40(%esp),%ecx
- movl 44(%esp),%edx
- movl %ecx,%esi
- shrl $9,%ecx
- movl %edx,%edi
- shrl $9,%edx
- movl %ecx,%ebx
- shll $14,%esi
- movl %edx,%eax
- shll $14,%edi
- xorl %esi,%ebx
- shrl $5,%ecx
- xorl %edi,%eax
- shrl $5,%edx
- xorl %ecx,%eax
- shll $4,%esi
- xorl %edx,%ebx
- shll $4,%edi
- xorl %esi,%ebx
- shrl $4,%ecx
- xorl %edi,%eax
- shrl $4,%edx
- xorl %ecx,%eax
- shll $5,%esi
- xorl %edx,%ebx
- shll $5,%edi
- xorl %esi,%eax
- xorl %edi,%ebx
- movl 48(%esp),%ecx
- movl 52(%esp),%edx
- movl 56(%esp),%esi
- movl 60(%esp),%edi
- addl 64(%esp),%eax
- adcl 68(%esp),%ebx
- xorl %esi,%ecx
- xorl %edi,%edx
- andl 40(%esp),%ecx
- andl 44(%esp),%edx
- addl 192(%esp),%eax
- adcl 196(%esp),%ebx
- xorl %esi,%ecx
- xorl %edi,%edx
- movl (%ebp),%esi
- movl 4(%ebp),%edi
- addl %ecx,%eax
- adcl %edx,%ebx
- movl 32(%esp),%ecx
- movl 36(%esp),%edx
- addl %esi,%eax
- adcl %edi,%ebx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- addl %ecx,%eax
- adcl %edx,%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- movl %eax,32(%esp)
- movl %ebx,36(%esp)
- movl %ecx,%esi
- shrl $2,%ecx
- movl %edx,%edi
- shrl $2,%edx
- movl %ecx,%ebx
- shll $4,%esi
- movl %edx,%eax
- shll $4,%edi
- xorl %esi,%ebx
- shrl $5,%ecx
- xorl %edi,%eax
- shrl $5,%edx
- xorl %ecx,%ebx
- shll $21,%esi
- xorl %edx,%eax
- shll $21,%edi
- xorl %esi,%eax
- shrl $21,%ecx
- xorl %edi,%ebx
- shrl $21,%edx
- xorl %ecx,%eax
- shll $5,%esi
- xorl %edx,%ebx
- shll $5,%edi
- xorl %esi,%eax
- xorl %edi,%ebx
- movl 8(%esp),%ecx
- movl 12(%esp),%edx
- movl 16(%esp),%esi
- movl 20(%esp),%edi
- addl (%esp),%eax
- adcl 4(%esp),%ebx
- orl %esi,%ecx
- orl %edi,%edx
- andl 24(%esp),%ecx
- andl 28(%esp),%edx
- andl 8(%esp),%esi
- andl 12(%esp),%edi
- orl %esi,%ecx
- orl %edi,%edx
- addl %ecx,%eax
- adcl %edx,%ebx
- movl %eax,(%esp)
- movl %ebx,4(%esp)
- movb (%ebp),%dl
- subl $8,%esp
- leal 8(%ebp),%ebp
- cmpb $23,%dl
- jne L01016_79_x86
- movl 840(%esp),%esi
- movl 844(%esp),%edi
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edx
- addl 8(%esp),%eax
- adcl 12(%esp),%ebx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- addl 16(%esp),%ecx
- adcl 20(%esp),%edx
- movl %ecx,8(%esi)
- movl %edx,12(%esi)
- movl 16(%esi),%eax
- movl 20(%esi),%ebx
- movl 24(%esi),%ecx
- movl 28(%esi),%edx
- addl 24(%esp),%eax
- adcl 28(%esp),%ebx
- movl %eax,16(%esi)
- movl %ebx,20(%esi)
- addl 32(%esp),%ecx
- adcl 36(%esp),%edx
- movl %ecx,24(%esi)
- movl %edx,28(%esi)
- movl 32(%esi),%eax
- movl 36(%esi),%ebx
- movl 40(%esi),%ecx
- movl 44(%esi),%edx
- addl 40(%esp),%eax
- adcl 44(%esp),%ebx
- movl %eax,32(%esi)
- movl %ebx,36(%esi)
- addl 48(%esp),%ecx
- adcl 52(%esp),%edx
- movl %ecx,40(%esi)
- movl %edx,44(%esi)
- movl 48(%esi),%eax
- movl 52(%esi),%ebx
- movl 56(%esi),%ecx
- movl 60(%esi),%edx
- addl 56(%esp),%eax
- adcl 60(%esp),%ebx
- movl %eax,48(%esi)
- movl %ebx,52(%esi)
- addl 64(%esp),%ecx
- adcl 68(%esp),%edx
- movl %ecx,56(%esi)
- movl %edx,60(%esi)
- addl $840,%esp
- subl $640,%ebp
- cmpl 8(%esp),%edi
- jb L002loop_x86
- movl 12(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 6,0x90
-L001K512:
-.long 3609767458,1116352408
-.long 602891725,1899447441
-.long 3964484399,3049323471
-.long 2173295548,3921009573
-.long 4081628472,961987163
-.long 3053834265,1508970993
-.long 2937671579,2453635748
-.long 3664609560,2870763221
-.long 2734883394,3624381080
-.long 1164996542,310598401
-.long 1323610764,607225278
-.long 3590304994,1426881987
-.long 4068182383,1925078388
-.long 991336113,2162078206
-.long 633803317,2614888103
-.long 3479774868,3248222580
-.long 2666613458,3835390401
-.long 944711139,4022224774
-.long 2341262773,264347078
-.long 2007800933,604807628
-.long 1495990901,770255983
-.long 1856431235,1249150122
-.long 3175218132,1555081692
-.long 2198950837,1996064986
-.long 3999719339,2554220882
-.long 766784016,2821834349
-.long 2566594879,2952996808
-.long 3203337956,3210313671
-.long 1034457026,3336571891
-.long 2466948901,3584528711
-.long 3758326383,113926993
-.long 168717936,338241895
-.long 1188179964,666307205
-.long 1546045734,773529912
-.long 1522805485,1294757372
-.long 2643833823,1396182291
-.long 2343527390,1695183700
-.long 1014477480,1986661051
-.long 1206759142,2177026350
-.long 344077627,2456956037
-.long 1290863460,2730485921
-.long 3158454273,2820302411
-.long 3505952657,3259730800
-.long 106217008,3345764771
-.long 3606008344,3516065817
-.long 1432725776,3600352804
-.long 1467031594,4094571909
-.long 851169720,275423344
-.long 3100823752,430227734
-.long 1363258195,506948616
-.long 3750685593,659060556
-.long 3785050280,883997877
-.long 3318307427,958139571
-.long 3812723403,1322822218
-.long 2003034995,1537002063
-.long 3602036899,1747873779
-.long 1575990012,1955562222
-.long 1125592928,2024104815
-.long 2716904306,2227730452
-.long 442776044,2361852424
-.long 593698344,2428436474
-.long 3733110249,2756734187
-.long 2999351573,3204031479
-.long 3815920427,3329325298
-.long 3928383900,3391569614
-.long 566280711,3515267271
-.long 3454069534,3940187606
-.long 4000239992,4118630271
-.long 1914138554,116418474
-.long 2731055270,174292421
-.long 3203993006,289380356
-.long 320620315,460393269
-.long 587496836,685471733
-.long 1086792851,852142971
-.long 365543100,1017036298
-.long 2618297676,1126000580
-.long 3409855158,1288033470
-.long 4234509866,1501505948
-.long 987167468,1607167915
-.long 1246189591,1816402316
-.long 67438087,66051
-.long 202182159,134810123
-.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
-.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
-.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
-.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
-.byte 62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol _OPENSSL_ia32cap_P
-.long 0
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/vpaes-x86-apple.S b/apple-x86/crypto/fipsmodule/vpaes-x86-apple.S
deleted file mode 100644
index 4d2c485..0000000
--- a/apple-x86/crypto/fipsmodule/vpaes-x86-apple.S
+++ /dev/null
@@ -1,680 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-#ifdef BORINGSSL_DISPATCH_TEST
-#endif
-.align 6,0x90
-L_vpaes_consts:
-.long 218628480,235210255,168496130,67568393
-.long 252381056,17041926,33884169,51187212
-.long 252645135,252645135,252645135,252645135
-.long 1512730624,3266504856,1377990664,3401244816
-.long 830229760,1275146365,2969422977,3447763452
-.long 3411033600,2979783055,338359620,2782886510
-.long 4209124096,907596821,221174255,1006095553
-.long 191964160,3799684038,3164090317,1589111125
-.long 182528256,1777043520,2877432650,3265356744
-.long 1874708224,3503451415,3305285752,363511674
-.long 1606117888,3487855781,1093350906,2384367825
-.long 197121,67569157,134941193,202313229
-.long 67569157,134941193,202313229,197121
-.long 134941193,202313229,197121,67569157
-.long 202313229,197121,67569157,134941193
-.long 33619971,100992007,168364043,235736079
-.long 235736079,33619971,100992007,168364043
-.long 168364043,235736079,33619971,100992007
-.long 100992007,168364043,235736079,33619971
-.long 50462976,117835012,185207048,252579084
-.long 252314880,51251460,117574920,184942860
-.long 184682752,252054788,50987272,118359308
-.long 118099200,185467140,251790600,50727180
-.long 2946363062,528716217,1300004225,1881839624
-.long 1532713819,1532713819,1532713819,1532713819
-.long 3602276352,4288629033,3737020424,4153884961
-.long 1354558464,32357713,2958822624,3775749553
-.long 1201988352,132424512,1572796698,503232858
-.long 2213177600,1597421020,4103937655,675398315
-.long 2749646592,4273543773,1511898873,121693092
-.long 3040248576,1103263732,2871565598,1608280554
-.long 2236667136,2588920351,482954393,64377734
-.long 3069987328,291237287,2117370568,3650299247
-.long 533321216,3573750986,2572112006,1401264716
-.long 1339849704,2721158661,548607111,3445553514
-.long 2128193280,3054596040,2183486460,1257083700
-.long 655635200,1165381986,3923443150,2344132524
-.long 190078720,256924420,290342170,357187870
-.long 1610966272,2263057382,4103205268,309794674
-.long 2592527872,2233205587,1335446729,3402964816
-.long 3973531904,3225098121,3002836325,1918774430
-.long 3870401024,2102906079,2284471353,4117666579
-.long 617007872,1021508343,366931923,691083277
-.long 2528395776,3491914898,2968704004,1613121270
-.long 3445188352,3247741094,844474987,4093578302
-.long 651481088,1190302358,1689581232,574775300
-.long 4289380608,206939853,2555985458,2489840491
-.long 2130264064,327674451,3566485037,3349835193
-.long 2470714624,316102159,3636825756,3393945945
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
-.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
-.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
-.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
-.byte 118,101,114,115,105,116,121,41,0
-.align 6,0x90
-.private_extern __vpaes_preheat
-.align 4
-__vpaes_preheat:
- addl (%esp),%ebp
- movdqa -48(%ebp),%xmm7
- movdqa -16(%ebp),%xmm6
- ret
-.private_extern __vpaes_encrypt_core
-.align 4
-__vpaes_encrypt_core:
- movl $16,%ecx
- movl 240(%edx),%eax
- movdqa %xmm6,%xmm1
- movdqa (%ebp),%xmm2
- pandn %xmm0,%xmm1
- pand %xmm6,%xmm0
- movdqu (%edx),%xmm5
-.byte 102,15,56,0,208
- movdqa 16(%ebp),%xmm0
- pxor %xmm5,%xmm2
- psrld $4,%xmm1
- addl $16,%edx
-.byte 102,15,56,0,193
- leal 192(%ebp),%ebx
- pxor %xmm2,%xmm0
- jmp L000enc_entry
-.align 4,0x90
-L001enc_loop:
- movdqa 32(%ebp),%xmm4
- movdqa 48(%ebp),%xmm0
-.byte 102,15,56,0,226
-.byte 102,15,56,0,195
- pxor %xmm5,%xmm4
- movdqa 64(%ebp),%xmm5
- pxor %xmm4,%xmm0
- movdqa -64(%ebx,%ecx,1),%xmm1
-.byte 102,15,56,0,234
- movdqa 80(%ebp),%xmm2
- movdqa (%ebx,%ecx,1),%xmm4
-.byte 102,15,56,0,211
- movdqa %xmm0,%xmm3
- pxor %xmm5,%xmm2
-.byte 102,15,56,0,193
- addl $16,%edx
- pxor %xmm2,%xmm0
-.byte 102,15,56,0,220
- addl $16,%ecx
- pxor %xmm0,%xmm3
-.byte 102,15,56,0,193
- andl $48,%ecx
- subl $1,%eax
- pxor %xmm3,%xmm0
-L000enc_entry:
- movdqa %xmm6,%xmm1
- movdqa -32(%ebp),%xmm5
- pandn %xmm0,%xmm1
- psrld $4,%xmm1
- pand %xmm6,%xmm0
-.byte 102,15,56,0,232
- movdqa %xmm7,%xmm3
- pxor %xmm1,%xmm0
-.byte 102,15,56,0,217
- movdqa %xmm7,%xmm4
- pxor %xmm5,%xmm3
-.byte 102,15,56,0,224
- movdqa %xmm7,%xmm2
- pxor %xmm5,%xmm4
-.byte 102,15,56,0,211
- movdqa %xmm7,%xmm3
- pxor %xmm0,%xmm2
-.byte 102,15,56,0,220
- movdqu (%edx),%xmm5
- pxor %xmm1,%xmm3
- jnz L001enc_loop
- movdqa 96(%ebp),%xmm4
- movdqa 112(%ebp),%xmm0
-.byte 102,15,56,0,226
- pxor %xmm5,%xmm4
-.byte 102,15,56,0,195
- movdqa 64(%ebx,%ecx,1),%xmm1
- pxor %xmm4,%xmm0
-.byte 102,15,56,0,193
- ret
-.private_extern __vpaes_decrypt_core
-.align 4
-__vpaes_decrypt_core:
- leal 608(%ebp),%ebx
- movl 240(%edx),%eax
- movdqa %xmm6,%xmm1
- movdqa -64(%ebx),%xmm2
- pandn %xmm0,%xmm1
- movl %eax,%ecx
- psrld $4,%xmm1
- movdqu (%edx),%xmm5
- shll $4,%ecx
- pand %xmm6,%xmm0
-.byte 102,15,56,0,208
- movdqa -48(%ebx),%xmm0
- xorl $48,%ecx
-.byte 102,15,56,0,193
- andl $48,%ecx
- pxor %xmm5,%xmm2
- movdqa 176(%ebp),%xmm5
- pxor %xmm2,%xmm0
- addl $16,%edx
- leal -352(%ebx,%ecx,1),%ecx
- jmp L002dec_entry
-.align 4,0x90
-L003dec_loop:
- movdqa -32(%ebx),%xmm4
- movdqa -16(%ebx),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,203
- pxor %xmm4,%xmm0
- movdqa (%ebx),%xmm4
- pxor %xmm1,%xmm0
- movdqa 16(%ebx),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,197
-.byte 102,15,56,0,203
- pxor %xmm4,%xmm0
- movdqa 32(%ebx),%xmm4
- pxor %xmm1,%xmm0
- movdqa 48(%ebx),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,197
-.byte 102,15,56,0,203
- pxor %xmm4,%xmm0
- movdqa 64(%ebx),%xmm4
- pxor %xmm1,%xmm0
- movdqa 80(%ebx),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,197
-.byte 102,15,56,0,203
- pxor %xmm4,%xmm0
- addl $16,%edx
-.byte 102,15,58,15,237,12
- pxor %xmm1,%xmm0
- subl $1,%eax
-L002dec_entry:
- movdqa %xmm6,%xmm1
- movdqa -32(%ebp),%xmm2
- pandn %xmm0,%xmm1
- pand %xmm6,%xmm0
- psrld $4,%xmm1
-.byte 102,15,56,0,208
- movdqa %xmm7,%xmm3
- pxor %xmm1,%xmm0
-.byte 102,15,56,0,217
- movdqa %xmm7,%xmm4
- pxor %xmm2,%xmm3
-.byte 102,15,56,0,224
- pxor %xmm2,%xmm4
- movdqa %xmm7,%xmm2
-.byte 102,15,56,0,211
- movdqa %xmm7,%xmm3
- pxor %xmm0,%xmm2
-.byte 102,15,56,0,220
- movdqu (%edx),%xmm0
- pxor %xmm1,%xmm3
- jnz L003dec_loop
- movdqa 96(%ebx),%xmm4
-.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 112(%ebx),%xmm0
- movdqa (%ecx),%xmm2
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
-.byte 102,15,56,0,194
- ret
-.private_extern __vpaes_schedule_core
-.align 4
-__vpaes_schedule_core:
- addl (%esp),%ebp
- movdqu (%esi),%xmm0
- movdqa 320(%ebp),%xmm2
- movdqa %xmm0,%xmm3
- leal (%ebp),%ebx
- movdqa %xmm2,4(%esp)
- call __vpaes_schedule_transform
- movdqa %xmm0,%xmm7
- testl %edi,%edi
- jnz L004schedule_am_decrypting
- movdqu %xmm0,(%edx)
- jmp L005schedule_go
-L004schedule_am_decrypting:
- movdqa 256(%ebp,%ecx,1),%xmm1
-.byte 102,15,56,0,217
- movdqu %xmm3,(%edx)
- xorl $48,%ecx
-L005schedule_go:
- cmpl $192,%eax
- ja L006schedule_256
- je L007schedule_192
-L008schedule_128:
- movl $10,%eax
-L009loop_schedule_128:
- call __vpaes_schedule_round
- decl %eax
- jz L010schedule_mangle_last
- call __vpaes_schedule_mangle
- jmp L009loop_schedule_128
-.align 4,0x90
-L007schedule_192:
- movdqu 8(%esi),%xmm0
- call __vpaes_schedule_transform
- movdqa %xmm0,%xmm6
- pxor %xmm4,%xmm4
- movhlps %xmm4,%xmm6
- movl $4,%eax
-L011loop_schedule_192:
- call __vpaes_schedule_round
-.byte 102,15,58,15,198,8
- call __vpaes_schedule_mangle
- call __vpaes_schedule_192_smear
- call __vpaes_schedule_mangle
- call __vpaes_schedule_round
- decl %eax
- jz L010schedule_mangle_last
- call __vpaes_schedule_mangle
- call __vpaes_schedule_192_smear
- jmp L011loop_schedule_192
-.align 4,0x90
-L006schedule_256:
- movdqu 16(%esi),%xmm0
- call __vpaes_schedule_transform
- movl $7,%eax
-L012loop_schedule_256:
- call __vpaes_schedule_mangle
- movdqa %xmm0,%xmm6
- call __vpaes_schedule_round
- decl %eax
- jz L010schedule_mangle_last
- call __vpaes_schedule_mangle
- pshufd $255,%xmm0,%xmm0
- movdqa %xmm7,20(%esp)
- movdqa %xmm6,%xmm7
- call L_vpaes_schedule_low_round
- movdqa 20(%esp),%xmm7
- jmp L012loop_schedule_256
-.align 4,0x90
-L010schedule_mangle_last:
- leal 384(%ebp),%ebx
- testl %edi,%edi
- jnz L013schedule_mangle_last_dec
- movdqa 256(%ebp,%ecx,1),%xmm1
-.byte 102,15,56,0,193
- leal 352(%ebp),%ebx
- addl $32,%edx
-L013schedule_mangle_last_dec:
- addl $-16,%edx
- pxor 336(%ebp),%xmm0
- call __vpaes_schedule_transform
- movdqu %xmm0,(%edx)
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
- pxor %xmm7,%xmm7
- ret
-.private_extern __vpaes_schedule_192_smear
-.align 4
-__vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm1
- pshufd $254,%xmm7,%xmm0
- pxor %xmm1,%xmm6
- pxor %xmm1,%xmm1
- pxor %xmm0,%xmm6
- movdqa %xmm6,%xmm0
- movhlps %xmm1,%xmm6
- ret
-.private_extern __vpaes_schedule_round
-.align 4
-__vpaes_schedule_round:
- movdqa 8(%esp),%xmm2
- pxor %xmm1,%xmm1
-.byte 102,15,58,15,202,15
-.byte 102,15,58,15,210,15
- pxor %xmm1,%xmm7
- pshufd $255,%xmm0,%xmm0
-.byte 102,15,58,15,192,1
- movdqa %xmm2,8(%esp)
-L_vpaes_schedule_low_round:
- movdqa %xmm7,%xmm1
- pslldq $4,%xmm7
- pxor %xmm1,%xmm7
- movdqa %xmm7,%xmm1
- pslldq $8,%xmm7
- pxor %xmm1,%xmm7
- pxor 336(%ebp),%xmm7
- movdqa -16(%ebp),%xmm4
- movdqa -48(%ebp),%xmm5
- movdqa %xmm4,%xmm1
- pandn %xmm0,%xmm1
- psrld $4,%xmm1
- pand %xmm4,%xmm0
- movdqa -32(%ebp),%xmm2
-.byte 102,15,56,0,208
- pxor %xmm1,%xmm0
- movdqa %xmm5,%xmm3
-.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
- movdqa %xmm5,%xmm4
-.byte 102,15,56,0,224
- pxor %xmm2,%xmm4
- movdqa %xmm5,%xmm2
-.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
- movdqa %xmm5,%xmm3
-.byte 102,15,56,0,220
- pxor %xmm1,%xmm3
- movdqa 32(%ebp),%xmm4
-.byte 102,15,56,0,226
- movdqa 48(%ebp),%xmm0
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
- pxor %xmm7,%xmm0
- movdqa %xmm0,%xmm7
- ret
-.private_extern __vpaes_schedule_transform
-.align 4
-__vpaes_schedule_transform:
- movdqa -16(%ebp),%xmm2
- movdqa %xmm2,%xmm1
- pandn %xmm0,%xmm1
- psrld $4,%xmm1
- pand %xmm2,%xmm0
- movdqa (%ebx),%xmm2
-.byte 102,15,56,0,208
- movdqa 16(%ebx),%xmm0
-.byte 102,15,56,0,193
- pxor %xmm2,%xmm0
- ret
-.private_extern __vpaes_schedule_mangle
-.align 4
-__vpaes_schedule_mangle:
- movdqa %xmm0,%xmm4
- movdqa 128(%ebp),%xmm5
- testl %edi,%edi
- jnz L014schedule_mangle_dec
- addl $16,%edx
- pxor 336(%ebp),%xmm4
-.byte 102,15,56,0,229
- movdqa %xmm4,%xmm3
-.byte 102,15,56,0,229
- pxor %xmm4,%xmm3
-.byte 102,15,56,0,229
- pxor %xmm4,%xmm3
- jmp L015schedule_mangle_both
-.align 4,0x90
-L014schedule_mangle_dec:
- movdqa -16(%ebp),%xmm2
- leal 416(%ebp),%esi
- movdqa %xmm2,%xmm1
- pandn %xmm4,%xmm1
- psrld $4,%xmm1
- pand %xmm2,%xmm4
- movdqa (%esi),%xmm2
-.byte 102,15,56,0,212
- movdqa 16(%esi),%xmm3
-.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
-.byte 102,15,56,0,221
- movdqa 32(%esi),%xmm2
-.byte 102,15,56,0,212
- pxor %xmm3,%xmm2
- movdqa 48(%esi),%xmm3
-.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
-.byte 102,15,56,0,221
- movdqa 64(%esi),%xmm2
-.byte 102,15,56,0,212
- pxor %xmm3,%xmm2
- movdqa 80(%esi),%xmm3
-.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
-.byte 102,15,56,0,221
- movdqa 96(%esi),%xmm2
-.byte 102,15,56,0,212
- pxor %xmm3,%xmm2
- movdqa 112(%esi),%xmm3
-.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
- addl $-16,%edx
-L015schedule_mangle_both:
- movdqa 256(%ebp,%ecx,1),%xmm1
-.byte 102,15,56,0,217
- addl $-16,%ecx
- andl $48,%ecx
- movdqu %xmm3,(%edx)
- ret
-.globl _vpaes_set_encrypt_key
-.private_extern _vpaes_set_encrypt_key
-.align 4
-_vpaes_set_encrypt_key:
-L_vpaes_set_encrypt_key_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-#ifdef BORINGSSL_DISPATCH_TEST
- pushl %ebx
- pushl %edx
- call L016pic
-L016pic:
- popl %ebx
- leal _BORINGSSL_function_hit+5-L016pic(%ebx),%ebx
- movl $1,%edx
- movb %dl,(%ebx)
- popl %edx
- popl %ebx
-#endif
- movl 20(%esp),%esi
- leal -56(%esp),%ebx
- movl 24(%esp),%eax
- andl $-16,%ebx
- movl 28(%esp),%edx
- xchgl %esp,%ebx
- movl %ebx,48(%esp)
- movl %eax,%ebx
- shrl $5,%ebx
- addl $5,%ebx
- movl %ebx,240(%edx)
- movl $48,%ecx
- movl $0,%edi
- leal L_vpaes_consts+0x30-L017pic_point,%ebp
- call __vpaes_schedule_core
-L017pic_point:
- movl 48(%esp),%esp
- xorl %eax,%eax
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _vpaes_set_decrypt_key
-.private_extern _vpaes_set_decrypt_key
-.align 4
-_vpaes_set_decrypt_key:
-L_vpaes_set_decrypt_key_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%esi
- leal -56(%esp),%ebx
- movl 24(%esp),%eax
- andl $-16,%ebx
- movl 28(%esp),%edx
- xchgl %esp,%ebx
- movl %ebx,48(%esp)
- movl %eax,%ebx
- shrl $5,%ebx
- addl $5,%ebx
- movl %ebx,240(%edx)
- shll $4,%ebx
- leal 16(%edx,%ebx,1),%edx
- movl $1,%edi
- movl %eax,%ecx
- shrl $1,%ecx
- andl $32,%ecx
- xorl $32,%ecx
- leal L_vpaes_consts+0x30-L018pic_point,%ebp
- call __vpaes_schedule_core
-L018pic_point:
- movl 48(%esp),%esp
- xorl %eax,%eax
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _vpaes_encrypt
-.private_extern _vpaes_encrypt
-.align 4
-_vpaes_encrypt:
-L_vpaes_encrypt_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
-#ifdef BORINGSSL_DISPATCH_TEST
- pushl %ebx
- pushl %edx
- call L019pic
-L019pic:
- popl %ebx
- leal _BORINGSSL_function_hit+4-L019pic(%ebx),%ebx
- movl $1,%edx
- movb %dl,(%ebx)
- popl %edx
- popl %ebx
-#endif
- leal L_vpaes_consts+0x30-L020pic_point,%ebp
- call __vpaes_preheat
-L020pic_point:
- movl 20(%esp),%esi
- leal -56(%esp),%ebx
- movl 24(%esp),%edi
- andl $-16,%ebx
- movl 28(%esp),%edx
- xchgl %esp,%ebx
- movl %ebx,48(%esp)
- movdqu (%esi),%xmm0
- call __vpaes_encrypt_core
- movdqu %xmm0,(%edi)
- movl 48(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _vpaes_decrypt
-.private_extern _vpaes_decrypt
-.align 4
-_vpaes_decrypt:
-L_vpaes_decrypt_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- leal L_vpaes_consts+0x30-L021pic_point,%ebp
- call __vpaes_preheat
-L021pic_point:
- movl 20(%esp),%esi
- leal -56(%esp),%ebx
- movl 24(%esp),%edi
- andl $-16,%ebx
- movl 28(%esp),%edx
- xchgl %esp,%ebx
- movl %ebx,48(%esp)
- movdqu (%esi),%xmm0
- call __vpaes_decrypt_core
- movdqu %xmm0,(%edi)
- movl 48(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _vpaes_cbc_encrypt
-.private_extern _vpaes_cbc_encrypt
-.align 4
-_vpaes_cbc_encrypt:
-L_vpaes_cbc_encrypt_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%eax
- movl 32(%esp),%edx
- subl $16,%eax
- jc L022cbc_abort
- leal -56(%esp),%ebx
- movl 36(%esp),%ebp
- andl $-16,%ebx
- movl 40(%esp),%ecx
- xchgl %esp,%ebx
- movdqu (%ebp),%xmm1
- subl %esi,%edi
- movl %ebx,48(%esp)
- movl %edi,(%esp)
- movl %edx,4(%esp)
- movl %ebp,8(%esp)
- movl %eax,%edi
- leal L_vpaes_consts+0x30-L023pic_point,%ebp
- call __vpaes_preheat
-L023pic_point:
- cmpl $0,%ecx
- je L024cbc_dec_loop
- jmp L025cbc_enc_loop
-.align 4,0x90
-L025cbc_enc_loop:
- movdqu (%esi),%xmm0
- pxor %xmm1,%xmm0
- call __vpaes_encrypt_core
- movl (%esp),%ebx
- movl 4(%esp),%edx
- movdqa %xmm0,%xmm1
- movdqu %xmm0,(%ebx,%esi,1)
- leal 16(%esi),%esi
- subl $16,%edi
- jnc L025cbc_enc_loop
- jmp L026cbc_done
-.align 4,0x90
-L024cbc_dec_loop:
- movdqu (%esi),%xmm0
- movdqa %xmm1,16(%esp)
- movdqa %xmm0,32(%esp)
- call __vpaes_decrypt_core
- movl (%esp),%ebx
- movl 4(%esp),%edx
- pxor 16(%esp),%xmm0
- movdqa 32(%esp),%xmm1
- movdqu %xmm0,(%ebx,%esi,1)
- leal 16(%esi),%esi
- subl $16,%edi
- jnc L024cbc_dec_loop
-L026cbc_done:
- movl 8(%esp),%ebx
- movl 48(%esp),%esp
- movdqu %xmm1,(%ebx)
-L022cbc_abort:
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/x86-mont-apple.S b/apple-x86/crypto/fipsmodule/x86-mont-apple.S
deleted file mode 100644
index f991f6c..0000000
--- a/apple-x86/crypto/fipsmodule/x86-mont-apple.S
+++ /dev/null
@@ -1,484 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl _bn_mul_mont
-.private_extern _bn_mul_mont
-.align 4
-_bn_mul_mont:
-L_bn_mul_mont_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- xorl %eax,%eax
- movl 40(%esp),%edi
- cmpl $4,%edi
- jl L000just_leave
- leal 20(%esp),%esi
- leal 24(%esp),%edx
- addl $2,%edi
- negl %edi
- leal -32(%esp,%edi,4),%ebp
- negl %edi
- movl %ebp,%eax
- subl %edx,%eax
- andl $2047,%eax
- subl %eax,%ebp
- xorl %ebp,%edx
- andl $2048,%edx
- xorl $2048,%edx
- subl %edx,%ebp
- andl $-64,%ebp
- movl %esp,%eax
- subl %ebp,%eax
- andl $-4096,%eax
- movl %esp,%edx
- leal (%ebp,%eax,1),%esp
- movl (%esp),%eax
- cmpl %ebp,%esp
- ja L001page_walk
- jmp L002page_walk_done
-.align 4,0x90
-L001page_walk:
- leal -4096(%esp),%esp
- movl (%esp),%eax
- cmpl %ebp,%esp
- ja L001page_walk
-L002page_walk_done:
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%ebp
- movl 16(%esi),%esi
- movl (%esi),%esi
- movl %eax,4(%esp)
- movl %ebx,8(%esp)
- movl %ecx,12(%esp)
- movl %ebp,16(%esp)
- movl %esi,20(%esp)
- leal -3(%edi),%ebx
- movl %edx,24(%esp)
- call L003PIC_me_up
-L003PIC_me_up:
- popl %eax
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L003PIC_me_up(%eax),%eax
- btl $26,(%eax)
- jnc L004non_sse2
- movl $-1,%eax
- movd %eax,%mm7
- movl 8(%esp),%esi
- movl 12(%esp),%edi
- movl 16(%esp),%ebp
- xorl %edx,%edx
- xorl %ecx,%ecx
- movd (%edi),%mm4
- movd (%esi),%mm5
- movd (%ebp),%mm3
- pmuludq %mm4,%mm5
- movq %mm5,%mm2
- movq %mm5,%mm0
- pand %mm7,%mm0
- pmuludq 20(%esp),%mm5
- pmuludq %mm5,%mm3
- paddq %mm0,%mm3
- movd 4(%ebp),%mm1
- movd 4(%esi),%mm0
- psrlq $32,%mm2
- psrlq $32,%mm3
- incl %ecx
-.align 4,0x90
-L0051st:
- pmuludq %mm4,%mm0
- pmuludq %mm5,%mm1
- paddq %mm0,%mm2
- paddq %mm1,%mm3
- movq %mm2,%mm0
- pand %mm7,%mm0
- movd 4(%ebp,%ecx,4),%mm1
- paddq %mm0,%mm3
- movd 4(%esi,%ecx,4),%mm0
- psrlq $32,%mm2
- movd %mm3,28(%esp,%ecx,4)
- psrlq $32,%mm3
- leal 1(%ecx),%ecx
- cmpl %ebx,%ecx
- jl L0051st
- pmuludq %mm4,%mm0
- pmuludq %mm5,%mm1
- paddq %mm0,%mm2
- paddq %mm1,%mm3
- movq %mm2,%mm0
- pand %mm7,%mm0
- paddq %mm0,%mm3
- movd %mm3,28(%esp,%ecx,4)
- psrlq $32,%mm2
- psrlq $32,%mm3
- paddq %mm2,%mm3
- movq %mm3,32(%esp,%ebx,4)
- incl %edx
-L006outer:
- xorl %ecx,%ecx
- movd (%edi,%edx,4),%mm4
- movd (%esi),%mm5
- movd 32(%esp),%mm6
- movd (%ebp),%mm3
- pmuludq %mm4,%mm5
- paddq %mm6,%mm5
- movq %mm5,%mm0
- movq %mm5,%mm2
- pand %mm7,%mm0
- pmuludq 20(%esp),%mm5
- pmuludq %mm5,%mm3
- paddq %mm0,%mm3
- movd 36(%esp),%mm6
- movd 4(%ebp),%mm1
- movd 4(%esi),%mm0
- psrlq $32,%mm2
- psrlq $32,%mm3
- paddq %mm6,%mm2
- incl %ecx
- decl %ebx
-L007inner:
- pmuludq %mm4,%mm0
- pmuludq %mm5,%mm1
- paddq %mm0,%mm2
- paddq %mm1,%mm3
- movq %mm2,%mm0
- movd 36(%esp,%ecx,4),%mm6
- pand %mm7,%mm0
- movd 4(%ebp,%ecx,4),%mm1
- paddq %mm0,%mm3
- movd 4(%esi,%ecx,4),%mm0
- psrlq $32,%mm2
- movd %mm3,28(%esp,%ecx,4)
- psrlq $32,%mm3
- paddq %mm6,%mm2
- decl %ebx
- leal 1(%ecx),%ecx
- jnz L007inner
- movl %ecx,%ebx
- pmuludq %mm4,%mm0
- pmuludq %mm5,%mm1
- paddq %mm0,%mm2
- paddq %mm1,%mm3
- movq %mm2,%mm0
- pand %mm7,%mm0
- paddq %mm0,%mm3
- movd %mm3,28(%esp,%ecx,4)
- psrlq $32,%mm2
- psrlq $32,%mm3
- movd 36(%esp,%ebx,4),%mm6
- paddq %mm2,%mm3
- paddq %mm6,%mm3
- movq %mm3,32(%esp,%ebx,4)
- leal 1(%edx),%edx
- cmpl %ebx,%edx
- jle L006outer
- emms
- jmp L008common_tail
-.align 4,0x90
-L004non_sse2:
- movl 8(%esp),%esi
- leal 1(%ebx),%ebp
- movl 12(%esp),%edi
- xorl %ecx,%ecx
- movl %esi,%edx
- andl $1,%ebp
- subl %edi,%edx
- leal 4(%edi,%ebx,4),%eax
- orl %edx,%ebp
- movl (%edi),%edi
- jz L009bn_sqr_mont
- movl %eax,28(%esp)
- movl (%esi),%eax
- xorl %edx,%edx
-.align 4,0x90
-L010mull:
- movl %edx,%ebp
- mull %edi
- addl %eax,%ebp
- leal 1(%ecx),%ecx
- adcl $0,%edx
- movl (%esi,%ecx,4),%eax
- cmpl %ebx,%ecx
- movl %ebp,28(%esp,%ecx,4)
- jl L010mull
- movl %edx,%ebp
- mull %edi
- movl 20(%esp),%edi
- addl %ebp,%eax
- movl 16(%esp),%esi
- adcl $0,%edx
- imull 32(%esp),%edi
- movl %eax,32(%esp,%ebx,4)
- xorl %ecx,%ecx
- movl %edx,36(%esp,%ebx,4)
- movl %ecx,40(%esp,%ebx,4)
- movl (%esi),%eax
- mull %edi
- addl 32(%esp),%eax
- movl 4(%esi),%eax
- adcl $0,%edx
- incl %ecx
- jmp L0112ndmadd
-.align 4,0x90
-L0121stmadd:
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ecx,4),%ebp
- leal 1(%ecx),%ecx
- adcl $0,%edx
- addl %eax,%ebp
- movl (%esi,%ecx,4),%eax
- adcl $0,%edx
- cmpl %ebx,%ecx
- movl %ebp,28(%esp,%ecx,4)
- jl L0121stmadd
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ebx,4),%eax
- movl 20(%esp),%edi
- adcl $0,%edx
- movl 16(%esp),%esi
- addl %eax,%ebp
- adcl $0,%edx
- imull 32(%esp),%edi
- xorl %ecx,%ecx
- addl 36(%esp,%ebx,4),%edx
- movl %ebp,32(%esp,%ebx,4)
- adcl $0,%ecx
- movl (%esi),%eax
- movl %edx,36(%esp,%ebx,4)
- movl %ecx,40(%esp,%ebx,4)
- mull %edi
- addl 32(%esp),%eax
- movl 4(%esi),%eax
- adcl $0,%edx
- movl $1,%ecx
-.align 4,0x90
-L0112ndmadd:
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ecx,4),%ebp
- leal 1(%ecx),%ecx
- adcl $0,%edx
- addl %eax,%ebp
- movl (%esi,%ecx,4),%eax
- adcl $0,%edx
- cmpl %ebx,%ecx
- movl %ebp,24(%esp,%ecx,4)
- jl L0112ndmadd
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ebx,4),%ebp
- adcl $0,%edx
- addl %eax,%ebp
- adcl $0,%edx
- movl %ebp,28(%esp,%ebx,4)
- xorl %eax,%eax
- movl 12(%esp),%ecx
- addl 36(%esp,%ebx,4),%edx
- adcl 40(%esp,%ebx,4),%eax
- leal 4(%ecx),%ecx
- movl %edx,32(%esp,%ebx,4)
- cmpl 28(%esp),%ecx
- movl %eax,36(%esp,%ebx,4)
- je L008common_tail
- movl (%ecx),%edi
- movl 8(%esp),%esi
- movl %ecx,12(%esp)
- xorl %ecx,%ecx
- xorl %edx,%edx
- movl (%esi),%eax
- jmp L0121stmadd
-.align 4,0x90
-L009bn_sqr_mont:
- movl %ebx,(%esp)
- movl %ecx,12(%esp)
- movl %edi,%eax
- mull %edi
- movl %eax,32(%esp)
- movl %edx,%ebx
- shrl $1,%edx
- andl $1,%ebx
- incl %ecx
-.align 4,0x90
-L013sqr:
- movl (%esi,%ecx,4),%eax
- movl %edx,%ebp
- mull %edi
- addl %ebp,%eax
- leal 1(%ecx),%ecx
- adcl $0,%edx
- leal (%ebx,%eax,2),%ebp
- shrl $31,%eax
- cmpl (%esp),%ecx
- movl %eax,%ebx
- movl %ebp,28(%esp,%ecx,4)
- jl L013sqr
- movl (%esi,%ecx,4),%eax
- movl %edx,%ebp
- mull %edi
- addl %ebp,%eax
- movl 20(%esp),%edi
- adcl $0,%edx
- movl 16(%esp),%esi
- leal (%ebx,%eax,2),%ebp
- imull 32(%esp),%edi
- shrl $31,%eax
- movl %ebp,32(%esp,%ecx,4)
- leal (%eax,%edx,2),%ebp
- movl (%esi),%eax
- shrl $31,%edx
- movl %ebp,36(%esp,%ecx,4)
- movl %edx,40(%esp,%ecx,4)
- mull %edi
- addl 32(%esp),%eax
- movl %ecx,%ebx
- adcl $0,%edx
- movl 4(%esi),%eax
- movl $1,%ecx
-.align 4,0x90
-L0143rdmadd:
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ecx,4),%ebp
- adcl $0,%edx
- addl %eax,%ebp
- movl 4(%esi,%ecx,4),%eax
- adcl $0,%edx
- movl %ebp,28(%esp,%ecx,4)
- movl %edx,%ebp
- mull %edi
- addl 36(%esp,%ecx,4),%ebp
- leal 2(%ecx),%ecx
- adcl $0,%edx
- addl %eax,%ebp
- movl (%esi,%ecx,4),%eax
- adcl $0,%edx
- cmpl %ebx,%ecx
- movl %ebp,24(%esp,%ecx,4)
- jl L0143rdmadd
- movl %edx,%ebp
- mull %edi
- addl 32(%esp,%ebx,4),%ebp
- adcl $0,%edx
- addl %eax,%ebp
- adcl $0,%edx
- movl %ebp,28(%esp,%ebx,4)
- movl 12(%esp),%ecx
- xorl %eax,%eax
- movl 8(%esp),%esi
- addl 36(%esp,%ebx,4),%edx
- adcl 40(%esp,%ebx,4),%eax
- movl %edx,32(%esp,%ebx,4)
- cmpl %ebx,%ecx
- movl %eax,36(%esp,%ebx,4)
- je L008common_tail
- movl 4(%esi,%ecx,4),%edi
- leal 1(%ecx),%ecx
- movl %edi,%eax
- movl %ecx,12(%esp)
- mull %edi
- addl 32(%esp,%ecx,4),%eax
- adcl $0,%edx
- movl %eax,32(%esp,%ecx,4)
- xorl %ebp,%ebp
- cmpl %ebx,%ecx
- leal 1(%ecx),%ecx
- je L015sqrlast
- movl %edx,%ebx
- shrl $1,%edx
- andl $1,%ebx
-.align 4,0x90
-L016sqradd:
- movl (%esi,%ecx,4),%eax
- movl %edx,%ebp
- mull %edi
- addl %ebp,%eax
- leal (%eax,%eax,1),%ebp
- adcl $0,%edx
- shrl $31,%eax
- addl 32(%esp,%ecx,4),%ebp
- leal 1(%ecx),%ecx
- adcl $0,%eax
- addl %ebx,%ebp
- adcl $0,%eax
- cmpl (%esp),%ecx
- movl %ebp,28(%esp,%ecx,4)
- movl %eax,%ebx
- jle L016sqradd
- movl %edx,%ebp
- addl %edx,%edx
- shrl $31,%ebp
- addl %ebx,%edx
- adcl $0,%ebp
-L015sqrlast:
- movl 20(%esp),%edi
- movl 16(%esp),%esi
- imull 32(%esp),%edi
- addl 32(%esp,%ecx,4),%edx
- movl (%esi),%eax
- adcl $0,%ebp
- movl %edx,32(%esp,%ecx,4)
- movl %ebp,36(%esp,%ecx,4)
- mull %edi
- addl 32(%esp),%eax
- leal -1(%ecx),%ebx
- adcl $0,%edx
- movl $1,%ecx
- movl 4(%esi),%eax
- jmp L0143rdmadd
-.align 4,0x90
-L008common_tail:
- movl 16(%esp),%ebp
- movl 4(%esp),%edi
- leal 32(%esp),%esi
- movl (%esi),%eax
- movl %ebx,%ecx
- xorl %edx,%edx
-.align 4,0x90
-L017sub:
- sbbl (%ebp,%edx,4),%eax
- movl %eax,(%edi,%edx,4)
- decl %ecx
- movl 4(%esi,%edx,4),%eax
- leal 1(%edx),%edx
- jge L017sub
- sbbl $0,%eax
- movl $-1,%edx
- xorl %eax,%edx
- jmp L018copy
-.align 4,0x90
-L018copy:
- movl 32(%esp,%ebx,4),%esi
- movl (%edi,%ebx,4),%ebp
- movl %ecx,32(%esp,%ebx,4)
- andl %eax,%esi
- andl %edx,%ebp
- orl %esi,%ebp
- movl %ebp,(%edi,%ebx,4)
- decl %ebx
- jge L018copy
- movl 24(%esp),%esp
- movl $1,%eax
-L000just_leave:
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
-.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
-.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
-.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
-.byte 111,114,103,62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol _OPENSSL_ia32cap_P
-.long 0
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/test/trampoline-x86-apple.S b/apple-x86/crypto/test/trampoline-x86-apple.S
deleted file mode 100644
index 4065b9a..0000000
--- a/apple-x86/crypto/test/trampoline-x86-apple.S
+++ /dev/null
@@ -1,168 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl _abi_test_trampoline
-.private_extern _abi_test_trampoline
-.align 4
-_abi_test_trampoline:
-L_abi_test_trampoline_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 24(%esp),%ecx
- movl (%ecx),%esi
- movl 4(%ecx),%edi
- movl 8(%ecx),%ebx
- movl 12(%ecx),%ebp
- subl $44,%esp
- movl 72(%esp),%eax
- xorl %ecx,%ecx
-L000loop:
- cmpl 76(%esp),%ecx
- jae L001loop_done
- movl (%eax,%ecx,4),%edx
- movl %edx,(%esp,%ecx,4)
- addl $1,%ecx
- jmp L000loop
-L001loop_done:
- call *64(%esp)
- addl $44,%esp
- movl 24(%esp),%ecx
- movl %esi,(%ecx)
- movl %edi,4(%ecx)
- movl %ebx,8(%ecx)
- movl %ebp,12(%ecx)
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.globl _abi_test_get_and_clear_direction_flag
-.private_extern _abi_test_get_and_clear_direction_flag
-.align 4
-_abi_test_get_and_clear_direction_flag:
-L_abi_test_get_and_clear_direction_flag_begin:
- pushfl
- popl %eax
- andl $1024,%eax
- shrl $10,%eax
- cld
- ret
-.globl _abi_test_set_direction_flag
-.private_extern _abi_test_set_direction_flag
-.align 4
-_abi_test_set_direction_flag:
-L_abi_test_set_direction_flag_begin:
- std
- ret
-.globl _abi_test_clobber_eax
-.private_extern _abi_test_clobber_eax
-.align 4
-_abi_test_clobber_eax:
-L_abi_test_clobber_eax_begin:
- xorl %eax,%eax
- ret
-.globl _abi_test_clobber_ebx
-.private_extern _abi_test_clobber_ebx
-.align 4
-_abi_test_clobber_ebx:
-L_abi_test_clobber_ebx_begin:
- xorl %ebx,%ebx
- ret
-.globl _abi_test_clobber_ecx
-.private_extern _abi_test_clobber_ecx
-.align 4
-_abi_test_clobber_ecx:
-L_abi_test_clobber_ecx_begin:
- xorl %ecx,%ecx
- ret
-.globl _abi_test_clobber_edx
-.private_extern _abi_test_clobber_edx
-.align 4
-_abi_test_clobber_edx:
-L_abi_test_clobber_edx_begin:
- xorl %edx,%edx
- ret
-.globl _abi_test_clobber_edi
-.private_extern _abi_test_clobber_edi
-.align 4
-_abi_test_clobber_edi:
-L_abi_test_clobber_edi_begin:
- xorl %edi,%edi
- ret
-.globl _abi_test_clobber_esi
-.private_extern _abi_test_clobber_esi
-.align 4
-_abi_test_clobber_esi:
-L_abi_test_clobber_esi_begin:
- xorl %esi,%esi
- ret
-.globl _abi_test_clobber_ebp
-.private_extern _abi_test_clobber_ebp
-.align 4
-_abi_test_clobber_ebp:
-L_abi_test_clobber_ebp_begin:
- xorl %ebp,%ebp
- ret
-.globl _abi_test_clobber_xmm0
-.private_extern _abi_test_clobber_xmm0
-.align 4
-_abi_test_clobber_xmm0:
-L_abi_test_clobber_xmm0_begin:
- pxor %xmm0,%xmm0
- ret
-.globl _abi_test_clobber_xmm1
-.private_extern _abi_test_clobber_xmm1
-.align 4
-_abi_test_clobber_xmm1:
-L_abi_test_clobber_xmm1_begin:
- pxor %xmm1,%xmm1
- ret
-.globl _abi_test_clobber_xmm2
-.private_extern _abi_test_clobber_xmm2
-.align 4
-_abi_test_clobber_xmm2:
-L_abi_test_clobber_xmm2_begin:
- pxor %xmm2,%xmm2
- ret
-.globl _abi_test_clobber_xmm3
-.private_extern _abi_test_clobber_xmm3
-.align 4
-_abi_test_clobber_xmm3:
-L_abi_test_clobber_xmm3_begin:
- pxor %xmm3,%xmm3
- ret
-.globl _abi_test_clobber_xmm4
-.private_extern _abi_test_clobber_xmm4
-.align 4
-_abi_test_clobber_xmm4:
-L_abi_test_clobber_xmm4_begin:
- pxor %xmm4,%xmm4
- ret
-.globl _abi_test_clobber_xmm5
-.private_extern _abi_test_clobber_xmm5
-.align 4
-_abi_test_clobber_xmm5:
-L_abi_test_clobber_xmm5_begin:
- pxor %xmm5,%xmm5
- ret
-.globl _abi_test_clobber_xmm6
-.private_extern _abi_test_clobber_xmm6
-.align 4
-_abi_test_clobber_xmm6:
-L_abi_test_clobber_xmm6_begin:
- pxor %xmm6,%xmm6
- ret
-.globl _abi_test_clobber_xmm7
-.private_extern _abi_test_clobber_xmm7
-.align 4
-_abi_test_clobber_xmm7:
-L_abi_test_clobber_xmm7_begin:
- pxor %xmm7,%xmm7
- ret
-#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86_64/crypto/chacha/chacha-x86_64-apple.S b/apple-x86_64/crypto/chacha/chacha-x86_64-apple.S
deleted file mode 100644
index 2c46926..0000000
--- a/apple-x86_64/crypto/chacha/chacha-x86_64-apple.S
+++ /dev/null
@@ -1,1621 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-
-
-.section __DATA,__const
-.p2align 6
-L$zero:
-.long 0,0,0,0
-L$one:
-.long 1,0,0,0
-L$inc:
-.long 0,1,2,3
-L$four:
-.long 4,4,4,4
-L$incy:
-.long 0,2,4,6,1,3,5,7
-L$eight:
-.long 8,8,8,8,8,8,8,8
-L$rot16:
-.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
-L$rot24:
-.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
-L$sigma:
-.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
-.p2align 6
-L$zeroz:
-.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
-L$fourz:
-.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
-L$incz:
-.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
-L$sixteen:
-.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
-.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.text
-.globl _ChaCha20_ctr32
-.private_extern _ChaCha20_ctr32
-
-.p2align 6
-_ChaCha20_ctr32:
-
-_CET_ENDBR
- cmpq $0,%rdx
- je L$no_data
- movq _OPENSSL_ia32cap_P+4(%rip),%r10
- testl $512,%r10d
- jnz L$ChaCha20_ssse3
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- subq $64+24,%rsp
-
-L$ctr32_body:
-
-
- movdqu (%rcx),%xmm1
- movdqu 16(%rcx),%xmm2
- movdqu (%r8),%xmm3
- movdqa L$one(%rip),%xmm4
-
-
- movdqa %xmm1,16(%rsp)
- movdqa %xmm2,32(%rsp)
- movdqa %xmm3,48(%rsp)
- movq %rdx,%rbp
- jmp L$oop_outer
-
-.p2align 5
-L$oop_outer:
- movl $0x61707865,%eax
- movl $0x3320646e,%ebx
- movl $0x79622d32,%ecx
- movl $0x6b206574,%edx
- movl 16(%rsp),%r8d
- movl 20(%rsp),%r9d
- movl 24(%rsp),%r10d
- movl 28(%rsp),%r11d
- movd %xmm3,%r12d
- movl 52(%rsp),%r13d
- movl 56(%rsp),%r14d
- movl 60(%rsp),%r15d
-
- movq %rbp,64+0(%rsp)
- movl $10,%ebp
- movq %rsi,64+8(%rsp)
-.byte 102,72,15,126,214
- movq %rdi,64+16(%rsp)
- movq %rsi,%rdi
- shrq $32,%rdi
- jmp L$oop
-
-.p2align 5
-L$oop:
- addl %r8d,%eax
- xorl %eax,%r12d
- roll $16,%r12d
- addl %r9d,%ebx
- xorl %ebx,%r13d
- roll $16,%r13d
- addl %r12d,%esi
- xorl %esi,%r8d
- roll $12,%r8d
- addl %r13d,%edi
- xorl %edi,%r9d
- roll $12,%r9d
- addl %r8d,%eax
- xorl %eax,%r12d
- roll $8,%r12d
- addl %r9d,%ebx
- xorl %ebx,%r13d
- roll $8,%r13d
- addl %r12d,%esi
- xorl %esi,%r8d
- roll $7,%r8d
- addl %r13d,%edi
- xorl %edi,%r9d
- roll $7,%r9d
- movl %esi,32(%rsp)
- movl %edi,36(%rsp)
- movl 40(%rsp),%esi
- movl 44(%rsp),%edi
- addl %r10d,%ecx
- xorl %ecx,%r14d
- roll $16,%r14d
- addl %r11d,%edx
- xorl %edx,%r15d
- roll $16,%r15d
- addl %r14d,%esi
- xorl %esi,%r10d
- roll $12,%r10d
- addl %r15d,%edi
- xorl %edi,%r11d
- roll $12,%r11d
- addl %r10d,%ecx
- xorl %ecx,%r14d
- roll $8,%r14d
- addl %r11d,%edx
- xorl %edx,%r15d
- roll $8,%r15d
- addl %r14d,%esi
- xorl %esi,%r10d
- roll $7,%r10d
- addl %r15d,%edi
- xorl %edi,%r11d
- roll $7,%r11d
- addl %r9d,%eax
- xorl %eax,%r15d
- roll $16,%r15d
- addl %r10d,%ebx
- xorl %ebx,%r12d
- roll $16,%r12d
- addl %r15d,%esi
- xorl %esi,%r9d
- roll $12,%r9d
- addl %r12d,%edi
- xorl %edi,%r10d
- roll $12,%r10d
- addl %r9d,%eax
- xorl %eax,%r15d
- roll $8,%r15d
- addl %r10d,%ebx
- xorl %ebx,%r12d
- roll $8,%r12d
- addl %r15d,%esi
- xorl %esi,%r9d
- roll $7,%r9d
- addl %r12d,%edi
- xorl %edi,%r10d
- roll $7,%r10d
- movl %esi,40(%rsp)
- movl %edi,44(%rsp)
- movl 32(%rsp),%esi
- movl 36(%rsp),%edi
- addl %r11d,%ecx
- xorl %ecx,%r13d
- roll $16,%r13d
- addl %r8d,%edx
- xorl %edx,%r14d
- roll $16,%r14d
- addl %r13d,%esi
- xorl %esi,%r11d
- roll $12,%r11d
- addl %r14d,%edi
- xorl %edi,%r8d
- roll $12,%r8d
- addl %r11d,%ecx
- xorl %ecx,%r13d
- roll $8,%r13d
- addl %r8d,%edx
- xorl %edx,%r14d
- roll $8,%r14d
- addl %r13d,%esi
- xorl %esi,%r11d
- roll $7,%r11d
- addl %r14d,%edi
- xorl %edi,%r8d
- roll $7,%r8d
- decl %ebp
- jnz L$oop
- movl %edi,36(%rsp)
- movl %esi,32(%rsp)
- movq 64(%rsp),%rbp
- movdqa %xmm2,%xmm1
- movq 64+8(%rsp),%rsi
- paddd %xmm4,%xmm3
- movq 64+16(%rsp),%rdi
-
- addl $0x61707865,%eax
- addl $0x3320646e,%ebx
- addl $0x79622d32,%ecx
- addl $0x6b206574,%edx
- addl 16(%rsp),%r8d
- addl 20(%rsp),%r9d
- addl 24(%rsp),%r10d
- addl 28(%rsp),%r11d
- addl 48(%rsp),%r12d
- addl 52(%rsp),%r13d
- addl 56(%rsp),%r14d
- addl 60(%rsp),%r15d
- paddd 32(%rsp),%xmm1
-
- cmpq $64,%rbp
- jb L$tail
-
- xorl 0(%rsi),%eax
- xorl 4(%rsi),%ebx
- xorl 8(%rsi),%ecx
- xorl 12(%rsi),%edx
- xorl 16(%rsi),%r8d
- xorl 20(%rsi),%r9d
- xorl 24(%rsi),%r10d
- xorl 28(%rsi),%r11d
- movdqu 32(%rsi),%xmm0
- xorl 48(%rsi),%r12d
- xorl 52(%rsi),%r13d
- xorl 56(%rsi),%r14d
- xorl 60(%rsi),%r15d
- leaq 64(%rsi),%rsi
- pxor %xmm1,%xmm0
-
- movdqa %xmm2,32(%rsp)
- movd %xmm3,48(%rsp)
-
- movl %eax,0(%rdi)
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
- movdqu %xmm0,32(%rdi)
- movl %r12d,48(%rdi)
- movl %r13d,52(%rdi)
- movl %r14d,56(%rdi)
- movl %r15d,60(%rdi)
- leaq 64(%rdi),%rdi
-
- subq $64,%rbp
- jnz L$oop_outer
-
- jmp L$done
-
-.p2align 4
-L$tail:
- movl %eax,0(%rsp)
- movl %ebx,4(%rsp)
- xorq %rbx,%rbx
- movl %ecx,8(%rsp)
- movl %edx,12(%rsp)
- movl %r8d,16(%rsp)
- movl %r9d,20(%rsp)
- movl %r10d,24(%rsp)
- movl %r11d,28(%rsp)
- movdqa %xmm1,32(%rsp)
- movl %r12d,48(%rsp)
- movl %r13d,52(%rsp)
- movl %r14d,56(%rsp)
- movl %r15d,60(%rsp)
-
-L$oop_tail:
- movzbl (%rsi,%rbx,1),%eax
- movzbl (%rsp,%rbx,1),%edx
- leaq 1(%rbx),%rbx
- xorl %edx,%eax
- movb %al,-1(%rdi,%rbx,1)
- decq %rbp
- jnz L$oop_tail
-
-L$done:
- leaq 64+24+48(%rsp),%rsi
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbp
-
- movq -8(%rsi),%rbx
-
- leaq (%rsi),%rsp
-
-L$no_data:
- ret
-
-
-
-.p2align 5
-ChaCha20_ssse3:
-L$ChaCha20_ssse3:
-
- movq %rsp,%r9
-
- cmpq $128,%rdx
- ja L$ChaCha20_4x
-
-L$do_sse3_after_all:
- subq $64+8,%rsp
- movdqa L$sigma(%rip),%xmm0
- movdqu (%rcx),%xmm1
- movdqu 16(%rcx),%xmm2
- movdqu (%r8),%xmm3
- movdqa L$rot16(%rip),%xmm6
- movdqa L$rot24(%rip),%xmm7
-
- movdqa %xmm0,0(%rsp)
- movdqa %xmm1,16(%rsp)
- movdqa %xmm2,32(%rsp)
- movdqa %xmm3,48(%rsp)
- movq $10,%r8
- jmp L$oop_ssse3
-
-.p2align 5
-L$oop_outer_ssse3:
- movdqa L$one(%rip),%xmm3
- movdqa 0(%rsp),%xmm0
- movdqa 16(%rsp),%xmm1
- movdqa 32(%rsp),%xmm2
- paddd 48(%rsp),%xmm3
- movq $10,%r8
- movdqa %xmm3,48(%rsp)
- jmp L$oop_ssse3
-
-.p2align 5
-L$oop_ssse3:
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
-.byte 102,15,56,0,222
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
-.byte 102,15,56,0,223
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $78,%xmm2,%xmm2
- pshufd $57,%xmm1,%xmm1
- pshufd $147,%xmm3,%xmm3
- nop
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
-.byte 102,15,56,0,222
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
-.byte 102,15,56,0,223
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $78,%xmm2,%xmm2
- pshufd $147,%xmm1,%xmm1
- pshufd $57,%xmm3,%xmm3
- decq %r8
- jnz L$oop_ssse3
- paddd 0(%rsp),%xmm0
- paddd 16(%rsp),%xmm1
- paddd 32(%rsp),%xmm2
- paddd 48(%rsp),%xmm3
-
- cmpq $64,%rdx
- jb L$tail_ssse3
-
- movdqu 0(%rsi),%xmm4
- movdqu 16(%rsi),%xmm5
- pxor %xmm4,%xmm0
- movdqu 32(%rsi),%xmm4
- pxor %xmm5,%xmm1
- movdqu 48(%rsi),%xmm5
- leaq 64(%rsi),%rsi
- pxor %xmm4,%xmm2
- pxor %xmm5,%xmm3
-
- movdqu %xmm0,0(%rdi)
- movdqu %xmm1,16(%rdi)
- movdqu %xmm2,32(%rdi)
- movdqu %xmm3,48(%rdi)
- leaq 64(%rdi),%rdi
-
- subq $64,%rdx
- jnz L$oop_outer_ssse3
-
- jmp L$done_ssse3
-
-.p2align 4
-L$tail_ssse3:
- movdqa %xmm0,0(%rsp)
- movdqa %xmm1,16(%rsp)
- movdqa %xmm2,32(%rsp)
- movdqa %xmm3,48(%rsp)
- xorq %r8,%r8
-
-L$oop_tail_ssse3:
- movzbl (%rsi,%r8,1),%eax
- movzbl (%rsp,%r8,1),%ecx
- leaq 1(%r8),%r8
- xorl %ecx,%eax
- movb %al,-1(%rdi,%r8,1)
- decq %rdx
- jnz L$oop_tail_ssse3
-
-L$done_ssse3:
- leaq (%r9),%rsp
-
-L$ssse3_epilogue:
- ret
-
-
-
-.p2align 5
-ChaCha20_4x:
-L$ChaCha20_4x:
-
- movq %rsp,%r9
-
- movq %r10,%r11
- shrq $32,%r10
- testq $32,%r10
- jnz L$ChaCha20_8x
- cmpq $192,%rdx
- ja L$proceed4x
-
- andq $71303168,%r11
- cmpq $4194304,%r11
- je L$do_sse3_after_all
-
-L$proceed4x:
- subq $0x140+8,%rsp
- movdqa L$sigma(%rip),%xmm11
- movdqu (%rcx),%xmm15
- movdqu 16(%rcx),%xmm7
- movdqu (%r8),%xmm3
- leaq 256(%rsp),%rcx
- leaq L$rot16(%rip),%r10
- leaq L$rot24(%rip),%r11
-
- pshufd $0x00,%xmm11,%xmm8
- pshufd $0x55,%xmm11,%xmm9
- movdqa %xmm8,64(%rsp)
- pshufd $0xaa,%xmm11,%xmm10
- movdqa %xmm9,80(%rsp)
- pshufd $0xff,%xmm11,%xmm11
- movdqa %xmm10,96(%rsp)
- movdqa %xmm11,112(%rsp)
-
- pshufd $0x00,%xmm15,%xmm12
- pshufd $0x55,%xmm15,%xmm13
- movdqa %xmm12,128-256(%rcx)
- pshufd $0xaa,%xmm15,%xmm14
- movdqa %xmm13,144-256(%rcx)
- pshufd $0xff,%xmm15,%xmm15
- movdqa %xmm14,160-256(%rcx)
- movdqa %xmm15,176-256(%rcx)
-
- pshufd $0x00,%xmm7,%xmm4
- pshufd $0x55,%xmm7,%xmm5
- movdqa %xmm4,192-256(%rcx)
- pshufd $0xaa,%xmm7,%xmm6
- movdqa %xmm5,208-256(%rcx)
- pshufd $0xff,%xmm7,%xmm7
- movdqa %xmm6,224-256(%rcx)
- movdqa %xmm7,240-256(%rcx)
-
- pshufd $0x00,%xmm3,%xmm0
- pshufd $0x55,%xmm3,%xmm1
- paddd L$inc(%rip),%xmm0
- pshufd $0xaa,%xmm3,%xmm2
- movdqa %xmm1,272-256(%rcx)
- pshufd $0xff,%xmm3,%xmm3
- movdqa %xmm2,288-256(%rcx)
- movdqa %xmm3,304-256(%rcx)
-
- jmp L$oop_enter4x
-
-.p2align 5
-L$oop_outer4x:
- movdqa 64(%rsp),%xmm8
- movdqa 80(%rsp),%xmm9
- movdqa 96(%rsp),%xmm10
- movdqa 112(%rsp),%xmm11
- movdqa 128-256(%rcx),%xmm12
- movdqa 144-256(%rcx),%xmm13
- movdqa 160-256(%rcx),%xmm14
- movdqa 176-256(%rcx),%xmm15
- movdqa 192-256(%rcx),%xmm4
- movdqa 208-256(%rcx),%xmm5
- movdqa 224-256(%rcx),%xmm6
- movdqa 240-256(%rcx),%xmm7
- movdqa 256-256(%rcx),%xmm0
- movdqa 272-256(%rcx),%xmm1
- movdqa 288-256(%rcx),%xmm2
- movdqa 304-256(%rcx),%xmm3
- paddd L$four(%rip),%xmm0
-
-L$oop_enter4x:
- movdqa %xmm6,32(%rsp)
- movdqa %xmm7,48(%rsp)
- movdqa (%r10),%xmm7
- movl $10,%eax
- movdqa %xmm0,256-256(%rcx)
- jmp L$oop4x
-
-.p2align 5
-L$oop4x:
- paddd %xmm12,%xmm8
- paddd %xmm13,%xmm9
- pxor %xmm8,%xmm0
- pxor %xmm9,%xmm1
-.byte 102,15,56,0,199
-.byte 102,15,56,0,207
- paddd %xmm0,%xmm4
- paddd %xmm1,%xmm5
- pxor %xmm4,%xmm12
- pxor %xmm5,%xmm13
- movdqa %xmm12,%xmm6
- pslld $12,%xmm12
- psrld $20,%xmm6
- movdqa %xmm13,%xmm7
- pslld $12,%xmm13
- por %xmm6,%xmm12
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm13
- paddd %xmm12,%xmm8
- paddd %xmm13,%xmm9
- pxor %xmm8,%xmm0
- pxor %xmm9,%xmm1
-.byte 102,15,56,0,198
-.byte 102,15,56,0,206
- paddd %xmm0,%xmm4
- paddd %xmm1,%xmm5
- pxor %xmm4,%xmm12
- pxor %xmm5,%xmm13
- movdqa %xmm12,%xmm7
- pslld $7,%xmm12
- psrld $25,%xmm7
- movdqa %xmm13,%xmm6
- pslld $7,%xmm13
- por %xmm7,%xmm12
- psrld $25,%xmm6
- movdqa (%r10),%xmm7
- por %xmm6,%xmm13
- movdqa %xmm4,0(%rsp)
- movdqa %xmm5,16(%rsp)
- movdqa 32(%rsp),%xmm4
- movdqa 48(%rsp),%xmm5
- paddd %xmm14,%xmm10
- paddd %xmm15,%xmm11
- pxor %xmm10,%xmm2
- pxor %xmm11,%xmm3
-.byte 102,15,56,0,215
-.byte 102,15,56,0,223
- paddd %xmm2,%xmm4
- paddd %xmm3,%xmm5
- pxor %xmm4,%xmm14
- pxor %xmm5,%xmm15
- movdqa %xmm14,%xmm6
- pslld $12,%xmm14
- psrld $20,%xmm6
- movdqa %xmm15,%xmm7
- pslld $12,%xmm15
- por %xmm6,%xmm14
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm15
- paddd %xmm14,%xmm10
- paddd %xmm15,%xmm11
- pxor %xmm10,%xmm2
- pxor %xmm11,%xmm3
-.byte 102,15,56,0,214
-.byte 102,15,56,0,222
- paddd %xmm2,%xmm4
- paddd %xmm3,%xmm5
- pxor %xmm4,%xmm14
- pxor %xmm5,%xmm15
- movdqa %xmm14,%xmm7
- pslld $7,%xmm14
- psrld $25,%xmm7
- movdqa %xmm15,%xmm6
- pslld $7,%xmm15
- por %xmm7,%xmm14
- psrld $25,%xmm6
- movdqa (%r10),%xmm7
- por %xmm6,%xmm15
- paddd %xmm13,%xmm8
- paddd %xmm14,%xmm9
- pxor %xmm8,%xmm3
- pxor %xmm9,%xmm0
-.byte 102,15,56,0,223
-.byte 102,15,56,0,199
- paddd %xmm3,%xmm4
- paddd %xmm0,%xmm5
- pxor %xmm4,%xmm13
- pxor %xmm5,%xmm14
- movdqa %xmm13,%xmm6
- pslld $12,%xmm13
- psrld $20,%xmm6
- movdqa %xmm14,%xmm7
- pslld $12,%xmm14
- por %xmm6,%xmm13
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm14
- paddd %xmm13,%xmm8
- paddd %xmm14,%xmm9
- pxor %xmm8,%xmm3
- pxor %xmm9,%xmm0
-.byte 102,15,56,0,222
-.byte 102,15,56,0,198
- paddd %xmm3,%xmm4
- paddd %xmm0,%xmm5
- pxor %xmm4,%xmm13
- pxor %xmm5,%xmm14
- movdqa %xmm13,%xmm7
- pslld $7,%xmm13
- psrld $25,%xmm7
- movdqa %xmm14,%xmm6
- pslld $7,%xmm14
- por %xmm7,%xmm13
- psrld $25,%xmm6
- movdqa (%r10),%xmm7
- por %xmm6,%xmm14
- movdqa %xmm4,32(%rsp)
- movdqa %xmm5,48(%rsp)
- movdqa 0(%rsp),%xmm4
- movdqa 16(%rsp),%xmm5
- paddd %xmm15,%xmm10
- paddd %xmm12,%xmm11
- pxor %xmm10,%xmm1
- pxor %xmm11,%xmm2
-.byte 102,15,56,0,207
-.byte 102,15,56,0,215
- paddd %xmm1,%xmm4
- paddd %xmm2,%xmm5
- pxor %xmm4,%xmm15
- pxor %xmm5,%xmm12
- movdqa %xmm15,%xmm6
- pslld $12,%xmm15
- psrld $20,%xmm6
- movdqa %xmm12,%xmm7
- pslld $12,%xmm12
- por %xmm6,%xmm15
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm12
- paddd %xmm15,%xmm10
- paddd %xmm12,%xmm11
- pxor %xmm10,%xmm1
- pxor %xmm11,%xmm2
-.byte 102,15,56,0,206
-.byte 102,15,56,0,214
- paddd %xmm1,%xmm4
- paddd %xmm2,%xmm5
- pxor %xmm4,%xmm15
- pxor %xmm5,%xmm12
- movdqa %xmm15,%xmm7
- pslld $7,%xmm15
- psrld $25,%xmm7
- movdqa %xmm12,%xmm6
- pslld $7,%xmm12
- por %xmm7,%xmm15
- psrld $25,%xmm6
- movdqa (%r10),%xmm7
- por %xmm6,%xmm12
- decl %eax
- jnz L$oop4x
-
- paddd 64(%rsp),%xmm8
- paddd 80(%rsp),%xmm9
- paddd 96(%rsp),%xmm10
- paddd 112(%rsp),%xmm11
-
- movdqa %xmm8,%xmm6
- punpckldq %xmm9,%xmm8
- movdqa %xmm10,%xmm7
- punpckldq %xmm11,%xmm10
- punpckhdq %xmm9,%xmm6
- punpckhdq %xmm11,%xmm7
- movdqa %xmm8,%xmm9
- punpcklqdq %xmm10,%xmm8
- movdqa %xmm6,%xmm11
- punpcklqdq %xmm7,%xmm6
- punpckhqdq %xmm10,%xmm9
- punpckhqdq %xmm7,%xmm11
- paddd 128-256(%rcx),%xmm12
- paddd 144-256(%rcx),%xmm13
- paddd 160-256(%rcx),%xmm14
- paddd 176-256(%rcx),%xmm15
-
- movdqa %xmm8,0(%rsp)
- movdqa %xmm9,16(%rsp)
- movdqa 32(%rsp),%xmm8
- movdqa 48(%rsp),%xmm9
-
- movdqa %xmm12,%xmm10
- punpckldq %xmm13,%xmm12
- movdqa %xmm14,%xmm7
- punpckldq %xmm15,%xmm14
- punpckhdq %xmm13,%xmm10
- punpckhdq %xmm15,%xmm7
- movdqa %xmm12,%xmm13
- punpcklqdq %xmm14,%xmm12
- movdqa %xmm10,%xmm15
- punpcklqdq %xmm7,%xmm10
- punpckhqdq %xmm14,%xmm13
- punpckhqdq %xmm7,%xmm15
- paddd 192-256(%rcx),%xmm4
- paddd 208-256(%rcx),%xmm5
- paddd 224-256(%rcx),%xmm8
- paddd 240-256(%rcx),%xmm9
-
- movdqa %xmm6,32(%rsp)
- movdqa %xmm11,48(%rsp)
-
- movdqa %xmm4,%xmm14
- punpckldq %xmm5,%xmm4
- movdqa %xmm8,%xmm7
- punpckldq %xmm9,%xmm8
- punpckhdq %xmm5,%xmm14
- punpckhdq %xmm9,%xmm7
- movdqa %xmm4,%xmm5
- punpcklqdq %xmm8,%xmm4
- movdqa %xmm14,%xmm9
- punpcklqdq %xmm7,%xmm14
- punpckhqdq %xmm8,%xmm5
- punpckhqdq %xmm7,%xmm9
- paddd 256-256(%rcx),%xmm0
- paddd 272-256(%rcx),%xmm1
- paddd 288-256(%rcx),%xmm2
- paddd 304-256(%rcx),%xmm3
-
- movdqa %xmm0,%xmm8
- punpckldq %xmm1,%xmm0
- movdqa %xmm2,%xmm7
- punpckldq %xmm3,%xmm2
- punpckhdq %xmm1,%xmm8
- punpckhdq %xmm3,%xmm7
- movdqa %xmm0,%xmm1
- punpcklqdq %xmm2,%xmm0
- movdqa %xmm8,%xmm3
- punpcklqdq %xmm7,%xmm8
- punpckhqdq %xmm2,%xmm1
- punpckhqdq %xmm7,%xmm3
- cmpq $256,%rdx
- jb L$tail4x
-
- movdqu 0(%rsi),%xmm6
- movdqu 16(%rsi),%xmm11
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm7
- pxor 0(%rsp),%xmm6
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
-
- movdqu %xmm6,0(%rdi)
- movdqu 64(%rsi),%xmm6
- movdqu %xmm11,16(%rdi)
- movdqu 80(%rsi),%xmm11
- movdqu %xmm2,32(%rdi)
- movdqu 96(%rsi),%xmm2
- movdqu %xmm7,48(%rdi)
- movdqu 112(%rsi),%xmm7
- leaq 128(%rsi),%rsi
- pxor 16(%rsp),%xmm6
- pxor %xmm13,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm1,%xmm7
-
- movdqu %xmm6,64(%rdi)
- movdqu 0(%rsi),%xmm6
- movdqu %xmm11,80(%rdi)
- movdqu 16(%rsi),%xmm11
- movdqu %xmm2,96(%rdi)
- movdqu 32(%rsi),%xmm2
- movdqu %xmm7,112(%rdi)
- leaq 128(%rdi),%rdi
- movdqu 48(%rsi),%xmm7
- pxor 32(%rsp),%xmm6
- pxor %xmm10,%xmm11
- pxor %xmm14,%xmm2
- pxor %xmm8,%xmm7
-
- movdqu %xmm6,0(%rdi)
- movdqu 64(%rsi),%xmm6
- movdqu %xmm11,16(%rdi)
- movdqu 80(%rsi),%xmm11
- movdqu %xmm2,32(%rdi)
- movdqu 96(%rsi),%xmm2
- movdqu %xmm7,48(%rdi)
- movdqu 112(%rsi),%xmm7
- leaq 128(%rsi),%rsi
- pxor 48(%rsp),%xmm6
- pxor %xmm15,%xmm11
- pxor %xmm9,%xmm2
- pxor %xmm3,%xmm7
- movdqu %xmm6,64(%rdi)
- movdqu %xmm11,80(%rdi)
- movdqu %xmm2,96(%rdi)
- movdqu %xmm7,112(%rdi)
- leaq 128(%rdi),%rdi
-
- subq $256,%rdx
- jnz L$oop_outer4x
-
- jmp L$done4x
-
-L$tail4x:
- cmpq $192,%rdx
- jae L$192_or_more4x
- cmpq $128,%rdx
- jae L$128_or_more4x
- cmpq $64,%rdx
- jae L$64_or_more4x
-
-
- xorq %r10,%r10
-
- movdqa %xmm12,16(%rsp)
- movdqa %xmm4,32(%rsp)
- movdqa %xmm0,48(%rsp)
- jmp L$oop_tail4x
-
-.p2align 5
-L$64_or_more4x:
- movdqu 0(%rsi),%xmm6
- movdqu 16(%rsi),%xmm11
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm7
- pxor 0(%rsp),%xmm6
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
- movdqu %xmm6,0(%rdi)
- movdqu %xmm11,16(%rdi)
- movdqu %xmm2,32(%rdi)
- movdqu %xmm7,48(%rdi)
- je L$done4x
-
- movdqa 16(%rsp),%xmm6
- leaq 64(%rsi),%rsi
- xorq %r10,%r10
- movdqa %xmm6,0(%rsp)
- movdqa %xmm13,16(%rsp)
- leaq 64(%rdi),%rdi
- movdqa %xmm5,32(%rsp)
- subq $64,%rdx
- movdqa %xmm1,48(%rsp)
- jmp L$oop_tail4x
-
-.p2align 5
-L$128_or_more4x:
- movdqu 0(%rsi),%xmm6
- movdqu 16(%rsi),%xmm11
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm7
- pxor 0(%rsp),%xmm6
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
-
- movdqu %xmm6,0(%rdi)
- movdqu 64(%rsi),%xmm6
- movdqu %xmm11,16(%rdi)
- movdqu 80(%rsi),%xmm11
- movdqu %xmm2,32(%rdi)
- movdqu 96(%rsi),%xmm2
- movdqu %xmm7,48(%rdi)
- movdqu 112(%rsi),%xmm7
- pxor 16(%rsp),%xmm6
- pxor %xmm13,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm1,%xmm7
- movdqu %xmm6,64(%rdi)
- movdqu %xmm11,80(%rdi)
- movdqu %xmm2,96(%rdi)
- movdqu %xmm7,112(%rdi)
- je L$done4x
-
- movdqa 32(%rsp),%xmm6
- leaq 128(%rsi),%rsi
- xorq %r10,%r10
- movdqa %xmm6,0(%rsp)
- movdqa %xmm10,16(%rsp)
- leaq 128(%rdi),%rdi
- movdqa %xmm14,32(%rsp)
- subq $128,%rdx
- movdqa %xmm8,48(%rsp)
- jmp L$oop_tail4x
-
-.p2align 5
-L$192_or_more4x:
- movdqu 0(%rsi),%xmm6
- movdqu 16(%rsi),%xmm11
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm7
- pxor 0(%rsp),%xmm6
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
-
- movdqu %xmm6,0(%rdi)
- movdqu 64(%rsi),%xmm6
- movdqu %xmm11,16(%rdi)
- movdqu 80(%rsi),%xmm11
- movdqu %xmm2,32(%rdi)
- movdqu 96(%rsi),%xmm2
- movdqu %xmm7,48(%rdi)
- movdqu 112(%rsi),%xmm7
- leaq 128(%rsi),%rsi
- pxor 16(%rsp),%xmm6
- pxor %xmm13,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm1,%xmm7
-
- movdqu %xmm6,64(%rdi)
- movdqu 0(%rsi),%xmm6
- movdqu %xmm11,80(%rdi)
- movdqu 16(%rsi),%xmm11
- movdqu %xmm2,96(%rdi)
- movdqu 32(%rsi),%xmm2
- movdqu %xmm7,112(%rdi)
- leaq 128(%rdi),%rdi
- movdqu 48(%rsi),%xmm7
- pxor 32(%rsp),%xmm6
- pxor %xmm10,%xmm11
- pxor %xmm14,%xmm2
- pxor %xmm8,%xmm7
- movdqu %xmm6,0(%rdi)
- movdqu %xmm11,16(%rdi)
- movdqu %xmm2,32(%rdi)
- movdqu %xmm7,48(%rdi)
- je L$done4x
-
- movdqa 48(%rsp),%xmm6
- leaq 64(%rsi),%rsi
- xorq %r10,%r10
- movdqa %xmm6,0(%rsp)
- movdqa %xmm15,16(%rsp)
- leaq 64(%rdi),%rdi
- movdqa %xmm9,32(%rsp)
- subq $192,%rdx
- movdqa %xmm3,48(%rsp)
-
-L$oop_tail4x:
- movzbl (%rsi,%r10,1),%eax
- movzbl (%rsp,%r10,1),%ecx
- leaq 1(%r10),%r10
- xorl %ecx,%eax
- movb %al,-1(%rdi,%r10,1)
- decq %rdx
- jnz L$oop_tail4x
-
-L$done4x:
- leaq (%r9),%rsp
-
-L$4x_epilogue:
- ret
-
-
-
-.p2align 5
-ChaCha20_8x:
-L$ChaCha20_8x:
-
- movq %rsp,%r9
-
- subq $0x280+8,%rsp
- andq $-32,%rsp
- vzeroupper
-
-
-
-
-
-
-
-
-
-
- vbroadcasti128 L$sigma(%rip),%ymm11
- vbroadcasti128 (%rcx),%ymm3
- vbroadcasti128 16(%rcx),%ymm15
- vbroadcasti128 (%r8),%ymm7
- leaq 256(%rsp),%rcx
- leaq 512(%rsp),%rax
- leaq L$rot16(%rip),%r10
- leaq L$rot24(%rip),%r11
-
- vpshufd $0x00,%ymm11,%ymm8
- vpshufd $0x55,%ymm11,%ymm9
- vmovdqa %ymm8,128-256(%rcx)
- vpshufd $0xaa,%ymm11,%ymm10
- vmovdqa %ymm9,160-256(%rcx)
- vpshufd $0xff,%ymm11,%ymm11
- vmovdqa %ymm10,192-256(%rcx)
- vmovdqa %ymm11,224-256(%rcx)
-
- vpshufd $0x00,%ymm3,%ymm0
- vpshufd $0x55,%ymm3,%ymm1
- vmovdqa %ymm0,256-256(%rcx)
- vpshufd $0xaa,%ymm3,%ymm2
- vmovdqa %ymm1,288-256(%rcx)
- vpshufd $0xff,%ymm3,%ymm3
- vmovdqa %ymm2,320-256(%rcx)
- vmovdqa %ymm3,352-256(%rcx)
-
- vpshufd $0x00,%ymm15,%ymm12
- vpshufd $0x55,%ymm15,%ymm13
- vmovdqa %ymm12,384-512(%rax)
- vpshufd $0xaa,%ymm15,%ymm14
- vmovdqa %ymm13,416-512(%rax)
- vpshufd $0xff,%ymm15,%ymm15
- vmovdqa %ymm14,448-512(%rax)
- vmovdqa %ymm15,480-512(%rax)
-
- vpshufd $0x00,%ymm7,%ymm4
- vpshufd $0x55,%ymm7,%ymm5
- vpaddd L$incy(%rip),%ymm4,%ymm4
- vpshufd $0xaa,%ymm7,%ymm6
- vmovdqa %ymm5,544-512(%rax)
- vpshufd $0xff,%ymm7,%ymm7
- vmovdqa %ymm6,576-512(%rax)
- vmovdqa %ymm7,608-512(%rax)
-
- jmp L$oop_enter8x
-
-.p2align 5
-L$oop_outer8x:
- vmovdqa 128-256(%rcx),%ymm8
- vmovdqa 160-256(%rcx),%ymm9
- vmovdqa 192-256(%rcx),%ymm10
- vmovdqa 224-256(%rcx),%ymm11
- vmovdqa 256-256(%rcx),%ymm0
- vmovdqa 288-256(%rcx),%ymm1
- vmovdqa 320-256(%rcx),%ymm2
- vmovdqa 352-256(%rcx),%ymm3
- vmovdqa 384-512(%rax),%ymm12
- vmovdqa 416-512(%rax),%ymm13
- vmovdqa 448-512(%rax),%ymm14
- vmovdqa 480-512(%rax),%ymm15
- vmovdqa 512-512(%rax),%ymm4
- vmovdqa 544-512(%rax),%ymm5
- vmovdqa 576-512(%rax),%ymm6
- vmovdqa 608-512(%rax),%ymm7
- vpaddd L$eight(%rip),%ymm4,%ymm4
-
-L$oop_enter8x:
- vmovdqa %ymm14,64(%rsp)
- vmovdqa %ymm15,96(%rsp)
- vbroadcasti128 (%r10),%ymm15
- vmovdqa %ymm4,512-512(%rax)
- movl $10,%eax
- jmp L$oop8x
-
-.p2align 5
-L$oop8x:
- vpaddd %ymm0,%ymm8,%ymm8
- vpxor %ymm4,%ymm8,%ymm4
- vpshufb %ymm15,%ymm4,%ymm4
- vpaddd %ymm1,%ymm9,%ymm9
- vpxor %ymm5,%ymm9,%ymm5
- vpshufb %ymm15,%ymm5,%ymm5
- vpaddd %ymm4,%ymm12,%ymm12
- vpxor %ymm0,%ymm12,%ymm0
- vpslld $12,%ymm0,%ymm14
- vpsrld $20,%ymm0,%ymm0
- vpor %ymm0,%ymm14,%ymm0
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm5,%ymm13,%ymm13
- vpxor %ymm1,%ymm13,%ymm1
- vpslld $12,%ymm1,%ymm15
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm1,%ymm15,%ymm1
- vpaddd %ymm0,%ymm8,%ymm8
- vpxor %ymm4,%ymm8,%ymm4
- vpshufb %ymm14,%ymm4,%ymm4
- vpaddd %ymm1,%ymm9,%ymm9
- vpxor %ymm5,%ymm9,%ymm5
- vpshufb %ymm14,%ymm5,%ymm5
- vpaddd %ymm4,%ymm12,%ymm12
- vpxor %ymm0,%ymm12,%ymm0
- vpslld $7,%ymm0,%ymm15
- vpsrld $25,%ymm0,%ymm0
- vpor %ymm0,%ymm15,%ymm0
- vbroadcasti128 (%r10),%ymm15
- vpaddd %ymm5,%ymm13,%ymm13
- vpxor %ymm1,%ymm13,%ymm1
- vpslld $7,%ymm1,%ymm14
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm1,%ymm14,%ymm1
- vmovdqa %ymm12,0(%rsp)
- vmovdqa %ymm13,32(%rsp)
- vmovdqa 64(%rsp),%ymm12
- vmovdqa 96(%rsp),%ymm13
- vpaddd %ymm2,%ymm10,%ymm10
- vpxor %ymm6,%ymm10,%ymm6
- vpshufb %ymm15,%ymm6,%ymm6
- vpaddd %ymm3,%ymm11,%ymm11
- vpxor %ymm7,%ymm11,%ymm7
- vpshufb %ymm15,%ymm7,%ymm7
- vpaddd %ymm6,%ymm12,%ymm12
- vpxor %ymm2,%ymm12,%ymm2
- vpslld $12,%ymm2,%ymm14
- vpsrld $20,%ymm2,%ymm2
- vpor %ymm2,%ymm14,%ymm2
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm7,%ymm13,%ymm13
- vpxor %ymm3,%ymm13,%ymm3
- vpslld $12,%ymm3,%ymm15
- vpsrld $20,%ymm3,%ymm3
- vpor %ymm3,%ymm15,%ymm3
- vpaddd %ymm2,%ymm10,%ymm10
- vpxor %ymm6,%ymm10,%ymm6
- vpshufb %ymm14,%ymm6,%ymm6
- vpaddd %ymm3,%ymm11,%ymm11
- vpxor %ymm7,%ymm11,%ymm7
- vpshufb %ymm14,%ymm7,%ymm7
- vpaddd %ymm6,%ymm12,%ymm12
- vpxor %ymm2,%ymm12,%ymm2
- vpslld $7,%ymm2,%ymm15
- vpsrld $25,%ymm2,%ymm2
- vpor %ymm2,%ymm15,%ymm2
- vbroadcasti128 (%r10),%ymm15
- vpaddd %ymm7,%ymm13,%ymm13
- vpxor %ymm3,%ymm13,%ymm3
- vpslld $7,%ymm3,%ymm14
- vpsrld $25,%ymm3,%ymm3
- vpor %ymm3,%ymm14,%ymm3
- vpaddd %ymm1,%ymm8,%ymm8
- vpxor %ymm7,%ymm8,%ymm7
- vpshufb %ymm15,%ymm7,%ymm7
- vpaddd %ymm2,%ymm9,%ymm9
- vpxor %ymm4,%ymm9,%ymm4
- vpshufb %ymm15,%ymm4,%ymm4
- vpaddd %ymm7,%ymm12,%ymm12
- vpxor %ymm1,%ymm12,%ymm1
- vpslld $12,%ymm1,%ymm14
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm1,%ymm14,%ymm1
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm4,%ymm13,%ymm13
- vpxor %ymm2,%ymm13,%ymm2
- vpslld $12,%ymm2,%ymm15
- vpsrld $20,%ymm2,%ymm2
- vpor %ymm2,%ymm15,%ymm2
- vpaddd %ymm1,%ymm8,%ymm8
- vpxor %ymm7,%ymm8,%ymm7
- vpshufb %ymm14,%ymm7,%ymm7
- vpaddd %ymm2,%ymm9,%ymm9
- vpxor %ymm4,%ymm9,%ymm4
- vpshufb %ymm14,%ymm4,%ymm4
- vpaddd %ymm7,%ymm12,%ymm12
- vpxor %ymm1,%ymm12,%ymm1
- vpslld $7,%ymm1,%ymm15
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm1,%ymm15,%ymm1
- vbroadcasti128 (%r10),%ymm15
- vpaddd %ymm4,%ymm13,%ymm13
- vpxor %ymm2,%ymm13,%ymm2
- vpslld $7,%ymm2,%ymm14
- vpsrld $25,%ymm2,%ymm2
- vpor %ymm2,%ymm14,%ymm2
- vmovdqa %ymm12,64(%rsp)
- vmovdqa %ymm13,96(%rsp)
- vmovdqa 0(%rsp),%ymm12
- vmovdqa 32(%rsp),%ymm13
- vpaddd %ymm3,%ymm10,%ymm10
- vpxor %ymm5,%ymm10,%ymm5
- vpshufb %ymm15,%ymm5,%ymm5
- vpaddd %ymm0,%ymm11,%ymm11
- vpxor %ymm6,%ymm11,%ymm6
- vpshufb %ymm15,%ymm6,%ymm6
- vpaddd %ymm5,%ymm12,%ymm12
- vpxor %ymm3,%ymm12,%ymm3
- vpslld $12,%ymm3,%ymm14
- vpsrld $20,%ymm3,%ymm3
- vpor %ymm3,%ymm14,%ymm3
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm6,%ymm13,%ymm13
- vpxor %ymm0,%ymm13,%ymm0
- vpslld $12,%ymm0,%ymm15
- vpsrld $20,%ymm0,%ymm0
- vpor %ymm0,%ymm15,%ymm0
- vpaddd %ymm3,%ymm10,%ymm10
- vpxor %ymm5,%ymm10,%ymm5
- vpshufb %ymm14,%ymm5,%ymm5
- vpaddd %ymm0,%ymm11,%ymm11
- vpxor %ymm6,%ymm11,%ymm6
- vpshufb %ymm14,%ymm6,%ymm6
- vpaddd %ymm5,%ymm12,%ymm12
- vpxor %ymm3,%ymm12,%ymm3
- vpslld $7,%ymm3,%ymm15
- vpsrld $25,%ymm3,%ymm3
- vpor %ymm3,%ymm15,%ymm3
- vbroadcasti128 (%r10),%ymm15
- vpaddd %ymm6,%ymm13,%ymm13
- vpxor %ymm0,%ymm13,%ymm0
- vpslld $7,%ymm0,%ymm14
- vpsrld $25,%ymm0,%ymm0
- vpor %ymm0,%ymm14,%ymm0
- decl %eax
- jnz L$oop8x
-
- leaq 512(%rsp),%rax
- vpaddd 128-256(%rcx),%ymm8,%ymm8
- vpaddd 160-256(%rcx),%ymm9,%ymm9
- vpaddd 192-256(%rcx),%ymm10,%ymm10
- vpaddd 224-256(%rcx),%ymm11,%ymm11
-
- vpunpckldq %ymm9,%ymm8,%ymm14
- vpunpckldq %ymm11,%ymm10,%ymm15
- vpunpckhdq %ymm9,%ymm8,%ymm8
- vpunpckhdq %ymm11,%ymm10,%ymm10
- vpunpcklqdq %ymm15,%ymm14,%ymm9
- vpunpckhqdq %ymm15,%ymm14,%ymm14
- vpunpcklqdq %ymm10,%ymm8,%ymm11
- vpunpckhqdq %ymm10,%ymm8,%ymm8
- vpaddd 256-256(%rcx),%ymm0,%ymm0
- vpaddd 288-256(%rcx),%ymm1,%ymm1
- vpaddd 320-256(%rcx),%ymm2,%ymm2
- vpaddd 352-256(%rcx),%ymm3,%ymm3
-
- vpunpckldq %ymm1,%ymm0,%ymm10
- vpunpckldq %ymm3,%ymm2,%ymm15
- vpunpckhdq %ymm1,%ymm0,%ymm0
- vpunpckhdq %ymm3,%ymm2,%ymm2
- vpunpcklqdq %ymm15,%ymm10,%ymm1
- vpunpckhqdq %ymm15,%ymm10,%ymm10
- vpunpcklqdq %ymm2,%ymm0,%ymm3
- vpunpckhqdq %ymm2,%ymm0,%ymm0
- vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
- vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
- vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
- vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
- vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
- vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
- vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
- vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
- vmovdqa %ymm15,0(%rsp)
- vmovdqa %ymm9,32(%rsp)
- vmovdqa 64(%rsp),%ymm15
- vmovdqa 96(%rsp),%ymm9
-
- vpaddd 384-512(%rax),%ymm12,%ymm12
- vpaddd 416-512(%rax),%ymm13,%ymm13
- vpaddd 448-512(%rax),%ymm15,%ymm15
- vpaddd 480-512(%rax),%ymm9,%ymm9
-
- vpunpckldq %ymm13,%ymm12,%ymm2
- vpunpckldq %ymm9,%ymm15,%ymm8
- vpunpckhdq %ymm13,%ymm12,%ymm12
- vpunpckhdq %ymm9,%ymm15,%ymm15
- vpunpcklqdq %ymm8,%ymm2,%ymm13
- vpunpckhqdq %ymm8,%ymm2,%ymm2
- vpunpcklqdq %ymm15,%ymm12,%ymm9
- vpunpckhqdq %ymm15,%ymm12,%ymm12
- vpaddd 512-512(%rax),%ymm4,%ymm4
- vpaddd 544-512(%rax),%ymm5,%ymm5
- vpaddd 576-512(%rax),%ymm6,%ymm6
- vpaddd 608-512(%rax),%ymm7,%ymm7
-
- vpunpckldq %ymm5,%ymm4,%ymm15
- vpunpckldq %ymm7,%ymm6,%ymm8
- vpunpckhdq %ymm5,%ymm4,%ymm4
- vpunpckhdq %ymm7,%ymm6,%ymm6
- vpunpcklqdq %ymm8,%ymm15,%ymm5
- vpunpckhqdq %ymm8,%ymm15,%ymm15
- vpunpcklqdq %ymm6,%ymm4,%ymm7
- vpunpckhqdq %ymm6,%ymm4,%ymm4
- vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
- vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
- vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
- vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
- vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
- vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
- vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
- vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
- vmovdqa 0(%rsp),%ymm6
- vmovdqa 32(%rsp),%ymm12
-
- cmpq $512,%rdx
- jb L$tail8x
-
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- leaq 128(%rsi),%rsi
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- leaq 128(%rdi),%rdi
-
- vpxor 0(%rsi),%ymm12,%ymm12
- vpxor 32(%rsi),%ymm13,%ymm13
- vpxor 64(%rsi),%ymm10,%ymm10
- vpxor 96(%rsi),%ymm15,%ymm15
- leaq 128(%rsi),%rsi
- vmovdqu %ymm12,0(%rdi)
- vmovdqu %ymm13,32(%rdi)
- vmovdqu %ymm10,64(%rdi)
- vmovdqu %ymm15,96(%rdi)
- leaq 128(%rdi),%rdi
-
- vpxor 0(%rsi),%ymm14,%ymm14
- vpxor 32(%rsi),%ymm2,%ymm2
- vpxor 64(%rsi),%ymm3,%ymm3
- vpxor 96(%rsi),%ymm7,%ymm7
- leaq 128(%rsi),%rsi
- vmovdqu %ymm14,0(%rdi)
- vmovdqu %ymm2,32(%rdi)
- vmovdqu %ymm3,64(%rdi)
- vmovdqu %ymm7,96(%rdi)
- leaq 128(%rdi),%rdi
-
- vpxor 0(%rsi),%ymm11,%ymm11
- vpxor 32(%rsi),%ymm9,%ymm9
- vpxor 64(%rsi),%ymm0,%ymm0
- vpxor 96(%rsi),%ymm4,%ymm4
- leaq 128(%rsi),%rsi
- vmovdqu %ymm11,0(%rdi)
- vmovdqu %ymm9,32(%rdi)
- vmovdqu %ymm0,64(%rdi)
- vmovdqu %ymm4,96(%rdi)
- leaq 128(%rdi),%rdi
-
- subq $512,%rdx
- jnz L$oop_outer8x
-
- jmp L$done8x
-
-L$tail8x:
- cmpq $448,%rdx
- jae L$448_or_more8x
- cmpq $384,%rdx
- jae L$384_or_more8x
- cmpq $320,%rdx
- jae L$320_or_more8x
- cmpq $256,%rdx
- jae L$256_or_more8x
- cmpq $192,%rdx
- jae L$192_or_more8x
- cmpq $128,%rdx
- jae L$128_or_more8x
- cmpq $64,%rdx
- jae L$64_or_more8x
-
- xorq %r10,%r10
- vmovdqa %ymm6,0(%rsp)
- vmovdqa %ymm8,32(%rsp)
- jmp L$oop_tail8x
-
-.p2align 5
-L$64_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- je L$done8x
-
- leaq 64(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm1,0(%rsp)
- leaq 64(%rdi),%rdi
- subq $64,%rdx
- vmovdqa %ymm5,32(%rsp)
- jmp L$oop_tail8x
-
-.p2align 5
-L$128_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- je L$done8x
-
- leaq 128(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm12,0(%rsp)
- leaq 128(%rdi),%rdi
- subq $128,%rdx
- vmovdqa %ymm13,32(%rsp)
- jmp L$oop_tail8x
-
-.p2align 5
-L$192_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- vpxor 128(%rsi),%ymm12,%ymm12
- vpxor 160(%rsi),%ymm13,%ymm13
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- vmovdqu %ymm12,128(%rdi)
- vmovdqu %ymm13,160(%rdi)
- je L$done8x
-
- leaq 192(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm10,0(%rsp)
- leaq 192(%rdi),%rdi
- subq $192,%rdx
- vmovdqa %ymm15,32(%rsp)
- jmp L$oop_tail8x
-
-.p2align 5
-L$256_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- vpxor 128(%rsi),%ymm12,%ymm12
- vpxor 160(%rsi),%ymm13,%ymm13
- vpxor 192(%rsi),%ymm10,%ymm10
- vpxor 224(%rsi),%ymm15,%ymm15
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- vmovdqu %ymm12,128(%rdi)
- vmovdqu %ymm13,160(%rdi)
- vmovdqu %ymm10,192(%rdi)
- vmovdqu %ymm15,224(%rdi)
- je L$done8x
-
- leaq 256(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm14,0(%rsp)
- leaq 256(%rdi),%rdi
- subq $256,%rdx
- vmovdqa %ymm2,32(%rsp)
- jmp L$oop_tail8x
-
-.p2align 5
-L$320_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- vpxor 128(%rsi),%ymm12,%ymm12
- vpxor 160(%rsi),%ymm13,%ymm13
- vpxor 192(%rsi),%ymm10,%ymm10
- vpxor 224(%rsi),%ymm15,%ymm15
- vpxor 256(%rsi),%ymm14,%ymm14
- vpxor 288(%rsi),%ymm2,%ymm2
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- vmovdqu %ymm12,128(%rdi)
- vmovdqu %ymm13,160(%rdi)
- vmovdqu %ymm10,192(%rdi)
- vmovdqu %ymm15,224(%rdi)
- vmovdqu %ymm14,256(%rdi)
- vmovdqu %ymm2,288(%rdi)
- je L$done8x
-
- leaq 320(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm3,0(%rsp)
- leaq 320(%rdi),%rdi
- subq $320,%rdx
- vmovdqa %ymm7,32(%rsp)
- jmp L$oop_tail8x
-
-.p2align 5
-L$384_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- vpxor 128(%rsi),%ymm12,%ymm12
- vpxor 160(%rsi),%ymm13,%ymm13
- vpxor 192(%rsi),%ymm10,%ymm10
- vpxor 224(%rsi),%ymm15,%ymm15
- vpxor 256(%rsi),%ymm14,%ymm14
- vpxor 288(%rsi),%ymm2,%ymm2
- vpxor 320(%rsi),%ymm3,%ymm3
- vpxor 352(%rsi),%ymm7,%ymm7
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- vmovdqu %ymm12,128(%rdi)
- vmovdqu %ymm13,160(%rdi)
- vmovdqu %ymm10,192(%rdi)
- vmovdqu %ymm15,224(%rdi)
- vmovdqu %ymm14,256(%rdi)
- vmovdqu %ymm2,288(%rdi)
- vmovdqu %ymm3,320(%rdi)
- vmovdqu %ymm7,352(%rdi)
- je L$done8x
-
- leaq 384(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm11,0(%rsp)
- leaq 384(%rdi),%rdi
- subq $384,%rdx
- vmovdqa %ymm9,32(%rsp)
- jmp L$oop_tail8x
-
-.p2align 5
-L$448_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- vpxor 128(%rsi),%ymm12,%ymm12
- vpxor 160(%rsi),%ymm13,%ymm13
- vpxor 192(%rsi),%ymm10,%ymm10
- vpxor 224(%rsi),%ymm15,%ymm15
- vpxor 256(%rsi),%ymm14,%ymm14
- vpxor 288(%rsi),%ymm2,%ymm2
- vpxor 320(%rsi),%ymm3,%ymm3
- vpxor 352(%rsi),%ymm7,%ymm7
- vpxor 384(%rsi),%ymm11,%ymm11
- vpxor 416(%rsi),%ymm9,%ymm9
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- vmovdqu %ymm12,128(%rdi)
- vmovdqu %ymm13,160(%rdi)
- vmovdqu %ymm10,192(%rdi)
- vmovdqu %ymm15,224(%rdi)
- vmovdqu %ymm14,256(%rdi)
- vmovdqu %ymm2,288(%rdi)
- vmovdqu %ymm3,320(%rdi)
- vmovdqu %ymm7,352(%rdi)
- vmovdqu %ymm11,384(%rdi)
- vmovdqu %ymm9,416(%rdi)
- je L$done8x
-
- leaq 448(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm0,0(%rsp)
- leaq 448(%rdi),%rdi
- subq $448,%rdx
- vmovdqa %ymm4,32(%rsp)
-
-L$oop_tail8x:
- movzbl (%rsi,%r10,1),%eax
- movzbl (%rsp,%r10,1),%ecx
- leaq 1(%r10),%r10
- xorl %ecx,%eax
- movb %al,-1(%rdi,%r10,1)
- decq %rdx
- jnz L$oop_tail8x
-
-L$done8x:
- vzeroall
- leaq (%r9),%rsp
-
-L$8x_epilogue:
- ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64-apple.S b/apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64-apple.S
deleted file mode 100644
index 188ce56..0000000
--- a/apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64-apple.S
+++ /dev/null
@@ -1,3079 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.section __DATA,__const
-
-.p2align 4
-one:
-.quad 1,0
-two:
-.quad 2,0
-three:
-.quad 3,0
-four:
-.quad 4,0
-five:
-.quad 5,0
-six:
-.quad 6,0
-seven:
-.quad 7,0
-eight:
-.quad 8,0
-
-OR_MASK:
-.long 0x00000000,0x00000000,0x00000000,0x80000000
-poly:
-.quad 0x1, 0xc200000000000000
-mask:
-.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
-con1:
-.long 1,1,1,1
-con2:
-.long 0x1b,0x1b,0x1b,0x1b
-con3:
-.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
-and_mask:
-.long 0,0xffffffff, 0xffffffff, 0xffffffff
-.text
-
-.p2align 4
-GFMUL:
-
- vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
- vpclmulqdq $0x11,%xmm1,%xmm0,%xmm5
- vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
- vpclmulqdq $0x01,%xmm1,%xmm0,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $8,%xmm3,%xmm4
- vpsrldq $8,%xmm3,%xmm3
- vpxor %xmm4,%xmm2,%xmm2
- vpxor %xmm3,%xmm5,%xmm5
-
- vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3
- vpshufd $78,%xmm2,%xmm4
- vpxor %xmm4,%xmm3,%xmm2
-
- vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3
- vpshufd $78,%xmm2,%xmm4
- vpxor %xmm4,%xmm3,%xmm2
-
- vpxor %xmm5,%xmm2,%xmm0
- ret
-
-
-.globl _aesgcmsiv_htable_init
-.private_extern _aesgcmsiv_htable_init
-
-.p2align 4
-_aesgcmsiv_htable_init:
-
-_CET_ENDBR
- vmovdqa (%rsi),%xmm0
- vmovdqa %xmm0,%xmm1
- vmovdqa %xmm0,(%rdi)
- call GFMUL
- vmovdqa %xmm0,16(%rdi)
- call GFMUL
- vmovdqa %xmm0,32(%rdi)
- call GFMUL
- vmovdqa %xmm0,48(%rdi)
- call GFMUL
- vmovdqa %xmm0,64(%rdi)
- call GFMUL
- vmovdqa %xmm0,80(%rdi)
- call GFMUL
- vmovdqa %xmm0,96(%rdi)
- call GFMUL
- vmovdqa %xmm0,112(%rdi)
- ret
-
-
-.globl _aesgcmsiv_htable6_init
-.private_extern _aesgcmsiv_htable6_init
-
-.p2align 4
-_aesgcmsiv_htable6_init:
-
-_CET_ENDBR
- vmovdqa (%rsi),%xmm0
- vmovdqa %xmm0,%xmm1
- vmovdqa %xmm0,(%rdi)
- call GFMUL
- vmovdqa %xmm0,16(%rdi)
- call GFMUL
- vmovdqa %xmm0,32(%rdi)
- call GFMUL
- vmovdqa %xmm0,48(%rdi)
- call GFMUL
- vmovdqa %xmm0,64(%rdi)
- call GFMUL
- vmovdqa %xmm0,80(%rdi)
- ret
-
-
-.globl _aesgcmsiv_htable_polyval
-.private_extern _aesgcmsiv_htable_polyval
-
-.p2align 4
-_aesgcmsiv_htable_polyval:
-
-_CET_ENDBR
- testq %rdx,%rdx
- jnz L$htable_polyval_start
- ret
-
-L$htable_polyval_start:
- vzeroall
-
-
-
- movq %rdx,%r11
- andq $127,%r11
-
- jz L$htable_polyval_no_prefix
-
- vpxor %xmm9,%xmm9,%xmm9
- vmovdqa (%rcx),%xmm1
- subq %r11,%rdx
-
- subq $16,%r11
-
-
- vmovdqu (%rsi),%xmm0
- vpxor %xmm1,%xmm0,%xmm0
-
- vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm5
- vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm3
- vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm4
- vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
-
- leaq 16(%rsi),%rsi
- testq %r11,%r11
- jnz L$htable_polyval_prefix_loop
- jmp L$htable_polyval_prefix_complete
-
-
-.p2align 6
-L$htable_polyval_prefix_loop:
- subq $16,%r11
-
- vmovdqu (%rsi),%xmm0
-
- vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm6
- vpxor %xmm6,%xmm3,%xmm3
- vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm6
- vpxor %xmm6,%xmm4,%xmm4
- vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
- vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
-
- testq %r11,%r11
-
- leaq 16(%rsi),%rsi
-
- jnz L$htable_polyval_prefix_loop
-
-L$htable_polyval_prefix_complete:
- vpsrldq $8,%xmm5,%xmm6
- vpslldq $8,%xmm5,%xmm5
-
- vpxor %xmm6,%xmm4,%xmm9
- vpxor %xmm5,%xmm3,%xmm1
-
- jmp L$htable_polyval_main_loop
-
-L$htable_polyval_no_prefix:
-
-
-
-
- vpxor %xmm1,%xmm1,%xmm1
- vmovdqa (%rcx),%xmm9
-
-.p2align 6
-L$htable_polyval_main_loop:
- subq $0x80,%rdx
- jb L$htable_polyval_out
-
- vmovdqu 112(%rsi),%xmm0
-
- vpclmulqdq $0x01,(%rdi),%xmm0,%xmm5
- vpclmulqdq $0x00,(%rdi),%xmm0,%xmm3
- vpclmulqdq $0x11,(%rdi),%xmm0,%xmm4
- vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
-
-
- vmovdqu 96(%rsi),%xmm0
- vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
- vpclmulqdq $0x00,16(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm3,%xmm3
- vpclmulqdq $0x11,16(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm4,%xmm4
- vpclmulqdq $0x10,16(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
-
-
-
- vmovdqu 80(%rsi),%xmm0
-
- vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7
- vpalignr $8,%xmm1,%xmm1,%xmm1
-
- vpclmulqdq $0x01,32(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
- vpclmulqdq $0x00,32(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm3,%xmm3
- vpclmulqdq $0x11,32(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm4,%xmm4
- vpclmulqdq $0x10,32(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
-
-
- vpxor %xmm7,%xmm1,%xmm1
-
- vmovdqu 64(%rsi),%xmm0
-
- vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
- vpclmulqdq $0x00,48(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm3,%xmm3
- vpclmulqdq $0x11,48(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm4,%xmm4
- vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
-
-
- vmovdqu 48(%rsi),%xmm0
-
- vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7
- vpalignr $8,%xmm1,%xmm1,%xmm1
-
- vpclmulqdq $0x01,64(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
- vpclmulqdq $0x00,64(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm3,%xmm3
- vpclmulqdq $0x11,64(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm4,%xmm4
- vpclmulqdq $0x10,64(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
-
-
- vpxor %xmm7,%xmm1,%xmm1
-
- vmovdqu 32(%rsi),%xmm0
-
- vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
- vpclmulqdq $0x00,80(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm3,%xmm3
- vpclmulqdq $0x11,80(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm4,%xmm4
- vpclmulqdq $0x10,80(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
-
-
- vpxor %xmm9,%xmm1,%xmm1
-
- vmovdqu 16(%rsi),%xmm0
-
- vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
- vpclmulqdq $0x00,96(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm3,%xmm3
- vpclmulqdq $0x11,96(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm4,%xmm4
- vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
-
-
- vmovdqu 0(%rsi),%xmm0
- vpxor %xmm1,%xmm0,%xmm0
-
- vpclmulqdq $0x01,112(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
- vpclmulqdq $0x00,112(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm3,%xmm3
- vpclmulqdq $0x11,112(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm4,%xmm4
- vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6
- vpxor %xmm6,%xmm5,%xmm5
-
-
- vpsrldq $8,%xmm5,%xmm6
- vpslldq $8,%xmm5,%xmm5
-
- vpxor %xmm6,%xmm4,%xmm9
- vpxor %xmm5,%xmm3,%xmm1
-
- leaq 128(%rsi),%rsi
- jmp L$htable_polyval_main_loop
-
-
-
-L$htable_polyval_out:
- vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6
- vpalignr $8,%xmm1,%xmm1,%xmm1
- vpxor %xmm6,%xmm1,%xmm1
-
- vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6
- vpalignr $8,%xmm1,%xmm1,%xmm1
- vpxor %xmm6,%xmm1,%xmm1
- vpxor %xmm9,%xmm1,%xmm1
-
- vmovdqu %xmm1,(%rcx)
- vzeroupper
- ret
-
-
-.globl _aesgcmsiv_polyval_horner
-.private_extern _aesgcmsiv_polyval_horner
-
-.p2align 4
-_aesgcmsiv_polyval_horner:
-
-_CET_ENDBR
- testq %rcx,%rcx
- jnz L$polyval_horner_start
- ret
-
-L$polyval_horner_start:
-
-
-
- xorq %r10,%r10
- shlq $4,%rcx
-
- vmovdqa (%rsi),%xmm1
- vmovdqa (%rdi),%xmm0
-
-L$polyval_horner_loop:
- vpxor (%rdx,%r10,1),%xmm0,%xmm0
- call GFMUL
-
- addq $16,%r10
- cmpq %r10,%rcx
- jne L$polyval_horner_loop
-
-
- vmovdqa %xmm0,(%rdi)
- ret
-
-
-.globl _aes128gcmsiv_aes_ks
-.private_extern _aes128gcmsiv_aes_ks
-
-.p2align 4
-_aes128gcmsiv_aes_ks:
-
-_CET_ENDBR
- vmovdqu (%rdi),%xmm1
- vmovdqa %xmm1,(%rsi)
-
- vmovdqa con1(%rip),%xmm0
- vmovdqa mask(%rip),%xmm15
-
- movq $8,%rax
-
-L$ks128_loop:
- addq $16,%rsi
- subq $1,%rax
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpslldq $4,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpslldq $4,%xmm3,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpslldq $4,%xmm3,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm1,(%rsi)
- jne L$ks128_loop
-
- vmovdqa con2(%rip),%xmm0
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpslldq $4,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpslldq $4,%xmm3,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpslldq $4,%xmm3,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm1,16(%rsi)
-
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslldq $4,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpslldq $4,%xmm3,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpslldq $4,%xmm3,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm1,32(%rsi)
- ret
-
-
-.globl _aes256gcmsiv_aes_ks
-.private_extern _aes256gcmsiv_aes_ks
-
-.p2align 4
-_aes256gcmsiv_aes_ks:
-
-_CET_ENDBR
- vmovdqu (%rdi),%xmm1
- vmovdqu 16(%rdi),%xmm3
- vmovdqa %xmm1,(%rsi)
- vmovdqa %xmm3,16(%rsi)
- vmovdqa con1(%rip),%xmm0
- vmovdqa mask(%rip),%xmm15
- vpxor %xmm14,%xmm14,%xmm14
- movq $6,%rax
-
-L$ks256_loop:
- addq $32,%rsi
- subq $1,%rax
- vpshufb %xmm15,%xmm3,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpsllq $32,%xmm1,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpshufb con3(%rip),%xmm1,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm1,(%rsi)
- vpshufd $0xff,%xmm1,%xmm2
- vaesenclast %xmm14,%xmm2,%xmm2
- vpsllq $32,%xmm3,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpshufb con3(%rip),%xmm3,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpxor %xmm2,%xmm3,%xmm3
- vmovdqa %xmm3,16(%rsi)
- jne L$ks256_loop
-
- vpshufb %xmm15,%xmm3,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpsllq $32,%xmm1,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpshufb con3(%rip),%xmm1,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm1,32(%rsi)
- ret
-
-.globl _aes128gcmsiv_aes_ks_enc_x1
-.private_extern _aes128gcmsiv_aes_ks_enc_x1
-
-.p2align 4
-_aes128gcmsiv_aes_ks_enc_x1:
-
-_CET_ENDBR
- vmovdqa (%rcx),%xmm1
- vmovdqa 0(%rdi),%xmm4
-
- vmovdqa %xmm1,(%rdx)
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqa con1(%rip),%xmm0
- vmovdqa mask(%rip),%xmm15
-
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpsllq $32,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpshufb con3(%rip),%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
-
- vaesenc %xmm1,%xmm4,%xmm4
- vmovdqa %xmm1,16(%rdx)
-
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpsllq $32,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpshufb con3(%rip),%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
-
- vaesenc %xmm1,%xmm4,%xmm4
- vmovdqa %xmm1,32(%rdx)
-
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpsllq $32,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpshufb con3(%rip),%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
-
- vaesenc %xmm1,%xmm4,%xmm4
- vmovdqa %xmm1,48(%rdx)
-
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpsllq $32,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpshufb con3(%rip),%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
-
- vaesenc %xmm1,%xmm4,%xmm4
- vmovdqa %xmm1,64(%rdx)
-
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpsllq $32,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpshufb con3(%rip),%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
-
- vaesenc %xmm1,%xmm4,%xmm4
- vmovdqa %xmm1,80(%rdx)
-
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpsllq $32,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpshufb con3(%rip),%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
-
- vaesenc %xmm1,%xmm4,%xmm4
- vmovdqa %xmm1,96(%rdx)
-
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpsllq $32,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpshufb con3(%rip),%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
-
- vaesenc %xmm1,%xmm4,%xmm4
- vmovdqa %xmm1,112(%rdx)
-
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpsllq $32,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpshufb con3(%rip),%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
-
- vaesenc %xmm1,%xmm4,%xmm4
- vmovdqa %xmm1,128(%rdx)
-
-
- vmovdqa con2(%rip),%xmm0
-
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpsllq $32,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpshufb con3(%rip),%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
-
- vaesenc %xmm1,%xmm4,%xmm4
- vmovdqa %xmm1,144(%rdx)
-
- vpshufb %xmm15,%xmm1,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpsllq $32,%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpshufb con3(%rip),%xmm1,%xmm3
- vpxor %xmm3,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
-
- vaesenclast %xmm1,%xmm4,%xmm4
- vmovdqa %xmm1,160(%rdx)
-
-
- vmovdqa %xmm4,0(%rsi)
- ret
-
-
-.globl _aes128gcmsiv_kdf
-.private_extern _aes128gcmsiv_kdf
-
-.p2align 4
-_aes128gcmsiv_kdf:
-
-_CET_ENDBR
-
-
-
-
- vmovdqa (%rdx),%xmm1
- vmovdqa 0(%rdi),%xmm9
- vmovdqa and_mask(%rip),%xmm12
- vmovdqa one(%rip),%xmm13
- vpshufd $0x90,%xmm9,%xmm9
- vpand %xmm12,%xmm9,%xmm9
- vpaddd %xmm13,%xmm9,%xmm10
- vpaddd %xmm13,%xmm10,%xmm11
- vpaddd %xmm13,%xmm11,%xmm12
-
- vpxor %xmm1,%xmm9,%xmm9
- vpxor %xmm1,%xmm10,%xmm10
- vpxor %xmm1,%xmm11,%xmm11
- vpxor %xmm1,%xmm12,%xmm12
-
- vmovdqa 16(%rdx),%xmm1
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
-
- vmovdqa 32(%rdx),%xmm2
- vaesenc %xmm2,%xmm9,%xmm9
- vaesenc %xmm2,%xmm10,%xmm10
- vaesenc %xmm2,%xmm11,%xmm11
- vaesenc %xmm2,%xmm12,%xmm12
-
- vmovdqa 48(%rdx),%xmm1
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
-
- vmovdqa 64(%rdx),%xmm2
- vaesenc %xmm2,%xmm9,%xmm9
- vaesenc %xmm2,%xmm10,%xmm10
- vaesenc %xmm2,%xmm11,%xmm11
- vaesenc %xmm2,%xmm12,%xmm12
-
- vmovdqa 80(%rdx),%xmm1
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
-
- vmovdqa 96(%rdx),%xmm2
- vaesenc %xmm2,%xmm9,%xmm9
- vaesenc %xmm2,%xmm10,%xmm10
- vaesenc %xmm2,%xmm11,%xmm11
- vaesenc %xmm2,%xmm12,%xmm12
-
- vmovdqa 112(%rdx),%xmm1
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
-
- vmovdqa 128(%rdx),%xmm2
- vaesenc %xmm2,%xmm9,%xmm9
- vaesenc %xmm2,%xmm10,%xmm10
- vaesenc %xmm2,%xmm11,%xmm11
- vaesenc %xmm2,%xmm12,%xmm12
-
- vmovdqa 144(%rdx),%xmm1
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
-
- vmovdqa 160(%rdx),%xmm2
- vaesenclast %xmm2,%xmm9,%xmm9
- vaesenclast %xmm2,%xmm10,%xmm10
- vaesenclast %xmm2,%xmm11,%xmm11
- vaesenclast %xmm2,%xmm12,%xmm12
-
-
- vmovdqa %xmm9,0(%rsi)
- vmovdqa %xmm10,16(%rsi)
- vmovdqa %xmm11,32(%rsi)
- vmovdqa %xmm12,48(%rsi)
- ret
-
-
-.globl _aes128gcmsiv_enc_msg_x4
-.private_extern _aes128gcmsiv_enc_msg_x4
-
-.p2align 4
-_aes128gcmsiv_enc_msg_x4:
-
-_CET_ENDBR
- testq %r8,%r8
- jnz L$128_enc_msg_x4_start
- ret
-
-L$128_enc_msg_x4_start:
- pushq %r12
-
- pushq %r13
-
-
- shrq $4,%r8
- movq %r8,%r10
- shlq $62,%r10
- shrq $62,%r10
-
-
- vmovdqa (%rdx),%xmm15
- vpor OR_MASK(%rip),%xmm15,%xmm15
-
- vmovdqu four(%rip),%xmm4
- vmovdqa %xmm15,%xmm0
- vpaddd one(%rip),%xmm15,%xmm1
- vpaddd two(%rip),%xmm15,%xmm2
- vpaddd three(%rip),%xmm15,%xmm3
-
- shrq $2,%r8
- je L$128_enc_msg_x4_check_remainder
-
- subq $64,%rsi
- subq $64,%rdi
-
-L$128_enc_msg_x4_loop1:
- addq $64,%rsi
- addq $64,%rdi
-
- vmovdqa %xmm0,%xmm5
- vmovdqa %xmm1,%xmm6
- vmovdqa %xmm2,%xmm7
- vmovdqa %xmm3,%xmm8
-
- vpxor (%rcx),%xmm5,%xmm5
- vpxor (%rcx),%xmm6,%xmm6
- vpxor (%rcx),%xmm7,%xmm7
- vpxor (%rcx),%xmm8,%xmm8
-
- vmovdqu 16(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vpaddd %xmm4,%xmm0,%xmm0
- vmovdqu 32(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vpaddd %xmm4,%xmm1,%xmm1
- vmovdqu 48(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vpaddd %xmm4,%xmm2,%xmm2
- vmovdqu 64(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vpaddd %xmm4,%xmm3,%xmm3
-
- vmovdqu 80(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 96(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 112(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 128(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 144(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 160(%rcx),%xmm12
- vaesenclast %xmm12,%xmm5,%xmm5
- vaesenclast %xmm12,%xmm6,%xmm6
- vaesenclast %xmm12,%xmm7,%xmm7
- vaesenclast %xmm12,%xmm8,%xmm8
-
-
-
- vpxor 0(%rdi),%xmm5,%xmm5
- vpxor 16(%rdi),%xmm6,%xmm6
- vpxor 32(%rdi),%xmm7,%xmm7
- vpxor 48(%rdi),%xmm8,%xmm8
-
- subq $1,%r8
-
- vmovdqu %xmm5,0(%rsi)
- vmovdqu %xmm6,16(%rsi)
- vmovdqu %xmm7,32(%rsi)
- vmovdqu %xmm8,48(%rsi)
-
- jne L$128_enc_msg_x4_loop1
-
- addq $64,%rsi
- addq $64,%rdi
-
-L$128_enc_msg_x4_check_remainder:
- cmpq $0,%r10
- je L$128_enc_msg_x4_out
-
-L$128_enc_msg_x4_loop2:
-
-
- vmovdqa %xmm0,%xmm5
- vpaddd one(%rip),%xmm0,%xmm0
-
- vpxor (%rcx),%xmm5,%xmm5
- vaesenc 16(%rcx),%xmm5,%xmm5
- vaesenc 32(%rcx),%xmm5,%xmm5
- vaesenc 48(%rcx),%xmm5,%xmm5
- vaesenc 64(%rcx),%xmm5,%xmm5
- vaesenc 80(%rcx),%xmm5,%xmm5
- vaesenc 96(%rcx),%xmm5,%xmm5
- vaesenc 112(%rcx),%xmm5,%xmm5
- vaesenc 128(%rcx),%xmm5,%xmm5
- vaesenc 144(%rcx),%xmm5,%xmm5
- vaesenclast 160(%rcx),%xmm5,%xmm5
-
-
- vpxor (%rdi),%xmm5,%xmm5
- vmovdqu %xmm5,(%rsi)
-
- addq $16,%rdi
- addq $16,%rsi
-
- subq $1,%r10
- jne L$128_enc_msg_x4_loop2
-
-L$128_enc_msg_x4_out:
- popq %r13
-
- popq %r12
-
- ret
-
-
-.globl _aes128gcmsiv_enc_msg_x8
-.private_extern _aes128gcmsiv_enc_msg_x8
-
-.p2align 4
-_aes128gcmsiv_enc_msg_x8:
-
-_CET_ENDBR
- testq %r8,%r8
- jnz L$128_enc_msg_x8_start
- ret
-
-L$128_enc_msg_x8_start:
- pushq %r12
-
- pushq %r13
-
- pushq %rbp
-
- movq %rsp,%rbp
-
-
-
- subq $128,%rsp
- andq $-64,%rsp
-
- shrq $4,%r8
- movq %r8,%r10
- shlq $61,%r10
- shrq $61,%r10
-
-
- vmovdqu (%rdx),%xmm1
- vpor OR_MASK(%rip),%xmm1,%xmm1
-
-
- vpaddd seven(%rip),%xmm1,%xmm0
- vmovdqu %xmm0,(%rsp)
- vpaddd one(%rip),%xmm1,%xmm9
- vpaddd two(%rip),%xmm1,%xmm10
- vpaddd three(%rip),%xmm1,%xmm11
- vpaddd four(%rip),%xmm1,%xmm12
- vpaddd five(%rip),%xmm1,%xmm13
- vpaddd six(%rip),%xmm1,%xmm14
- vmovdqa %xmm1,%xmm0
-
- shrq $3,%r8
- je L$128_enc_msg_x8_check_remainder
-
- subq $128,%rsi
- subq $128,%rdi
-
-L$128_enc_msg_x8_loop1:
- addq $128,%rsi
- addq $128,%rdi
-
- vmovdqa %xmm0,%xmm1
- vmovdqa %xmm9,%xmm2
- vmovdqa %xmm10,%xmm3
- vmovdqa %xmm11,%xmm4
- vmovdqa %xmm12,%xmm5
- vmovdqa %xmm13,%xmm6
- vmovdqa %xmm14,%xmm7
-
- vmovdqu (%rsp),%xmm8
-
- vpxor (%rcx),%xmm1,%xmm1
- vpxor (%rcx),%xmm2,%xmm2
- vpxor (%rcx),%xmm3,%xmm3
- vpxor (%rcx),%xmm4,%xmm4
- vpxor (%rcx),%xmm5,%xmm5
- vpxor (%rcx),%xmm6,%xmm6
- vpxor (%rcx),%xmm7,%xmm7
- vpxor (%rcx),%xmm8,%xmm8
-
- vmovdqu 16(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vmovdqu (%rsp),%xmm14
- vpaddd eight(%rip),%xmm14,%xmm14
- vmovdqu %xmm14,(%rsp)
- vmovdqu 32(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpsubd one(%rip),%xmm14,%xmm14
- vmovdqu 48(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpaddd eight(%rip),%xmm0,%xmm0
- vmovdqu 64(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpaddd eight(%rip),%xmm9,%xmm9
- vmovdqu 80(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpaddd eight(%rip),%xmm10,%xmm10
- vmovdqu 96(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpaddd eight(%rip),%xmm11,%xmm11
- vmovdqu 112(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpaddd eight(%rip),%xmm12,%xmm12
- vmovdqu 128(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpaddd eight(%rip),%xmm13,%xmm13
- vmovdqu 144(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vmovdqu 160(%rcx),%xmm15
- vaesenclast %xmm15,%xmm1,%xmm1
- vaesenclast %xmm15,%xmm2,%xmm2
- vaesenclast %xmm15,%xmm3,%xmm3
- vaesenclast %xmm15,%xmm4,%xmm4
- vaesenclast %xmm15,%xmm5,%xmm5
- vaesenclast %xmm15,%xmm6,%xmm6
- vaesenclast %xmm15,%xmm7,%xmm7
- vaesenclast %xmm15,%xmm8,%xmm8
-
-
-
- vpxor 0(%rdi),%xmm1,%xmm1
- vpxor 16(%rdi),%xmm2,%xmm2
- vpxor 32(%rdi),%xmm3,%xmm3
- vpxor 48(%rdi),%xmm4,%xmm4
- vpxor 64(%rdi),%xmm5,%xmm5
- vpxor 80(%rdi),%xmm6,%xmm6
- vpxor 96(%rdi),%xmm7,%xmm7
- vpxor 112(%rdi),%xmm8,%xmm8
-
- decq %r8
-
- vmovdqu %xmm1,0(%rsi)
- vmovdqu %xmm2,16(%rsi)
- vmovdqu %xmm3,32(%rsi)
- vmovdqu %xmm4,48(%rsi)
- vmovdqu %xmm5,64(%rsi)
- vmovdqu %xmm6,80(%rsi)
- vmovdqu %xmm7,96(%rsi)
- vmovdqu %xmm8,112(%rsi)
-
- jne L$128_enc_msg_x8_loop1
-
- addq $128,%rsi
- addq $128,%rdi
-
-L$128_enc_msg_x8_check_remainder:
- cmpq $0,%r10
- je L$128_enc_msg_x8_out
-
-L$128_enc_msg_x8_loop2:
-
-
- vmovdqa %xmm0,%xmm1
- vpaddd one(%rip),%xmm0,%xmm0
-
- vpxor (%rcx),%xmm1,%xmm1
- vaesenc 16(%rcx),%xmm1,%xmm1
- vaesenc 32(%rcx),%xmm1,%xmm1
- vaesenc 48(%rcx),%xmm1,%xmm1
- vaesenc 64(%rcx),%xmm1,%xmm1
- vaesenc 80(%rcx),%xmm1,%xmm1
- vaesenc 96(%rcx),%xmm1,%xmm1
- vaesenc 112(%rcx),%xmm1,%xmm1
- vaesenc 128(%rcx),%xmm1,%xmm1
- vaesenc 144(%rcx),%xmm1,%xmm1
- vaesenclast 160(%rcx),%xmm1,%xmm1
-
-
- vpxor (%rdi),%xmm1,%xmm1
-
- vmovdqu %xmm1,(%rsi)
-
- addq $16,%rdi
- addq $16,%rsi
-
- decq %r10
- jne L$128_enc_msg_x8_loop2
-
-L$128_enc_msg_x8_out:
- movq %rbp,%rsp
-
- popq %rbp
-
- popq %r13
-
- popq %r12
-
- ret
-
-
-.globl _aes128gcmsiv_dec
-.private_extern _aes128gcmsiv_dec
-
-.p2align 4
-_aes128gcmsiv_dec:
-
-_CET_ENDBR
- testq $~15,%r9
- jnz L$128_dec_start
- ret
-
-L$128_dec_start:
- vzeroupper
- vmovdqa (%rdx),%xmm0
- movq %rdx,%rax
-
- leaq 32(%rax),%rax
- leaq 32(%rcx),%rcx
-
-
- vmovdqu (%rdi,%r9,1),%xmm15
- vpor OR_MASK(%rip),%xmm15,%xmm15
- andq $~15,%r9
-
-
- cmpq $96,%r9
- jb L$128_dec_loop2
-
-
- subq $96,%r9
- vmovdqa %xmm15,%xmm7
- vpaddd one(%rip),%xmm7,%xmm8
- vpaddd two(%rip),%xmm7,%xmm9
- vpaddd one(%rip),%xmm9,%xmm10
- vpaddd two(%rip),%xmm9,%xmm11
- vpaddd one(%rip),%xmm11,%xmm12
- vpaddd two(%rip),%xmm11,%xmm15
-
- vpxor (%r8),%xmm7,%xmm7
- vpxor (%r8),%xmm8,%xmm8
- vpxor (%r8),%xmm9,%xmm9
- vpxor (%r8),%xmm10,%xmm10
- vpxor (%r8),%xmm11,%xmm11
- vpxor (%r8),%xmm12,%xmm12
-
- vmovdqu 16(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 32(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 48(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 64(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 80(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 96(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 112(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 128(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 144(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 160(%r8),%xmm4
- vaesenclast %xmm4,%xmm7,%xmm7
- vaesenclast %xmm4,%xmm8,%xmm8
- vaesenclast %xmm4,%xmm9,%xmm9
- vaesenclast %xmm4,%xmm10,%xmm10
- vaesenclast %xmm4,%xmm11,%xmm11
- vaesenclast %xmm4,%xmm12,%xmm12
-
-
- vpxor 0(%rdi),%xmm7,%xmm7
- vpxor 16(%rdi),%xmm8,%xmm8
- vpxor 32(%rdi),%xmm9,%xmm9
- vpxor 48(%rdi),%xmm10,%xmm10
- vpxor 64(%rdi),%xmm11,%xmm11
- vpxor 80(%rdi),%xmm12,%xmm12
-
- vmovdqu %xmm7,0(%rsi)
- vmovdqu %xmm8,16(%rsi)
- vmovdqu %xmm9,32(%rsi)
- vmovdqu %xmm10,48(%rsi)
- vmovdqu %xmm11,64(%rsi)
- vmovdqu %xmm12,80(%rsi)
-
- addq $96,%rdi
- addq $96,%rsi
- jmp L$128_dec_loop1
-
-
-.p2align 6
-L$128_dec_loop1:
- cmpq $96,%r9
- jb L$128_dec_finish_96
- subq $96,%r9
-
- vmovdqa %xmm12,%xmm6
- vmovdqa %xmm11,16-32(%rax)
- vmovdqa %xmm10,32-32(%rax)
- vmovdqa %xmm9,48-32(%rax)
- vmovdqa %xmm8,64-32(%rax)
- vmovdqa %xmm7,80-32(%rax)
-
- vmovdqa %xmm15,%xmm7
- vpaddd one(%rip),%xmm7,%xmm8
- vpaddd two(%rip),%xmm7,%xmm9
- vpaddd one(%rip),%xmm9,%xmm10
- vpaddd two(%rip),%xmm9,%xmm11
- vpaddd one(%rip),%xmm11,%xmm12
- vpaddd two(%rip),%xmm11,%xmm15
-
- vmovdqa (%r8),%xmm4
- vpxor %xmm4,%xmm7,%xmm7
- vpxor %xmm4,%xmm8,%xmm8
- vpxor %xmm4,%xmm9,%xmm9
- vpxor %xmm4,%xmm10,%xmm10
- vpxor %xmm4,%xmm11,%xmm11
- vpxor %xmm4,%xmm12,%xmm12
-
- vmovdqu 0-32(%rcx),%xmm4
- vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
- vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
- vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1
- vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vmovdqu 16(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu -16(%rax),%xmm6
- vmovdqu -16(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
-
- vmovdqu 32(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 0(%rax),%xmm6
- vmovdqu 0(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
-
- vmovdqu 48(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 16(%rax),%xmm6
- vmovdqu 16(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
-
- vmovdqu 64(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 32(%rax),%xmm6
- vmovdqu 32(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
-
- vmovdqu 80(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 96(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 112(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
-
- vmovdqa 80-32(%rax),%xmm6
- vpxor %xmm0,%xmm6,%xmm6
- vmovdqu 80-32(%rcx),%xmm5
-
- vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vmovdqu 128(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
-
- vpsrldq $8,%xmm1,%xmm4
- vpxor %xmm4,%xmm2,%xmm5
- vpslldq $8,%xmm1,%xmm4
- vpxor %xmm4,%xmm3,%xmm0
-
- vmovdqa poly(%rip),%xmm3
-
- vmovdqu 144(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 160(%r8),%xmm6
- vpalignr $8,%xmm0,%xmm0,%xmm2
- vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
- vpxor %xmm0,%xmm2,%xmm0
-
- vpxor 0(%rdi),%xmm6,%xmm4
- vaesenclast %xmm4,%xmm7,%xmm7
- vpxor 16(%rdi),%xmm6,%xmm4
- vaesenclast %xmm4,%xmm8,%xmm8
- vpxor 32(%rdi),%xmm6,%xmm4
- vaesenclast %xmm4,%xmm9,%xmm9
- vpxor 48(%rdi),%xmm6,%xmm4
- vaesenclast %xmm4,%xmm10,%xmm10
- vpxor 64(%rdi),%xmm6,%xmm4
- vaesenclast %xmm4,%xmm11,%xmm11
- vpxor 80(%rdi),%xmm6,%xmm4
- vaesenclast %xmm4,%xmm12,%xmm12
-
- vpalignr $8,%xmm0,%xmm0,%xmm2
- vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
- vpxor %xmm0,%xmm2,%xmm0
-
- vmovdqu %xmm7,0(%rsi)
- vmovdqu %xmm8,16(%rsi)
- vmovdqu %xmm9,32(%rsi)
- vmovdqu %xmm10,48(%rsi)
- vmovdqu %xmm11,64(%rsi)
- vmovdqu %xmm12,80(%rsi)
-
- vpxor %xmm5,%xmm0,%xmm0
-
- leaq 96(%rdi),%rdi
- leaq 96(%rsi),%rsi
- jmp L$128_dec_loop1
-
-L$128_dec_finish_96:
- vmovdqa %xmm12,%xmm6
- vmovdqa %xmm11,16-32(%rax)
- vmovdqa %xmm10,32-32(%rax)
- vmovdqa %xmm9,48-32(%rax)
- vmovdqa %xmm8,64-32(%rax)
- vmovdqa %xmm7,80-32(%rax)
-
- vmovdqu 0-32(%rcx),%xmm4
- vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1
- vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
- vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
- vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vmovdqu -16(%rax),%xmm6
- vmovdqu -16(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vmovdqu 0(%rax),%xmm6
- vmovdqu 0(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vmovdqu 16(%rax),%xmm6
- vmovdqu 16(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vmovdqu 32(%rax),%xmm6
- vmovdqu 32(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
-
- vmovdqu 80-32(%rax),%xmm6
- vpxor %xmm0,%xmm6,%xmm6
- vmovdqu 80-32(%rcx),%xmm5
- vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vpsrldq $8,%xmm1,%xmm4
- vpxor %xmm4,%xmm2,%xmm5
- vpslldq $8,%xmm1,%xmm4
- vpxor %xmm4,%xmm3,%xmm0
-
- vmovdqa poly(%rip),%xmm3
-
- vpalignr $8,%xmm0,%xmm0,%xmm2
- vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
- vpxor %xmm0,%xmm2,%xmm0
-
- vpalignr $8,%xmm0,%xmm0,%xmm2
- vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
- vpxor %xmm0,%xmm2,%xmm0
-
- vpxor %xmm5,%xmm0,%xmm0
-
-L$128_dec_loop2:
-
-
-
- cmpq $16,%r9
- jb L$128_dec_out
- subq $16,%r9
-
- vmovdqa %xmm15,%xmm2
- vpaddd one(%rip),%xmm15,%xmm15
-
- vpxor 0(%r8),%xmm2,%xmm2
- vaesenc 16(%r8),%xmm2,%xmm2
- vaesenc 32(%r8),%xmm2,%xmm2
- vaesenc 48(%r8),%xmm2,%xmm2
- vaesenc 64(%r8),%xmm2,%xmm2
- vaesenc 80(%r8),%xmm2,%xmm2
- vaesenc 96(%r8),%xmm2,%xmm2
- vaesenc 112(%r8),%xmm2,%xmm2
- vaesenc 128(%r8),%xmm2,%xmm2
- vaesenc 144(%r8),%xmm2,%xmm2
- vaesenclast 160(%r8),%xmm2,%xmm2
- vpxor (%rdi),%xmm2,%xmm2
- vmovdqu %xmm2,(%rsi)
- addq $16,%rdi
- addq $16,%rsi
-
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa -32(%rcx),%xmm1
- call GFMUL
-
- jmp L$128_dec_loop2
-
-L$128_dec_out:
- vmovdqu %xmm0,(%rdx)
- ret
-
-
-.globl _aes128gcmsiv_ecb_enc_block
-.private_extern _aes128gcmsiv_ecb_enc_block
-
-.p2align 4
-_aes128gcmsiv_ecb_enc_block:
-
-_CET_ENDBR
- vmovdqa (%rdi),%xmm1
-
- vpxor (%rdx),%xmm1,%xmm1
- vaesenc 16(%rdx),%xmm1,%xmm1
- vaesenc 32(%rdx),%xmm1,%xmm1
- vaesenc 48(%rdx),%xmm1,%xmm1
- vaesenc 64(%rdx),%xmm1,%xmm1
- vaesenc 80(%rdx),%xmm1,%xmm1
- vaesenc 96(%rdx),%xmm1,%xmm1
- vaesenc 112(%rdx),%xmm1,%xmm1
- vaesenc 128(%rdx),%xmm1,%xmm1
- vaesenc 144(%rdx),%xmm1,%xmm1
- vaesenclast 160(%rdx),%xmm1,%xmm1
-
- vmovdqa %xmm1,(%rsi)
-
- ret
-
-
-.globl _aes256gcmsiv_aes_ks_enc_x1
-.private_extern _aes256gcmsiv_aes_ks_enc_x1
-
-.p2align 4
-_aes256gcmsiv_aes_ks_enc_x1:
-
-_CET_ENDBR
- vmovdqa con1(%rip),%xmm0
- vmovdqa mask(%rip),%xmm15
- vmovdqa (%rdi),%xmm8
- vmovdqa (%rcx),%xmm1
- vmovdqa 16(%rcx),%xmm3
- vpxor %xmm1,%xmm8,%xmm8
- vaesenc %xmm3,%xmm8,%xmm8
- vmovdqu %xmm1,(%rdx)
- vmovdqu %xmm3,16(%rdx)
- vpxor %xmm14,%xmm14,%xmm14
-
- vpshufb %xmm15,%xmm3,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpslldq $4,%xmm1,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
- vaesenc %xmm1,%xmm8,%xmm8
- vmovdqu %xmm1,32(%rdx)
-
- vpshufd $0xff,%xmm1,%xmm2
- vaesenclast %xmm14,%xmm2,%xmm2
- vpslldq $4,%xmm3,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpxor %xmm2,%xmm3,%xmm3
- vaesenc %xmm3,%xmm8,%xmm8
- vmovdqu %xmm3,48(%rdx)
-
- vpshufb %xmm15,%xmm3,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpslldq $4,%xmm1,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
- vaesenc %xmm1,%xmm8,%xmm8
- vmovdqu %xmm1,64(%rdx)
-
- vpshufd $0xff,%xmm1,%xmm2
- vaesenclast %xmm14,%xmm2,%xmm2
- vpslldq $4,%xmm3,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpxor %xmm2,%xmm3,%xmm3
- vaesenc %xmm3,%xmm8,%xmm8
- vmovdqu %xmm3,80(%rdx)
-
- vpshufb %xmm15,%xmm3,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpslldq $4,%xmm1,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
- vaesenc %xmm1,%xmm8,%xmm8
- vmovdqu %xmm1,96(%rdx)
-
- vpshufd $0xff,%xmm1,%xmm2
- vaesenclast %xmm14,%xmm2,%xmm2
- vpslldq $4,%xmm3,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpxor %xmm2,%xmm3,%xmm3
- vaesenc %xmm3,%xmm8,%xmm8
- vmovdqu %xmm3,112(%rdx)
-
- vpshufb %xmm15,%xmm3,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpslldq $4,%xmm1,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
- vaesenc %xmm1,%xmm8,%xmm8
- vmovdqu %xmm1,128(%rdx)
-
- vpshufd $0xff,%xmm1,%xmm2
- vaesenclast %xmm14,%xmm2,%xmm2
- vpslldq $4,%xmm3,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpxor %xmm2,%xmm3,%xmm3
- vaesenc %xmm3,%xmm8,%xmm8
- vmovdqu %xmm3,144(%rdx)
-
- vpshufb %xmm15,%xmm3,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpslldq $4,%xmm1,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
- vaesenc %xmm1,%xmm8,%xmm8
- vmovdqu %xmm1,160(%rdx)
-
- vpshufd $0xff,%xmm1,%xmm2
- vaesenclast %xmm14,%xmm2,%xmm2
- vpslldq $4,%xmm3,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpxor %xmm2,%xmm3,%xmm3
- vaesenc %xmm3,%xmm8,%xmm8
- vmovdqu %xmm3,176(%rdx)
-
- vpshufb %xmm15,%xmm3,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslld $1,%xmm0,%xmm0
- vpslldq $4,%xmm1,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
- vaesenc %xmm1,%xmm8,%xmm8
- vmovdqu %xmm1,192(%rdx)
-
- vpshufd $0xff,%xmm1,%xmm2
- vaesenclast %xmm14,%xmm2,%xmm2
- vpslldq $4,%xmm3,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpxor %xmm2,%xmm3,%xmm3
- vaesenc %xmm3,%xmm8,%xmm8
- vmovdqu %xmm3,208(%rdx)
-
- vpshufb %xmm15,%xmm3,%xmm2
- vaesenclast %xmm0,%xmm2,%xmm2
- vpslldq $4,%xmm1,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpslldq $4,%xmm4,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpxor %xmm2,%xmm1,%xmm1
- vaesenclast %xmm1,%xmm8,%xmm8
- vmovdqu %xmm1,224(%rdx)
-
- vmovdqa %xmm8,(%rsi)
- ret
-
-
-.globl _aes256gcmsiv_ecb_enc_block
-.private_extern _aes256gcmsiv_ecb_enc_block
-
-.p2align 4
-_aes256gcmsiv_ecb_enc_block:
-
-_CET_ENDBR
- vmovdqa (%rdi),%xmm1
- vpxor (%rdx),%xmm1,%xmm1
- vaesenc 16(%rdx),%xmm1,%xmm1
- vaesenc 32(%rdx),%xmm1,%xmm1
- vaesenc 48(%rdx),%xmm1,%xmm1
- vaesenc 64(%rdx),%xmm1,%xmm1
- vaesenc 80(%rdx),%xmm1,%xmm1
- vaesenc 96(%rdx),%xmm1,%xmm1
- vaesenc 112(%rdx),%xmm1,%xmm1
- vaesenc 128(%rdx),%xmm1,%xmm1
- vaesenc 144(%rdx),%xmm1,%xmm1
- vaesenc 160(%rdx),%xmm1,%xmm1
- vaesenc 176(%rdx),%xmm1,%xmm1
- vaesenc 192(%rdx),%xmm1,%xmm1
- vaesenc 208(%rdx),%xmm1,%xmm1
- vaesenclast 224(%rdx),%xmm1,%xmm1
- vmovdqa %xmm1,(%rsi)
- ret
-
-
-.globl _aes256gcmsiv_enc_msg_x4
-.private_extern _aes256gcmsiv_enc_msg_x4
-
-.p2align 4
-_aes256gcmsiv_enc_msg_x4:
-
-_CET_ENDBR
- testq %r8,%r8
- jnz L$256_enc_msg_x4_start
- ret
-
-L$256_enc_msg_x4_start:
- movq %r8,%r10
- shrq $4,%r8
- shlq $60,%r10
- jz L$256_enc_msg_x4_start2
- addq $1,%r8
-
-L$256_enc_msg_x4_start2:
- movq %r8,%r10
- shlq $62,%r10
- shrq $62,%r10
-
-
- vmovdqa (%rdx),%xmm15
- vpor OR_MASK(%rip),%xmm15,%xmm15
-
- vmovdqa four(%rip),%xmm4
- vmovdqa %xmm15,%xmm0
- vpaddd one(%rip),%xmm15,%xmm1
- vpaddd two(%rip),%xmm15,%xmm2
- vpaddd three(%rip),%xmm15,%xmm3
-
- shrq $2,%r8
- je L$256_enc_msg_x4_check_remainder
-
- subq $64,%rsi
- subq $64,%rdi
-
-L$256_enc_msg_x4_loop1:
- addq $64,%rsi
- addq $64,%rdi
-
- vmovdqa %xmm0,%xmm5
- vmovdqa %xmm1,%xmm6
- vmovdqa %xmm2,%xmm7
- vmovdqa %xmm3,%xmm8
-
- vpxor (%rcx),%xmm5,%xmm5
- vpxor (%rcx),%xmm6,%xmm6
- vpxor (%rcx),%xmm7,%xmm7
- vpxor (%rcx),%xmm8,%xmm8
-
- vmovdqu 16(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vpaddd %xmm4,%xmm0,%xmm0
- vmovdqu 32(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vpaddd %xmm4,%xmm1,%xmm1
- vmovdqu 48(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vpaddd %xmm4,%xmm2,%xmm2
- vmovdqu 64(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vpaddd %xmm4,%xmm3,%xmm3
-
- vmovdqu 80(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 96(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 112(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 128(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 144(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 160(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 176(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 192(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 208(%rcx),%xmm12
- vaesenc %xmm12,%xmm5,%xmm5
- vaesenc %xmm12,%xmm6,%xmm6
- vaesenc %xmm12,%xmm7,%xmm7
- vaesenc %xmm12,%xmm8,%xmm8
-
- vmovdqu 224(%rcx),%xmm12
- vaesenclast %xmm12,%xmm5,%xmm5
- vaesenclast %xmm12,%xmm6,%xmm6
- vaesenclast %xmm12,%xmm7,%xmm7
- vaesenclast %xmm12,%xmm8,%xmm8
-
-
-
- vpxor 0(%rdi),%xmm5,%xmm5
- vpxor 16(%rdi),%xmm6,%xmm6
- vpxor 32(%rdi),%xmm7,%xmm7
- vpxor 48(%rdi),%xmm8,%xmm8
-
- subq $1,%r8
-
- vmovdqu %xmm5,0(%rsi)
- vmovdqu %xmm6,16(%rsi)
- vmovdqu %xmm7,32(%rsi)
- vmovdqu %xmm8,48(%rsi)
-
- jne L$256_enc_msg_x4_loop1
-
- addq $64,%rsi
- addq $64,%rdi
-
-L$256_enc_msg_x4_check_remainder:
- cmpq $0,%r10
- je L$256_enc_msg_x4_out
-
-L$256_enc_msg_x4_loop2:
-
-
-
- vmovdqa %xmm0,%xmm5
- vpaddd one(%rip),%xmm0,%xmm0
- vpxor (%rcx),%xmm5,%xmm5
- vaesenc 16(%rcx),%xmm5,%xmm5
- vaesenc 32(%rcx),%xmm5,%xmm5
- vaesenc 48(%rcx),%xmm5,%xmm5
- vaesenc 64(%rcx),%xmm5,%xmm5
- vaesenc 80(%rcx),%xmm5,%xmm5
- vaesenc 96(%rcx),%xmm5,%xmm5
- vaesenc 112(%rcx),%xmm5,%xmm5
- vaesenc 128(%rcx),%xmm5,%xmm5
- vaesenc 144(%rcx),%xmm5,%xmm5
- vaesenc 160(%rcx),%xmm5,%xmm5
- vaesenc 176(%rcx),%xmm5,%xmm5
- vaesenc 192(%rcx),%xmm5,%xmm5
- vaesenc 208(%rcx),%xmm5,%xmm5
- vaesenclast 224(%rcx),%xmm5,%xmm5
-
-
- vpxor (%rdi),%xmm5,%xmm5
-
- vmovdqu %xmm5,(%rsi)
-
- addq $16,%rdi
- addq $16,%rsi
-
- subq $1,%r10
- jne L$256_enc_msg_x4_loop2
-
-L$256_enc_msg_x4_out:
- ret
-
-
-.globl _aes256gcmsiv_enc_msg_x8
-.private_extern _aes256gcmsiv_enc_msg_x8
-
-.p2align 4
-_aes256gcmsiv_enc_msg_x8:
-
-_CET_ENDBR
- testq %r8,%r8
- jnz L$256_enc_msg_x8_start
- ret
-
-L$256_enc_msg_x8_start:
-
- movq %rsp,%r11
- subq $16,%r11
- andq $-64,%r11
-
- movq %r8,%r10
- shrq $4,%r8
- shlq $60,%r10
- jz L$256_enc_msg_x8_start2
- addq $1,%r8
-
-L$256_enc_msg_x8_start2:
- movq %r8,%r10
- shlq $61,%r10
- shrq $61,%r10
-
-
- vmovdqa (%rdx),%xmm1
- vpor OR_MASK(%rip),%xmm1,%xmm1
-
-
- vpaddd seven(%rip),%xmm1,%xmm0
- vmovdqa %xmm0,(%r11)
- vpaddd one(%rip),%xmm1,%xmm9
- vpaddd two(%rip),%xmm1,%xmm10
- vpaddd three(%rip),%xmm1,%xmm11
- vpaddd four(%rip),%xmm1,%xmm12
- vpaddd five(%rip),%xmm1,%xmm13
- vpaddd six(%rip),%xmm1,%xmm14
- vmovdqa %xmm1,%xmm0
-
- shrq $3,%r8
- jz L$256_enc_msg_x8_check_remainder
-
- subq $128,%rsi
- subq $128,%rdi
-
-L$256_enc_msg_x8_loop1:
- addq $128,%rsi
- addq $128,%rdi
-
- vmovdqa %xmm0,%xmm1
- vmovdqa %xmm9,%xmm2
- vmovdqa %xmm10,%xmm3
- vmovdqa %xmm11,%xmm4
- vmovdqa %xmm12,%xmm5
- vmovdqa %xmm13,%xmm6
- vmovdqa %xmm14,%xmm7
-
- vmovdqa (%r11),%xmm8
-
- vpxor (%rcx),%xmm1,%xmm1
- vpxor (%rcx),%xmm2,%xmm2
- vpxor (%rcx),%xmm3,%xmm3
- vpxor (%rcx),%xmm4,%xmm4
- vpxor (%rcx),%xmm5,%xmm5
- vpxor (%rcx),%xmm6,%xmm6
- vpxor (%rcx),%xmm7,%xmm7
- vpxor (%rcx),%xmm8,%xmm8
-
- vmovdqu 16(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vmovdqa (%r11),%xmm14
- vpaddd eight(%rip),%xmm14,%xmm14
- vmovdqa %xmm14,(%r11)
- vmovdqu 32(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpsubd one(%rip),%xmm14,%xmm14
- vmovdqu 48(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpaddd eight(%rip),%xmm0,%xmm0
- vmovdqu 64(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpaddd eight(%rip),%xmm9,%xmm9
- vmovdqu 80(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpaddd eight(%rip),%xmm10,%xmm10
- vmovdqu 96(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpaddd eight(%rip),%xmm11,%xmm11
- vmovdqu 112(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpaddd eight(%rip),%xmm12,%xmm12
- vmovdqu 128(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vpaddd eight(%rip),%xmm13,%xmm13
- vmovdqu 144(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vmovdqu 160(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vmovdqu 176(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vmovdqu 192(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vmovdqu 208(%rcx),%xmm15
- vaesenc %xmm15,%xmm1,%xmm1
- vaesenc %xmm15,%xmm2,%xmm2
- vaesenc %xmm15,%xmm3,%xmm3
- vaesenc %xmm15,%xmm4,%xmm4
- vaesenc %xmm15,%xmm5,%xmm5
- vaesenc %xmm15,%xmm6,%xmm6
- vaesenc %xmm15,%xmm7,%xmm7
- vaesenc %xmm15,%xmm8,%xmm8
-
- vmovdqu 224(%rcx),%xmm15
- vaesenclast %xmm15,%xmm1,%xmm1
- vaesenclast %xmm15,%xmm2,%xmm2
- vaesenclast %xmm15,%xmm3,%xmm3
- vaesenclast %xmm15,%xmm4,%xmm4
- vaesenclast %xmm15,%xmm5,%xmm5
- vaesenclast %xmm15,%xmm6,%xmm6
- vaesenclast %xmm15,%xmm7,%xmm7
- vaesenclast %xmm15,%xmm8,%xmm8
-
-
-
- vpxor 0(%rdi),%xmm1,%xmm1
- vpxor 16(%rdi),%xmm2,%xmm2
- vpxor 32(%rdi),%xmm3,%xmm3
- vpxor 48(%rdi),%xmm4,%xmm4
- vpxor 64(%rdi),%xmm5,%xmm5
- vpxor 80(%rdi),%xmm6,%xmm6
- vpxor 96(%rdi),%xmm7,%xmm7
- vpxor 112(%rdi),%xmm8,%xmm8
-
- subq $1,%r8
-
- vmovdqu %xmm1,0(%rsi)
- vmovdqu %xmm2,16(%rsi)
- vmovdqu %xmm3,32(%rsi)
- vmovdqu %xmm4,48(%rsi)
- vmovdqu %xmm5,64(%rsi)
- vmovdqu %xmm6,80(%rsi)
- vmovdqu %xmm7,96(%rsi)
- vmovdqu %xmm8,112(%rsi)
-
- jne L$256_enc_msg_x8_loop1
-
- addq $128,%rsi
- addq $128,%rdi
-
-L$256_enc_msg_x8_check_remainder:
- cmpq $0,%r10
- je L$256_enc_msg_x8_out
-
-L$256_enc_msg_x8_loop2:
-
-
- vmovdqa %xmm0,%xmm1
- vpaddd one(%rip),%xmm0,%xmm0
-
- vpxor (%rcx),%xmm1,%xmm1
- vaesenc 16(%rcx),%xmm1,%xmm1
- vaesenc 32(%rcx),%xmm1,%xmm1
- vaesenc 48(%rcx),%xmm1,%xmm1
- vaesenc 64(%rcx),%xmm1,%xmm1
- vaesenc 80(%rcx),%xmm1,%xmm1
- vaesenc 96(%rcx),%xmm1,%xmm1
- vaesenc 112(%rcx),%xmm1,%xmm1
- vaesenc 128(%rcx),%xmm1,%xmm1
- vaesenc 144(%rcx),%xmm1,%xmm1
- vaesenc 160(%rcx),%xmm1,%xmm1
- vaesenc 176(%rcx),%xmm1,%xmm1
- vaesenc 192(%rcx),%xmm1,%xmm1
- vaesenc 208(%rcx),%xmm1,%xmm1
- vaesenclast 224(%rcx),%xmm1,%xmm1
-
-
- vpxor (%rdi),%xmm1,%xmm1
-
- vmovdqu %xmm1,(%rsi)
-
- addq $16,%rdi
- addq $16,%rsi
- subq $1,%r10
- jnz L$256_enc_msg_x8_loop2
-
-L$256_enc_msg_x8_out:
- ret
-
-
-
-.globl _aes256gcmsiv_dec
-.private_extern _aes256gcmsiv_dec
-
-.p2align 4
-_aes256gcmsiv_dec:
-
-_CET_ENDBR
- testq $~15,%r9
- jnz L$256_dec_start
- ret
-
-L$256_dec_start:
- vzeroupper
- vmovdqa (%rdx),%xmm0
- movq %rdx,%rax
-
- leaq 32(%rax),%rax
- leaq 32(%rcx),%rcx
-
-
- vmovdqu (%rdi,%r9,1),%xmm15
- vpor OR_MASK(%rip),%xmm15,%xmm15
- andq $~15,%r9
-
-
- cmpq $96,%r9
- jb L$256_dec_loop2
-
-
- subq $96,%r9
- vmovdqa %xmm15,%xmm7
- vpaddd one(%rip),%xmm7,%xmm8
- vpaddd two(%rip),%xmm7,%xmm9
- vpaddd one(%rip),%xmm9,%xmm10
- vpaddd two(%rip),%xmm9,%xmm11
- vpaddd one(%rip),%xmm11,%xmm12
- vpaddd two(%rip),%xmm11,%xmm15
-
- vpxor (%r8),%xmm7,%xmm7
- vpxor (%r8),%xmm8,%xmm8
- vpxor (%r8),%xmm9,%xmm9
- vpxor (%r8),%xmm10,%xmm10
- vpxor (%r8),%xmm11,%xmm11
- vpxor (%r8),%xmm12,%xmm12
-
- vmovdqu 16(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 32(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 48(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 64(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 80(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 96(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 112(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 128(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 144(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 160(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 176(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 192(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 208(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 224(%r8),%xmm4
- vaesenclast %xmm4,%xmm7,%xmm7
- vaesenclast %xmm4,%xmm8,%xmm8
- vaesenclast %xmm4,%xmm9,%xmm9
- vaesenclast %xmm4,%xmm10,%xmm10
- vaesenclast %xmm4,%xmm11,%xmm11
- vaesenclast %xmm4,%xmm12,%xmm12
-
-
- vpxor 0(%rdi),%xmm7,%xmm7
- vpxor 16(%rdi),%xmm8,%xmm8
- vpxor 32(%rdi),%xmm9,%xmm9
- vpxor 48(%rdi),%xmm10,%xmm10
- vpxor 64(%rdi),%xmm11,%xmm11
- vpxor 80(%rdi),%xmm12,%xmm12
-
- vmovdqu %xmm7,0(%rsi)
- vmovdqu %xmm8,16(%rsi)
- vmovdqu %xmm9,32(%rsi)
- vmovdqu %xmm10,48(%rsi)
- vmovdqu %xmm11,64(%rsi)
- vmovdqu %xmm12,80(%rsi)
-
- addq $96,%rdi
- addq $96,%rsi
- jmp L$256_dec_loop1
-
-
-.p2align 6
-L$256_dec_loop1:
- cmpq $96,%r9
- jb L$256_dec_finish_96
- subq $96,%r9
-
- vmovdqa %xmm12,%xmm6
- vmovdqa %xmm11,16-32(%rax)
- vmovdqa %xmm10,32-32(%rax)
- vmovdqa %xmm9,48-32(%rax)
- vmovdqa %xmm8,64-32(%rax)
- vmovdqa %xmm7,80-32(%rax)
-
- vmovdqa %xmm15,%xmm7
- vpaddd one(%rip),%xmm7,%xmm8
- vpaddd two(%rip),%xmm7,%xmm9
- vpaddd one(%rip),%xmm9,%xmm10
- vpaddd two(%rip),%xmm9,%xmm11
- vpaddd one(%rip),%xmm11,%xmm12
- vpaddd two(%rip),%xmm11,%xmm15
-
- vmovdqa (%r8),%xmm4
- vpxor %xmm4,%xmm7,%xmm7
- vpxor %xmm4,%xmm8,%xmm8
- vpxor %xmm4,%xmm9,%xmm9
- vpxor %xmm4,%xmm10,%xmm10
- vpxor %xmm4,%xmm11,%xmm11
- vpxor %xmm4,%xmm12,%xmm12
-
- vmovdqu 0-32(%rcx),%xmm4
- vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
- vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
- vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1
- vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vmovdqu 16(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu -16(%rax),%xmm6
- vmovdqu -16(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
-
- vmovdqu 32(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 0(%rax),%xmm6
- vmovdqu 0(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
-
- vmovdqu 48(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 16(%rax),%xmm6
- vmovdqu 16(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
-
- vmovdqu 64(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 32(%rax),%xmm6
- vmovdqu 32(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
-
- vmovdqu 80(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 96(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 112(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
-
- vmovdqa 80-32(%rax),%xmm6
- vpxor %xmm0,%xmm6,%xmm6
- vmovdqu 80-32(%rcx),%xmm5
-
- vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vmovdqu 128(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
-
- vpsrldq $8,%xmm1,%xmm4
- vpxor %xmm4,%xmm2,%xmm5
- vpslldq $8,%xmm1,%xmm4
- vpxor %xmm4,%xmm3,%xmm0
-
- vmovdqa poly(%rip),%xmm3
-
- vmovdqu 144(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 160(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 176(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 192(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 208(%r8),%xmm4
- vaesenc %xmm4,%xmm7,%xmm7
- vaesenc %xmm4,%xmm8,%xmm8
- vaesenc %xmm4,%xmm9,%xmm9
- vaesenc %xmm4,%xmm10,%xmm10
- vaesenc %xmm4,%xmm11,%xmm11
- vaesenc %xmm4,%xmm12,%xmm12
-
- vmovdqu 224(%r8),%xmm6
- vpalignr $8,%xmm0,%xmm0,%xmm2
- vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
- vpxor %xmm0,%xmm2,%xmm0
-
- vpxor 0(%rdi),%xmm6,%xmm4
- vaesenclast %xmm4,%xmm7,%xmm7
- vpxor 16(%rdi),%xmm6,%xmm4
- vaesenclast %xmm4,%xmm8,%xmm8
- vpxor 32(%rdi),%xmm6,%xmm4
- vaesenclast %xmm4,%xmm9,%xmm9
- vpxor 48(%rdi),%xmm6,%xmm4
- vaesenclast %xmm4,%xmm10,%xmm10
- vpxor 64(%rdi),%xmm6,%xmm4
- vaesenclast %xmm4,%xmm11,%xmm11
- vpxor 80(%rdi),%xmm6,%xmm4
- vaesenclast %xmm4,%xmm12,%xmm12
-
- vpalignr $8,%xmm0,%xmm0,%xmm2
- vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
- vpxor %xmm0,%xmm2,%xmm0
-
- vmovdqu %xmm7,0(%rsi)
- vmovdqu %xmm8,16(%rsi)
- vmovdqu %xmm9,32(%rsi)
- vmovdqu %xmm10,48(%rsi)
- vmovdqu %xmm11,64(%rsi)
- vmovdqu %xmm12,80(%rsi)
-
- vpxor %xmm5,%xmm0,%xmm0
-
- leaq 96(%rdi),%rdi
- leaq 96(%rsi),%rsi
- jmp L$256_dec_loop1
-
-L$256_dec_finish_96:
- vmovdqa %xmm12,%xmm6
- vmovdqa %xmm11,16-32(%rax)
- vmovdqa %xmm10,32-32(%rax)
- vmovdqa %xmm9,48-32(%rax)
- vmovdqa %xmm8,64-32(%rax)
- vmovdqa %xmm7,80-32(%rax)
-
- vmovdqu 0-32(%rcx),%xmm4
- vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1
- vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
- vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
- vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vmovdqu -16(%rax),%xmm6
- vmovdqu -16(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vmovdqu 0(%rax),%xmm6
- vmovdqu 0(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vmovdqu 16(%rax),%xmm6
- vmovdqu 16(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vmovdqu 32(%rax),%xmm6
- vmovdqu 32(%rcx),%xmm13
-
- vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
-
- vmovdqu 80-32(%rax),%xmm6
- vpxor %xmm0,%xmm6,%xmm6
- vmovdqu 80-32(%rcx),%xmm5
- vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
- vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
- vpxor %xmm4,%xmm1,%xmm1
-
- vpsrldq $8,%xmm1,%xmm4
- vpxor %xmm4,%xmm2,%xmm5
- vpslldq $8,%xmm1,%xmm4
- vpxor %xmm4,%xmm3,%xmm0
-
- vmovdqa poly(%rip),%xmm3
-
- vpalignr $8,%xmm0,%xmm0,%xmm2
- vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
- vpxor %xmm0,%xmm2,%xmm0
-
- vpalignr $8,%xmm0,%xmm0,%xmm2
- vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
- vpxor %xmm0,%xmm2,%xmm0
-
- vpxor %xmm5,%xmm0,%xmm0
-
-L$256_dec_loop2:
-
-
-
- cmpq $16,%r9
- jb L$256_dec_out
- subq $16,%r9
-
- vmovdqa %xmm15,%xmm2
- vpaddd one(%rip),%xmm15,%xmm15
-
- vpxor 0(%r8),%xmm2,%xmm2
- vaesenc 16(%r8),%xmm2,%xmm2
- vaesenc 32(%r8),%xmm2,%xmm2
- vaesenc 48(%r8),%xmm2,%xmm2
- vaesenc 64(%r8),%xmm2,%xmm2
- vaesenc 80(%r8),%xmm2,%xmm2
- vaesenc 96(%r8),%xmm2,%xmm2
- vaesenc 112(%r8),%xmm2,%xmm2
- vaesenc 128(%r8),%xmm2,%xmm2
- vaesenc 144(%r8),%xmm2,%xmm2
- vaesenc 160(%r8),%xmm2,%xmm2
- vaesenc 176(%r8),%xmm2,%xmm2
- vaesenc 192(%r8),%xmm2,%xmm2
- vaesenc 208(%r8),%xmm2,%xmm2
- vaesenclast 224(%r8),%xmm2,%xmm2
- vpxor (%rdi),%xmm2,%xmm2
- vmovdqu %xmm2,(%rsi)
- addq $16,%rdi
- addq $16,%rsi
-
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa -32(%rcx),%xmm1
- call GFMUL
-
- jmp L$256_dec_loop2
-
-L$256_dec_out:
- vmovdqu %xmm0,(%rdx)
- ret
-
-
-.globl _aes256gcmsiv_kdf
-.private_extern _aes256gcmsiv_kdf
-
-.p2align 4
-_aes256gcmsiv_kdf:
-
-_CET_ENDBR
-
-
-
-
- vmovdqa (%rdx),%xmm1
- vmovdqa 0(%rdi),%xmm4
- vmovdqa and_mask(%rip),%xmm11
- vmovdqa one(%rip),%xmm8
- vpshufd $0x90,%xmm4,%xmm4
- vpand %xmm11,%xmm4,%xmm4
- vpaddd %xmm8,%xmm4,%xmm6
- vpaddd %xmm8,%xmm6,%xmm7
- vpaddd %xmm8,%xmm7,%xmm11
- vpaddd %xmm8,%xmm11,%xmm12
- vpaddd %xmm8,%xmm12,%xmm13
-
- vpxor %xmm1,%xmm4,%xmm4
- vpxor %xmm1,%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm1,%xmm11,%xmm11
- vpxor %xmm1,%xmm12,%xmm12
- vpxor %xmm1,%xmm13,%xmm13
-
- vmovdqa 16(%rdx),%xmm1
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
-
- vmovdqa 32(%rdx),%xmm2
- vaesenc %xmm2,%xmm4,%xmm4
- vaesenc %xmm2,%xmm6,%xmm6
- vaesenc %xmm2,%xmm7,%xmm7
- vaesenc %xmm2,%xmm11,%xmm11
- vaesenc %xmm2,%xmm12,%xmm12
- vaesenc %xmm2,%xmm13,%xmm13
-
- vmovdqa 48(%rdx),%xmm1
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
-
- vmovdqa 64(%rdx),%xmm2
- vaesenc %xmm2,%xmm4,%xmm4
- vaesenc %xmm2,%xmm6,%xmm6
- vaesenc %xmm2,%xmm7,%xmm7
- vaesenc %xmm2,%xmm11,%xmm11
- vaesenc %xmm2,%xmm12,%xmm12
- vaesenc %xmm2,%xmm13,%xmm13
-
- vmovdqa 80(%rdx),%xmm1
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
-
- vmovdqa 96(%rdx),%xmm2
- vaesenc %xmm2,%xmm4,%xmm4
- vaesenc %xmm2,%xmm6,%xmm6
- vaesenc %xmm2,%xmm7,%xmm7
- vaesenc %xmm2,%xmm11,%xmm11
- vaesenc %xmm2,%xmm12,%xmm12
- vaesenc %xmm2,%xmm13,%xmm13
-
- vmovdqa 112(%rdx),%xmm1
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
-
- vmovdqa 128(%rdx),%xmm2
- vaesenc %xmm2,%xmm4,%xmm4
- vaesenc %xmm2,%xmm6,%xmm6
- vaesenc %xmm2,%xmm7,%xmm7
- vaesenc %xmm2,%xmm11,%xmm11
- vaesenc %xmm2,%xmm12,%xmm12
- vaesenc %xmm2,%xmm13,%xmm13
-
- vmovdqa 144(%rdx),%xmm1
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
-
- vmovdqa 160(%rdx),%xmm2
- vaesenc %xmm2,%xmm4,%xmm4
- vaesenc %xmm2,%xmm6,%xmm6
- vaesenc %xmm2,%xmm7,%xmm7
- vaesenc %xmm2,%xmm11,%xmm11
- vaesenc %xmm2,%xmm12,%xmm12
- vaesenc %xmm2,%xmm13,%xmm13
-
- vmovdqa 176(%rdx),%xmm1
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
-
- vmovdqa 192(%rdx),%xmm2
- vaesenc %xmm2,%xmm4,%xmm4
- vaesenc %xmm2,%xmm6,%xmm6
- vaesenc %xmm2,%xmm7,%xmm7
- vaesenc %xmm2,%xmm11,%xmm11
- vaesenc %xmm2,%xmm12,%xmm12
- vaesenc %xmm2,%xmm13,%xmm13
-
- vmovdqa 208(%rdx),%xmm1
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
-
- vmovdqa 224(%rdx),%xmm2
- vaesenclast %xmm2,%xmm4,%xmm4
- vaesenclast %xmm2,%xmm6,%xmm6
- vaesenclast %xmm2,%xmm7,%xmm7
- vaesenclast %xmm2,%xmm11,%xmm11
- vaesenclast %xmm2,%xmm12,%xmm12
- vaesenclast %xmm2,%xmm13,%xmm13
-
-
- vmovdqa %xmm4,0(%rsi)
- vmovdqa %xmm6,16(%rsi)
- vmovdqa %xmm7,32(%rsi)
- vmovdqa %xmm11,48(%rsi)
- vmovdqa %xmm12,64(%rsi)
- vmovdqa %xmm13,80(%rsi)
- ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64-apple.S b/apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64-apple.S
deleted file mode 100644
index e4a7202..0000000
--- a/apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64-apple.S
+++ /dev/null
@@ -1,8875 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-
-chacha20_poly1305_constants:
-
-.section __DATA,__const
-.p2align 6
-L$chacha20_consts:
-.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
-.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
-L$rol8:
-.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
-.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
-L$rol16:
-.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
-.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
-L$avx2_init:
-.long 0,0,0,0
-L$sse_inc:
-.long 1,0,0,0
-L$avx2_inc:
-.long 2,0,0,0,2,0,0,0
-L$clamp:
-.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
-.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
-.p2align 4
-L$and_masks:
-.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
-.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
-.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
-.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
-.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
-.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
-.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
-.text
-
-
-.p2align 6
-poly_hash_ad_internal:
-
-
- xorq %r10,%r10
- xorq %r11,%r11
- xorq %r12,%r12
- cmpq $13,%r8
- jne L$hash_ad_loop
-L$poly_fast_tls_ad:
-
- movq (%rcx),%r10
- movq 5(%rcx),%r11
- shrq $24,%r11
- movq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- ret
-L$hash_ad_loop:
-
- cmpq $16,%r8
- jb L$hash_ad_tail
- addq 0+0(%rcx),%r10
- adcq 8+0(%rcx),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rcx),%rcx
- subq $16,%r8
- jmp L$hash_ad_loop
-L$hash_ad_tail:
- cmpq $0,%r8
- je L$hash_ad_done
-
- xorq %r13,%r13
- xorq %r14,%r14
- xorq %r15,%r15
- addq %r8,%rcx
-L$hash_ad_tail_loop:
- shldq $8,%r13,%r14
- shlq $8,%r13
- movzbq -1(%rcx),%r15
- xorq %r15,%r13
- decq %rcx
- decq %r8
- jne L$hash_ad_tail_loop
-
- addq %r13,%r10
- adcq %r14,%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-
-L$hash_ad_done:
- ret
-
-
-
-.globl _chacha20_poly1305_open
-.private_extern _chacha20_poly1305_open
-
-.p2align 6
-_chacha20_poly1305_open:
-
-_CET_ENDBR
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
-
-
- pushq %r9
-
- subq $288 + 0 + 32,%rsp
-
-
- leaq 32(%rsp),%rbp
- andq $-32,%rbp
-
- movq %rdx,%rbx
- movq %r8,0+0+32(%rbp)
- movq %rbx,8+0+32(%rbp)
-
- movl _OPENSSL_ia32cap_P+8(%rip),%eax
- andl $288,%eax
- xorl $288,%eax
- jz chacha20_poly1305_open_avx2
-
- cmpq $128,%rbx
- jbe L$open_sse_128
-
- movdqa L$chacha20_consts(%rip),%xmm0
- movdqu 0(%r9),%xmm4
- movdqu 16(%r9),%xmm8
- movdqu 32(%r9),%xmm12
-
- movdqa %xmm12,%xmm7
-
- movdqa %xmm4,0+48(%rbp)
- movdqa %xmm8,0+64(%rbp)
- movdqa %xmm12,0+96(%rbp)
- movq $10,%r10
-L$open_sse_init_rounds:
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
-
- decq %r10
- jne L$open_sse_init_rounds
-
- paddd L$chacha20_consts(%rip),%xmm0
- paddd 0+48(%rbp),%xmm4
-
- pand L$clamp(%rip),%xmm0
- movdqa %xmm0,0+0(%rbp)
- movdqa %xmm4,0+16(%rbp)
-
- movq %r8,%r8
- call poly_hash_ad_internal
-L$open_sse_main_loop:
- cmpq $256,%rbx
- jb L$open_sse_tail
-
- movdqa L$chacha20_consts(%rip),%xmm0
- movdqa 0+48(%rbp),%xmm4
- movdqa 0+64(%rbp),%xmm8
- movdqa %xmm0,%xmm1
- movdqa %xmm4,%xmm5
- movdqa %xmm8,%xmm9
- movdqa %xmm0,%xmm2
- movdqa %xmm4,%xmm6
- movdqa %xmm8,%xmm10
- movdqa %xmm0,%xmm3
- movdqa %xmm4,%xmm7
- movdqa %xmm8,%xmm11
- movdqa 0+96(%rbp),%xmm15
- paddd L$sse_inc(%rip),%xmm15
- movdqa %xmm15,%xmm14
- paddd L$sse_inc(%rip),%xmm14
- movdqa %xmm14,%xmm13
- paddd L$sse_inc(%rip),%xmm13
- movdqa %xmm13,%xmm12
- paddd L$sse_inc(%rip),%xmm12
- movdqa %xmm12,0+96(%rbp)
- movdqa %xmm13,0+112(%rbp)
- movdqa %xmm14,0+128(%rbp)
- movdqa %xmm15,0+144(%rbp)
-
-
-
- movq $4,%rcx
- movq %rsi,%r8
-L$open_sse_main_loop_rounds:
- movdqa %xmm8,0+80(%rbp)
- movdqa L$rol16(%rip),%xmm8
- paddd %xmm7,%xmm3
- paddd %xmm6,%xmm2
- paddd %xmm5,%xmm1
- paddd %xmm4,%xmm0
- pxor %xmm3,%xmm15
- pxor %xmm2,%xmm14
- pxor %xmm1,%xmm13
- pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
- movdqa 0+80(%rbp),%xmm8
- paddd %xmm15,%xmm11
- paddd %xmm14,%xmm10
- paddd %xmm13,%xmm9
- paddd %xmm12,%xmm8
- pxor %xmm11,%xmm7
- addq 0+0(%r8),%r10
- adcq 8+0(%r8),%r11
- adcq $1,%r12
-
- leaq 16(%r8),%r8
- pxor %xmm10,%xmm6
- pxor %xmm9,%xmm5
- pxor %xmm8,%xmm4
- movdqa %xmm8,0+80(%rbp)
- movdqa %xmm7,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm7
- pxor %xmm8,%xmm7
- movdqa %xmm6,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm6
- pxor %xmm8,%xmm6
- movdqa %xmm5,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm5
- pxor %xmm8,%xmm5
- movdqa %xmm4,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm4
- pxor %xmm8,%xmm4
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movdqa L$rol8(%rip),%xmm8
- paddd %xmm7,%xmm3
- paddd %xmm6,%xmm2
- paddd %xmm5,%xmm1
- paddd %xmm4,%xmm0
- pxor %xmm3,%xmm15
- pxor %xmm2,%xmm14
- pxor %xmm1,%xmm13
- pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
- movdqa 0+80(%rbp),%xmm8
- paddd %xmm15,%xmm11
- paddd %xmm14,%xmm10
- paddd %xmm13,%xmm9
- paddd %xmm12,%xmm8
- pxor %xmm11,%xmm7
- pxor %xmm10,%xmm6
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- pxor %xmm9,%xmm5
- pxor %xmm8,%xmm4
- movdqa %xmm8,0+80(%rbp)
- movdqa %xmm7,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm7
- pxor %xmm8,%xmm7
- movdqa %xmm6,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm6
- pxor %xmm8,%xmm6
- movdqa %xmm5,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm5
- pxor %xmm8,%xmm5
- movdqa %xmm4,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm4
- pxor %xmm8,%xmm4
- movdqa 0+80(%rbp),%xmm8
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
-.byte 102,15,58,15,255,4
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,12
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- movdqa %xmm8,0+80(%rbp)
- movdqa L$rol16(%rip),%xmm8
- paddd %xmm7,%xmm3
- paddd %xmm6,%xmm2
- paddd %xmm5,%xmm1
- paddd %xmm4,%xmm0
- pxor %xmm3,%xmm15
- pxor %xmm2,%xmm14
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- pxor %xmm1,%xmm13
- pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
- movdqa 0+80(%rbp),%xmm8
- paddd %xmm15,%xmm11
- paddd %xmm14,%xmm10
- paddd %xmm13,%xmm9
- paddd %xmm12,%xmm8
- pxor %xmm11,%xmm7
- pxor %xmm10,%xmm6
- pxor %xmm9,%xmm5
- pxor %xmm8,%xmm4
- movdqa %xmm8,0+80(%rbp)
- movdqa %xmm7,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm7
- pxor %xmm8,%xmm7
- movdqa %xmm6,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm6
- pxor %xmm8,%xmm6
- movdqa %xmm5,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm5
- pxor %xmm8,%xmm5
- movdqa %xmm4,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm4
- pxor %xmm8,%xmm4
- movdqa L$rol8(%rip),%xmm8
- paddd %xmm7,%xmm3
- paddd %xmm6,%xmm2
- paddd %xmm5,%xmm1
- paddd %xmm4,%xmm0
- pxor %xmm3,%xmm15
- pxor %xmm2,%xmm14
- pxor %xmm1,%xmm13
- pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
- movdqa 0+80(%rbp),%xmm8
- paddd %xmm15,%xmm11
- paddd %xmm14,%xmm10
- paddd %xmm13,%xmm9
- paddd %xmm12,%xmm8
- pxor %xmm11,%xmm7
- pxor %xmm10,%xmm6
- pxor %xmm9,%xmm5
- pxor %xmm8,%xmm4
- movdqa %xmm8,0+80(%rbp)
- movdqa %xmm7,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm7
- pxor %xmm8,%xmm7
- movdqa %xmm6,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm6
- pxor %xmm8,%xmm6
- movdqa %xmm5,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm5
- pxor %xmm8,%xmm5
- movdqa %xmm4,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm4
- pxor %xmm8,%xmm4
- movdqa 0+80(%rbp),%xmm8
-.byte 102,15,58,15,255,12
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,4
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
-
- decq %rcx
- jge L$open_sse_main_loop_rounds
- addq 0+0(%r8),%r10
- adcq 8+0(%r8),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%r8),%r8
- cmpq $-6,%rcx
- jg L$open_sse_main_loop_rounds
- paddd L$chacha20_consts(%rip),%xmm3
- paddd 0+48(%rbp),%xmm7
- paddd 0+64(%rbp),%xmm11
- paddd 0+144(%rbp),%xmm15
- paddd L$chacha20_consts(%rip),%xmm2
- paddd 0+48(%rbp),%xmm6
- paddd 0+64(%rbp),%xmm10
- paddd 0+128(%rbp),%xmm14
- paddd L$chacha20_consts(%rip),%xmm1
- paddd 0+48(%rbp),%xmm5
- paddd 0+64(%rbp),%xmm9
- paddd 0+112(%rbp),%xmm13
- paddd L$chacha20_consts(%rip),%xmm0
- paddd 0+48(%rbp),%xmm4
- paddd 0+64(%rbp),%xmm8
- paddd 0+96(%rbp),%xmm12
- movdqa %xmm12,0+80(%rbp)
- movdqu 0 + 0(%rsi),%xmm12
- pxor %xmm3,%xmm12
- movdqu %xmm12,0 + 0(%rdi)
- movdqu 16 + 0(%rsi),%xmm12
- pxor %xmm7,%xmm12
- movdqu %xmm12,16 + 0(%rdi)
- movdqu 32 + 0(%rsi),%xmm12
- pxor %xmm11,%xmm12
- movdqu %xmm12,32 + 0(%rdi)
- movdqu 48 + 0(%rsi),%xmm12
- pxor %xmm15,%xmm12
- movdqu %xmm12,48 + 0(%rdi)
- movdqu 0 + 64(%rsi),%xmm3
- movdqu 16 + 64(%rsi),%xmm7
- movdqu 32 + 64(%rsi),%xmm11
- movdqu 48 + 64(%rsi),%xmm15
- pxor %xmm3,%xmm2
- pxor %xmm7,%xmm6
- pxor %xmm11,%xmm10
- pxor %xmm14,%xmm15
- movdqu %xmm2,0 + 64(%rdi)
- movdqu %xmm6,16 + 64(%rdi)
- movdqu %xmm10,32 + 64(%rdi)
- movdqu %xmm15,48 + 64(%rdi)
- movdqu 0 + 128(%rsi),%xmm3
- movdqu 16 + 128(%rsi),%xmm7
- movdqu 32 + 128(%rsi),%xmm11
- movdqu 48 + 128(%rsi),%xmm15
- pxor %xmm3,%xmm1
- pxor %xmm7,%xmm5
- pxor %xmm11,%xmm9
- pxor %xmm13,%xmm15
- movdqu %xmm1,0 + 128(%rdi)
- movdqu %xmm5,16 + 128(%rdi)
- movdqu %xmm9,32 + 128(%rdi)
- movdqu %xmm15,48 + 128(%rdi)
- movdqu 0 + 192(%rsi),%xmm3
- movdqu 16 + 192(%rsi),%xmm7
- movdqu 32 + 192(%rsi),%xmm11
- movdqu 48 + 192(%rsi),%xmm15
- pxor %xmm3,%xmm0
- pxor %xmm7,%xmm4
- pxor %xmm11,%xmm8
- pxor 0+80(%rbp),%xmm15
- movdqu %xmm0,0 + 192(%rdi)
- movdqu %xmm4,16 + 192(%rdi)
- movdqu %xmm8,32 + 192(%rdi)
- movdqu %xmm15,48 + 192(%rdi)
-
- leaq 256(%rsi),%rsi
- leaq 256(%rdi),%rdi
- subq $256,%rbx
- jmp L$open_sse_main_loop
-L$open_sse_tail:
-
- testq %rbx,%rbx
- jz L$open_sse_finalize
- cmpq $192,%rbx
- ja L$open_sse_tail_256
- cmpq $128,%rbx
- ja L$open_sse_tail_192
- cmpq $64,%rbx
- ja L$open_sse_tail_128
- movdqa L$chacha20_consts(%rip),%xmm0
- movdqa 0+48(%rbp),%xmm4
- movdqa 0+64(%rbp),%xmm8
- movdqa 0+96(%rbp),%xmm12
- paddd L$sse_inc(%rip),%xmm12
- movdqa %xmm12,0+96(%rbp)
-
- xorq %r8,%r8
- movq %rbx,%rcx
- cmpq $16,%rcx
- jb L$open_sse_tail_64_rounds
-L$open_sse_tail_64_rounds_and_x1hash:
- addq 0+0(%rsi,%r8,1),%r10
- adcq 8+0(%rsi,%r8,1),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- subq $16,%rcx
-L$open_sse_tail_64_rounds:
- addq $16,%r8
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
-
- cmpq $16,%rcx
- jae L$open_sse_tail_64_rounds_and_x1hash
- cmpq $160,%r8
- jne L$open_sse_tail_64_rounds
- paddd L$chacha20_consts(%rip),%xmm0
- paddd 0+48(%rbp),%xmm4
- paddd 0+64(%rbp),%xmm8
- paddd 0+96(%rbp),%xmm12
-
- jmp L$open_sse_tail_64_dec_loop
-
-L$open_sse_tail_128:
- movdqa L$chacha20_consts(%rip),%xmm0
- movdqa 0+48(%rbp),%xmm4
- movdqa 0+64(%rbp),%xmm8
- movdqa %xmm0,%xmm1
- movdqa %xmm4,%xmm5
- movdqa %xmm8,%xmm9
- movdqa 0+96(%rbp),%xmm13
- paddd L$sse_inc(%rip),%xmm13
- movdqa %xmm13,%xmm12
- paddd L$sse_inc(%rip),%xmm12
- movdqa %xmm12,0+96(%rbp)
- movdqa %xmm13,0+112(%rbp)
-
- movq %rbx,%rcx
- andq $-16,%rcx
- xorq %r8,%r8
-L$open_sse_tail_128_rounds_and_x1hash:
- addq 0+0(%rsi,%r8,1),%r10
- adcq 8+0(%rsi,%r8,1),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-L$open_sse_tail_128_rounds:
- addq $16,%r8
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm5
- pxor %xmm3,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm5
- pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm5
- pxor %xmm3,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm5
- pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
-
- cmpq %rcx,%r8
- jb L$open_sse_tail_128_rounds_and_x1hash
- cmpq $160,%r8
- jne L$open_sse_tail_128_rounds
- paddd L$chacha20_consts(%rip),%xmm1
- paddd 0+48(%rbp),%xmm5
- paddd 0+64(%rbp),%xmm9
- paddd 0+112(%rbp),%xmm13
- paddd L$chacha20_consts(%rip),%xmm0
- paddd 0+48(%rbp),%xmm4
- paddd 0+64(%rbp),%xmm8
- paddd 0+96(%rbp),%xmm12
- movdqu 0 + 0(%rsi),%xmm3
- movdqu 16 + 0(%rsi),%xmm7
- movdqu 32 + 0(%rsi),%xmm11
- movdqu 48 + 0(%rsi),%xmm15
- pxor %xmm3,%xmm1
- pxor %xmm7,%xmm5
- pxor %xmm11,%xmm9
- pxor %xmm13,%xmm15
- movdqu %xmm1,0 + 0(%rdi)
- movdqu %xmm5,16 + 0(%rdi)
- movdqu %xmm9,32 + 0(%rdi)
- movdqu %xmm15,48 + 0(%rdi)
-
- subq $64,%rbx
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
- jmp L$open_sse_tail_64_dec_loop
-
-L$open_sse_tail_192:
- movdqa L$chacha20_consts(%rip),%xmm0
- movdqa 0+48(%rbp),%xmm4
- movdqa 0+64(%rbp),%xmm8
- movdqa %xmm0,%xmm1
- movdqa %xmm4,%xmm5
- movdqa %xmm8,%xmm9
- movdqa %xmm0,%xmm2
- movdqa %xmm4,%xmm6
- movdqa %xmm8,%xmm10
- movdqa 0+96(%rbp),%xmm14
- paddd L$sse_inc(%rip),%xmm14
- movdqa %xmm14,%xmm13
- paddd L$sse_inc(%rip),%xmm13
- movdqa %xmm13,%xmm12
- paddd L$sse_inc(%rip),%xmm12
- movdqa %xmm12,0+96(%rbp)
- movdqa %xmm13,0+112(%rbp)
- movdqa %xmm14,0+128(%rbp)
-
- movq %rbx,%rcx
- movq $160,%r8
- cmpq $160,%rcx
- cmovgq %r8,%rcx
- andq $-16,%rcx
- xorq %r8,%r8
-L$open_sse_tail_192_rounds_and_x1hash:
- addq 0+0(%rsi,%r8,1),%r10
- adcq 8+0(%rsi,%r8,1),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-L$open_sse_tail_192_rounds:
- addq $16,%r8
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm5
- pxor %xmm3,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm5
- pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol16(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm6
- pxor %xmm3,%xmm6
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol8(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm6
- pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm5
- pxor %xmm3,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm5
- pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol16(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm6
- pxor %xmm3,%xmm6
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol8(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm6
- pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-
- cmpq %rcx,%r8
- jb L$open_sse_tail_192_rounds_and_x1hash
- cmpq $160,%r8
- jne L$open_sse_tail_192_rounds
- cmpq $176,%rbx
- jb L$open_sse_tail_192_finish
- addq 0+160(%rsi),%r10
- adcq 8+160(%rsi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- cmpq $192,%rbx
- jb L$open_sse_tail_192_finish
- addq 0+176(%rsi),%r10
- adcq 8+176(%rsi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-L$open_sse_tail_192_finish:
- paddd L$chacha20_consts(%rip),%xmm2
- paddd 0+48(%rbp),%xmm6
- paddd 0+64(%rbp),%xmm10
- paddd 0+128(%rbp),%xmm14
- paddd L$chacha20_consts(%rip),%xmm1
- paddd 0+48(%rbp),%xmm5
- paddd 0+64(%rbp),%xmm9
- paddd 0+112(%rbp),%xmm13
- paddd L$chacha20_consts(%rip),%xmm0
- paddd 0+48(%rbp),%xmm4
- paddd 0+64(%rbp),%xmm8
- paddd 0+96(%rbp),%xmm12
- movdqu 0 + 0(%rsi),%xmm3
- movdqu 16 + 0(%rsi),%xmm7
- movdqu 32 + 0(%rsi),%xmm11
- movdqu 48 + 0(%rsi),%xmm15
- pxor %xmm3,%xmm2
- pxor %xmm7,%xmm6
- pxor %xmm11,%xmm10
- pxor %xmm14,%xmm15
- movdqu %xmm2,0 + 0(%rdi)
- movdqu %xmm6,16 + 0(%rdi)
- movdqu %xmm10,32 + 0(%rdi)
- movdqu %xmm15,48 + 0(%rdi)
- movdqu 0 + 64(%rsi),%xmm3
- movdqu 16 + 64(%rsi),%xmm7
- movdqu 32 + 64(%rsi),%xmm11
- movdqu 48 + 64(%rsi),%xmm15
- pxor %xmm3,%xmm1
- pxor %xmm7,%xmm5
- pxor %xmm11,%xmm9
- pxor %xmm13,%xmm15
- movdqu %xmm1,0 + 64(%rdi)
- movdqu %xmm5,16 + 64(%rdi)
- movdqu %xmm9,32 + 64(%rdi)
- movdqu %xmm15,48 + 64(%rdi)
-
- subq $128,%rbx
- leaq 128(%rsi),%rsi
- leaq 128(%rdi),%rdi
- jmp L$open_sse_tail_64_dec_loop
-
-L$open_sse_tail_256:
- movdqa L$chacha20_consts(%rip),%xmm0
- movdqa 0+48(%rbp),%xmm4
- movdqa 0+64(%rbp),%xmm8
- movdqa %xmm0,%xmm1
- movdqa %xmm4,%xmm5
- movdqa %xmm8,%xmm9
- movdqa %xmm0,%xmm2
- movdqa %xmm4,%xmm6
- movdqa %xmm8,%xmm10
- movdqa %xmm0,%xmm3
- movdqa %xmm4,%xmm7
- movdqa %xmm8,%xmm11
- movdqa 0+96(%rbp),%xmm15
- paddd L$sse_inc(%rip),%xmm15
- movdqa %xmm15,%xmm14
- paddd L$sse_inc(%rip),%xmm14
- movdqa %xmm14,%xmm13
- paddd L$sse_inc(%rip),%xmm13
- movdqa %xmm13,%xmm12
- paddd L$sse_inc(%rip),%xmm12
- movdqa %xmm12,0+96(%rbp)
- movdqa %xmm13,0+112(%rbp)
- movdqa %xmm14,0+128(%rbp)
- movdqa %xmm15,0+144(%rbp)
-
- xorq %r8,%r8
-L$open_sse_tail_256_rounds_and_x1hash:
- addq 0+0(%rsi,%r8,1),%r10
- adcq 8+0(%rsi,%r8,1),%r11
- adcq $1,%r12
- movdqa %xmm11,0+80(%rbp)
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm11
- pslld $12,%xmm11
- psrld $20,%xmm4
- pxor %xmm11,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm11
- pslld $7,%xmm11
- psrld $25,%xmm4
- pxor %xmm11,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm11
- pslld $12,%xmm11
- psrld $20,%xmm5
- pxor %xmm11,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm11
- pslld $7,%xmm11
- psrld $25,%xmm5
- pxor %xmm11,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol16(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm11
- pslld $12,%xmm11
- psrld $20,%xmm6
- pxor %xmm11,%xmm6
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol8(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm11
- pslld $7,%xmm11
- psrld $25,%xmm6
- pxor %xmm11,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
- movdqa 0+80(%rbp),%xmm11
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movdqa %xmm9,0+80(%rbp)
- paddd %xmm7,%xmm3
- pxor %xmm3,%xmm15
- pshufb L$rol16(%rip),%xmm15
- paddd %xmm15,%xmm11
- pxor %xmm11,%xmm7
- movdqa %xmm7,%xmm9
- pslld $12,%xmm9
- psrld $20,%xmm7
- pxor %xmm9,%xmm7
- paddd %xmm7,%xmm3
- pxor %xmm3,%xmm15
- pshufb L$rol8(%rip),%xmm15
- paddd %xmm15,%xmm11
- pxor %xmm11,%xmm7
- movdqa %xmm7,%xmm9
- pslld $7,%xmm9
- psrld $25,%xmm7
- pxor %xmm9,%xmm7
-.byte 102,15,58,15,255,4
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,12
- movdqa 0+80(%rbp),%xmm9
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- movdqa %xmm11,0+80(%rbp)
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm11
- pslld $12,%xmm11
- psrld $20,%xmm4
- pxor %xmm11,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm11
- pslld $7,%xmm11
- psrld $25,%xmm4
- pxor %xmm11,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm11
- pslld $12,%xmm11
- psrld $20,%xmm5
- pxor %xmm11,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm11
- pslld $7,%xmm11
- psrld $25,%xmm5
- pxor %xmm11,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol16(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm11
- pslld $12,%xmm11
- psrld $20,%xmm6
- pxor %xmm11,%xmm6
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol8(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm11
- pslld $7,%xmm11
- psrld $25,%xmm6
- pxor %xmm11,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
- movdqa 0+80(%rbp),%xmm11
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- movdqa %xmm9,0+80(%rbp)
- paddd %xmm7,%xmm3
- pxor %xmm3,%xmm15
- pshufb L$rol16(%rip),%xmm15
- paddd %xmm15,%xmm11
- pxor %xmm11,%xmm7
- movdqa %xmm7,%xmm9
- pslld $12,%xmm9
- psrld $20,%xmm7
- pxor %xmm9,%xmm7
- paddd %xmm7,%xmm3
- pxor %xmm3,%xmm15
- pshufb L$rol8(%rip),%xmm15
- paddd %xmm15,%xmm11
- pxor %xmm11,%xmm7
- movdqa %xmm7,%xmm9
- pslld $7,%xmm9
- psrld $25,%xmm7
- pxor %xmm9,%xmm7
-.byte 102,15,58,15,255,12
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,4
- movdqa 0+80(%rbp),%xmm9
-
- addq $16,%r8
- cmpq $160,%r8
- jb L$open_sse_tail_256_rounds_and_x1hash
-
- movq %rbx,%rcx
- andq $-16,%rcx
-L$open_sse_tail_256_hash:
- addq 0+0(%rsi,%r8,1),%r10
- adcq 8+0(%rsi,%r8,1),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- addq $16,%r8
- cmpq %rcx,%r8
- jb L$open_sse_tail_256_hash
- paddd L$chacha20_consts(%rip),%xmm3
- paddd 0+48(%rbp),%xmm7
- paddd 0+64(%rbp),%xmm11
- paddd 0+144(%rbp),%xmm15
- paddd L$chacha20_consts(%rip),%xmm2
- paddd 0+48(%rbp),%xmm6
- paddd 0+64(%rbp),%xmm10
- paddd 0+128(%rbp),%xmm14
- paddd L$chacha20_consts(%rip),%xmm1
- paddd 0+48(%rbp),%xmm5
- paddd 0+64(%rbp),%xmm9
- paddd 0+112(%rbp),%xmm13
- paddd L$chacha20_consts(%rip),%xmm0
- paddd 0+48(%rbp),%xmm4
- paddd 0+64(%rbp),%xmm8
- paddd 0+96(%rbp),%xmm12
- movdqa %xmm12,0+80(%rbp)
- movdqu 0 + 0(%rsi),%xmm12
- pxor %xmm3,%xmm12
- movdqu %xmm12,0 + 0(%rdi)
- movdqu 16 + 0(%rsi),%xmm12
- pxor %xmm7,%xmm12
- movdqu %xmm12,16 + 0(%rdi)
- movdqu 32 + 0(%rsi),%xmm12
- pxor %xmm11,%xmm12
- movdqu %xmm12,32 + 0(%rdi)
- movdqu 48 + 0(%rsi),%xmm12
- pxor %xmm15,%xmm12
- movdqu %xmm12,48 + 0(%rdi)
- movdqu 0 + 64(%rsi),%xmm3
- movdqu 16 + 64(%rsi),%xmm7
- movdqu 32 + 64(%rsi),%xmm11
- movdqu 48 + 64(%rsi),%xmm15
- pxor %xmm3,%xmm2
- pxor %xmm7,%xmm6
- pxor %xmm11,%xmm10
- pxor %xmm14,%xmm15
- movdqu %xmm2,0 + 64(%rdi)
- movdqu %xmm6,16 + 64(%rdi)
- movdqu %xmm10,32 + 64(%rdi)
- movdqu %xmm15,48 + 64(%rdi)
- movdqu 0 + 128(%rsi),%xmm3
- movdqu 16 + 128(%rsi),%xmm7
- movdqu 32 + 128(%rsi),%xmm11
- movdqu 48 + 128(%rsi),%xmm15
- pxor %xmm3,%xmm1
- pxor %xmm7,%xmm5
- pxor %xmm11,%xmm9
- pxor %xmm13,%xmm15
- movdqu %xmm1,0 + 128(%rdi)
- movdqu %xmm5,16 + 128(%rdi)
- movdqu %xmm9,32 + 128(%rdi)
- movdqu %xmm15,48 + 128(%rdi)
-
- movdqa 0+80(%rbp),%xmm12
- subq $192,%rbx
- leaq 192(%rsi),%rsi
- leaq 192(%rdi),%rdi
-
-
-L$open_sse_tail_64_dec_loop:
- cmpq $16,%rbx
- jb L$open_sse_tail_16_init
- subq $16,%rbx
- movdqu (%rsi),%xmm3
- pxor %xmm3,%xmm0
- movdqu %xmm0,(%rdi)
- leaq 16(%rsi),%rsi
- leaq 16(%rdi),%rdi
- movdqa %xmm4,%xmm0
- movdqa %xmm8,%xmm4
- movdqa %xmm12,%xmm8
- jmp L$open_sse_tail_64_dec_loop
-L$open_sse_tail_16_init:
- movdqa %xmm0,%xmm1
-
-
-L$open_sse_tail_16:
- testq %rbx,%rbx
- jz L$open_sse_finalize
-
-
-
- pxor %xmm3,%xmm3
- leaq -1(%rsi,%rbx,1),%rsi
- movq %rbx,%r8
-L$open_sse_tail_16_compose:
- pslldq $1,%xmm3
- pinsrb $0,(%rsi),%xmm3
- subq $1,%rsi
- subq $1,%r8
- jnz L$open_sse_tail_16_compose
-
-.byte 102,73,15,126,221
- pextrq $1,%xmm3,%r14
-
- pxor %xmm1,%xmm3
-
-
-L$open_sse_tail_16_extract:
- pextrb $0,%xmm3,(%rdi)
- psrldq $1,%xmm3
- addq $1,%rdi
- subq $1,%rbx
- jne L$open_sse_tail_16_extract
-
- addq %r13,%r10
- adcq %r14,%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-
-L$open_sse_finalize:
- addq 0+0+32(%rbp),%r10
- adcq 8+0+32(%rbp),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-
- movq %r10,%r13
- movq %r11,%r14
- movq %r12,%r15
- subq $-5,%r10
- sbbq $-1,%r11
- sbbq $3,%r12
- cmovcq %r13,%r10
- cmovcq %r14,%r11
- cmovcq %r15,%r12
-
- addq 0+0+16(%rbp),%r10
- adcq 8+0+16(%rbp),%r11
-
-
- addq $288 + 0 + 32,%rsp
-
-
- popq %r9
-
- movq %r10,(%r9)
- movq %r11,8(%r9)
- popq %r15
-
- popq %r14
-
- popq %r13
-
- popq %r12
-
- popq %rbx
-
- popq %rbp
-
- ret
-
-L$open_sse_128:
-
- movdqu L$chacha20_consts(%rip),%xmm0
- movdqa %xmm0,%xmm1
- movdqa %xmm0,%xmm2
- movdqu 0(%r9),%xmm4
- movdqa %xmm4,%xmm5
- movdqa %xmm4,%xmm6
- movdqu 16(%r9),%xmm8
- movdqa %xmm8,%xmm9
- movdqa %xmm8,%xmm10
- movdqu 32(%r9),%xmm12
- movdqa %xmm12,%xmm13
- paddd L$sse_inc(%rip),%xmm13
- movdqa %xmm13,%xmm14
- paddd L$sse_inc(%rip),%xmm14
- movdqa %xmm4,%xmm7
- movdqa %xmm8,%xmm11
- movdqa %xmm13,%xmm15
- movq $10,%r10
-
-L$open_sse_128_rounds:
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm5
- pxor %xmm3,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm5
- pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol16(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm6
- pxor %xmm3,%xmm6
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol8(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm6
- pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm5
- pxor %xmm3,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm5
- pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol16(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm6
- pxor %xmm3,%xmm6
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol8(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm6
- pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-
- decq %r10
- jnz L$open_sse_128_rounds
- paddd L$chacha20_consts(%rip),%xmm0
- paddd L$chacha20_consts(%rip),%xmm1
- paddd L$chacha20_consts(%rip),%xmm2
- paddd %xmm7,%xmm4
- paddd %xmm7,%xmm5
- paddd %xmm7,%xmm6
- paddd %xmm11,%xmm9
- paddd %xmm11,%xmm10
- paddd %xmm15,%xmm13
- paddd L$sse_inc(%rip),%xmm15
- paddd %xmm15,%xmm14
-
- pand L$clamp(%rip),%xmm0
- movdqa %xmm0,0+0(%rbp)
- movdqa %xmm4,0+16(%rbp)
-
- movq %r8,%r8
- call poly_hash_ad_internal
-L$open_sse_128_xor_hash:
- cmpq $16,%rbx
- jb L$open_sse_tail_16
- subq $16,%rbx
- addq 0+0(%rsi),%r10
- adcq 8+0(%rsi),%r11
- adcq $1,%r12
-
-
- movdqu 0(%rsi),%xmm3
- pxor %xmm3,%xmm1
- movdqu %xmm1,0(%rdi)
- leaq 16(%rsi),%rsi
- leaq 16(%rdi),%rdi
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-
- movdqa %xmm5,%xmm1
- movdqa %xmm9,%xmm5
- movdqa %xmm13,%xmm9
- movdqa %xmm2,%xmm13
- movdqa %xmm6,%xmm2
- movdqa %xmm10,%xmm6
- movdqa %xmm14,%xmm10
- jmp L$open_sse_128_xor_hash
-
-
-
-
-
-
-
-
-
-.globl _chacha20_poly1305_seal
-.private_extern _chacha20_poly1305_seal
-
-.p2align 6
-_chacha20_poly1305_seal:
-
-_CET_ENDBR
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
-
-
- pushq %r9
-
- subq $288 + 0 + 32,%rsp
-
- leaq 32(%rsp),%rbp
- andq $-32,%rbp
-
- movq 56(%r9),%rbx
- addq %rdx,%rbx
- movq %r8,0+0+32(%rbp)
- movq %rbx,8+0+32(%rbp)
- movq %rdx,%rbx
-
- movl _OPENSSL_ia32cap_P+8(%rip),%eax
- andl $288,%eax
- xorl $288,%eax
- jz chacha20_poly1305_seal_avx2
-
- cmpq $128,%rbx
- jbe L$seal_sse_128
-
- movdqa L$chacha20_consts(%rip),%xmm0
- movdqu 0(%r9),%xmm4
- movdqu 16(%r9),%xmm8
- movdqu 32(%r9),%xmm12
-
- movdqa %xmm0,%xmm1
- movdqa %xmm0,%xmm2
- movdqa %xmm0,%xmm3
- movdqa %xmm4,%xmm5
- movdqa %xmm4,%xmm6
- movdqa %xmm4,%xmm7
- movdqa %xmm8,%xmm9
- movdqa %xmm8,%xmm10
- movdqa %xmm8,%xmm11
- movdqa %xmm12,%xmm15
- paddd L$sse_inc(%rip),%xmm12
- movdqa %xmm12,%xmm14
- paddd L$sse_inc(%rip),%xmm12
- movdqa %xmm12,%xmm13
- paddd L$sse_inc(%rip),%xmm12
-
- movdqa %xmm4,0+48(%rbp)
- movdqa %xmm8,0+64(%rbp)
- movdqa %xmm12,0+96(%rbp)
- movdqa %xmm13,0+112(%rbp)
- movdqa %xmm14,0+128(%rbp)
- movdqa %xmm15,0+144(%rbp)
- movq $10,%r10
-L$seal_sse_init_rounds:
- movdqa %xmm8,0+80(%rbp)
- movdqa L$rol16(%rip),%xmm8
- paddd %xmm7,%xmm3
- paddd %xmm6,%xmm2
- paddd %xmm5,%xmm1
- paddd %xmm4,%xmm0
- pxor %xmm3,%xmm15
- pxor %xmm2,%xmm14
- pxor %xmm1,%xmm13
- pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
- movdqa 0+80(%rbp),%xmm8
- paddd %xmm15,%xmm11
- paddd %xmm14,%xmm10
- paddd %xmm13,%xmm9
- paddd %xmm12,%xmm8
- pxor %xmm11,%xmm7
- pxor %xmm10,%xmm6
- pxor %xmm9,%xmm5
- pxor %xmm8,%xmm4
- movdqa %xmm8,0+80(%rbp)
- movdqa %xmm7,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm7
- pxor %xmm8,%xmm7
- movdqa %xmm6,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm6
- pxor %xmm8,%xmm6
- movdqa %xmm5,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm5
- pxor %xmm8,%xmm5
- movdqa %xmm4,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm4
- pxor %xmm8,%xmm4
- movdqa L$rol8(%rip),%xmm8
- paddd %xmm7,%xmm3
- paddd %xmm6,%xmm2
- paddd %xmm5,%xmm1
- paddd %xmm4,%xmm0
- pxor %xmm3,%xmm15
- pxor %xmm2,%xmm14
- pxor %xmm1,%xmm13
- pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
- movdqa 0+80(%rbp),%xmm8
- paddd %xmm15,%xmm11
- paddd %xmm14,%xmm10
- paddd %xmm13,%xmm9
- paddd %xmm12,%xmm8
- pxor %xmm11,%xmm7
- pxor %xmm10,%xmm6
- pxor %xmm9,%xmm5
- pxor %xmm8,%xmm4
- movdqa %xmm8,0+80(%rbp)
- movdqa %xmm7,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm7
- pxor %xmm8,%xmm7
- movdqa %xmm6,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm6
- pxor %xmm8,%xmm6
- movdqa %xmm5,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm5
- pxor %xmm8,%xmm5
- movdqa %xmm4,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm4
- pxor %xmm8,%xmm4
- movdqa 0+80(%rbp),%xmm8
-.byte 102,15,58,15,255,4
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,12
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- movdqa %xmm8,0+80(%rbp)
- movdqa L$rol16(%rip),%xmm8
- paddd %xmm7,%xmm3
- paddd %xmm6,%xmm2
- paddd %xmm5,%xmm1
- paddd %xmm4,%xmm0
- pxor %xmm3,%xmm15
- pxor %xmm2,%xmm14
- pxor %xmm1,%xmm13
- pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
- movdqa 0+80(%rbp),%xmm8
- paddd %xmm15,%xmm11
- paddd %xmm14,%xmm10
- paddd %xmm13,%xmm9
- paddd %xmm12,%xmm8
- pxor %xmm11,%xmm7
- pxor %xmm10,%xmm6
- pxor %xmm9,%xmm5
- pxor %xmm8,%xmm4
- movdqa %xmm8,0+80(%rbp)
- movdqa %xmm7,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm7
- pxor %xmm8,%xmm7
- movdqa %xmm6,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm6
- pxor %xmm8,%xmm6
- movdqa %xmm5,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm5
- pxor %xmm8,%xmm5
- movdqa %xmm4,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm4
- pxor %xmm8,%xmm4
- movdqa L$rol8(%rip),%xmm8
- paddd %xmm7,%xmm3
- paddd %xmm6,%xmm2
- paddd %xmm5,%xmm1
- paddd %xmm4,%xmm0
- pxor %xmm3,%xmm15
- pxor %xmm2,%xmm14
- pxor %xmm1,%xmm13
- pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
- movdqa 0+80(%rbp),%xmm8
- paddd %xmm15,%xmm11
- paddd %xmm14,%xmm10
- paddd %xmm13,%xmm9
- paddd %xmm12,%xmm8
- pxor %xmm11,%xmm7
- pxor %xmm10,%xmm6
- pxor %xmm9,%xmm5
- pxor %xmm8,%xmm4
- movdqa %xmm8,0+80(%rbp)
- movdqa %xmm7,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm7
- pxor %xmm8,%xmm7
- movdqa %xmm6,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm6
- pxor %xmm8,%xmm6
- movdqa %xmm5,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm5
- pxor %xmm8,%xmm5
- movdqa %xmm4,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm4
- pxor %xmm8,%xmm4
- movdqa 0+80(%rbp),%xmm8
-.byte 102,15,58,15,255,12
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,4
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
-
- decq %r10
- jnz L$seal_sse_init_rounds
- paddd L$chacha20_consts(%rip),%xmm3
- paddd 0+48(%rbp),%xmm7
- paddd 0+64(%rbp),%xmm11
- paddd 0+144(%rbp),%xmm15
- paddd L$chacha20_consts(%rip),%xmm2
- paddd 0+48(%rbp),%xmm6
- paddd 0+64(%rbp),%xmm10
- paddd 0+128(%rbp),%xmm14
- paddd L$chacha20_consts(%rip),%xmm1
- paddd 0+48(%rbp),%xmm5
- paddd 0+64(%rbp),%xmm9
- paddd 0+112(%rbp),%xmm13
- paddd L$chacha20_consts(%rip),%xmm0
- paddd 0+48(%rbp),%xmm4
- paddd 0+64(%rbp),%xmm8
- paddd 0+96(%rbp),%xmm12
-
-
- pand L$clamp(%rip),%xmm3
- movdqa %xmm3,0+0(%rbp)
- movdqa %xmm7,0+16(%rbp)
-
- movq %r8,%r8
- call poly_hash_ad_internal
- movdqu 0 + 0(%rsi),%xmm3
- movdqu 16 + 0(%rsi),%xmm7
- movdqu 32 + 0(%rsi),%xmm11
- movdqu 48 + 0(%rsi),%xmm15
- pxor %xmm3,%xmm2
- pxor %xmm7,%xmm6
- pxor %xmm11,%xmm10
- pxor %xmm14,%xmm15
- movdqu %xmm2,0 + 0(%rdi)
- movdqu %xmm6,16 + 0(%rdi)
- movdqu %xmm10,32 + 0(%rdi)
- movdqu %xmm15,48 + 0(%rdi)
- movdqu 0 + 64(%rsi),%xmm3
- movdqu 16 + 64(%rsi),%xmm7
- movdqu 32 + 64(%rsi),%xmm11
- movdqu 48 + 64(%rsi),%xmm15
- pxor %xmm3,%xmm1
- pxor %xmm7,%xmm5
- pxor %xmm11,%xmm9
- pxor %xmm13,%xmm15
- movdqu %xmm1,0 + 64(%rdi)
- movdqu %xmm5,16 + 64(%rdi)
- movdqu %xmm9,32 + 64(%rdi)
- movdqu %xmm15,48 + 64(%rdi)
-
- cmpq $192,%rbx
- ja L$seal_sse_main_init
- movq $128,%rcx
- subq $128,%rbx
- leaq 128(%rsi),%rsi
- jmp L$seal_sse_128_tail_hash
-L$seal_sse_main_init:
- movdqu 0 + 128(%rsi),%xmm3
- movdqu 16 + 128(%rsi),%xmm7
- movdqu 32 + 128(%rsi),%xmm11
- movdqu 48 + 128(%rsi),%xmm15
- pxor %xmm3,%xmm0
- pxor %xmm7,%xmm4
- pxor %xmm11,%xmm8
- pxor %xmm12,%xmm15
- movdqu %xmm0,0 + 128(%rdi)
- movdqu %xmm4,16 + 128(%rdi)
- movdqu %xmm8,32 + 128(%rdi)
- movdqu %xmm15,48 + 128(%rdi)
-
- movq $192,%rcx
- subq $192,%rbx
- leaq 192(%rsi),%rsi
- movq $2,%rcx
- movq $8,%r8
- cmpq $64,%rbx
- jbe L$seal_sse_tail_64
- cmpq $128,%rbx
- jbe L$seal_sse_tail_128
- cmpq $192,%rbx
- jbe L$seal_sse_tail_192
-
-L$seal_sse_main_loop:
- movdqa L$chacha20_consts(%rip),%xmm0
- movdqa 0+48(%rbp),%xmm4
- movdqa 0+64(%rbp),%xmm8
- movdqa %xmm0,%xmm1
- movdqa %xmm4,%xmm5
- movdqa %xmm8,%xmm9
- movdqa %xmm0,%xmm2
- movdqa %xmm4,%xmm6
- movdqa %xmm8,%xmm10
- movdqa %xmm0,%xmm3
- movdqa %xmm4,%xmm7
- movdqa %xmm8,%xmm11
- movdqa 0+96(%rbp),%xmm15
- paddd L$sse_inc(%rip),%xmm15
- movdqa %xmm15,%xmm14
- paddd L$sse_inc(%rip),%xmm14
- movdqa %xmm14,%xmm13
- paddd L$sse_inc(%rip),%xmm13
- movdqa %xmm13,%xmm12
- paddd L$sse_inc(%rip),%xmm12
- movdqa %xmm12,0+96(%rbp)
- movdqa %xmm13,0+112(%rbp)
- movdqa %xmm14,0+128(%rbp)
- movdqa %xmm15,0+144(%rbp)
-
-.p2align 5
-L$seal_sse_main_rounds:
- movdqa %xmm8,0+80(%rbp)
- movdqa L$rol16(%rip),%xmm8
- paddd %xmm7,%xmm3
- paddd %xmm6,%xmm2
- paddd %xmm5,%xmm1
- paddd %xmm4,%xmm0
- pxor %xmm3,%xmm15
- pxor %xmm2,%xmm14
- pxor %xmm1,%xmm13
- pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
- movdqa 0+80(%rbp),%xmm8
- paddd %xmm15,%xmm11
- paddd %xmm14,%xmm10
- paddd %xmm13,%xmm9
- paddd %xmm12,%xmm8
- pxor %xmm11,%xmm7
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- pxor %xmm10,%xmm6
- pxor %xmm9,%xmm5
- pxor %xmm8,%xmm4
- movdqa %xmm8,0+80(%rbp)
- movdqa %xmm7,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm7
- pxor %xmm8,%xmm7
- movdqa %xmm6,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm6
- pxor %xmm8,%xmm6
- movdqa %xmm5,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm5
- pxor %xmm8,%xmm5
- movdqa %xmm4,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm4
- pxor %xmm8,%xmm4
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movdqa L$rol8(%rip),%xmm8
- paddd %xmm7,%xmm3
- paddd %xmm6,%xmm2
- paddd %xmm5,%xmm1
- paddd %xmm4,%xmm0
- pxor %xmm3,%xmm15
- pxor %xmm2,%xmm14
- pxor %xmm1,%xmm13
- pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
- movdqa 0+80(%rbp),%xmm8
- paddd %xmm15,%xmm11
- paddd %xmm14,%xmm10
- paddd %xmm13,%xmm9
- paddd %xmm12,%xmm8
- pxor %xmm11,%xmm7
- pxor %xmm10,%xmm6
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- pxor %xmm9,%xmm5
- pxor %xmm8,%xmm4
- movdqa %xmm8,0+80(%rbp)
- movdqa %xmm7,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm7
- pxor %xmm8,%xmm7
- movdqa %xmm6,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm6
- pxor %xmm8,%xmm6
- movdqa %xmm5,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm5
- pxor %xmm8,%xmm5
- movdqa %xmm4,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm4
- pxor %xmm8,%xmm4
- movdqa 0+80(%rbp),%xmm8
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
-.byte 102,15,58,15,255,4
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,12
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- movdqa %xmm8,0+80(%rbp)
- movdqa L$rol16(%rip),%xmm8
- paddd %xmm7,%xmm3
- paddd %xmm6,%xmm2
- paddd %xmm5,%xmm1
- paddd %xmm4,%xmm0
- pxor %xmm3,%xmm15
- pxor %xmm2,%xmm14
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- pxor %xmm1,%xmm13
- pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
- movdqa 0+80(%rbp),%xmm8
- paddd %xmm15,%xmm11
- paddd %xmm14,%xmm10
- paddd %xmm13,%xmm9
- paddd %xmm12,%xmm8
- pxor %xmm11,%xmm7
- pxor %xmm10,%xmm6
- pxor %xmm9,%xmm5
- pxor %xmm8,%xmm4
- movdqa %xmm8,0+80(%rbp)
- movdqa %xmm7,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm7
- pxor %xmm8,%xmm7
- movdqa %xmm6,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm6
- pxor %xmm8,%xmm6
- movdqa %xmm5,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm5
- pxor %xmm8,%xmm5
- movdqa %xmm4,%xmm8
- psrld $20,%xmm8
- pslld $32-20,%xmm4
- pxor %xmm8,%xmm4
- movdqa L$rol8(%rip),%xmm8
- paddd %xmm7,%xmm3
- paddd %xmm6,%xmm2
- paddd %xmm5,%xmm1
- paddd %xmm4,%xmm0
- pxor %xmm3,%xmm15
- pxor %xmm2,%xmm14
- pxor %xmm1,%xmm13
- pxor %xmm0,%xmm12
-.byte 102,69,15,56,0,248
-.byte 102,69,15,56,0,240
-.byte 102,69,15,56,0,232
-.byte 102,69,15,56,0,224
- movdqa 0+80(%rbp),%xmm8
- paddd %xmm15,%xmm11
- paddd %xmm14,%xmm10
- paddd %xmm13,%xmm9
- paddd %xmm12,%xmm8
- pxor %xmm11,%xmm7
- pxor %xmm10,%xmm6
- pxor %xmm9,%xmm5
- pxor %xmm8,%xmm4
- movdqa %xmm8,0+80(%rbp)
- movdqa %xmm7,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm7
- pxor %xmm8,%xmm7
- movdqa %xmm6,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm6
- pxor %xmm8,%xmm6
- movdqa %xmm5,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm5
- pxor %xmm8,%xmm5
- movdqa %xmm4,%xmm8
- psrld $25,%xmm8
- pslld $32-25,%xmm4
- pxor %xmm8,%xmm4
- movdqa 0+80(%rbp),%xmm8
-.byte 102,15,58,15,255,12
-.byte 102,69,15,58,15,219,8
-.byte 102,69,15,58,15,255,4
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
-
- leaq 16(%rdi),%rdi
- decq %r8
- jge L$seal_sse_main_rounds
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rdi),%rdi
- decq %rcx
- jg L$seal_sse_main_rounds
- paddd L$chacha20_consts(%rip),%xmm3
- paddd 0+48(%rbp),%xmm7
- paddd 0+64(%rbp),%xmm11
- paddd 0+144(%rbp),%xmm15
- paddd L$chacha20_consts(%rip),%xmm2
- paddd 0+48(%rbp),%xmm6
- paddd 0+64(%rbp),%xmm10
- paddd 0+128(%rbp),%xmm14
- paddd L$chacha20_consts(%rip),%xmm1
- paddd 0+48(%rbp),%xmm5
- paddd 0+64(%rbp),%xmm9
- paddd 0+112(%rbp),%xmm13
- paddd L$chacha20_consts(%rip),%xmm0
- paddd 0+48(%rbp),%xmm4
- paddd 0+64(%rbp),%xmm8
- paddd 0+96(%rbp),%xmm12
-
- movdqa %xmm14,0+80(%rbp)
- movdqa %xmm14,0+80(%rbp)
- movdqu 0 + 0(%rsi),%xmm14
- pxor %xmm3,%xmm14
- movdqu %xmm14,0 + 0(%rdi)
- movdqu 16 + 0(%rsi),%xmm14
- pxor %xmm7,%xmm14
- movdqu %xmm14,16 + 0(%rdi)
- movdqu 32 + 0(%rsi),%xmm14
- pxor %xmm11,%xmm14
- movdqu %xmm14,32 + 0(%rdi)
- movdqu 48 + 0(%rsi),%xmm14
- pxor %xmm15,%xmm14
- movdqu %xmm14,48 + 0(%rdi)
-
- movdqa 0+80(%rbp),%xmm14
- movdqu 0 + 64(%rsi),%xmm3
- movdqu 16 + 64(%rsi),%xmm7
- movdqu 32 + 64(%rsi),%xmm11
- movdqu 48 + 64(%rsi),%xmm15
- pxor %xmm3,%xmm2
- pxor %xmm7,%xmm6
- pxor %xmm11,%xmm10
- pxor %xmm14,%xmm15
- movdqu %xmm2,0 + 64(%rdi)
- movdqu %xmm6,16 + 64(%rdi)
- movdqu %xmm10,32 + 64(%rdi)
- movdqu %xmm15,48 + 64(%rdi)
- movdqu 0 + 128(%rsi),%xmm3
- movdqu 16 + 128(%rsi),%xmm7
- movdqu 32 + 128(%rsi),%xmm11
- movdqu 48 + 128(%rsi),%xmm15
- pxor %xmm3,%xmm1
- pxor %xmm7,%xmm5
- pxor %xmm11,%xmm9
- pxor %xmm13,%xmm15
- movdqu %xmm1,0 + 128(%rdi)
- movdqu %xmm5,16 + 128(%rdi)
- movdqu %xmm9,32 + 128(%rdi)
- movdqu %xmm15,48 + 128(%rdi)
-
- cmpq $256,%rbx
- ja L$seal_sse_main_loop_xor
-
- movq $192,%rcx
- subq $192,%rbx
- leaq 192(%rsi),%rsi
- jmp L$seal_sse_128_tail_hash
-L$seal_sse_main_loop_xor:
- movdqu 0 + 192(%rsi),%xmm3
- movdqu 16 + 192(%rsi),%xmm7
- movdqu 32 + 192(%rsi),%xmm11
- movdqu 48 + 192(%rsi),%xmm15
- pxor %xmm3,%xmm0
- pxor %xmm7,%xmm4
- pxor %xmm11,%xmm8
- pxor %xmm12,%xmm15
- movdqu %xmm0,0 + 192(%rdi)
- movdqu %xmm4,16 + 192(%rdi)
- movdqu %xmm8,32 + 192(%rdi)
- movdqu %xmm15,48 + 192(%rdi)
-
- leaq 256(%rsi),%rsi
- subq $256,%rbx
- movq $6,%rcx
- movq $4,%r8
- cmpq $192,%rbx
- jg L$seal_sse_main_loop
- movq %rbx,%rcx
- testq %rbx,%rbx
- je L$seal_sse_128_tail_hash
- movq $6,%rcx
- cmpq $128,%rbx
- ja L$seal_sse_tail_192
- cmpq $64,%rbx
- ja L$seal_sse_tail_128
-
-L$seal_sse_tail_64:
- movdqa L$chacha20_consts(%rip),%xmm0
- movdqa 0+48(%rbp),%xmm4
- movdqa 0+64(%rbp),%xmm8
- movdqa 0+96(%rbp),%xmm12
- paddd L$sse_inc(%rip),%xmm12
- movdqa %xmm12,0+96(%rbp)
-
-L$seal_sse_tail_64_rounds_and_x2hash:
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rdi),%rdi
-L$seal_sse_tail_64_rounds_and_x1hash:
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rdi),%rdi
- decq %rcx
- jg L$seal_sse_tail_64_rounds_and_x2hash
- decq %r8
- jge L$seal_sse_tail_64_rounds_and_x1hash
- paddd L$chacha20_consts(%rip),%xmm0
- paddd 0+48(%rbp),%xmm4
- paddd 0+64(%rbp),%xmm8
- paddd 0+96(%rbp),%xmm12
-
- jmp L$seal_sse_128_tail_xor
-
-L$seal_sse_tail_128:
- movdqa L$chacha20_consts(%rip),%xmm0
- movdqa 0+48(%rbp),%xmm4
- movdqa 0+64(%rbp),%xmm8
- movdqa %xmm0,%xmm1
- movdqa %xmm4,%xmm5
- movdqa %xmm8,%xmm9
- movdqa 0+96(%rbp),%xmm13
- paddd L$sse_inc(%rip),%xmm13
- movdqa %xmm13,%xmm12
- paddd L$sse_inc(%rip),%xmm12
- movdqa %xmm12,0+96(%rbp)
- movdqa %xmm13,0+112(%rbp)
-
-L$seal_sse_tail_128_rounds_and_x2hash:
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rdi),%rdi
-L$seal_sse_tail_128_rounds_and_x1hash:
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm5
- pxor %xmm3,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm5
- pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm5
- pxor %xmm3,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm5
- pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
-
- leaq 16(%rdi),%rdi
- decq %rcx
- jg L$seal_sse_tail_128_rounds_and_x2hash
- decq %r8
- jge L$seal_sse_tail_128_rounds_and_x1hash
- paddd L$chacha20_consts(%rip),%xmm1
- paddd 0+48(%rbp),%xmm5
- paddd 0+64(%rbp),%xmm9
- paddd 0+112(%rbp),%xmm13
- paddd L$chacha20_consts(%rip),%xmm0
- paddd 0+48(%rbp),%xmm4
- paddd 0+64(%rbp),%xmm8
- paddd 0+96(%rbp),%xmm12
- movdqu 0 + 0(%rsi),%xmm3
- movdqu 16 + 0(%rsi),%xmm7
- movdqu 32 + 0(%rsi),%xmm11
- movdqu 48 + 0(%rsi),%xmm15
- pxor %xmm3,%xmm1
- pxor %xmm7,%xmm5
- pxor %xmm11,%xmm9
- pxor %xmm13,%xmm15
- movdqu %xmm1,0 + 0(%rdi)
- movdqu %xmm5,16 + 0(%rdi)
- movdqu %xmm9,32 + 0(%rdi)
- movdqu %xmm15,48 + 0(%rdi)
-
- movq $64,%rcx
- subq $64,%rbx
- leaq 64(%rsi),%rsi
- jmp L$seal_sse_128_tail_hash
-
-L$seal_sse_tail_192:
- movdqa L$chacha20_consts(%rip),%xmm0
- movdqa 0+48(%rbp),%xmm4
- movdqa 0+64(%rbp),%xmm8
- movdqa %xmm0,%xmm1
- movdqa %xmm4,%xmm5
- movdqa %xmm8,%xmm9
- movdqa %xmm0,%xmm2
- movdqa %xmm4,%xmm6
- movdqa %xmm8,%xmm10
- movdqa 0+96(%rbp),%xmm14
- paddd L$sse_inc(%rip),%xmm14
- movdqa %xmm14,%xmm13
- paddd L$sse_inc(%rip),%xmm13
- movdqa %xmm13,%xmm12
- paddd L$sse_inc(%rip),%xmm12
- movdqa %xmm12,0+96(%rbp)
- movdqa %xmm13,0+112(%rbp)
- movdqa %xmm14,0+128(%rbp)
-
-L$seal_sse_tail_192_rounds_and_x2hash:
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rdi),%rdi
-L$seal_sse_tail_192_rounds_and_x1hash:
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm5
- pxor %xmm3,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm5
- pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol16(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm6
- pxor %xmm3,%xmm6
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol8(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm6
- pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm5
- pxor %xmm3,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm5
- pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol16(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm6
- pxor %xmm3,%xmm6
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol8(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm6
- pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-
- leaq 16(%rdi),%rdi
- decq %rcx
- jg L$seal_sse_tail_192_rounds_and_x2hash
- decq %r8
- jge L$seal_sse_tail_192_rounds_and_x1hash
- paddd L$chacha20_consts(%rip),%xmm2
- paddd 0+48(%rbp),%xmm6
- paddd 0+64(%rbp),%xmm10
- paddd 0+128(%rbp),%xmm14
- paddd L$chacha20_consts(%rip),%xmm1
- paddd 0+48(%rbp),%xmm5
- paddd 0+64(%rbp),%xmm9
- paddd 0+112(%rbp),%xmm13
- paddd L$chacha20_consts(%rip),%xmm0
- paddd 0+48(%rbp),%xmm4
- paddd 0+64(%rbp),%xmm8
- paddd 0+96(%rbp),%xmm12
- movdqu 0 + 0(%rsi),%xmm3
- movdqu 16 + 0(%rsi),%xmm7
- movdqu 32 + 0(%rsi),%xmm11
- movdqu 48 + 0(%rsi),%xmm15
- pxor %xmm3,%xmm2
- pxor %xmm7,%xmm6
- pxor %xmm11,%xmm10
- pxor %xmm14,%xmm15
- movdqu %xmm2,0 + 0(%rdi)
- movdqu %xmm6,16 + 0(%rdi)
- movdqu %xmm10,32 + 0(%rdi)
- movdqu %xmm15,48 + 0(%rdi)
- movdqu 0 + 64(%rsi),%xmm3
- movdqu 16 + 64(%rsi),%xmm7
- movdqu 32 + 64(%rsi),%xmm11
- movdqu 48 + 64(%rsi),%xmm15
- pxor %xmm3,%xmm1
- pxor %xmm7,%xmm5
- pxor %xmm11,%xmm9
- pxor %xmm13,%xmm15
- movdqu %xmm1,0 + 64(%rdi)
- movdqu %xmm5,16 + 64(%rdi)
- movdqu %xmm9,32 + 64(%rdi)
- movdqu %xmm15,48 + 64(%rdi)
-
- movq $128,%rcx
- subq $128,%rbx
- leaq 128(%rsi),%rsi
-
-L$seal_sse_128_tail_hash:
- cmpq $16,%rcx
- jb L$seal_sse_128_tail_xor
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- subq $16,%rcx
- leaq 16(%rdi),%rdi
- jmp L$seal_sse_128_tail_hash
-
-L$seal_sse_128_tail_xor:
- cmpq $16,%rbx
- jb L$seal_sse_tail_16
- subq $16,%rbx
-
- movdqu 0(%rsi),%xmm3
- pxor %xmm3,%xmm0
- movdqu %xmm0,0(%rdi)
-
- addq 0(%rdi),%r10
- adcq 8(%rdi),%r11
- adcq $1,%r12
- leaq 16(%rsi),%rsi
- leaq 16(%rdi),%rdi
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-
- movdqa %xmm4,%xmm0
- movdqa %xmm8,%xmm4
- movdqa %xmm12,%xmm8
- movdqa %xmm1,%xmm12
- movdqa %xmm5,%xmm1
- movdqa %xmm9,%xmm5
- movdqa %xmm13,%xmm9
- jmp L$seal_sse_128_tail_xor
-
-L$seal_sse_tail_16:
- testq %rbx,%rbx
- jz L$process_blocks_of_extra_in
-
- movq %rbx,%r8
- movq %rbx,%rcx
- leaq -1(%rsi,%rbx,1),%rsi
- pxor %xmm15,%xmm15
-L$seal_sse_tail_16_compose:
- pslldq $1,%xmm15
- pinsrb $0,(%rsi),%xmm15
- leaq -1(%rsi),%rsi
- decq %rcx
- jne L$seal_sse_tail_16_compose
-
-
- pxor %xmm0,%xmm15
-
-
- movq %rbx,%rcx
- movdqu %xmm15,%xmm0
-L$seal_sse_tail_16_extract:
- pextrb $0,%xmm0,(%rdi)
- psrldq $1,%xmm0
- addq $1,%rdi
- subq $1,%rcx
- jnz L$seal_sse_tail_16_extract
-
-
-
-
-
-
-
-
- movq 288 + 0 + 32(%rsp),%r9
- movq 56(%r9),%r14
- movq 48(%r9),%r13
- testq %r14,%r14
- jz L$process_partial_block
-
- movq $16,%r15
- subq %rbx,%r15
- cmpq %r15,%r14
-
- jge L$load_extra_in
- movq %r14,%r15
-
-L$load_extra_in:
-
-
- leaq -1(%r13,%r15,1),%rsi
-
-
- addq %r15,%r13
- subq %r15,%r14
- movq %r13,48(%r9)
- movq %r14,56(%r9)
-
-
-
- addq %r15,%r8
-
-
- pxor %xmm11,%xmm11
-L$load_extra_load_loop:
- pslldq $1,%xmm11
- pinsrb $0,(%rsi),%xmm11
- leaq -1(%rsi),%rsi
- subq $1,%r15
- jnz L$load_extra_load_loop
-
-
-
-
- movq %rbx,%r15
-
-L$load_extra_shift_loop:
- pslldq $1,%xmm11
- subq $1,%r15
- jnz L$load_extra_shift_loop
-
-
-
-
- leaq L$and_masks(%rip),%r15
- shlq $4,%rbx
- pand -16(%r15,%rbx,1),%xmm15
-
-
- por %xmm11,%xmm15
-
-
-
-.byte 102,77,15,126,253
- pextrq $1,%xmm15,%r14
- addq %r13,%r10
- adcq %r14,%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-
-L$process_blocks_of_extra_in:
-
- movq 288+32+0 (%rsp),%r9
- movq 48(%r9),%rsi
- movq 56(%r9),%r8
- movq %r8,%rcx
- shrq $4,%r8
-
-L$process_extra_hash_loop:
- jz process_extra_in_trailer
- addq 0+0(%rsi),%r10
- adcq 8+0(%rsi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rsi),%rsi
- subq $1,%r8
- jmp L$process_extra_hash_loop
-process_extra_in_trailer:
- andq $15,%rcx
- movq %rcx,%rbx
- jz L$do_length_block
- leaq -1(%rsi,%rcx,1),%rsi
-
-L$process_extra_in_trailer_load:
- pslldq $1,%xmm15
- pinsrb $0,(%rsi),%xmm15
- leaq -1(%rsi),%rsi
- subq $1,%rcx
- jnz L$process_extra_in_trailer_load
-
-L$process_partial_block:
-
- leaq L$and_masks(%rip),%r15
- shlq $4,%rbx
- pand -16(%r15,%rbx,1),%xmm15
-.byte 102,77,15,126,253
- pextrq $1,%xmm15,%r14
- addq %r13,%r10
- adcq %r14,%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-
-L$do_length_block:
- addq 0+0+32(%rbp),%r10
- adcq 8+0+32(%rbp),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-
- movq %r10,%r13
- movq %r11,%r14
- movq %r12,%r15
- subq $-5,%r10
- sbbq $-1,%r11
- sbbq $3,%r12
- cmovcq %r13,%r10
- cmovcq %r14,%r11
- cmovcq %r15,%r12
-
- addq 0+0+16(%rbp),%r10
- adcq 8+0+16(%rbp),%r11
-
-
- addq $288 + 0 + 32,%rsp
-
-
- popq %r9
-
- movq %r10,(%r9)
- movq %r11,8(%r9)
- popq %r15
-
- popq %r14
-
- popq %r13
-
- popq %r12
-
- popq %rbx
-
- popq %rbp
-
- ret
-
-L$seal_sse_128:
-
- movdqu L$chacha20_consts(%rip),%xmm0
- movdqa %xmm0,%xmm1
- movdqa %xmm0,%xmm2
- movdqu 0(%r9),%xmm4
- movdqa %xmm4,%xmm5
- movdqa %xmm4,%xmm6
- movdqu 16(%r9),%xmm8
- movdqa %xmm8,%xmm9
- movdqa %xmm8,%xmm10
- movdqu 32(%r9),%xmm14
- movdqa %xmm14,%xmm12
- paddd L$sse_inc(%rip),%xmm12
- movdqa %xmm12,%xmm13
- paddd L$sse_inc(%rip),%xmm13
- movdqa %xmm4,%xmm7
- movdqa %xmm8,%xmm11
- movdqa %xmm12,%xmm15
- movq $10,%r10
-
-L$seal_sse_128_rounds:
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,4
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,12
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm5
- pxor %xmm3,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm5
- pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,4
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,12
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol16(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm6
- pxor %xmm3,%xmm6
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol8(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm6
- pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,4
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,12
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol16(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm4
- pxor %xmm3,%xmm4
- paddd %xmm4,%xmm0
- pxor %xmm0,%xmm12
- pshufb L$rol8(%rip),%xmm12
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,15,228,12
-.byte 102,69,15,58,15,192,8
-.byte 102,69,15,58,15,228,4
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol16(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm5
- pxor %xmm3,%xmm5
- paddd %xmm5,%xmm1
- pxor %xmm1,%xmm13
- pshufb L$rol8(%rip),%xmm13
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm5
- pxor %xmm3,%xmm5
-.byte 102,15,58,15,237,12
-.byte 102,69,15,58,15,201,8
-.byte 102,69,15,58,15,237,4
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol16(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $12,%xmm3
- psrld $20,%xmm6
- pxor %xmm3,%xmm6
- paddd %xmm6,%xmm2
- pxor %xmm2,%xmm14
- pshufb L$rol8(%rip),%xmm14
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm3
- pslld $7,%xmm3
- psrld $25,%xmm6
- pxor %xmm3,%xmm6
-.byte 102,15,58,15,246,12
-.byte 102,69,15,58,15,210,8
-.byte 102,69,15,58,15,246,4
-
- decq %r10
- jnz L$seal_sse_128_rounds
- paddd L$chacha20_consts(%rip),%xmm0
- paddd L$chacha20_consts(%rip),%xmm1
- paddd L$chacha20_consts(%rip),%xmm2
- paddd %xmm7,%xmm4
- paddd %xmm7,%xmm5
- paddd %xmm7,%xmm6
- paddd %xmm11,%xmm8
- paddd %xmm11,%xmm9
- paddd %xmm15,%xmm12
- paddd L$sse_inc(%rip),%xmm15
- paddd %xmm15,%xmm13
-
- pand L$clamp(%rip),%xmm2
- movdqa %xmm2,0+0(%rbp)
- movdqa %xmm6,0+16(%rbp)
-
- movq %r8,%r8
- call poly_hash_ad_internal
- jmp L$seal_sse_128_tail_xor
-
-
-
-
-
-.p2align 6
-chacha20_poly1305_open_avx2:
-
-
-
-
-
-
-
-
-
-
-
-
- vzeroupper
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vbroadcasti128 0(%r9),%ymm4
- vbroadcasti128 16(%r9),%ymm8
- vbroadcasti128 32(%r9),%ymm12
- vpaddd L$avx2_init(%rip),%ymm12,%ymm12
- cmpq $192,%rbx
- jbe L$open_avx2_192
- cmpq $320,%rbx
- jbe L$open_avx2_320
-
- vmovdqa %ymm4,0+64(%rbp)
- vmovdqa %ymm8,0+96(%rbp)
- vmovdqa %ymm12,0+160(%rbp)
- movq $10,%r10
-L$open_avx2_init_rounds:
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $4,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm4,%ymm4,%ymm4
-
- decq %r10
- jne L$open_avx2_init_rounds
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd 0+64(%rbp),%ymm4,%ymm4
- vpaddd 0+96(%rbp),%ymm8,%ymm8
- vpaddd 0+160(%rbp),%ymm12,%ymm12
-
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
-
- vpand L$clamp(%rip),%ymm3,%ymm3
- vmovdqa %ymm3,0+0(%rbp)
-
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
-
- movq %r8,%r8
- call poly_hash_ad_internal
-
- xorq %rcx,%rcx
-L$open_avx2_init_hash:
- addq 0+0(%rsi,%rcx,1),%r10
- adcq 8+0(%rsi,%rcx,1),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- addq $16,%rcx
- cmpq $64,%rcx
- jne L$open_avx2_init_hash
-
- vpxor 0(%rsi),%ymm0,%ymm0
- vpxor 32(%rsi),%ymm4,%ymm4
-
- vmovdqu %ymm0,0(%rdi)
- vmovdqu %ymm4,32(%rdi)
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
- subq $64,%rbx
-L$open_avx2_main_loop:
-
- cmpq $512,%rbx
- jb L$open_avx2_main_loop_done
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vmovdqa 0+64(%rbp),%ymm4
- vmovdqa 0+96(%rbp),%ymm8
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm8,%ymm9
- vmovdqa %ymm0,%ymm2
- vmovdqa %ymm4,%ymm6
- vmovdqa %ymm8,%ymm10
- vmovdqa %ymm0,%ymm3
- vmovdqa %ymm4,%ymm7
- vmovdqa %ymm8,%ymm11
- vmovdqa L$avx2_inc(%rip),%ymm12
- vpaddd 0+160(%rbp),%ymm12,%ymm15
- vpaddd %ymm15,%ymm12,%ymm14
- vpaddd %ymm14,%ymm12,%ymm13
- vpaddd %ymm13,%ymm12,%ymm12
- vmovdqa %ymm15,0+256(%rbp)
- vmovdqa %ymm14,0+224(%rbp)
- vmovdqa %ymm13,0+192(%rbp)
- vmovdqa %ymm12,0+160(%rbp)
-
- xorq %rcx,%rcx
-L$open_avx2_main_loop_rounds:
- addq 0+0(%rsi,%rcx,1),%r10
- adcq 8+0(%rsi,%rcx,1),%r11
- adcq $1,%r12
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- addq %rax,%r15
- adcq %rdx,%r9
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- addq 0+16(%rsi,%rcx,1),%r10
- adcq 8+16(%rsi,%rcx,1),%r11
- adcq $1,%r12
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $25,%ymm7,%ymm8
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- vpslld $32-25,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $25,%ymm6,%ymm8
- vpslld $32-25,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $25,%ymm5,%ymm8
- vpslld $32-25,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $25,%ymm4,%ymm8
- vpslld $32-25,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa 0+128(%rbp),%ymm8
- vpalignr $4,%ymm7,%ymm7,%ymm7
- vpalignr $8,%ymm11,%ymm11,%ymm11
- vpalignr $12,%ymm15,%ymm15,%ymm15
- vpalignr $4,%ymm6,%ymm6,%ymm6
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $12,%ymm14,%ymm14,%ymm14
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- vpalignr $4,%ymm5,%ymm5,%ymm5
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- addq %rax,%r15
- adcq %rdx,%r9
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- addq 0+32(%rsi,%rcx,1),%r10
- adcq 8+32(%rsi,%rcx,1),%r11
- adcq $1,%r12
-
- leaq 48(%rcx),%rcx
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $25,%ymm7,%ymm8
- vpslld $32-25,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $25,%ymm6,%ymm8
- vpslld $32-25,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- addq %rax,%r15
- adcq %rdx,%r9
- vpsrld $25,%ymm5,%ymm8
- vpslld $32-25,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $25,%ymm4,%ymm8
- vpslld $32-25,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa 0+128(%rbp),%ymm8
- vpalignr $12,%ymm7,%ymm7,%ymm7
- vpalignr $8,%ymm11,%ymm11,%ymm11
- vpalignr $4,%ymm15,%ymm15,%ymm15
- vpalignr $12,%ymm6,%ymm6,%ymm6
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $4,%ymm14,%ymm14,%ymm14
- vpalignr $12,%ymm5,%ymm5,%ymm5
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $12,%ymm4,%ymm4,%ymm4
- vpalignr $8,%ymm8,%ymm8,%ymm8
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vpalignr $4,%ymm12,%ymm12,%ymm12
-
- cmpq $60*8,%rcx
- jne L$open_avx2_main_loop_rounds
- vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3
- vpaddd 0+64(%rbp),%ymm7,%ymm7
- vpaddd 0+96(%rbp),%ymm11,%ymm11
- vpaddd 0+256(%rbp),%ymm15,%ymm15
- vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
- vpaddd 0+64(%rbp),%ymm6,%ymm6
- vpaddd 0+96(%rbp),%ymm10,%ymm10
- vpaddd 0+224(%rbp),%ymm14,%ymm14
- vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
- vpaddd 0+64(%rbp),%ymm5,%ymm5
- vpaddd 0+96(%rbp),%ymm9,%ymm9
- vpaddd 0+192(%rbp),%ymm13,%ymm13
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd 0+64(%rbp),%ymm4,%ymm4
- vpaddd 0+96(%rbp),%ymm8,%ymm8
- vpaddd 0+160(%rbp),%ymm12,%ymm12
-
- vmovdqa %ymm0,0+128(%rbp)
- addq 0+60*8(%rsi),%r10
- adcq 8+60*8(%rsi),%r11
- adcq $1,%r12
- vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
- vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
- vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
- vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
- vpxor 0+0(%rsi),%ymm0,%ymm0
- vpxor 32+0(%rsi),%ymm3,%ymm3
- vpxor 64+0(%rsi),%ymm7,%ymm7
- vpxor 96+0(%rsi),%ymm11,%ymm11
- vmovdqu %ymm0,0+0(%rdi)
- vmovdqu %ymm3,32+0(%rdi)
- vmovdqu %ymm7,64+0(%rdi)
- vmovdqu %ymm11,96+0(%rdi)
-
- vmovdqa 0+128(%rbp),%ymm0
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
- vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
- vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
- vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
- vpxor 0+128(%rsi),%ymm3,%ymm3
- vpxor 32+128(%rsi),%ymm2,%ymm2
- vpxor 64+128(%rsi),%ymm6,%ymm6
- vpxor 96+128(%rsi),%ymm10,%ymm10
- vmovdqu %ymm3,0+128(%rdi)
- vmovdqu %ymm2,32+128(%rdi)
- vmovdqu %ymm6,64+128(%rdi)
- vmovdqu %ymm10,96+128(%rdi)
- addq 0+60*8+16(%rsi),%r10
- adcq 8+60*8+16(%rsi),%r11
- adcq $1,%r12
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
- vpxor 0+256(%rsi),%ymm3,%ymm3
- vpxor 32+256(%rsi),%ymm1,%ymm1
- vpxor 64+256(%rsi),%ymm5,%ymm5
- vpxor 96+256(%rsi),%ymm9,%ymm9
- vmovdqu %ymm3,0+256(%rdi)
- vmovdqu %ymm1,32+256(%rdi)
- vmovdqu %ymm5,64+256(%rdi)
- vmovdqu %ymm9,96+256(%rdi)
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
- vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
- vpxor 0+384(%rsi),%ymm3,%ymm3
- vpxor 32+384(%rsi),%ymm0,%ymm0
- vpxor 64+384(%rsi),%ymm4,%ymm4
- vpxor 96+384(%rsi),%ymm8,%ymm8
- vmovdqu %ymm3,0+384(%rdi)
- vmovdqu %ymm0,32+384(%rdi)
- vmovdqu %ymm4,64+384(%rdi)
- vmovdqu %ymm8,96+384(%rdi)
-
- leaq 512(%rsi),%rsi
- leaq 512(%rdi),%rdi
- subq $512,%rbx
- jmp L$open_avx2_main_loop
-L$open_avx2_main_loop_done:
- testq %rbx,%rbx
- vzeroupper
- je L$open_sse_finalize
-
- cmpq $384,%rbx
- ja L$open_avx2_tail_512
- cmpq $256,%rbx
- ja L$open_avx2_tail_384
- cmpq $128,%rbx
- ja L$open_avx2_tail_256
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vmovdqa 0+64(%rbp),%ymm4
- vmovdqa 0+96(%rbp),%ymm8
- vmovdqa L$avx2_inc(%rip),%ymm12
- vpaddd 0+160(%rbp),%ymm12,%ymm12
- vmovdqa %ymm12,0+160(%rbp)
-
- xorq %r8,%r8
- movq %rbx,%rcx
- andq $-16,%rcx
- testq %rcx,%rcx
- je L$open_avx2_tail_128_rounds
-L$open_avx2_tail_128_rounds_and_x1hash:
- addq 0+0(%rsi,%r8,1),%r10
- adcq 8+0(%rsi,%r8,1),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-L$open_avx2_tail_128_rounds:
- addq $16,%r8
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $4,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm4,%ymm4,%ymm4
-
- cmpq %rcx,%r8
- jb L$open_avx2_tail_128_rounds_and_x1hash
- cmpq $160,%r8
- jne L$open_avx2_tail_128_rounds
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd 0+64(%rbp),%ymm4,%ymm4
- vpaddd 0+96(%rbp),%ymm8,%ymm8
- vpaddd 0+160(%rbp),%ymm12,%ymm12
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
- vmovdqa %ymm3,%ymm8
-
- jmp L$open_avx2_tail_128_xor
-
-L$open_avx2_tail_256:
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vmovdqa 0+64(%rbp),%ymm4
- vmovdqa 0+96(%rbp),%ymm8
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm8,%ymm9
- vmovdqa L$avx2_inc(%rip),%ymm12
- vpaddd 0+160(%rbp),%ymm12,%ymm13
- vpaddd %ymm13,%ymm12,%ymm12
- vmovdqa %ymm12,0+160(%rbp)
- vmovdqa %ymm13,0+192(%rbp)
-
- movq %rbx,0+128(%rbp)
- movq %rbx,%rcx
- subq $128,%rcx
- shrq $4,%rcx
- movq $10,%r8
- cmpq $10,%rcx
- cmovgq %r8,%rcx
- movq %rsi,%rbx
- xorq %r8,%r8
-L$open_avx2_tail_256_rounds_and_x1hash:
- addq 0+0(%rbx),%r10
- adcq 8+0(%rbx),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rbx),%rbx
-L$open_avx2_tail_256_rounds:
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm5,%ymm5,%ymm5
-
- incq %r8
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $4,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm5,%ymm5,%ymm5
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol16(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpsrld $20,%ymm6,%ymm3
- vpslld $12,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol8(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm3
- vpsrld $25,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpalignr $4,%ymm14,%ymm14,%ymm14
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $12,%ymm6,%ymm6,%ymm6
-
- cmpq %rcx,%r8
- jb L$open_avx2_tail_256_rounds_and_x1hash
- cmpq $10,%r8
- jne L$open_avx2_tail_256_rounds
- movq %rbx,%r8
- subq %rsi,%rbx
- movq %rbx,%rcx
- movq 0+128(%rbp),%rbx
-L$open_avx2_tail_256_hash:
- addq $16,%rcx
- cmpq %rbx,%rcx
- jg L$open_avx2_tail_256_done
- addq 0+0(%r8),%r10
- adcq 8+0(%r8),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%r8),%r8
- jmp L$open_avx2_tail_256_hash
-L$open_avx2_tail_256_done:
- vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
- vpaddd 0+64(%rbp),%ymm5,%ymm5
- vpaddd 0+96(%rbp),%ymm9,%ymm9
- vpaddd 0+192(%rbp),%ymm13,%ymm13
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd 0+64(%rbp),%ymm4,%ymm4
- vpaddd 0+96(%rbp),%ymm8,%ymm8
- vpaddd 0+160(%rbp),%ymm12,%ymm12
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
- vpxor 0+0(%rsi),%ymm3,%ymm3
- vpxor 32+0(%rsi),%ymm1,%ymm1
- vpxor 64+0(%rsi),%ymm5,%ymm5
- vpxor 96+0(%rsi),%ymm9,%ymm9
- vmovdqu %ymm3,0+0(%rdi)
- vmovdqu %ymm1,32+0(%rdi)
- vmovdqu %ymm5,64+0(%rdi)
- vmovdqu %ymm9,96+0(%rdi)
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
- vmovdqa %ymm3,%ymm8
-
- leaq 128(%rsi),%rsi
- leaq 128(%rdi),%rdi
- subq $128,%rbx
- jmp L$open_avx2_tail_128_xor
-
-L$open_avx2_tail_384:
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vmovdqa 0+64(%rbp),%ymm4
- vmovdqa 0+96(%rbp),%ymm8
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm8,%ymm9
- vmovdqa %ymm0,%ymm2
- vmovdqa %ymm4,%ymm6
- vmovdqa %ymm8,%ymm10
- vmovdqa L$avx2_inc(%rip),%ymm12
- vpaddd 0+160(%rbp),%ymm12,%ymm14
- vpaddd %ymm14,%ymm12,%ymm13
- vpaddd %ymm13,%ymm12,%ymm12
- vmovdqa %ymm12,0+160(%rbp)
- vmovdqa %ymm13,0+192(%rbp)
- vmovdqa %ymm14,0+224(%rbp)
-
- movq %rbx,0+128(%rbp)
- movq %rbx,%rcx
- subq $256,%rcx
- shrq $4,%rcx
- addq $6,%rcx
- movq $10,%r8
- cmpq $10,%rcx
- cmovgq %r8,%rcx
- movq %rsi,%rbx
- xorq %r8,%r8
-L$open_avx2_tail_384_rounds_and_x2hash:
- addq 0+0(%rbx),%r10
- adcq 8+0(%rbx),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rbx),%rbx
-L$open_avx2_tail_384_rounds_and_x1hash:
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol16(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpsrld $20,%ymm6,%ymm3
- vpslld $12,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol8(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm3
- vpsrld $25,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpalignr $12,%ymm14,%ymm14,%ymm14
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $4,%ymm6,%ymm6,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm5,%ymm5,%ymm5
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm4,%ymm4,%ymm4
- addq 0+0(%rbx),%r10
- adcq 8+0(%rbx),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rbx),%rbx
- incq %r8
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol16(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpsrld $20,%ymm6,%ymm3
- vpslld $12,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol8(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm3
- vpsrld $25,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpalignr $4,%ymm14,%ymm14,%ymm14
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $12,%ymm6,%ymm6,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm5,%ymm5,%ymm5
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $4,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm4,%ymm4,%ymm4
-
- cmpq %rcx,%r8
- jb L$open_avx2_tail_384_rounds_and_x2hash
- cmpq $10,%r8
- jne L$open_avx2_tail_384_rounds_and_x1hash
- movq %rbx,%r8
- subq %rsi,%rbx
- movq %rbx,%rcx
- movq 0+128(%rbp),%rbx
-L$open_avx2_384_tail_hash:
- addq $16,%rcx
- cmpq %rbx,%rcx
- jg L$open_avx2_384_tail_done
- addq 0+0(%r8),%r10
- adcq 8+0(%r8),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%r8),%r8
- jmp L$open_avx2_384_tail_hash
-L$open_avx2_384_tail_done:
- vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
- vpaddd 0+64(%rbp),%ymm6,%ymm6
- vpaddd 0+96(%rbp),%ymm10,%ymm10
- vpaddd 0+224(%rbp),%ymm14,%ymm14
- vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
- vpaddd 0+64(%rbp),%ymm5,%ymm5
- vpaddd 0+96(%rbp),%ymm9,%ymm9
- vpaddd 0+192(%rbp),%ymm13,%ymm13
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd 0+64(%rbp),%ymm4,%ymm4
- vpaddd 0+96(%rbp),%ymm8,%ymm8
- vpaddd 0+160(%rbp),%ymm12,%ymm12
- vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
- vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
- vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
- vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
- vpxor 0+0(%rsi),%ymm3,%ymm3
- vpxor 32+0(%rsi),%ymm2,%ymm2
- vpxor 64+0(%rsi),%ymm6,%ymm6
- vpxor 96+0(%rsi),%ymm10,%ymm10
- vmovdqu %ymm3,0+0(%rdi)
- vmovdqu %ymm2,32+0(%rdi)
- vmovdqu %ymm6,64+0(%rdi)
- vmovdqu %ymm10,96+0(%rdi)
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
- vpxor 0+128(%rsi),%ymm3,%ymm3
- vpxor 32+128(%rsi),%ymm1,%ymm1
- vpxor 64+128(%rsi),%ymm5,%ymm5
- vpxor 96+128(%rsi),%ymm9,%ymm9
- vmovdqu %ymm3,0+128(%rdi)
- vmovdqu %ymm1,32+128(%rdi)
- vmovdqu %ymm5,64+128(%rdi)
- vmovdqu %ymm9,96+128(%rdi)
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
- vmovdqa %ymm3,%ymm8
-
- leaq 256(%rsi),%rsi
- leaq 256(%rdi),%rdi
- subq $256,%rbx
- jmp L$open_avx2_tail_128_xor
-
-L$open_avx2_tail_512:
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vmovdqa 0+64(%rbp),%ymm4
- vmovdqa 0+96(%rbp),%ymm8
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm8,%ymm9
- vmovdqa %ymm0,%ymm2
- vmovdqa %ymm4,%ymm6
- vmovdqa %ymm8,%ymm10
- vmovdqa %ymm0,%ymm3
- vmovdqa %ymm4,%ymm7
- vmovdqa %ymm8,%ymm11
- vmovdqa L$avx2_inc(%rip),%ymm12
- vpaddd 0+160(%rbp),%ymm12,%ymm15
- vpaddd %ymm15,%ymm12,%ymm14
- vpaddd %ymm14,%ymm12,%ymm13
- vpaddd %ymm13,%ymm12,%ymm12
- vmovdqa %ymm15,0+256(%rbp)
- vmovdqa %ymm14,0+224(%rbp)
- vmovdqa %ymm13,0+192(%rbp)
- vmovdqa %ymm12,0+160(%rbp)
-
- xorq %rcx,%rcx
- movq %rsi,%r8
-L$open_avx2_tail_512_rounds_and_x2hash:
- addq 0+0(%r8),%r10
- adcq 8+0(%r8),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%r8),%r8
-L$open_avx2_tail_512_rounds_and_x1hash:
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- addq 0+0(%r8),%r10
- adcq 8+0(%r8),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $25,%ymm7,%ymm8
- vpslld $32-25,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $25,%ymm6,%ymm8
- vpslld $32-25,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $25,%ymm5,%ymm8
- vpslld $32-25,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $25,%ymm4,%ymm8
- vpslld $32-25,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa 0+128(%rbp),%ymm8
- vpalignr $4,%ymm7,%ymm7,%ymm7
- vpalignr $8,%ymm11,%ymm11,%ymm11
- vpalignr $12,%ymm15,%ymm15,%ymm15
- vpalignr $4,%ymm6,%ymm6,%ymm6
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $12,%ymm14,%ymm14,%ymm14
- vpalignr $4,%ymm5,%ymm5,%ymm5
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- addq 0+16(%r8),%r10
- adcq 8+16(%r8),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 32(%r8),%r8
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $25,%ymm7,%ymm8
- vpslld $32-25,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $25,%ymm6,%ymm8
- vpslld $32-25,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $25,%ymm5,%ymm8
- vpslld $32-25,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $25,%ymm4,%ymm8
- vpslld $32-25,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa 0+128(%rbp),%ymm8
- vpalignr $12,%ymm7,%ymm7,%ymm7
- vpalignr $8,%ymm11,%ymm11,%ymm11
- vpalignr $4,%ymm15,%ymm15,%ymm15
- vpalignr $12,%ymm6,%ymm6,%ymm6
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $4,%ymm14,%ymm14,%ymm14
- vpalignr $12,%ymm5,%ymm5,%ymm5
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $12,%ymm4,%ymm4,%ymm4
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm12,%ymm12,%ymm12
-
- incq %rcx
- cmpq $4,%rcx
- jl L$open_avx2_tail_512_rounds_and_x2hash
- cmpq $10,%rcx
- jne L$open_avx2_tail_512_rounds_and_x1hash
- movq %rbx,%rcx
- subq $384,%rcx
- andq $-16,%rcx
-L$open_avx2_tail_512_hash:
- testq %rcx,%rcx
- je L$open_avx2_tail_512_done
- addq 0+0(%r8),%r10
- adcq 8+0(%r8),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%r8),%r8
- subq $16,%rcx
- jmp L$open_avx2_tail_512_hash
-L$open_avx2_tail_512_done:
- vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3
- vpaddd 0+64(%rbp),%ymm7,%ymm7
- vpaddd 0+96(%rbp),%ymm11,%ymm11
- vpaddd 0+256(%rbp),%ymm15,%ymm15
- vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
- vpaddd 0+64(%rbp),%ymm6,%ymm6
- vpaddd 0+96(%rbp),%ymm10,%ymm10
- vpaddd 0+224(%rbp),%ymm14,%ymm14
- vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
- vpaddd 0+64(%rbp),%ymm5,%ymm5
- vpaddd 0+96(%rbp),%ymm9,%ymm9
- vpaddd 0+192(%rbp),%ymm13,%ymm13
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd 0+64(%rbp),%ymm4,%ymm4
- vpaddd 0+96(%rbp),%ymm8,%ymm8
- vpaddd 0+160(%rbp),%ymm12,%ymm12
-
- vmovdqa %ymm0,0+128(%rbp)
- vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
- vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
- vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
- vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
- vpxor 0+0(%rsi),%ymm0,%ymm0
- vpxor 32+0(%rsi),%ymm3,%ymm3
- vpxor 64+0(%rsi),%ymm7,%ymm7
- vpxor 96+0(%rsi),%ymm11,%ymm11
- vmovdqu %ymm0,0+0(%rdi)
- vmovdqu %ymm3,32+0(%rdi)
- vmovdqu %ymm7,64+0(%rdi)
- vmovdqu %ymm11,96+0(%rdi)
-
- vmovdqa 0+128(%rbp),%ymm0
- vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
- vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
- vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
- vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
- vpxor 0+128(%rsi),%ymm3,%ymm3
- vpxor 32+128(%rsi),%ymm2,%ymm2
- vpxor 64+128(%rsi),%ymm6,%ymm6
- vpxor 96+128(%rsi),%ymm10,%ymm10
- vmovdqu %ymm3,0+128(%rdi)
- vmovdqu %ymm2,32+128(%rdi)
- vmovdqu %ymm6,64+128(%rdi)
- vmovdqu %ymm10,96+128(%rdi)
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
- vpxor 0+256(%rsi),%ymm3,%ymm3
- vpxor 32+256(%rsi),%ymm1,%ymm1
- vpxor 64+256(%rsi),%ymm5,%ymm5
- vpxor 96+256(%rsi),%ymm9,%ymm9
- vmovdqu %ymm3,0+256(%rdi)
- vmovdqu %ymm1,32+256(%rdi)
- vmovdqu %ymm5,64+256(%rdi)
- vmovdqu %ymm9,96+256(%rdi)
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
- vmovdqa %ymm3,%ymm8
-
- leaq 384(%rsi),%rsi
- leaq 384(%rdi),%rdi
- subq $384,%rbx
-L$open_avx2_tail_128_xor:
- cmpq $32,%rbx
- jb L$open_avx2_tail_32_xor
- subq $32,%rbx
- vpxor (%rsi),%ymm0,%ymm0
- vmovdqu %ymm0,(%rdi)
- leaq 32(%rsi),%rsi
- leaq 32(%rdi),%rdi
- vmovdqa %ymm4,%ymm0
- vmovdqa %ymm8,%ymm4
- vmovdqa %ymm12,%ymm8
- jmp L$open_avx2_tail_128_xor
-L$open_avx2_tail_32_xor:
- cmpq $16,%rbx
- vmovdqa %xmm0,%xmm1
- jb L$open_avx2_exit
- subq $16,%rbx
-
- vpxor (%rsi),%xmm0,%xmm1
- vmovdqu %xmm1,(%rdi)
- leaq 16(%rsi),%rsi
- leaq 16(%rdi),%rdi
- vperm2i128 $0x11,%ymm0,%ymm0,%ymm0
- vmovdqa %xmm0,%xmm1
-L$open_avx2_exit:
- vzeroupper
- jmp L$open_sse_tail_16
-
-L$open_avx2_192:
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm0,%ymm2
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm4,%ymm6
- vmovdqa %ymm8,%ymm9
- vmovdqa %ymm8,%ymm10
- vpaddd L$avx2_inc(%rip),%ymm12,%ymm13
- vmovdqa %ymm12,%ymm11
- vmovdqa %ymm13,%ymm15
- movq $10,%r10
-L$open_avx2_192_rounds:
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm5,%ymm5,%ymm5
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $4,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm5,%ymm5,%ymm5
-
- decq %r10
- jne L$open_avx2_192_rounds
- vpaddd %ymm2,%ymm0,%ymm0
- vpaddd %ymm2,%ymm1,%ymm1
- vpaddd %ymm6,%ymm4,%ymm4
- vpaddd %ymm6,%ymm5,%ymm5
- vpaddd %ymm10,%ymm8,%ymm8
- vpaddd %ymm10,%ymm9,%ymm9
- vpaddd %ymm11,%ymm12,%ymm12
- vpaddd %ymm15,%ymm13,%ymm13
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
-
- vpand L$clamp(%rip),%ymm3,%ymm3
- vmovdqa %ymm3,0+0(%rbp)
-
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
-L$open_avx2_short:
- movq %r8,%r8
- call poly_hash_ad_internal
-L$open_avx2_short_hash_and_xor_loop:
- cmpq $32,%rbx
- jb L$open_avx2_short_tail_32
- subq $32,%rbx
- addq 0+0(%rsi),%r10
- adcq 8+0(%rsi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- addq 0+16(%rsi),%r10
- adcq 8+16(%rsi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-
- vpxor (%rsi),%ymm0,%ymm0
- vmovdqu %ymm0,(%rdi)
- leaq 32(%rsi),%rsi
- leaq 32(%rdi),%rdi
-
- vmovdqa %ymm4,%ymm0
- vmovdqa %ymm8,%ymm4
- vmovdqa %ymm12,%ymm8
- vmovdqa %ymm1,%ymm12
- vmovdqa %ymm5,%ymm1
- vmovdqa %ymm9,%ymm5
- vmovdqa %ymm13,%ymm9
- vmovdqa %ymm2,%ymm13
- vmovdqa %ymm6,%ymm2
- jmp L$open_avx2_short_hash_and_xor_loop
-L$open_avx2_short_tail_32:
- cmpq $16,%rbx
- vmovdqa %xmm0,%xmm1
- jb L$open_avx2_short_tail_32_exit
- subq $16,%rbx
- addq 0+0(%rsi),%r10
- adcq 8+0(%rsi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- vpxor (%rsi),%xmm0,%xmm3
- vmovdqu %xmm3,(%rdi)
- leaq 16(%rsi),%rsi
- leaq 16(%rdi),%rdi
- vextracti128 $1,%ymm0,%xmm1
-L$open_avx2_short_tail_32_exit:
- vzeroupper
- jmp L$open_sse_tail_16
-
-L$open_avx2_320:
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm0,%ymm2
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm4,%ymm6
- vmovdqa %ymm8,%ymm9
- vmovdqa %ymm8,%ymm10
- vpaddd L$avx2_inc(%rip),%ymm12,%ymm13
- vpaddd L$avx2_inc(%rip),%ymm13,%ymm14
- vmovdqa %ymm4,%ymm7
- vmovdqa %ymm8,%ymm11
- vmovdqa %ymm12,0+160(%rbp)
- vmovdqa %ymm13,0+192(%rbp)
- vmovdqa %ymm14,0+224(%rbp)
- movq $10,%r10
-L$open_avx2_320_rounds:
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm5,%ymm5,%ymm5
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol16(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpsrld $20,%ymm6,%ymm3
- vpslld $12,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol8(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm3
- vpsrld $25,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpalignr $12,%ymm14,%ymm14,%ymm14
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $4,%ymm6,%ymm6,%ymm6
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $4,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm5,%ymm5,%ymm5
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol16(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpsrld $20,%ymm6,%ymm3
- vpslld $12,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol8(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm3
- vpsrld $25,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpalignr $4,%ymm14,%ymm14,%ymm14
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $12,%ymm6,%ymm6,%ymm6
-
- decq %r10
- jne L$open_avx2_320_rounds
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
- vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
- vpaddd %ymm7,%ymm4,%ymm4
- vpaddd %ymm7,%ymm5,%ymm5
- vpaddd %ymm7,%ymm6,%ymm6
- vpaddd %ymm11,%ymm8,%ymm8
- vpaddd %ymm11,%ymm9,%ymm9
- vpaddd %ymm11,%ymm10,%ymm10
- vpaddd 0+160(%rbp),%ymm12,%ymm12
- vpaddd 0+192(%rbp),%ymm13,%ymm13
- vpaddd 0+224(%rbp),%ymm14,%ymm14
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
-
- vpand L$clamp(%rip),%ymm3,%ymm3
- vmovdqa %ymm3,0+0(%rbp)
-
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
- vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
- vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
- vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
- vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
- jmp L$open_avx2_short
-
-
-
-
-
-.p2align 6
-chacha20_poly1305_seal_avx2:
-
-
-
-
-
-
-
-
-
-
-
-
- vzeroupper
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vbroadcasti128 0(%r9),%ymm4
- vbroadcasti128 16(%r9),%ymm8
- vbroadcasti128 32(%r9),%ymm12
- vpaddd L$avx2_init(%rip),%ymm12,%ymm12
- cmpq $192,%rbx
- jbe L$seal_avx2_192
- cmpq $320,%rbx
- jbe L$seal_avx2_320
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm0,%ymm2
- vmovdqa %ymm0,%ymm3
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm4,%ymm6
- vmovdqa %ymm4,%ymm7
- vmovdqa %ymm4,0+64(%rbp)
- vmovdqa %ymm8,%ymm9
- vmovdqa %ymm8,%ymm10
- vmovdqa %ymm8,%ymm11
- vmovdqa %ymm8,0+96(%rbp)
- vmovdqa %ymm12,%ymm15
- vpaddd L$avx2_inc(%rip),%ymm15,%ymm14
- vpaddd L$avx2_inc(%rip),%ymm14,%ymm13
- vpaddd L$avx2_inc(%rip),%ymm13,%ymm12
- vmovdqa %ymm12,0+160(%rbp)
- vmovdqa %ymm13,0+192(%rbp)
- vmovdqa %ymm14,0+224(%rbp)
- vmovdqa %ymm15,0+256(%rbp)
- movq $10,%r10
-L$seal_avx2_init_rounds:
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $25,%ymm7,%ymm8
- vpslld $32-25,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $25,%ymm6,%ymm8
- vpslld $32-25,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $25,%ymm5,%ymm8
- vpslld $32-25,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $25,%ymm4,%ymm8
- vpslld $32-25,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa 0+128(%rbp),%ymm8
- vpalignr $4,%ymm7,%ymm7,%ymm7
- vpalignr $8,%ymm11,%ymm11,%ymm11
- vpalignr $12,%ymm15,%ymm15,%ymm15
- vpalignr $4,%ymm6,%ymm6,%ymm6
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $12,%ymm14,%ymm14,%ymm14
- vpalignr $4,%ymm5,%ymm5,%ymm5
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $25,%ymm7,%ymm8
- vpslld $32-25,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $25,%ymm6,%ymm8
- vpslld $32-25,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $25,%ymm5,%ymm8
- vpslld $32-25,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $25,%ymm4,%ymm8
- vpslld $32-25,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa 0+128(%rbp),%ymm8
- vpalignr $12,%ymm7,%ymm7,%ymm7
- vpalignr $8,%ymm11,%ymm11,%ymm11
- vpalignr $4,%ymm15,%ymm15,%ymm15
- vpalignr $12,%ymm6,%ymm6,%ymm6
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $4,%ymm14,%ymm14,%ymm14
- vpalignr $12,%ymm5,%ymm5,%ymm5
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $12,%ymm4,%ymm4,%ymm4
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm12,%ymm12,%ymm12
-
- decq %r10
- jnz L$seal_avx2_init_rounds
- vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3
- vpaddd 0+64(%rbp),%ymm7,%ymm7
- vpaddd 0+96(%rbp),%ymm11,%ymm11
- vpaddd 0+256(%rbp),%ymm15,%ymm15
- vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
- vpaddd 0+64(%rbp),%ymm6,%ymm6
- vpaddd 0+96(%rbp),%ymm10,%ymm10
- vpaddd 0+224(%rbp),%ymm14,%ymm14
- vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
- vpaddd 0+64(%rbp),%ymm5,%ymm5
- vpaddd 0+96(%rbp),%ymm9,%ymm9
- vpaddd 0+192(%rbp),%ymm13,%ymm13
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd 0+64(%rbp),%ymm4,%ymm4
- vpaddd 0+96(%rbp),%ymm8,%ymm8
- vpaddd 0+160(%rbp),%ymm12,%ymm12
-
- vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
- vperm2i128 $0x02,%ymm3,%ymm7,%ymm15
- vperm2i128 $0x13,%ymm3,%ymm7,%ymm3
- vpand L$clamp(%rip),%ymm15,%ymm15
- vmovdqa %ymm15,0+0(%rbp)
- movq %r8,%r8
- call poly_hash_ad_internal
-
- vpxor 0(%rsi),%ymm3,%ymm3
- vpxor 32(%rsi),%ymm11,%ymm11
- vmovdqu %ymm3,0(%rdi)
- vmovdqu %ymm11,32(%rdi)
- vperm2i128 $0x02,%ymm2,%ymm6,%ymm15
- vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
- vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
- vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
- vpxor 0+64(%rsi),%ymm15,%ymm15
- vpxor 32+64(%rsi),%ymm2,%ymm2
- vpxor 64+64(%rsi),%ymm6,%ymm6
- vpxor 96+64(%rsi),%ymm10,%ymm10
- vmovdqu %ymm15,0+64(%rdi)
- vmovdqu %ymm2,32+64(%rdi)
- vmovdqu %ymm6,64+64(%rdi)
- vmovdqu %ymm10,96+64(%rdi)
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm15
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
- vpxor 0+192(%rsi),%ymm15,%ymm15
- vpxor 32+192(%rsi),%ymm1,%ymm1
- vpxor 64+192(%rsi),%ymm5,%ymm5
- vpxor 96+192(%rsi),%ymm9,%ymm9
- vmovdqu %ymm15,0+192(%rdi)
- vmovdqu %ymm1,32+192(%rdi)
- vmovdqu %ymm5,64+192(%rdi)
- vmovdqu %ymm9,96+192(%rdi)
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm15
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
- vmovdqa %ymm15,%ymm8
-
- leaq 320(%rsi),%rsi
- subq $320,%rbx
- movq $320,%rcx
- cmpq $128,%rbx
- jbe L$seal_avx2_short_hash_remainder
- vpxor 0(%rsi),%ymm0,%ymm0
- vpxor 32(%rsi),%ymm4,%ymm4
- vpxor 64(%rsi),%ymm8,%ymm8
- vpxor 96(%rsi),%ymm12,%ymm12
- vmovdqu %ymm0,320(%rdi)
- vmovdqu %ymm4,352(%rdi)
- vmovdqu %ymm8,384(%rdi)
- vmovdqu %ymm12,416(%rdi)
- leaq 128(%rsi),%rsi
- subq $128,%rbx
- movq $8,%rcx
- movq $2,%r8
- cmpq $128,%rbx
- jbe L$seal_avx2_tail_128
- cmpq $256,%rbx
- jbe L$seal_avx2_tail_256
- cmpq $384,%rbx
- jbe L$seal_avx2_tail_384
- cmpq $512,%rbx
- jbe L$seal_avx2_tail_512
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vmovdqa 0+64(%rbp),%ymm4
- vmovdqa 0+96(%rbp),%ymm8
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm8,%ymm9
- vmovdqa %ymm0,%ymm2
- vmovdqa %ymm4,%ymm6
- vmovdqa %ymm8,%ymm10
- vmovdqa %ymm0,%ymm3
- vmovdqa %ymm4,%ymm7
- vmovdqa %ymm8,%ymm11
- vmovdqa L$avx2_inc(%rip),%ymm12
- vpaddd 0+160(%rbp),%ymm12,%ymm15
- vpaddd %ymm15,%ymm12,%ymm14
- vpaddd %ymm14,%ymm12,%ymm13
- vpaddd %ymm13,%ymm12,%ymm12
- vmovdqa %ymm15,0+256(%rbp)
- vmovdqa %ymm14,0+224(%rbp)
- vmovdqa %ymm13,0+192(%rbp)
- vmovdqa %ymm12,0+160(%rbp)
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $25,%ymm7,%ymm8
- vpslld $32-25,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $25,%ymm6,%ymm8
- vpslld $32-25,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $25,%ymm5,%ymm8
- vpslld $32-25,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $25,%ymm4,%ymm8
- vpslld $32-25,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa 0+128(%rbp),%ymm8
- vpalignr $4,%ymm7,%ymm7,%ymm7
- vpalignr $8,%ymm11,%ymm11,%ymm11
- vpalignr $12,%ymm15,%ymm15,%ymm15
- vpalignr $4,%ymm6,%ymm6,%ymm6
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $12,%ymm14,%ymm14,%ymm14
- vpalignr $4,%ymm5,%ymm5,%ymm5
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $25,%ymm7,%ymm8
- vpslld $32-25,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $25,%ymm6,%ymm8
- vpslld $32-25,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $25,%ymm5,%ymm8
- vpslld $32-25,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $25,%ymm4,%ymm8
- vpslld $32-25,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa 0+128(%rbp),%ymm8
- vpalignr $12,%ymm7,%ymm7,%ymm7
- vpalignr $8,%ymm11,%ymm11,%ymm11
- vpalignr $4,%ymm15,%ymm15,%ymm15
- vpalignr $12,%ymm6,%ymm6,%ymm6
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $4,%ymm14,%ymm14,%ymm14
- vpalignr $12,%ymm5,%ymm5,%ymm5
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $12,%ymm4,%ymm4,%ymm4
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm12,%ymm12,%ymm12
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
-
- subq $16,%rdi
- movq $9,%rcx
- jmp L$seal_avx2_main_loop_rounds_entry
-.p2align 5
-L$seal_avx2_main_loop:
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vmovdqa 0+64(%rbp),%ymm4
- vmovdqa 0+96(%rbp),%ymm8
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm8,%ymm9
- vmovdqa %ymm0,%ymm2
- vmovdqa %ymm4,%ymm6
- vmovdqa %ymm8,%ymm10
- vmovdqa %ymm0,%ymm3
- vmovdqa %ymm4,%ymm7
- vmovdqa %ymm8,%ymm11
- vmovdqa L$avx2_inc(%rip),%ymm12
- vpaddd 0+160(%rbp),%ymm12,%ymm15
- vpaddd %ymm15,%ymm12,%ymm14
- vpaddd %ymm14,%ymm12,%ymm13
- vpaddd %ymm13,%ymm12,%ymm12
- vmovdqa %ymm15,0+256(%rbp)
- vmovdqa %ymm14,0+224(%rbp)
- vmovdqa %ymm13,0+192(%rbp)
- vmovdqa %ymm12,0+160(%rbp)
-
- movq $10,%rcx
-.p2align 5
-L$seal_avx2_main_loop_rounds:
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- addq %rax,%r15
- adcq %rdx,%r9
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
-L$seal_avx2_main_loop_rounds_entry:
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- addq 0+16(%rdi),%r10
- adcq 8+16(%rdi),%r11
- adcq $1,%r12
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $25,%ymm7,%ymm8
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- vpslld $32-25,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $25,%ymm6,%ymm8
- vpslld $32-25,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $25,%ymm5,%ymm8
- vpslld $32-25,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $25,%ymm4,%ymm8
- vpslld $32-25,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa 0+128(%rbp),%ymm8
- vpalignr $4,%ymm7,%ymm7,%ymm7
- vpalignr $8,%ymm11,%ymm11,%ymm11
- vpalignr $12,%ymm15,%ymm15,%ymm15
- vpalignr $4,%ymm6,%ymm6,%ymm6
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $12,%ymm14,%ymm14,%ymm14
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- vpalignr $4,%ymm5,%ymm5,%ymm5
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- addq %rax,%r15
- adcq %rdx,%r9
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- addq 0+32(%rdi),%r10
- adcq 8+32(%rdi),%r11
- adcq $1,%r12
-
- leaq 48(%rdi),%rdi
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $25,%ymm7,%ymm8
- vpslld $32-25,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $25,%ymm6,%ymm8
- vpslld $32-25,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- addq %rax,%r15
- adcq %rdx,%r9
- vpsrld $25,%ymm5,%ymm8
- vpslld $32-25,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $25,%ymm4,%ymm8
- vpslld $32-25,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa 0+128(%rbp),%ymm8
- vpalignr $12,%ymm7,%ymm7,%ymm7
- vpalignr $8,%ymm11,%ymm11,%ymm11
- vpalignr $4,%ymm15,%ymm15,%ymm15
- vpalignr $12,%ymm6,%ymm6,%ymm6
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $4,%ymm14,%ymm14,%ymm14
- vpalignr $12,%ymm5,%ymm5,%ymm5
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $12,%ymm4,%ymm4,%ymm4
- vpalignr $8,%ymm8,%ymm8,%ymm8
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vpalignr $4,%ymm12,%ymm12,%ymm12
-
- decq %rcx
- jne L$seal_avx2_main_loop_rounds
- vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3
- vpaddd 0+64(%rbp),%ymm7,%ymm7
- vpaddd 0+96(%rbp),%ymm11,%ymm11
- vpaddd 0+256(%rbp),%ymm15,%ymm15
- vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
- vpaddd 0+64(%rbp),%ymm6,%ymm6
- vpaddd 0+96(%rbp),%ymm10,%ymm10
- vpaddd 0+224(%rbp),%ymm14,%ymm14
- vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
- vpaddd 0+64(%rbp),%ymm5,%ymm5
- vpaddd 0+96(%rbp),%ymm9,%ymm9
- vpaddd 0+192(%rbp),%ymm13,%ymm13
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd 0+64(%rbp),%ymm4,%ymm4
- vpaddd 0+96(%rbp),%ymm8,%ymm8
- vpaddd 0+160(%rbp),%ymm12,%ymm12
-
- vmovdqa %ymm0,0+128(%rbp)
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- addq 0+16(%rdi),%r10
- adcq 8+16(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 32(%rdi),%rdi
- vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
- vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
- vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
- vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
- vpxor 0+0(%rsi),%ymm0,%ymm0
- vpxor 32+0(%rsi),%ymm3,%ymm3
- vpxor 64+0(%rsi),%ymm7,%ymm7
- vpxor 96+0(%rsi),%ymm11,%ymm11
- vmovdqu %ymm0,0+0(%rdi)
- vmovdqu %ymm3,32+0(%rdi)
- vmovdqu %ymm7,64+0(%rdi)
- vmovdqu %ymm11,96+0(%rdi)
-
- vmovdqa 0+128(%rbp),%ymm0
- vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
- vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
- vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
- vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
- vpxor 0+128(%rsi),%ymm3,%ymm3
- vpxor 32+128(%rsi),%ymm2,%ymm2
- vpxor 64+128(%rsi),%ymm6,%ymm6
- vpxor 96+128(%rsi),%ymm10,%ymm10
- vmovdqu %ymm3,0+128(%rdi)
- vmovdqu %ymm2,32+128(%rdi)
- vmovdqu %ymm6,64+128(%rdi)
- vmovdqu %ymm10,96+128(%rdi)
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
- vpxor 0+256(%rsi),%ymm3,%ymm3
- vpxor 32+256(%rsi),%ymm1,%ymm1
- vpxor 64+256(%rsi),%ymm5,%ymm5
- vpxor 96+256(%rsi),%ymm9,%ymm9
- vmovdqu %ymm3,0+256(%rdi)
- vmovdqu %ymm1,32+256(%rdi)
- vmovdqu %ymm5,64+256(%rdi)
- vmovdqu %ymm9,96+256(%rdi)
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
- vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
- vpxor 0+384(%rsi),%ymm3,%ymm3
- vpxor 32+384(%rsi),%ymm0,%ymm0
- vpxor 64+384(%rsi),%ymm4,%ymm4
- vpxor 96+384(%rsi),%ymm8,%ymm8
- vmovdqu %ymm3,0+384(%rdi)
- vmovdqu %ymm0,32+384(%rdi)
- vmovdqu %ymm4,64+384(%rdi)
- vmovdqu %ymm8,96+384(%rdi)
-
- leaq 512(%rsi),%rsi
- subq $512,%rbx
- cmpq $512,%rbx
- jg L$seal_avx2_main_loop
-
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- addq 0+16(%rdi),%r10
- adcq 8+16(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 32(%rdi),%rdi
- movq $10,%rcx
- xorq %r8,%r8
-
- cmpq $384,%rbx
- ja L$seal_avx2_tail_512
- cmpq $256,%rbx
- ja L$seal_avx2_tail_384
- cmpq $128,%rbx
- ja L$seal_avx2_tail_256
-
-L$seal_avx2_tail_128:
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vmovdqa 0+64(%rbp),%ymm4
- vmovdqa 0+96(%rbp),%ymm8
- vmovdqa L$avx2_inc(%rip),%ymm12
- vpaddd 0+160(%rbp),%ymm12,%ymm12
- vmovdqa %ymm12,0+160(%rbp)
-
-L$seal_avx2_tail_128_rounds_and_3xhash:
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rdi),%rdi
-L$seal_avx2_tail_128_rounds_and_2xhash:
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm4,%ymm4,%ymm4
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $4,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm4,%ymm4,%ymm4
- addq 0+16(%rdi),%r10
- adcq 8+16(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 32(%rdi),%rdi
- decq %rcx
- jg L$seal_avx2_tail_128_rounds_and_3xhash
- decq %r8
- jge L$seal_avx2_tail_128_rounds_and_2xhash
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd 0+64(%rbp),%ymm4,%ymm4
- vpaddd 0+96(%rbp),%ymm8,%ymm8
- vpaddd 0+160(%rbp),%ymm12,%ymm12
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
- vmovdqa %ymm3,%ymm8
-
- jmp L$seal_avx2_short_loop
-
-L$seal_avx2_tail_256:
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vmovdqa 0+64(%rbp),%ymm4
- vmovdqa 0+96(%rbp),%ymm8
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm8,%ymm9
- vmovdqa L$avx2_inc(%rip),%ymm12
- vpaddd 0+160(%rbp),%ymm12,%ymm13
- vpaddd %ymm13,%ymm12,%ymm12
- vmovdqa %ymm12,0+160(%rbp)
- vmovdqa %ymm13,0+192(%rbp)
-
-L$seal_avx2_tail_256_rounds_and_3xhash:
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rdi),%rdi
-L$seal_avx2_tail_256_rounds_and_2xhash:
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm5,%ymm5,%ymm5
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $4,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm5,%ymm5,%ymm5
- addq 0+16(%rdi),%r10
- adcq 8+16(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 32(%rdi),%rdi
- decq %rcx
- jg L$seal_avx2_tail_256_rounds_and_3xhash
- decq %r8
- jge L$seal_avx2_tail_256_rounds_and_2xhash
- vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
- vpaddd 0+64(%rbp),%ymm5,%ymm5
- vpaddd 0+96(%rbp),%ymm9,%ymm9
- vpaddd 0+192(%rbp),%ymm13,%ymm13
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd 0+64(%rbp),%ymm4,%ymm4
- vpaddd 0+96(%rbp),%ymm8,%ymm8
- vpaddd 0+160(%rbp),%ymm12,%ymm12
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
- vpxor 0+0(%rsi),%ymm3,%ymm3
- vpxor 32+0(%rsi),%ymm1,%ymm1
- vpxor 64+0(%rsi),%ymm5,%ymm5
- vpxor 96+0(%rsi),%ymm9,%ymm9
- vmovdqu %ymm3,0+0(%rdi)
- vmovdqu %ymm1,32+0(%rdi)
- vmovdqu %ymm5,64+0(%rdi)
- vmovdqu %ymm9,96+0(%rdi)
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
- vmovdqa %ymm3,%ymm8
-
- movq $128,%rcx
- leaq 128(%rsi),%rsi
- subq $128,%rbx
- jmp L$seal_avx2_short_hash_remainder
-
-L$seal_avx2_tail_384:
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vmovdqa 0+64(%rbp),%ymm4
- vmovdqa 0+96(%rbp),%ymm8
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm8,%ymm9
- vmovdqa %ymm0,%ymm2
- vmovdqa %ymm4,%ymm6
- vmovdqa %ymm8,%ymm10
- vmovdqa L$avx2_inc(%rip),%ymm12
- vpaddd 0+160(%rbp),%ymm12,%ymm14
- vpaddd %ymm14,%ymm12,%ymm13
- vpaddd %ymm13,%ymm12,%ymm12
- vmovdqa %ymm12,0+160(%rbp)
- vmovdqa %ymm13,0+192(%rbp)
- vmovdqa %ymm14,0+224(%rbp)
-
-L$seal_avx2_tail_384_rounds_and_3xhash:
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rdi),%rdi
-L$seal_avx2_tail_384_rounds_and_2xhash:
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm5,%ymm5,%ymm5
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol16(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpsrld $20,%ymm6,%ymm3
- vpslld $12,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol8(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm3
- vpsrld $25,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpalignr $12,%ymm14,%ymm14,%ymm14
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $4,%ymm6,%ymm6,%ymm6
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $4,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm4,%ymm4,%ymm4
- addq 0+16(%rdi),%r10
- adcq 8+16(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm5,%ymm5,%ymm5
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol16(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpsrld $20,%ymm6,%ymm3
- vpslld $12,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol8(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm3
- vpsrld $25,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpalignr $4,%ymm14,%ymm14,%ymm14
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $12,%ymm6,%ymm6,%ymm6
-
- leaq 32(%rdi),%rdi
- decq %rcx
- jg L$seal_avx2_tail_384_rounds_and_3xhash
- decq %r8
- jge L$seal_avx2_tail_384_rounds_and_2xhash
- vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
- vpaddd 0+64(%rbp),%ymm6,%ymm6
- vpaddd 0+96(%rbp),%ymm10,%ymm10
- vpaddd 0+224(%rbp),%ymm14,%ymm14
- vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
- vpaddd 0+64(%rbp),%ymm5,%ymm5
- vpaddd 0+96(%rbp),%ymm9,%ymm9
- vpaddd 0+192(%rbp),%ymm13,%ymm13
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd 0+64(%rbp),%ymm4,%ymm4
- vpaddd 0+96(%rbp),%ymm8,%ymm8
- vpaddd 0+160(%rbp),%ymm12,%ymm12
- vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
- vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
- vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
- vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
- vpxor 0+0(%rsi),%ymm3,%ymm3
- vpxor 32+0(%rsi),%ymm2,%ymm2
- vpxor 64+0(%rsi),%ymm6,%ymm6
- vpxor 96+0(%rsi),%ymm10,%ymm10
- vmovdqu %ymm3,0+0(%rdi)
- vmovdqu %ymm2,32+0(%rdi)
- vmovdqu %ymm6,64+0(%rdi)
- vmovdqu %ymm10,96+0(%rdi)
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
- vpxor 0+128(%rsi),%ymm3,%ymm3
- vpxor 32+128(%rsi),%ymm1,%ymm1
- vpxor 64+128(%rsi),%ymm5,%ymm5
- vpxor 96+128(%rsi),%ymm9,%ymm9
- vmovdqu %ymm3,0+128(%rdi)
- vmovdqu %ymm1,32+128(%rdi)
- vmovdqu %ymm5,64+128(%rdi)
- vmovdqu %ymm9,96+128(%rdi)
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
- vmovdqa %ymm3,%ymm8
-
- movq $256,%rcx
- leaq 256(%rsi),%rsi
- subq $256,%rbx
- jmp L$seal_avx2_short_hash_remainder
-
-L$seal_avx2_tail_512:
- vmovdqa L$chacha20_consts(%rip),%ymm0
- vmovdqa 0+64(%rbp),%ymm4
- vmovdqa 0+96(%rbp),%ymm8
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm8,%ymm9
- vmovdqa %ymm0,%ymm2
- vmovdqa %ymm4,%ymm6
- vmovdqa %ymm8,%ymm10
- vmovdqa %ymm0,%ymm3
- vmovdqa %ymm4,%ymm7
- vmovdqa %ymm8,%ymm11
- vmovdqa L$avx2_inc(%rip),%ymm12
- vpaddd 0+160(%rbp),%ymm12,%ymm15
- vpaddd %ymm15,%ymm12,%ymm14
- vpaddd %ymm14,%ymm12,%ymm13
- vpaddd %ymm13,%ymm12,%ymm12
- vmovdqa %ymm15,0+256(%rbp)
- vmovdqa %ymm14,0+224(%rbp)
- vmovdqa %ymm13,0+192(%rbp)
- vmovdqa %ymm12,0+160(%rbp)
-
-L$seal_avx2_tail_512_rounds_and_3xhash:
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- addq %rax,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rdi),%rdi
-L$seal_avx2_tail_512_rounds_and_2xhash:
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $25,%ymm7,%ymm8
- vpslld $32-25,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- vpsrld $25,%ymm6,%ymm8
- vpslld $32-25,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $25,%ymm5,%ymm8
- vpslld $32-25,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $25,%ymm4,%ymm8
- vpslld $32-25,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa 0+128(%rbp),%ymm8
- vpalignr $4,%ymm7,%ymm7,%ymm7
- vpalignr $8,%ymm11,%ymm11,%ymm11
- vpalignr $12,%ymm15,%ymm15,%ymm15
- vpalignr $4,%ymm6,%ymm6,%ymm6
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $12,%ymm14,%ymm14,%ymm14
- vpalignr $4,%ymm5,%ymm5,%ymm5
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $4,%ymm4,%ymm4,%ymm4
- addq %rax,%r15
- adcq %rdx,%r9
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vmovdqa %ymm8,0+128(%rbp)
- vmovdqa L$rol16(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $20,%ymm7,%ymm8
- vpslld $32-20,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $20,%ymm6,%ymm8
- vpslld $32-20,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $20,%ymm5,%ymm8
- vpslld $32-20,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $20,%ymm4,%ymm8
- vpslld $32-20,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa L$rol8(%rip),%ymm8
- vpaddd %ymm7,%ymm3,%ymm3
- vpaddd %ymm6,%ymm2,%ymm2
- addq 0+16(%rdi),%r10
- adcq 8+16(%rdi),%r11
- adcq $1,%r12
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm3,%ymm15,%ymm15
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm8,%ymm15,%ymm15
- vpshufb %ymm8,%ymm14,%ymm14
- vpshufb %ymm8,%ymm13,%ymm13
- vpshufb %ymm8,%ymm12,%ymm12
- vpaddd %ymm15,%ymm11,%ymm11
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd 0+128(%rbp),%ymm12,%ymm8
- vpxor %ymm11,%ymm7,%ymm7
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa %ymm8,0+128(%rbp)
- vpsrld $25,%ymm7,%ymm8
- movq 0+0+0(%rbp),%rdx
- movq %rdx,%r15
- mulxq %r10,%r13,%r14
- mulxq %r11,%rax,%rdx
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- vpslld $32-25,%ymm7,%ymm7
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $25,%ymm6,%ymm8
- vpslld $32-25,%ymm6,%ymm6
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $25,%ymm5,%ymm8
- vpslld $32-25,%ymm5,%ymm5
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $25,%ymm4,%ymm8
- vpslld $32-25,%ymm4,%ymm4
- vpxor %ymm8,%ymm4,%ymm4
- vmovdqa 0+128(%rbp),%ymm8
- vpalignr $12,%ymm7,%ymm7,%ymm7
- vpalignr $8,%ymm11,%ymm11,%ymm11
- vpalignr $4,%ymm15,%ymm15,%ymm15
- vpalignr $12,%ymm6,%ymm6,%ymm6
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $4,%ymm14,%ymm14,%ymm14
- vpalignr $12,%ymm5,%ymm5,%ymm5
- vpalignr $8,%ymm9,%ymm9,%ymm9
- movq 8+0+0(%rbp),%rdx
- mulxq %r10,%r10,%rax
- addq %r10,%r14
- mulxq %r11,%r11,%r9
- adcq %r11,%r15
- adcq $0,%r9
- imulq %r12,%rdx
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $12,%ymm4,%ymm4,%ymm4
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm12,%ymm12,%ymm12
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- addq %rax,%r15
- adcq %rdx,%r9
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 32(%rdi),%rdi
- decq %rcx
- jg L$seal_avx2_tail_512_rounds_and_3xhash
- decq %r8
- jge L$seal_avx2_tail_512_rounds_and_2xhash
- vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3
- vpaddd 0+64(%rbp),%ymm7,%ymm7
- vpaddd 0+96(%rbp),%ymm11,%ymm11
- vpaddd 0+256(%rbp),%ymm15,%ymm15
- vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
- vpaddd 0+64(%rbp),%ymm6,%ymm6
- vpaddd 0+96(%rbp),%ymm10,%ymm10
- vpaddd 0+224(%rbp),%ymm14,%ymm14
- vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
- vpaddd 0+64(%rbp),%ymm5,%ymm5
- vpaddd 0+96(%rbp),%ymm9,%ymm9
- vpaddd 0+192(%rbp),%ymm13,%ymm13
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd 0+64(%rbp),%ymm4,%ymm4
- vpaddd 0+96(%rbp),%ymm8,%ymm8
- vpaddd 0+160(%rbp),%ymm12,%ymm12
-
- vmovdqa %ymm0,0+128(%rbp)
- vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
- vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
- vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
- vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
- vpxor 0+0(%rsi),%ymm0,%ymm0
- vpxor 32+0(%rsi),%ymm3,%ymm3
- vpxor 64+0(%rsi),%ymm7,%ymm7
- vpxor 96+0(%rsi),%ymm11,%ymm11
- vmovdqu %ymm0,0+0(%rdi)
- vmovdqu %ymm3,32+0(%rdi)
- vmovdqu %ymm7,64+0(%rdi)
- vmovdqu %ymm11,96+0(%rdi)
-
- vmovdqa 0+128(%rbp),%ymm0
- vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
- vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
- vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
- vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
- vpxor 0+128(%rsi),%ymm3,%ymm3
- vpxor 32+128(%rsi),%ymm2,%ymm2
- vpxor 64+128(%rsi),%ymm6,%ymm6
- vpxor 96+128(%rsi),%ymm10,%ymm10
- vmovdqu %ymm3,0+128(%rdi)
- vmovdqu %ymm2,32+128(%rdi)
- vmovdqu %ymm6,64+128(%rdi)
- vmovdqu %ymm10,96+128(%rdi)
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
- vpxor 0+256(%rsi),%ymm3,%ymm3
- vpxor 32+256(%rsi),%ymm1,%ymm1
- vpxor 64+256(%rsi),%ymm5,%ymm5
- vpxor 96+256(%rsi),%ymm9,%ymm9
- vmovdqu %ymm3,0+256(%rdi)
- vmovdqu %ymm1,32+256(%rdi)
- vmovdqu %ymm5,64+256(%rdi)
- vmovdqu %ymm9,96+256(%rdi)
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
- vmovdqa %ymm3,%ymm8
-
- movq $384,%rcx
- leaq 384(%rsi),%rsi
- subq $384,%rbx
- jmp L$seal_avx2_short_hash_remainder
-
-L$seal_avx2_320:
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm0,%ymm2
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm4,%ymm6
- vmovdqa %ymm8,%ymm9
- vmovdqa %ymm8,%ymm10
- vpaddd L$avx2_inc(%rip),%ymm12,%ymm13
- vpaddd L$avx2_inc(%rip),%ymm13,%ymm14
- vmovdqa %ymm4,%ymm7
- vmovdqa %ymm8,%ymm11
- vmovdqa %ymm12,0+160(%rbp)
- vmovdqa %ymm13,0+192(%rbp)
- vmovdqa %ymm14,0+224(%rbp)
- movq $10,%r10
-L$seal_avx2_320_rounds:
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm5,%ymm5,%ymm5
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol16(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpsrld $20,%ymm6,%ymm3
- vpslld $12,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol8(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm3
- vpsrld $25,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpalignr $12,%ymm14,%ymm14,%ymm14
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $4,%ymm6,%ymm6,%ymm6
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $4,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm5,%ymm5,%ymm5
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol16(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpsrld $20,%ymm6,%ymm3
- vpslld $12,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpaddd %ymm6,%ymm2,%ymm2
- vpxor %ymm2,%ymm14,%ymm14
- vpshufb L$rol8(%rip),%ymm14,%ymm14
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm3
- vpsrld $25,%ymm6,%ymm6
- vpxor %ymm3,%ymm6,%ymm6
- vpalignr $4,%ymm14,%ymm14,%ymm14
- vpalignr $8,%ymm10,%ymm10,%ymm10
- vpalignr $12,%ymm6,%ymm6,%ymm6
-
- decq %r10
- jne L$seal_avx2_320_rounds
- vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
- vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
- vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
- vpaddd %ymm7,%ymm4,%ymm4
- vpaddd %ymm7,%ymm5,%ymm5
- vpaddd %ymm7,%ymm6,%ymm6
- vpaddd %ymm11,%ymm8,%ymm8
- vpaddd %ymm11,%ymm9,%ymm9
- vpaddd %ymm11,%ymm10,%ymm10
- vpaddd 0+160(%rbp),%ymm12,%ymm12
- vpaddd 0+192(%rbp),%ymm13,%ymm13
- vpaddd 0+224(%rbp),%ymm14,%ymm14
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
-
- vpand L$clamp(%rip),%ymm3,%ymm3
- vmovdqa %ymm3,0+0(%rbp)
-
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
- vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
- vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
- vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
- vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
- jmp L$seal_avx2_short
-
-L$seal_avx2_192:
- vmovdqa %ymm0,%ymm1
- vmovdqa %ymm0,%ymm2
- vmovdqa %ymm4,%ymm5
- vmovdqa %ymm4,%ymm6
- vmovdqa %ymm8,%ymm9
- vmovdqa %ymm8,%ymm10
- vpaddd L$avx2_inc(%rip),%ymm12,%ymm13
- vmovdqa %ymm12,%ymm11
- vmovdqa %ymm13,%ymm15
- movq $10,%r10
-L$seal_avx2_192_rounds:
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $12,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $4,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $12,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $4,%ymm5,%ymm5,%ymm5
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol16(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $20,%ymm4,%ymm3
- vpslld $12,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpaddd %ymm4,%ymm0,%ymm0
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb L$rol8(%rip),%ymm12,%ymm12
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm3
- vpsrld $25,%ymm4,%ymm4
- vpxor %ymm3,%ymm4,%ymm4
- vpalignr $4,%ymm12,%ymm12,%ymm12
- vpalignr $8,%ymm8,%ymm8,%ymm8
- vpalignr $12,%ymm4,%ymm4,%ymm4
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol16(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpsrld $20,%ymm5,%ymm3
- vpslld $12,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpaddd %ymm5,%ymm1,%ymm1
- vpxor %ymm1,%ymm13,%ymm13
- vpshufb L$rol8(%rip),%ymm13,%ymm13
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm3
- vpsrld $25,%ymm5,%ymm5
- vpxor %ymm3,%ymm5,%ymm5
- vpalignr $4,%ymm13,%ymm13,%ymm13
- vpalignr $8,%ymm9,%ymm9,%ymm9
- vpalignr $12,%ymm5,%ymm5,%ymm5
-
- decq %r10
- jne L$seal_avx2_192_rounds
- vpaddd %ymm2,%ymm0,%ymm0
- vpaddd %ymm2,%ymm1,%ymm1
- vpaddd %ymm6,%ymm4,%ymm4
- vpaddd %ymm6,%ymm5,%ymm5
- vpaddd %ymm10,%ymm8,%ymm8
- vpaddd %ymm10,%ymm9,%ymm9
- vpaddd %ymm11,%ymm12,%ymm12
- vpaddd %ymm15,%ymm13,%ymm13
- vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
-
- vpand L$clamp(%rip),%ymm3,%ymm3
- vmovdqa %ymm3,0+0(%rbp)
-
- vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
- vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
- vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
- vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
- vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
- vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
-L$seal_avx2_short:
- movq %r8,%r8
- call poly_hash_ad_internal
- xorq %rcx,%rcx
-L$seal_avx2_short_hash_remainder:
- cmpq $16,%rcx
- jb L$seal_avx2_short_loop
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- subq $16,%rcx
- addq $16,%rdi
- jmp L$seal_avx2_short_hash_remainder
-L$seal_avx2_short_loop:
- cmpq $32,%rbx
- jb L$seal_avx2_short_tail
- subq $32,%rbx
-
- vpxor (%rsi),%ymm0,%ymm0
- vmovdqu %ymm0,(%rdi)
- leaq 32(%rsi),%rsi
-
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
- addq 0+16(%rdi),%r10
- adcq 8+16(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 32(%rdi),%rdi
-
- vmovdqa %ymm4,%ymm0
- vmovdqa %ymm8,%ymm4
- vmovdqa %ymm12,%ymm8
- vmovdqa %ymm1,%ymm12
- vmovdqa %ymm5,%ymm1
- vmovdqa %ymm9,%ymm5
- vmovdqa %ymm13,%ymm9
- vmovdqa %ymm2,%ymm13
- vmovdqa %ymm6,%ymm2
- jmp L$seal_avx2_short_loop
-L$seal_avx2_short_tail:
- cmpq $16,%rbx
- jb L$seal_avx2_exit
- subq $16,%rbx
- vpxor (%rsi),%xmm0,%xmm3
- vmovdqu %xmm3,(%rdi)
- leaq 16(%rsi),%rsi
- addq 0+0(%rdi),%r10
- adcq 8+0(%rdi),%r11
- adcq $1,%r12
- movq 0+0+0(%rbp),%rax
- movq %rax,%r15
- mulq %r10
- movq %rax,%r13
- movq %rdx,%r14
- movq 0+0+0(%rbp),%rax
- mulq %r11
- imulq %r12,%r15
- addq %rax,%r14
- adcq %rdx,%r15
- movq 8+0+0(%rbp),%rax
- movq %rax,%r9
- mulq %r10
- addq %rax,%r14
- adcq $0,%rdx
- movq %rdx,%r10
- movq 8+0+0(%rbp),%rax
- mulq %r11
- addq %rax,%r15
- adcq $0,%rdx
- imulq %r12,%r9
- addq %r10,%r15
- adcq %rdx,%r9
- movq %r13,%r10
- movq %r14,%r11
- movq %r15,%r12
- andq $3,%r12
- movq %r15,%r13
- andq $-4,%r13
- movq %r9,%r14
- shrdq $2,%r9,%r15
- shrq $2,%r9
- addq %r13,%r15
- adcq %r14,%r9
- addq %r15,%r10
- adcq %r9,%r11
- adcq $0,%r12
-
- leaq 16(%rdi),%rdi
- vextracti128 $1,%ymm0,%xmm0
-L$seal_avx2_exit:
- vzeroupper
- jmp L$seal_sse_tail_16
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64-apple.S
deleted file mode 100644
index e1247bc..0000000
--- a/apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64-apple.S
+++ /dev/null
@@ -1,868 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-
-.p2align 5
-_aesni_ctr32_ghash_6x:
-
- vmovdqu 32(%r11),%xmm2
- subq $6,%rdx
- vpxor %xmm4,%xmm4,%xmm4
- vmovdqu 0-128(%rcx),%xmm15
- vpaddb %xmm2,%xmm1,%xmm10
- vpaddb %xmm2,%xmm10,%xmm11
- vpaddb %xmm2,%xmm11,%xmm12
- vpaddb %xmm2,%xmm12,%xmm13
- vpaddb %xmm2,%xmm13,%xmm14
- vpxor %xmm15,%xmm1,%xmm9
- vmovdqu %xmm4,16+8(%rsp)
- jmp L$oop6x
-
-.p2align 5
-L$oop6x:
- addl $100663296,%ebx
- jc L$handle_ctr32
- vmovdqu 0-32(%r9),%xmm3
- vpaddb %xmm2,%xmm14,%xmm1
- vpxor %xmm15,%xmm10,%xmm10
- vpxor %xmm15,%xmm11,%xmm11
-
-L$resume_ctr32:
- vmovdqu %xmm1,(%r8)
- vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
- vpxor %xmm15,%xmm12,%xmm12
- vmovups 16-128(%rcx),%xmm2
- vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- xorq %r12,%r12
- cmpq %r14,%r15
-
- vaesenc %xmm2,%xmm9,%xmm9
- vmovdqu 48+8(%rsp),%xmm0
- vpxor %xmm15,%xmm13,%xmm13
- vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
- vaesenc %xmm2,%xmm10,%xmm10
- vpxor %xmm15,%xmm14,%xmm14
- setnc %r12b
- vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
- vaesenc %xmm2,%xmm11,%xmm11
- vmovdqu 16-32(%r9),%xmm3
- negq %r12
- vaesenc %xmm2,%xmm12,%xmm12
- vpxor %xmm5,%xmm6,%xmm6
- vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
- vpxor %xmm4,%xmm8,%xmm8
- vaesenc %xmm2,%xmm13,%xmm13
- vpxor %xmm5,%xmm1,%xmm4
- andq $0x60,%r12
- vmovups 32-128(%rcx),%xmm15
- vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
- vaesenc %xmm2,%xmm14,%xmm14
-
- vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
- leaq (%r14,%r12,1),%r14
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor 16+8(%rsp),%xmm8,%xmm8
- vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
- vmovdqu 64+8(%rsp),%xmm0
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 88(%r14),%r13
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 80(%r14),%r12
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,32+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,40+8(%rsp)
- vmovdqu 48-32(%r9),%xmm5
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 48-128(%rcx),%xmm15
- vpxor %xmm1,%xmm6,%xmm6
- vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm2,%xmm6,%xmm6
- vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
- vaesenc %xmm15,%xmm10,%xmm10
- vpxor %xmm3,%xmm7,%xmm7
- vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
- vaesenc %xmm15,%xmm11,%xmm11
- vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
- vmovdqu 80+8(%rsp),%xmm0
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqu 64-32(%r9),%xmm1
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 64-128(%rcx),%xmm15
- vpxor %xmm2,%xmm6,%xmm6
- vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm3,%xmm6,%xmm6
- vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 72(%r14),%r13
- vpxor %xmm5,%xmm7,%xmm7
- vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 64(%r14),%r12
- vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
- vmovdqu 96+8(%rsp),%xmm0
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,48+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,56+8(%rsp)
- vpxor %xmm2,%xmm4,%xmm4
- vmovdqu 96-32(%r9),%xmm2
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 80-128(%rcx),%xmm15
- vpxor %xmm3,%xmm6,%xmm6
- vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm5,%xmm6,%xmm6
- vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 56(%r14),%r13
- vpxor %xmm1,%xmm7,%xmm7
- vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
- vpxor 112+8(%rsp),%xmm8,%xmm8
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 48(%r14),%r12
- vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,64+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,72+8(%rsp)
- vpxor %xmm3,%xmm4,%xmm4
- vmovdqu 112-32(%r9),%xmm3
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 96-128(%rcx),%xmm15
- vpxor %xmm5,%xmm6,%xmm6
- vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm1,%xmm6,%xmm6
- vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 40(%r14),%r13
- vpxor %xmm2,%xmm7,%xmm7
- vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 32(%r14),%r12
- vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,80+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,88+8(%rsp)
- vpxor %xmm5,%xmm6,%xmm6
- vaesenc %xmm15,%xmm14,%xmm14
- vpxor %xmm1,%xmm6,%xmm6
-
- vmovups 112-128(%rcx),%xmm15
- vpslldq $8,%xmm6,%xmm5
- vpxor %xmm2,%xmm4,%xmm4
- vmovdqu 16(%r11),%xmm3
-
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm8,%xmm7,%xmm7
- vaesenc %xmm15,%xmm10,%xmm10
- vpxor %xmm5,%xmm4,%xmm4
- movbeq 24(%r14),%r13
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 16(%r14),%r12
- vpalignr $8,%xmm4,%xmm4,%xmm0
- vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
- movq %r13,96+8(%rsp)
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r12,104+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- vmovups 128-128(%rcx),%xmm1
- vaesenc %xmm15,%xmm14,%xmm14
-
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 144-128(%rcx),%xmm15
- vaesenc %xmm1,%xmm10,%xmm10
- vpsrldq $8,%xmm6,%xmm6
- vaesenc %xmm1,%xmm11,%xmm11
- vpxor %xmm6,%xmm7,%xmm7
- vaesenc %xmm1,%xmm12,%xmm12
- vpxor %xmm0,%xmm4,%xmm4
- movbeq 8(%r14),%r13
- vaesenc %xmm1,%xmm13,%xmm13
- movbeq 0(%r14),%r12
- vaesenc %xmm1,%xmm14,%xmm14
- vmovups 160-128(%rcx),%xmm1
- cmpl $11,%r10d
- jb L$enc_tail
-
- vaesenc %xmm15,%xmm9,%xmm9
- vaesenc %xmm15,%xmm10,%xmm10
- vaesenc %xmm15,%xmm11,%xmm11
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vaesenc %xmm15,%xmm14,%xmm14
-
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
- vmovups 176-128(%rcx),%xmm15
- vaesenc %xmm1,%xmm14,%xmm14
- vmovups 192-128(%rcx),%xmm1
- je L$enc_tail
-
- vaesenc %xmm15,%xmm9,%xmm9
- vaesenc %xmm15,%xmm10,%xmm10
- vaesenc %xmm15,%xmm11,%xmm11
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vaesenc %xmm15,%xmm14,%xmm14
-
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
- vmovups 208-128(%rcx),%xmm15
- vaesenc %xmm1,%xmm14,%xmm14
- vmovups 224-128(%rcx),%xmm1
- jmp L$enc_tail
-
-.p2align 5
-L$handle_ctr32:
- vmovdqu (%r11),%xmm0
- vpshufb %xmm0,%xmm1,%xmm6
- vmovdqu 48(%r11),%xmm5
- vpaddd 64(%r11),%xmm6,%xmm10
- vpaddd %xmm5,%xmm6,%xmm11
- vmovdqu 0-32(%r9),%xmm3
- vpaddd %xmm5,%xmm10,%xmm12
- vpshufb %xmm0,%xmm10,%xmm10
- vpaddd %xmm5,%xmm11,%xmm13
- vpshufb %xmm0,%xmm11,%xmm11
- vpxor %xmm15,%xmm10,%xmm10
- vpaddd %xmm5,%xmm12,%xmm14
- vpshufb %xmm0,%xmm12,%xmm12
- vpxor %xmm15,%xmm11,%xmm11
- vpaddd %xmm5,%xmm13,%xmm1
- vpshufb %xmm0,%xmm13,%xmm13
- vpshufb %xmm0,%xmm14,%xmm14
- vpshufb %xmm0,%xmm1,%xmm1
- jmp L$resume_ctr32
-
-.p2align 5
-L$enc_tail:
- vaesenc %xmm15,%xmm9,%xmm9
- vmovdqu %xmm7,16+8(%rsp)
- vpalignr $8,%xmm4,%xmm4,%xmm8
- vaesenc %xmm15,%xmm10,%xmm10
- vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
- vpxor 0(%rdi),%xmm1,%xmm2
- vaesenc %xmm15,%xmm11,%xmm11
- vpxor 16(%rdi),%xmm1,%xmm0
- vaesenc %xmm15,%xmm12,%xmm12
- vpxor 32(%rdi),%xmm1,%xmm5
- vaesenc %xmm15,%xmm13,%xmm13
- vpxor 48(%rdi),%xmm1,%xmm6
- vaesenc %xmm15,%xmm14,%xmm14
- vpxor 64(%rdi),%xmm1,%xmm7
- vpxor 80(%rdi),%xmm1,%xmm3
- vmovdqu (%r8),%xmm1
-
- vaesenclast %xmm2,%xmm9,%xmm9
- vmovdqu 32(%r11),%xmm2
- vaesenclast %xmm0,%xmm10,%xmm10
- vpaddb %xmm2,%xmm1,%xmm0
- movq %r13,112+8(%rsp)
- leaq 96(%rdi),%rdi
-
- prefetcht0 512(%rdi)
- prefetcht0 576(%rdi)
- vaesenclast %xmm5,%xmm11,%xmm11
- vpaddb %xmm2,%xmm0,%xmm5
- movq %r12,120+8(%rsp)
- leaq 96(%rsi),%rsi
- vmovdqu 0-128(%rcx),%xmm15
- vaesenclast %xmm6,%xmm12,%xmm12
- vpaddb %xmm2,%xmm5,%xmm6
- vaesenclast %xmm7,%xmm13,%xmm13
- vpaddb %xmm2,%xmm6,%xmm7
- vaesenclast %xmm3,%xmm14,%xmm14
- vpaddb %xmm2,%xmm7,%xmm3
-
- addq $0x60,%rax
- subq $0x6,%rdx
- jc L$6x_done
-
- vmovups %xmm9,-96(%rsi)
- vpxor %xmm15,%xmm1,%xmm9
- vmovups %xmm10,-80(%rsi)
- vmovdqa %xmm0,%xmm10
- vmovups %xmm11,-64(%rsi)
- vmovdqa %xmm5,%xmm11
- vmovups %xmm12,-48(%rsi)
- vmovdqa %xmm6,%xmm12
- vmovups %xmm13,-32(%rsi)
- vmovdqa %xmm7,%xmm13
- vmovups %xmm14,-16(%rsi)
- vmovdqa %xmm3,%xmm14
- vmovdqu 32+8(%rsp),%xmm7
- jmp L$oop6x
-
-L$6x_done:
- vpxor 16+8(%rsp),%xmm8,%xmm8
- vpxor %xmm4,%xmm8,%xmm8
-
- ret
-
-
-.globl _aesni_gcm_decrypt
-.private_extern _aesni_gcm_decrypt
-
-.p2align 5
-_aesni_gcm_decrypt:
-
-
-_CET_ENDBR
- xorq %rax,%rax
-
-
-
- cmpq $0x60,%rdx
- jb L$gcm_dec_abort
-
- pushq %rbp
-
-
- movq %rsp,%rbp
-
- pushq %rbx
-
-
- pushq %r12
-
-
- pushq %r13
-
-
- pushq %r14
-
-
- pushq %r15
-
-
- vzeroupper
-
- movq 16(%rbp),%r12
- vmovdqu (%r8),%xmm1
- addq $-128,%rsp
- movl 12(%r8),%ebx
- leaq L$bswap_mask(%rip),%r11
- leaq -128(%rcx),%r14
- movq $0xf80,%r15
- vmovdqu (%r12),%xmm8
- andq $-128,%rsp
- vmovdqu (%r11),%xmm0
- leaq 128(%rcx),%rcx
- leaq 32(%r9),%r9
- movl 240-128(%rcx),%r10d
- vpshufb %xmm0,%xmm8,%xmm8
-
- andq %r15,%r14
- andq %rsp,%r15
- subq %r14,%r15
- jc L$dec_no_key_aliasing
- cmpq $768,%r15
- jnc L$dec_no_key_aliasing
- subq %r15,%rsp
-L$dec_no_key_aliasing:
-
- vmovdqu 80(%rdi),%xmm7
- movq %rdi,%r14
- vmovdqu 64(%rdi),%xmm4
-
-
-
-
-
-
-
- leaq -192(%rdi,%rdx,1),%r15
-
- vmovdqu 48(%rdi),%xmm5
- shrq $4,%rdx
- xorq %rax,%rax
- vmovdqu 32(%rdi),%xmm6
- vpshufb %xmm0,%xmm7,%xmm7
- vmovdqu 16(%rdi),%xmm2
- vpshufb %xmm0,%xmm4,%xmm4
- vmovdqu (%rdi),%xmm3
- vpshufb %xmm0,%xmm5,%xmm5
- vmovdqu %xmm4,48(%rsp)
- vpshufb %xmm0,%xmm6,%xmm6
- vmovdqu %xmm5,64(%rsp)
- vpshufb %xmm0,%xmm2,%xmm2
- vmovdqu %xmm6,80(%rsp)
- vpshufb %xmm0,%xmm3,%xmm3
- vmovdqu %xmm2,96(%rsp)
- vmovdqu %xmm3,112(%rsp)
-
- call _aesni_ctr32_ghash_6x
-
- movq 16(%rbp),%r12
- vmovups %xmm9,-96(%rsi)
- vmovups %xmm10,-80(%rsi)
- vmovups %xmm11,-64(%rsi)
- vmovups %xmm12,-48(%rsi)
- vmovups %xmm13,-32(%rsi)
- vmovups %xmm14,-16(%rsi)
-
- vpshufb (%r11),%xmm8,%xmm8
- vmovdqu %xmm8,(%r12)
-
- vzeroupper
- leaq -40(%rbp),%rsp
-
- popq %r15
-
- popq %r14
-
- popq %r13
-
- popq %r12
-
- popq %rbx
-
- popq %rbp
-
-L$gcm_dec_abort:
- ret
-
-
-
-
-.p2align 5
-_aesni_ctr32_6x:
-
- vmovdqu 0-128(%rcx),%xmm4
- vmovdqu 32(%r11),%xmm2
- leaq -1(%r10),%r13
- vmovups 16-128(%rcx),%xmm15
- leaq 32-128(%rcx),%r12
- vpxor %xmm4,%xmm1,%xmm9
- addl $100663296,%ebx
- jc L$handle_ctr32_2
- vpaddb %xmm2,%xmm1,%xmm10
- vpaddb %xmm2,%xmm10,%xmm11
- vpxor %xmm4,%xmm10,%xmm10
- vpaddb %xmm2,%xmm11,%xmm12
- vpxor %xmm4,%xmm11,%xmm11
- vpaddb %xmm2,%xmm12,%xmm13
- vpxor %xmm4,%xmm12,%xmm12
- vpaddb %xmm2,%xmm13,%xmm14
- vpxor %xmm4,%xmm13,%xmm13
- vpaddb %xmm2,%xmm14,%xmm1
- vpxor %xmm4,%xmm14,%xmm14
- jmp L$oop_ctr32
-
-.p2align 4
-L$oop_ctr32:
- vaesenc %xmm15,%xmm9,%xmm9
- vaesenc %xmm15,%xmm10,%xmm10
- vaesenc %xmm15,%xmm11,%xmm11
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vaesenc %xmm15,%xmm14,%xmm14
- vmovups (%r12),%xmm15
- leaq 16(%r12),%r12
- decl %r13d
- jnz L$oop_ctr32
-
- vmovdqu (%r12),%xmm3
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor 0(%rdi),%xmm3,%xmm4
- vaesenc %xmm15,%xmm10,%xmm10
- vpxor 16(%rdi),%xmm3,%xmm5
- vaesenc %xmm15,%xmm11,%xmm11
- vpxor 32(%rdi),%xmm3,%xmm6
- vaesenc %xmm15,%xmm12,%xmm12
- vpxor 48(%rdi),%xmm3,%xmm8
- vaesenc %xmm15,%xmm13,%xmm13
- vpxor 64(%rdi),%xmm3,%xmm2
- vaesenc %xmm15,%xmm14,%xmm14
- vpxor 80(%rdi),%xmm3,%xmm3
- leaq 96(%rdi),%rdi
-
- vaesenclast %xmm4,%xmm9,%xmm9
- vaesenclast %xmm5,%xmm10,%xmm10
- vaesenclast %xmm6,%xmm11,%xmm11
- vaesenclast %xmm8,%xmm12,%xmm12
- vaesenclast %xmm2,%xmm13,%xmm13
- vaesenclast %xmm3,%xmm14,%xmm14
- vmovups %xmm9,0(%rsi)
- vmovups %xmm10,16(%rsi)
- vmovups %xmm11,32(%rsi)
- vmovups %xmm12,48(%rsi)
- vmovups %xmm13,64(%rsi)
- vmovups %xmm14,80(%rsi)
- leaq 96(%rsi),%rsi
-
- ret
-.p2align 5
-L$handle_ctr32_2:
- vpshufb %xmm0,%xmm1,%xmm6
- vmovdqu 48(%r11),%xmm5
- vpaddd 64(%r11),%xmm6,%xmm10
- vpaddd %xmm5,%xmm6,%xmm11
- vpaddd %xmm5,%xmm10,%xmm12
- vpshufb %xmm0,%xmm10,%xmm10
- vpaddd %xmm5,%xmm11,%xmm13
- vpshufb %xmm0,%xmm11,%xmm11
- vpxor %xmm4,%xmm10,%xmm10
- vpaddd %xmm5,%xmm12,%xmm14
- vpshufb %xmm0,%xmm12,%xmm12
- vpxor %xmm4,%xmm11,%xmm11
- vpaddd %xmm5,%xmm13,%xmm1
- vpshufb %xmm0,%xmm13,%xmm13
- vpxor %xmm4,%xmm12,%xmm12
- vpshufb %xmm0,%xmm14,%xmm14
- vpxor %xmm4,%xmm13,%xmm13
- vpshufb %xmm0,%xmm1,%xmm1
- vpxor %xmm4,%xmm14,%xmm14
- jmp L$oop_ctr32
-
-
-
-.globl _aesni_gcm_encrypt
-.private_extern _aesni_gcm_encrypt
-
-.p2align 5
-_aesni_gcm_encrypt:
-
-
-_CET_ENDBR
-#ifdef BORINGSSL_DISPATCH_TEST
-
- movb $1,_BORINGSSL_function_hit+2(%rip)
-#endif
- xorq %rax,%rax
-
-
-
-
- cmpq $288,%rdx
- jb L$gcm_enc_abort
-
- pushq %rbp
-
-
- movq %rsp,%rbp
-
- pushq %rbx
-
-
- pushq %r12
-
-
- pushq %r13
-
-
- pushq %r14
-
-
- pushq %r15
-
-
- vzeroupper
-
- vmovdqu (%r8),%xmm1
- addq $-128,%rsp
- movl 12(%r8),%ebx
- leaq L$bswap_mask(%rip),%r11
- leaq -128(%rcx),%r14
- movq $0xf80,%r15
- leaq 128(%rcx),%rcx
- vmovdqu (%r11),%xmm0
- andq $-128,%rsp
- movl 240-128(%rcx),%r10d
-
- andq %r15,%r14
- andq %rsp,%r15
- subq %r14,%r15
- jc L$enc_no_key_aliasing
- cmpq $768,%r15
- jnc L$enc_no_key_aliasing
- subq %r15,%rsp
-L$enc_no_key_aliasing:
-
- movq %rsi,%r14
-
-
-
-
-
-
-
-
- leaq -192(%rsi,%rdx,1),%r15
-
- shrq $4,%rdx
-
- call _aesni_ctr32_6x
- vpshufb %xmm0,%xmm9,%xmm8
- vpshufb %xmm0,%xmm10,%xmm2
- vmovdqu %xmm8,112(%rsp)
- vpshufb %xmm0,%xmm11,%xmm4
- vmovdqu %xmm2,96(%rsp)
- vpshufb %xmm0,%xmm12,%xmm5
- vmovdqu %xmm4,80(%rsp)
- vpshufb %xmm0,%xmm13,%xmm6
- vmovdqu %xmm5,64(%rsp)
- vpshufb %xmm0,%xmm14,%xmm7
- vmovdqu %xmm6,48(%rsp)
-
- call _aesni_ctr32_6x
-
- movq 16(%rbp),%r12
- leaq 32(%r9),%r9
- vmovdqu (%r12),%xmm8
- subq $12,%rdx
- movq $192,%rax
- vpshufb %xmm0,%xmm8,%xmm8
-
- call _aesni_ctr32_ghash_6x
- vmovdqu 32(%rsp),%xmm7
- vmovdqu (%r11),%xmm0
- vmovdqu 0-32(%r9),%xmm3
- vpunpckhqdq %xmm7,%xmm7,%xmm1
- vmovdqu 32-32(%r9),%xmm15
- vmovups %xmm9,-96(%rsi)
- vpshufb %xmm0,%xmm9,%xmm9
- vpxor %xmm7,%xmm1,%xmm1
- vmovups %xmm10,-80(%rsi)
- vpshufb %xmm0,%xmm10,%xmm10
- vmovups %xmm11,-64(%rsi)
- vpshufb %xmm0,%xmm11,%xmm11
- vmovups %xmm12,-48(%rsi)
- vpshufb %xmm0,%xmm12,%xmm12
- vmovups %xmm13,-32(%rsi)
- vpshufb %xmm0,%xmm13,%xmm13
- vmovups %xmm14,-16(%rsi)
- vpshufb %xmm0,%xmm14,%xmm14
- vmovdqu %xmm9,16(%rsp)
- vmovdqu 48(%rsp),%xmm6
- vmovdqu 16-32(%r9),%xmm0
- vpunpckhqdq %xmm6,%xmm6,%xmm2
- vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
- vpxor %xmm6,%xmm2,%xmm2
- vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
- vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
-
- vmovdqu 64(%rsp),%xmm9
- vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
- vmovdqu 48-32(%r9),%xmm3
- vpxor %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm9,%xmm9,%xmm5
- vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
- vpxor %xmm9,%xmm5,%xmm5
- vpxor %xmm7,%xmm6,%xmm6
- vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
- vmovdqu 80-32(%r9),%xmm15
- vpxor %xmm1,%xmm2,%xmm2
-
- vmovdqu 80(%rsp),%xmm1
- vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
- vmovdqu 64-32(%r9),%xmm0
- vpxor %xmm4,%xmm7,%xmm7
- vpunpckhqdq %xmm1,%xmm1,%xmm4
- vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
- vpxor %xmm1,%xmm4,%xmm4
- vpxor %xmm6,%xmm9,%xmm9
- vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu 96(%rsp),%xmm2
- vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
- vmovdqu 96-32(%r9),%xmm3
- vpxor %xmm7,%xmm6,%xmm6
- vpunpckhqdq %xmm2,%xmm2,%xmm7
- vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpxor %xmm9,%xmm1,%xmm1
- vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
- vmovdqu 128-32(%r9),%xmm15
- vpxor %xmm5,%xmm4,%xmm4
-
- vpxor 112(%rsp),%xmm8,%xmm8
- vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
- vmovdqu 112-32(%r9),%xmm0
- vpunpckhqdq %xmm8,%xmm8,%xmm9
- vpxor %xmm6,%xmm5,%xmm5
- vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
- vpxor %xmm8,%xmm9,%xmm9
- vpxor %xmm1,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
- vpxor %xmm4,%xmm7,%xmm4
-
- vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
- vmovdqu 0-32(%r9),%xmm3
- vpunpckhqdq %xmm14,%xmm14,%xmm1
- vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
- vpxor %xmm14,%xmm1,%xmm1
- vpxor %xmm5,%xmm6,%xmm5
- vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
- vmovdqu 32-32(%r9),%xmm15
- vpxor %xmm2,%xmm8,%xmm7
- vpxor %xmm4,%xmm9,%xmm6
-
- vmovdqu 16-32(%r9),%xmm0
- vpxor %xmm5,%xmm7,%xmm9
- vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
- vpxor %xmm9,%xmm6,%xmm6
- vpunpckhqdq %xmm13,%xmm13,%xmm2
- vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
- vpxor %xmm13,%xmm2,%xmm2
- vpslldq $8,%xmm6,%xmm9
- vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
- vpxor %xmm9,%xmm5,%xmm8
- vpsrldq $8,%xmm6,%xmm6
- vpxor %xmm6,%xmm7,%xmm7
-
- vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
- vmovdqu 48-32(%r9),%xmm3
- vpxor %xmm4,%xmm5,%xmm5
- vpunpckhqdq %xmm12,%xmm12,%xmm9
- vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
- vpxor %xmm12,%xmm9,%xmm9
- vpxor %xmm14,%xmm13,%xmm13
- vpalignr $8,%xmm8,%xmm8,%xmm14
- vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
- vmovdqu 80-32(%r9),%xmm15
- vpxor %xmm1,%xmm2,%xmm2
-
- vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
- vmovdqu 64-32(%r9),%xmm0
- vpxor %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm11,%xmm11,%xmm1
- vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
- vpxor %xmm11,%xmm1,%xmm1
- vpxor %xmm13,%xmm12,%xmm12
- vxorps 16(%rsp),%xmm7,%xmm7
- vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
- vpxor %xmm2,%xmm9,%xmm9
-
- vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
- vxorps %xmm14,%xmm8,%xmm8
-
- vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
- vmovdqu 96-32(%r9),%xmm3
- vpxor %xmm4,%xmm5,%xmm5
- vpunpckhqdq %xmm10,%xmm10,%xmm2
- vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
- vpxor %xmm10,%xmm2,%xmm2
- vpalignr $8,%xmm8,%xmm8,%xmm14
- vpxor %xmm12,%xmm11,%xmm11
- vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
- vmovdqu 128-32(%r9),%xmm15
- vpxor %xmm9,%xmm1,%xmm1
-
- vxorps %xmm7,%xmm14,%xmm14
- vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
- vxorps %xmm14,%xmm8,%xmm8
-
- vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
- vmovdqu 112-32(%r9),%xmm0
- vpxor %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm8,%xmm8,%xmm9
- vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
- vpxor %xmm8,%xmm9,%xmm9
- vpxor %xmm11,%xmm10,%xmm10
- vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
- vpxor %xmm1,%xmm2,%xmm2
-
- vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
- vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
- vpxor %xmm4,%xmm5,%xmm5
- vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
- vpxor %xmm10,%xmm7,%xmm7
- vpxor %xmm2,%xmm6,%xmm6
-
- vpxor %xmm5,%xmm7,%xmm4
- vpxor %xmm4,%xmm6,%xmm6
- vpslldq $8,%xmm6,%xmm1
- vmovdqu 16(%r11),%xmm3
- vpsrldq $8,%xmm6,%xmm6
- vpxor %xmm1,%xmm5,%xmm8
- vpxor %xmm6,%xmm7,%xmm7
-
- vpalignr $8,%xmm8,%xmm8,%xmm2
- vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
- vpxor %xmm2,%xmm8,%xmm8
-
- vpalignr $8,%xmm8,%xmm8,%xmm2
- vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
- vpxor %xmm7,%xmm2,%xmm2
- vpxor %xmm2,%xmm8,%xmm8
- movq 16(%rbp),%r12
- vpshufb (%r11),%xmm8,%xmm8
- vmovdqu %xmm8,(%r12)
-
- vzeroupper
- leaq -40(%rbp),%rsp
-
- popq %r15
-
- popq %r14
-
- popq %r13
-
- popq %r12
-
- popq %rbx
-
- popq %rbp
-
-L$gcm_enc_abort:
- ret
-
-
-
-.section __DATA,__const
-.p2align 6
-L$bswap_mask:
-.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-L$poly:
-.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
-L$one_msb:
-.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
-L$two_lsb:
-.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-L$one_lsb:
-.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.p2align 6
-.text
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/aesni-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/aesni-x86_64-apple.S
deleted file mode 100644
index b8ba910..0000000
--- a/apple-x86_64/crypto/fipsmodule/aesni-x86_64-apple.S
+++ /dev/null
@@ -1,2507 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-.globl _aes_hw_encrypt
-.private_extern _aes_hw_encrypt
-
-.p2align 4
-_aes_hw_encrypt:
-
-_CET_ENDBR
-#ifdef BORINGSSL_DISPATCH_TEST
-
- movb $1,_BORINGSSL_function_hit+1(%rip)
-#endif
- movups (%rdi),%xmm2
- movl 240(%rdx),%eax
- movups (%rdx),%xmm0
- movups 16(%rdx),%xmm1
- leaq 32(%rdx),%rdx
- xorps %xmm0,%xmm2
-L$oop_enc1_1:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rdx),%xmm1
- leaq 16(%rdx),%rdx
- jnz L$oop_enc1_1
-.byte 102,15,56,221,209
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- movups %xmm2,(%rsi)
- pxor %xmm2,%xmm2
- ret
-
-
-
-.globl _aes_hw_decrypt
-.private_extern _aes_hw_decrypt
-
-.p2align 4
-_aes_hw_decrypt:
-
-_CET_ENDBR
- movups (%rdi),%xmm2
- movl 240(%rdx),%eax
- movups (%rdx),%xmm0
- movups 16(%rdx),%xmm1
- leaq 32(%rdx),%rdx
- xorps %xmm0,%xmm2
-L$oop_dec1_2:
-.byte 102,15,56,222,209
- decl %eax
- movups (%rdx),%xmm1
- leaq 16(%rdx),%rdx
- jnz L$oop_dec1_2
-.byte 102,15,56,223,209
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- movups %xmm2,(%rsi)
- pxor %xmm2,%xmm2
- ret
-
-
-
-.p2align 4
-_aesni_encrypt2:
-
- movups (%rcx),%xmm0
- shll $4,%eax
- movups 16(%rcx),%xmm1
- xorps %xmm0,%xmm2
- xorps %xmm0,%xmm3
- movups 32(%rcx),%xmm0
- leaq 32(%rcx,%rax,1),%rcx
- negq %rax
- addq $16,%rax
-
-L$enc_loop2:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
- movups (%rcx,%rax,1),%xmm1
- addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
- movups -16(%rcx,%rax,1),%xmm0
- jnz L$enc_loop2
-
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
- ret
-
-
-
-.p2align 4
-_aesni_decrypt2:
-
- movups (%rcx),%xmm0
- shll $4,%eax
- movups 16(%rcx),%xmm1
- xorps %xmm0,%xmm2
- xorps %xmm0,%xmm3
- movups 32(%rcx),%xmm0
- leaq 32(%rcx,%rax,1),%rcx
- negq %rax
- addq $16,%rax
-
-L$dec_loop2:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
- movups (%rcx,%rax,1),%xmm1
- addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
- movups -16(%rcx,%rax,1),%xmm0
- jnz L$dec_loop2
-
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
- ret
-
-
-
-.p2align 4
-_aesni_encrypt3:
-
- movups (%rcx),%xmm0
- shll $4,%eax
- movups 16(%rcx),%xmm1
- xorps %xmm0,%xmm2
- xorps %xmm0,%xmm3
- xorps %xmm0,%xmm4
- movups 32(%rcx),%xmm0
- leaq 32(%rcx,%rax,1),%rcx
- negq %rax
- addq $16,%rax
-
-L$enc_loop3:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
- movups (%rcx,%rax,1),%xmm1
- addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
- movups -16(%rcx,%rax,1),%xmm0
- jnz L$enc_loop3
-
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
- ret
-
-
-
-.p2align 4
-_aesni_decrypt3:
-
- movups (%rcx),%xmm0
- shll $4,%eax
- movups 16(%rcx),%xmm1
- xorps %xmm0,%xmm2
- xorps %xmm0,%xmm3
- xorps %xmm0,%xmm4
- movups 32(%rcx),%xmm0
- leaq 32(%rcx,%rax,1),%rcx
- negq %rax
- addq $16,%rax
-
-L$dec_loop3:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
- movups (%rcx,%rax,1),%xmm1
- addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
- movups -16(%rcx,%rax,1),%xmm0
- jnz L$dec_loop3
-
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
- ret
-
-
-
-.p2align 4
-_aesni_encrypt4:
-
- movups (%rcx),%xmm0
- shll $4,%eax
- movups 16(%rcx),%xmm1
- xorps %xmm0,%xmm2
- xorps %xmm0,%xmm3
- xorps %xmm0,%xmm4
- xorps %xmm0,%xmm5
- movups 32(%rcx),%xmm0
- leaq 32(%rcx,%rax,1),%rcx
- negq %rax
-.byte 0x0f,0x1f,0x00
- addq $16,%rax
-
-L$enc_loop4:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
- movups (%rcx,%rax,1),%xmm1
- addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
- movups -16(%rcx,%rax,1),%xmm0
- jnz L$enc_loop4
-
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
-.byte 102,15,56,221,232
- ret
-
-
-
-.p2align 4
-_aesni_decrypt4:
-
- movups (%rcx),%xmm0
- shll $4,%eax
- movups 16(%rcx),%xmm1
- xorps %xmm0,%xmm2
- xorps %xmm0,%xmm3
- xorps %xmm0,%xmm4
- xorps %xmm0,%xmm5
- movups 32(%rcx),%xmm0
- leaq 32(%rcx,%rax,1),%rcx
- negq %rax
-.byte 0x0f,0x1f,0x00
- addq $16,%rax
-
-L$dec_loop4:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
- movups (%rcx,%rax,1),%xmm1
- addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
- movups -16(%rcx,%rax,1),%xmm0
- jnz L$dec_loop4
-
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
-.byte 102,15,56,223,232
- ret
-
-
-
-.p2align 4
-_aesni_encrypt6:
-
- movups (%rcx),%xmm0
- shll $4,%eax
- movups 16(%rcx),%xmm1
- xorps %xmm0,%xmm2
- pxor %xmm0,%xmm3
- pxor %xmm0,%xmm4
-.byte 102,15,56,220,209
- leaq 32(%rcx,%rax,1),%rcx
- negq %rax
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm5
- pxor %xmm0,%xmm6
-.byte 102,15,56,220,225
- pxor %xmm0,%xmm7
- movups (%rcx,%rax,1),%xmm0
- addq $16,%rax
- jmp L$enc_loop6_enter
-.p2align 4
-L$enc_loop6:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-L$enc_loop6_enter:
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- movups (%rcx,%rax,1),%xmm1
- addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
- movups -16(%rcx,%rax,1),%xmm0
- jnz L$enc_loop6
-
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
- ret
-
-
-
-.p2align 4
-_aesni_decrypt6:
-
- movups (%rcx),%xmm0
- shll $4,%eax
- movups 16(%rcx),%xmm1
- xorps %xmm0,%xmm2
- pxor %xmm0,%xmm3
- pxor %xmm0,%xmm4
-.byte 102,15,56,222,209
- leaq 32(%rcx,%rax,1),%rcx
- negq %rax
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm5
- pxor %xmm0,%xmm6
-.byte 102,15,56,222,225
- pxor %xmm0,%xmm7
- movups (%rcx,%rax,1),%xmm0
- addq $16,%rax
- jmp L$dec_loop6_enter
-.p2align 4
-L$dec_loop6:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-L$dec_loop6_enter:
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- movups (%rcx,%rax,1),%xmm1
- addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
- movups -16(%rcx,%rax,1),%xmm0
- jnz L$dec_loop6
-
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
- ret
-
-
-
-.p2align 4
-_aesni_encrypt8:
-
- movups (%rcx),%xmm0
- shll $4,%eax
- movups 16(%rcx),%xmm1
- xorps %xmm0,%xmm2
- xorps %xmm0,%xmm3
- pxor %xmm0,%xmm4
- pxor %xmm0,%xmm5
- pxor %xmm0,%xmm6
- leaq 32(%rcx,%rax,1),%rcx
- negq %rax
-.byte 102,15,56,220,209
- pxor %xmm0,%xmm7
- pxor %xmm0,%xmm8
-.byte 102,15,56,220,217
- pxor %xmm0,%xmm9
- movups (%rcx,%rax,1),%xmm0
- addq $16,%rax
- jmp L$enc_loop8_inner
-.p2align 4
-L$enc_loop8:
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-L$enc_loop8_inner:
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
-L$enc_loop8_enter:
- movups (%rcx,%rax,1),%xmm1
- addq $32,%rax
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
- movups -16(%rcx,%rax,1),%xmm0
- jnz L$enc_loop8
-
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
-.byte 102,15,56,221,208
-.byte 102,15,56,221,216
-.byte 102,15,56,221,224
-.byte 102,15,56,221,232
-.byte 102,15,56,221,240
-.byte 102,15,56,221,248
-.byte 102,68,15,56,221,192
-.byte 102,68,15,56,221,200
- ret
-
-
-
-.p2align 4
-_aesni_decrypt8:
-
- movups (%rcx),%xmm0
- shll $4,%eax
- movups 16(%rcx),%xmm1
- xorps %xmm0,%xmm2
- xorps %xmm0,%xmm3
- pxor %xmm0,%xmm4
- pxor %xmm0,%xmm5
- pxor %xmm0,%xmm6
- leaq 32(%rcx,%rax,1),%rcx
- negq %rax
-.byte 102,15,56,222,209
- pxor %xmm0,%xmm7
- pxor %xmm0,%xmm8
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm9
- movups (%rcx,%rax,1),%xmm0
- addq $16,%rax
- jmp L$dec_loop8_inner
-.p2align 4
-L$dec_loop8:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-L$dec_loop8_inner:
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
-L$dec_loop8_enter:
- movups (%rcx,%rax,1),%xmm1
- addq $32,%rax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
- movups -16(%rcx,%rax,1),%xmm0
- jnz L$dec_loop8
-
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
-.byte 102,15,56,223,208
-.byte 102,15,56,223,216
-.byte 102,15,56,223,224
-.byte 102,15,56,223,232
-.byte 102,15,56,223,240
-.byte 102,15,56,223,248
-.byte 102,68,15,56,223,192
-.byte 102,68,15,56,223,200
- ret
-
-
-.globl _aes_hw_ecb_encrypt
-.private_extern _aes_hw_ecb_encrypt
-
-.p2align 4
-_aes_hw_ecb_encrypt:
-
-_CET_ENDBR
- andq $-16,%rdx
- jz L$ecb_ret
-
- movl 240(%rcx),%eax
- movups (%rcx),%xmm0
- movq %rcx,%r11
- movl %eax,%r10d
- testl %r8d,%r8d
- jz L$ecb_decrypt
-
- cmpq $0x80,%rdx
- jb L$ecb_enc_tail
-
- movdqu (%rdi),%xmm2
- movdqu 16(%rdi),%xmm3
- movdqu 32(%rdi),%xmm4
- movdqu 48(%rdi),%xmm5
- movdqu 64(%rdi),%xmm6
- movdqu 80(%rdi),%xmm7
- movdqu 96(%rdi),%xmm8
- movdqu 112(%rdi),%xmm9
- leaq 128(%rdi),%rdi
- subq $0x80,%rdx
- jmp L$ecb_enc_loop8_enter
-.p2align 4
-L$ecb_enc_loop8:
- movups %xmm2,(%rsi)
- movq %r11,%rcx
- movdqu (%rdi),%xmm2
- movl %r10d,%eax
- movups %xmm3,16(%rsi)
- movdqu 16(%rdi),%xmm3
- movups %xmm4,32(%rsi)
- movdqu 32(%rdi),%xmm4
- movups %xmm5,48(%rsi)
- movdqu 48(%rdi),%xmm5
- movups %xmm6,64(%rsi)
- movdqu 64(%rdi),%xmm6
- movups %xmm7,80(%rsi)
- movdqu 80(%rdi),%xmm7
- movups %xmm8,96(%rsi)
- movdqu 96(%rdi),%xmm8
- movups %xmm9,112(%rsi)
- leaq 128(%rsi),%rsi
- movdqu 112(%rdi),%xmm9
- leaq 128(%rdi),%rdi
-L$ecb_enc_loop8_enter:
-
- call _aesni_encrypt8
-
- subq $0x80,%rdx
- jnc L$ecb_enc_loop8
-
- movups %xmm2,(%rsi)
- movq %r11,%rcx
- movups %xmm3,16(%rsi)
- movl %r10d,%eax
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
- movups %xmm8,96(%rsi)
- movups %xmm9,112(%rsi)
- leaq 128(%rsi),%rsi
- addq $0x80,%rdx
- jz L$ecb_ret
-
-L$ecb_enc_tail:
- movups (%rdi),%xmm2
- cmpq $0x20,%rdx
- jb L$ecb_enc_one
- movups 16(%rdi),%xmm3
- je L$ecb_enc_two
- movups 32(%rdi),%xmm4
- cmpq $0x40,%rdx
- jb L$ecb_enc_three
- movups 48(%rdi),%xmm5
- je L$ecb_enc_four
- movups 64(%rdi),%xmm6
- cmpq $0x60,%rdx
- jb L$ecb_enc_five
- movups 80(%rdi),%xmm7
- je L$ecb_enc_six
- movdqu 96(%rdi),%xmm8
- xorps %xmm9,%xmm9
- call _aesni_encrypt8
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
- movups %xmm8,96(%rsi)
- jmp L$ecb_ret
-.p2align 4
-L$ecb_enc_one:
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-L$oop_enc1_3:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz L$oop_enc1_3
-.byte 102,15,56,221,209
- movups %xmm2,(%rsi)
- jmp L$ecb_ret
-.p2align 4
-L$ecb_enc_two:
- call _aesni_encrypt2
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- jmp L$ecb_ret
-.p2align 4
-L$ecb_enc_three:
- call _aesni_encrypt3
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- jmp L$ecb_ret
-.p2align 4
-L$ecb_enc_four:
- call _aesni_encrypt4
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- jmp L$ecb_ret
-.p2align 4
-L$ecb_enc_five:
- xorps %xmm7,%xmm7
- call _aesni_encrypt6
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- jmp L$ecb_ret
-.p2align 4
-L$ecb_enc_six:
- call _aesni_encrypt6
- movups %xmm2,(%rsi)
- movups %xmm3,16(%rsi)
- movups %xmm4,32(%rsi)
- movups %xmm5,48(%rsi)
- movups %xmm6,64(%rsi)
- movups %xmm7,80(%rsi)
- jmp L$ecb_ret
-
-.p2align 4
-L$ecb_decrypt:
- cmpq $0x80,%rdx
- jb L$ecb_dec_tail
-
- movdqu (%rdi),%xmm2
- movdqu 16(%rdi),%xmm3
- movdqu 32(%rdi),%xmm4
- movdqu 48(%rdi),%xmm5
- movdqu 64(%rdi),%xmm6
- movdqu 80(%rdi),%xmm7
- movdqu 96(%rdi),%xmm8
- movdqu 112(%rdi),%xmm9
- leaq 128(%rdi),%rdi
- subq $0x80,%rdx
- jmp L$ecb_dec_loop8_enter
-.p2align 4
-L$ecb_dec_loop8:
- movups %xmm2,(%rsi)
- movq %r11,%rcx
- movdqu (%rdi),%xmm2
- movl %r10d,%eax
- movups %xmm3,16(%rsi)
- movdqu 16(%rdi),%xmm3
- movups %xmm4,32(%rsi)
- movdqu 32(%rdi),%xmm4
- movups %xmm5,48(%rsi)
- movdqu 48(%rdi),%xmm5
- movups %xmm6,64(%rsi)
- movdqu 64(%rdi),%xmm6
- movups %xmm7,80(%rsi)
- movdqu 80(%rdi),%xmm7
- movups %xmm8,96(%rsi)
- movdqu 96(%rdi),%xmm8
- movups %xmm9,112(%rsi)
- leaq 128(%rsi),%rsi
- movdqu 112(%rdi),%xmm9
- leaq 128(%rdi),%rdi
-L$ecb_dec_loop8_enter:
-
- call _aesni_decrypt8
-
- movups (%r11),%xmm0
- subq $0x80,%rdx
- jnc L$ecb_dec_loop8
-
- movups %xmm2,(%rsi)
- pxor %xmm2,%xmm2
- movq %r11,%rcx
- movups %xmm3,16(%rsi)
- pxor %xmm3,%xmm3
- movl %r10d,%eax
- movups %xmm4,32(%rsi)
- pxor %xmm4,%xmm4
- movups %xmm5,48(%rsi)
- pxor %xmm5,%xmm5
- movups %xmm6,64(%rsi)
- pxor %xmm6,%xmm6
- movups %xmm7,80(%rsi)
- pxor %xmm7,%xmm7
- movups %xmm8,96(%rsi)
- pxor %xmm8,%xmm8
- movups %xmm9,112(%rsi)
- pxor %xmm9,%xmm9
- leaq 128(%rsi),%rsi
- addq $0x80,%rdx
- jz L$ecb_ret
-
-L$ecb_dec_tail:
- movups (%rdi),%xmm2
- cmpq $0x20,%rdx
- jb L$ecb_dec_one
- movups 16(%rdi),%xmm3
- je L$ecb_dec_two
- movups 32(%rdi),%xmm4
- cmpq $0x40,%rdx
- jb L$ecb_dec_three
- movups 48(%rdi),%xmm5
- je L$ecb_dec_four
- movups 64(%rdi),%xmm6
- cmpq $0x60,%rdx
- jb L$ecb_dec_five
- movups 80(%rdi),%xmm7
- je L$ecb_dec_six
- movups 96(%rdi),%xmm8
- movups (%rcx),%xmm0
- xorps %xmm9,%xmm9
- call _aesni_decrypt8
- movups %xmm2,(%rsi)
- pxor %xmm2,%xmm2
- movups %xmm3,16(%rsi)
- pxor %xmm3,%xmm3
- movups %xmm4,32(%rsi)
- pxor %xmm4,%xmm4
- movups %xmm5,48(%rsi)
- pxor %xmm5,%xmm5
- movups %xmm6,64(%rsi)
- pxor %xmm6,%xmm6
- movups %xmm7,80(%rsi)
- pxor %xmm7,%xmm7
- movups %xmm8,96(%rsi)
- pxor %xmm8,%xmm8
- pxor %xmm9,%xmm9
- jmp L$ecb_ret
-.p2align 4
-L$ecb_dec_one:
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-L$oop_dec1_4:
-.byte 102,15,56,222,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz L$oop_dec1_4
-.byte 102,15,56,223,209
- movups %xmm2,(%rsi)
- pxor %xmm2,%xmm2
- jmp L$ecb_ret
-.p2align 4
-L$ecb_dec_two:
- call _aesni_decrypt2
- movups %xmm2,(%rsi)
- pxor %xmm2,%xmm2
- movups %xmm3,16(%rsi)
- pxor %xmm3,%xmm3
- jmp L$ecb_ret
-.p2align 4
-L$ecb_dec_three:
- call _aesni_decrypt3
- movups %xmm2,(%rsi)
- pxor %xmm2,%xmm2
- movups %xmm3,16(%rsi)
- pxor %xmm3,%xmm3
- movups %xmm4,32(%rsi)
- pxor %xmm4,%xmm4
- jmp L$ecb_ret
-.p2align 4
-L$ecb_dec_four:
- call _aesni_decrypt4
- movups %xmm2,(%rsi)
- pxor %xmm2,%xmm2
- movups %xmm3,16(%rsi)
- pxor %xmm3,%xmm3
- movups %xmm4,32(%rsi)
- pxor %xmm4,%xmm4
- movups %xmm5,48(%rsi)
- pxor %xmm5,%xmm5
- jmp L$ecb_ret
-.p2align 4
-L$ecb_dec_five:
- xorps %xmm7,%xmm7
- call _aesni_decrypt6
- movups %xmm2,(%rsi)
- pxor %xmm2,%xmm2
- movups %xmm3,16(%rsi)
- pxor %xmm3,%xmm3
- movups %xmm4,32(%rsi)
- pxor %xmm4,%xmm4
- movups %xmm5,48(%rsi)
- pxor %xmm5,%xmm5
- movups %xmm6,64(%rsi)
- pxor %xmm6,%xmm6
- pxor %xmm7,%xmm7
- jmp L$ecb_ret
-.p2align 4
-L$ecb_dec_six:
- call _aesni_decrypt6
- movups %xmm2,(%rsi)
- pxor %xmm2,%xmm2
- movups %xmm3,16(%rsi)
- pxor %xmm3,%xmm3
- movups %xmm4,32(%rsi)
- pxor %xmm4,%xmm4
- movups %xmm5,48(%rsi)
- pxor %xmm5,%xmm5
- movups %xmm6,64(%rsi)
- pxor %xmm6,%xmm6
- movups %xmm7,80(%rsi)
- pxor %xmm7,%xmm7
-
-L$ecb_ret:
- xorps %xmm0,%xmm0
- pxor %xmm1,%xmm1
- ret
-
-
-.globl _aes_hw_ctr32_encrypt_blocks
-.private_extern _aes_hw_ctr32_encrypt_blocks
-
-.p2align 4
-_aes_hw_ctr32_encrypt_blocks:
-
-_CET_ENDBR
-#ifdef BORINGSSL_DISPATCH_TEST
- movb $1,_BORINGSSL_function_hit(%rip)
-#endif
- cmpq $1,%rdx
- jne L$ctr32_bulk
-
-
-
- movups (%r8),%xmm2
- movups (%rdi),%xmm3
- movl 240(%rcx),%edx
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-L$oop_enc1_5:
-.byte 102,15,56,220,209
- decl %edx
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz L$oop_enc1_5
-.byte 102,15,56,221,209
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- xorps %xmm3,%xmm2
- pxor %xmm3,%xmm3
- movups %xmm2,(%rsi)
- xorps %xmm2,%xmm2
- jmp L$ctr32_epilogue
-
-.p2align 4
-L$ctr32_bulk:
- leaq (%rsp),%r11
-
- pushq %rbp
-
- subq $128,%rsp
- andq $-16,%rsp
-
-
-
-
- movdqu (%r8),%xmm2
- movdqu (%rcx),%xmm0
- movl 12(%r8),%r8d
- pxor %xmm0,%xmm2
- movl 12(%rcx),%ebp
- movdqa %xmm2,0(%rsp)
- bswapl %r8d
- movdqa %xmm2,%xmm3
- movdqa %xmm2,%xmm4
- movdqa %xmm2,%xmm5
- movdqa %xmm2,64(%rsp)
- movdqa %xmm2,80(%rsp)
- movdqa %xmm2,96(%rsp)
- movq %rdx,%r10
- movdqa %xmm2,112(%rsp)
-
- leaq 1(%r8),%rax
- leaq 2(%r8),%rdx
- bswapl %eax
- bswapl %edx
- xorl %ebp,%eax
- xorl %ebp,%edx
-.byte 102,15,58,34,216,3
- leaq 3(%r8),%rax
- movdqa %xmm3,16(%rsp)
-.byte 102,15,58,34,226,3
- bswapl %eax
- movq %r10,%rdx
- leaq 4(%r8),%r10
- movdqa %xmm4,32(%rsp)
- xorl %ebp,%eax
- bswapl %r10d
-.byte 102,15,58,34,232,3
- xorl %ebp,%r10d
- movdqa %xmm5,48(%rsp)
- leaq 5(%r8),%r9
- movl %r10d,64+12(%rsp)
- bswapl %r9d
- leaq 6(%r8),%r10
- movl 240(%rcx),%eax
- xorl %ebp,%r9d
- bswapl %r10d
- movl %r9d,80+12(%rsp)
- xorl %ebp,%r10d
- leaq 7(%r8),%r9
- movl %r10d,96+12(%rsp)
- bswapl %r9d
- leaq _OPENSSL_ia32cap_P(%rip),%r10
- movl 4(%r10),%r10d
- xorl %ebp,%r9d
- andl $71303168,%r10d
- movl %r9d,112+12(%rsp)
-
- movups 16(%rcx),%xmm1
-
- movdqa 64(%rsp),%xmm6
- movdqa 80(%rsp),%xmm7
-
- cmpq $8,%rdx
- jb L$ctr32_tail
-
- subq $6,%rdx
- cmpl $4194304,%r10d
- je L$ctr32_6x
-
- leaq 128(%rcx),%rcx
- subq $2,%rdx
- jmp L$ctr32_loop8
-
-.p2align 4
-L$ctr32_6x:
- shll $4,%eax
- movl $48,%r10d
- bswapl %ebp
- leaq 32(%rcx,%rax,1),%rcx
- subq %rax,%r10
- jmp L$ctr32_loop6
-
-.p2align 4
-L$ctr32_loop6:
- addl $6,%r8d
- movups -48(%rcx,%r10,1),%xmm0
-.byte 102,15,56,220,209
- movl %r8d,%eax
- xorl %ebp,%eax
-.byte 102,15,56,220,217
-.byte 0x0f,0x38,0xf1,0x44,0x24,12
- leal 1(%r8),%eax
-.byte 102,15,56,220,225
- xorl %ebp,%eax
-.byte 0x0f,0x38,0xf1,0x44,0x24,28
-.byte 102,15,56,220,233
- leal 2(%r8),%eax
- xorl %ebp,%eax
-.byte 102,15,56,220,241
-.byte 0x0f,0x38,0xf1,0x44,0x24,44
- leal 3(%r8),%eax
-.byte 102,15,56,220,249
- movups -32(%rcx,%r10,1),%xmm1
- xorl %ebp,%eax
-
-.byte 102,15,56,220,208
-.byte 0x0f,0x38,0xf1,0x44,0x24,60
- leal 4(%r8),%eax
-.byte 102,15,56,220,216
- xorl %ebp,%eax
-.byte 0x0f,0x38,0xf1,0x44,0x24,76
-.byte 102,15,56,220,224
- leal 5(%r8),%eax
- xorl %ebp,%eax
-.byte 102,15,56,220,232
-.byte 0x0f,0x38,0xf1,0x44,0x24,92
- movq %r10,%rax
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
- movups -16(%rcx,%r10,1),%xmm0
-
- call L$enc_loop6
-
- movdqu (%rdi),%xmm8
- movdqu 16(%rdi),%xmm9
- movdqu 32(%rdi),%xmm10
- movdqu 48(%rdi),%xmm11
- movdqu 64(%rdi),%xmm12
- movdqu 80(%rdi),%xmm13
- leaq 96(%rdi),%rdi
- movups -64(%rcx,%r10,1),%xmm1
- pxor %xmm2,%xmm8
- movaps 0(%rsp),%xmm2
- pxor %xmm3,%xmm9
- movaps 16(%rsp),%xmm3
- pxor %xmm4,%xmm10
- movaps 32(%rsp),%xmm4
- pxor %xmm5,%xmm11
- movaps 48(%rsp),%xmm5
- pxor %xmm6,%xmm12
- movaps 64(%rsp),%xmm6
- pxor %xmm7,%xmm13
- movaps 80(%rsp),%xmm7
- movdqu %xmm8,(%rsi)
- movdqu %xmm9,16(%rsi)
- movdqu %xmm10,32(%rsi)
- movdqu %xmm11,48(%rsi)
- movdqu %xmm12,64(%rsi)
- movdqu %xmm13,80(%rsi)
- leaq 96(%rsi),%rsi
-
- subq $6,%rdx
- jnc L$ctr32_loop6
-
- addq $6,%rdx
- jz L$ctr32_done
-
- leal -48(%r10),%eax
- leaq -80(%rcx,%r10,1),%rcx
- negl %eax
- shrl $4,%eax
- jmp L$ctr32_tail
-
-.p2align 5
-L$ctr32_loop8:
- addl $8,%r8d
- movdqa 96(%rsp),%xmm8
-.byte 102,15,56,220,209
- movl %r8d,%r9d
- movdqa 112(%rsp),%xmm9
-.byte 102,15,56,220,217
- bswapl %r9d
- movups 32-128(%rcx),%xmm0
-.byte 102,15,56,220,225
- xorl %ebp,%r9d
- nop
-.byte 102,15,56,220,233
- movl %r9d,0+12(%rsp)
- leaq 1(%r8),%r9
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups 48-128(%rcx),%xmm1
- bswapl %r9d
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
- xorl %ebp,%r9d
-.byte 0x66,0x90
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
- movl %r9d,16+12(%rsp)
- leaq 2(%r8),%r9
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
- movups 64-128(%rcx),%xmm0
- bswapl %r9d
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
- xorl %ebp,%r9d
-.byte 0x66,0x90
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
- movl %r9d,32+12(%rsp)
- leaq 3(%r8),%r9
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups 80-128(%rcx),%xmm1
- bswapl %r9d
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
- xorl %ebp,%r9d
-.byte 0x66,0x90
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
- movl %r9d,48+12(%rsp)
- leaq 4(%r8),%r9
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
- movups 96-128(%rcx),%xmm0
- bswapl %r9d
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
- xorl %ebp,%r9d
-.byte 0x66,0x90
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
- movl %r9d,64+12(%rsp)
- leaq 5(%r8),%r9
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups 112-128(%rcx),%xmm1
- bswapl %r9d
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
- xorl %ebp,%r9d
-.byte 0x66,0x90
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
- movl %r9d,80+12(%rsp)
- leaq 6(%r8),%r9
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
- movups 128-128(%rcx),%xmm0
- bswapl %r9d
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
- xorl %ebp,%r9d
-.byte 0x66,0x90
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
- movl %r9d,96+12(%rsp)
- leaq 7(%r8),%r9
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups 144-128(%rcx),%xmm1
- bswapl %r9d
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
- xorl %ebp,%r9d
- movdqu 0(%rdi),%xmm10
-.byte 102,15,56,220,232
- movl %r9d,112+12(%rsp)
- cmpl $11,%eax
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
- movups 160-128(%rcx),%xmm0
-
- jb L$ctr32_enc_done
-
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups 176-128(%rcx),%xmm1
-
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
- movups 192-128(%rcx),%xmm0
- je L$ctr32_enc_done
-
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movups 208-128(%rcx),%xmm1
-
-.byte 102,15,56,220,208
-.byte 102,15,56,220,216
-.byte 102,15,56,220,224
-.byte 102,15,56,220,232
-.byte 102,15,56,220,240
-.byte 102,15,56,220,248
-.byte 102,68,15,56,220,192
-.byte 102,68,15,56,220,200
- movups 224-128(%rcx),%xmm0
- jmp L$ctr32_enc_done
-
-.p2align 4
-L$ctr32_enc_done:
- movdqu 16(%rdi),%xmm11
- pxor %xmm0,%xmm10
- movdqu 32(%rdi),%xmm12
- pxor %xmm0,%xmm11
- movdqu 48(%rdi),%xmm13
- pxor %xmm0,%xmm12
- movdqu 64(%rdi),%xmm14
- pxor %xmm0,%xmm13
- movdqu 80(%rdi),%xmm15
- pxor %xmm0,%xmm14
- prefetcht0 448(%rdi)
- prefetcht0 512(%rdi)
- pxor %xmm0,%xmm15
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-.byte 102,68,15,56,220,201
- movdqu 96(%rdi),%xmm1
- leaq 128(%rdi),%rdi
-
-.byte 102,65,15,56,221,210
- pxor %xmm0,%xmm1
- movdqu 112-128(%rdi),%xmm10
-.byte 102,65,15,56,221,219
- pxor %xmm0,%xmm10
- movdqa 0(%rsp),%xmm11
-.byte 102,65,15,56,221,228
-.byte 102,65,15,56,221,237
- movdqa 16(%rsp),%xmm12
- movdqa 32(%rsp),%xmm13
-.byte 102,65,15,56,221,246
-.byte 102,65,15,56,221,255
- movdqa 48(%rsp),%xmm14
- movdqa 64(%rsp),%xmm15
-.byte 102,68,15,56,221,193
- movdqa 80(%rsp),%xmm0
- movups 16-128(%rcx),%xmm1
-.byte 102,69,15,56,221,202
-
- movups %xmm2,(%rsi)
- movdqa %xmm11,%xmm2
- movups %xmm3,16(%rsi)
- movdqa %xmm12,%xmm3
- movups %xmm4,32(%rsi)
- movdqa %xmm13,%xmm4
- movups %xmm5,48(%rsi)
- movdqa %xmm14,%xmm5
- movups %xmm6,64(%rsi)
- movdqa %xmm15,%xmm6
- movups %xmm7,80(%rsi)
- movdqa %xmm0,%xmm7
- movups %xmm8,96(%rsi)
- movups %xmm9,112(%rsi)
- leaq 128(%rsi),%rsi
-
- subq $8,%rdx
- jnc L$ctr32_loop8
-
- addq $8,%rdx
- jz L$ctr32_done
- leaq -128(%rcx),%rcx
-
-L$ctr32_tail:
-
-
- leaq 16(%rcx),%rcx
- cmpq $4,%rdx
- jb L$ctr32_loop3
- je L$ctr32_loop4
-
-
- shll $4,%eax
- movdqa 96(%rsp),%xmm8
- pxor %xmm9,%xmm9
-
- movups 16(%rcx),%xmm0
-.byte 102,15,56,220,209
-.byte 102,15,56,220,217
- leaq 32-16(%rcx,%rax,1),%rcx
- negq %rax
-.byte 102,15,56,220,225
- addq $16,%rax
- movups (%rdi),%xmm10
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
- movups 16(%rdi),%xmm11
- movups 32(%rdi),%xmm12
-.byte 102,15,56,220,249
-.byte 102,68,15,56,220,193
-
- call L$enc_loop8_enter
-
- movdqu 48(%rdi),%xmm13
- pxor %xmm10,%xmm2
- movdqu 64(%rdi),%xmm10
- pxor %xmm11,%xmm3
- movdqu %xmm2,(%rsi)
- pxor %xmm12,%xmm4
- movdqu %xmm3,16(%rsi)
- pxor %xmm13,%xmm5
- movdqu %xmm4,32(%rsi)
- pxor %xmm10,%xmm6
- movdqu %xmm5,48(%rsi)
- movdqu %xmm6,64(%rsi)
- cmpq $6,%rdx
- jb L$ctr32_done
-
- movups 80(%rdi),%xmm11
- xorps %xmm11,%xmm7
- movups %xmm7,80(%rsi)
- je L$ctr32_done
-
- movups 96(%rdi),%xmm12
- xorps %xmm12,%xmm8
- movups %xmm8,96(%rsi)
- jmp L$ctr32_done
-
-.p2align 5
-L$ctr32_loop4:
-.byte 102,15,56,220,209
- leaq 16(%rcx),%rcx
- decl %eax
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
-.byte 102,15,56,220,233
- movups (%rcx),%xmm1
- jnz L$ctr32_loop4
-.byte 102,15,56,221,209
-.byte 102,15,56,221,217
- movups (%rdi),%xmm10
- movups 16(%rdi),%xmm11
-.byte 102,15,56,221,225
-.byte 102,15,56,221,233
- movups 32(%rdi),%xmm12
- movups 48(%rdi),%xmm13
-
- xorps %xmm10,%xmm2
- movups %xmm2,(%rsi)
- xorps %xmm11,%xmm3
- movups %xmm3,16(%rsi)
- pxor %xmm12,%xmm4
- movdqu %xmm4,32(%rsi)
- pxor %xmm13,%xmm5
- movdqu %xmm5,48(%rsi)
- jmp L$ctr32_done
-
-.p2align 5
-L$ctr32_loop3:
-.byte 102,15,56,220,209
- leaq 16(%rcx),%rcx
- decl %eax
-.byte 102,15,56,220,217
-.byte 102,15,56,220,225
- movups (%rcx),%xmm1
- jnz L$ctr32_loop3
-.byte 102,15,56,221,209
-.byte 102,15,56,221,217
-.byte 102,15,56,221,225
-
- movups (%rdi),%xmm10
- xorps %xmm10,%xmm2
- movups %xmm2,(%rsi)
- cmpq $2,%rdx
- jb L$ctr32_done
-
- movups 16(%rdi),%xmm11
- xorps %xmm11,%xmm3
- movups %xmm3,16(%rsi)
- je L$ctr32_done
-
- movups 32(%rdi),%xmm12
- xorps %xmm12,%xmm4
- movups %xmm4,32(%rsi)
-
-L$ctr32_done:
- xorps %xmm0,%xmm0
- xorl %ebp,%ebp
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
- pxor %xmm7,%xmm7
- movaps %xmm0,0(%rsp)
- pxor %xmm8,%xmm8
- movaps %xmm0,16(%rsp)
- pxor %xmm9,%xmm9
- movaps %xmm0,32(%rsp)
- pxor %xmm10,%xmm10
- movaps %xmm0,48(%rsp)
- pxor %xmm11,%xmm11
- movaps %xmm0,64(%rsp)
- pxor %xmm12,%xmm12
- movaps %xmm0,80(%rsp)
- pxor %xmm13,%xmm13
- movaps %xmm0,96(%rsp)
- pxor %xmm14,%xmm14
- movaps %xmm0,112(%rsp)
- pxor %xmm15,%xmm15
- movq -8(%r11),%rbp
-
- leaq (%r11),%rsp
-
-L$ctr32_epilogue:
- ret
-
-
-.globl _aes_hw_cbc_encrypt
-.private_extern _aes_hw_cbc_encrypt
-
-.p2align 4
-_aes_hw_cbc_encrypt:
-
-_CET_ENDBR
- testq %rdx,%rdx
- jz L$cbc_ret
-
- movl 240(%rcx),%r10d
- movq %rcx,%r11
- testl %r9d,%r9d
- jz L$cbc_decrypt
-
- movups (%r8),%xmm2
- movl %r10d,%eax
- cmpq $16,%rdx
- jb L$cbc_enc_tail
- subq $16,%rdx
- jmp L$cbc_enc_loop
-.p2align 4
-L$cbc_enc_loop:
- movups (%rdi),%xmm3
- leaq 16(%rdi),%rdi
-
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- xorps %xmm0,%xmm3
- leaq 32(%rcx),%rcx
- xorps %xmm3,%xmm2
-L$oop_enc1_6:
-.byte 102,15,56,220,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz L$oop_enc1_6
-.byte 102,15,56,221,209
- movl %r10d,%eax
- movq %r11,%rcx
- movups %xmm2,0(%rsi)
- leaq 16(%rsi),%rsi
- subq $16,%rdx
- jnc L$cbc_enc_loop
- addq $16,%rdx
- jnz L$cbc_enc_tail
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- movups %xmm2,(%r8)
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- jmp L$cbc_ret
-
-L$cbc_enc_tail:
- movq %rdx,%rcx
- xchgq %rdi,%rsi
-.long 0x9066A4F3
- movl $16,%ecx
- subq %rdx,%rcx
- xorl %eax,%eax
-.long 0x9066AAF3
- leaq -16(%rdi),%rdi
- movl %r10d,%eax
- movq %rdi,%rsi
- movq %r11,%rcx
- xorq %rdx,%rdx
- jmp L$cbc_enc_loop
-
-.p2align 4
-L$cbc_decrypt:
- cmpq $16,%rdx
- jne L$cbc_decrypt_bulk
-
-
-
- movdqu (%rdi),%xmm2
- movdqu (%r8),%xmm3
- movdqa %xmm2,%xmm4
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-L$oop_dec1_7:
-.byte 102,15,56,222,209
- decl %r10d
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz L$oop_dec1_7
-.byte 102,15,56,223,209
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- movdqu %xmm4,(%r8)
- xorps %xmm3,%xmm2
- pxor %xmm3,%xmm3
- movups %xmm2,(%rsi)
- pxor %xmm2,%xmm2
- jmp L$cbc_ret
-.p2align 4
-L$cbc_decrypt_bulk:
- leaq (%rsp),%r11
-
- pushq %rbp
-
- subq $16,%rsp
- andq $-16,%rsp
- movq %rcx,%rbp
- movups (%r8),%xmm10
- movl %r10d,%eax
- cmpq $0x50,%rdx
- jbe L$cbc_dec_tail
-
- movups (%rcx),%xmm0
- movdqu 0(%rdi),%xmm2
- movdqu 16(%rdi),%xmm3
- movdqa %xmm2,%xmm11
- movdqu 32(%rdi),%xmm4
- movdqa %xmm3,%xmm12
- movdqu 48(%rdi),%xmm5
- movdqa %xmm4,%xmm13
- movdqu 64(%rdi),%xmm6
- movdqa %xmm5,%xmm14
- movdqu 80(%rdi),%xmm7
- movdqa %xmm6,%xmm15
- leaq _OPENSSL_ia32cap_P(%rip),%r9
- movl 4(%r9),%r9d
- cmpq $0x70,%rdx
- jbe L$cbc_dec_six_or_seven
-
- andl $71303168,%r9d
- subq $0x50,%rdx
- cmpl $4194304,%r9d
- je L$cbc_dec_loop6_enter
- subq $0x20,%rdx
- leaq 112(%rcx),%rcx
- jmp L$cbc_dec_loop8_enter
-.p2align 4
-L$cbc_dec_loop8:
- movups %xmm9,(%rsi)
- leaq 16(%rsi),%rsi
-L$cbc_dec_loop8_enter:
- movdqu 96(%rdi),%xmm8
- pxor %xmm0,%xmm2
- movdqu 112(%rdi),%xmm9
- pxor %xmm0,%xmm3
- movups 16-112(%rcx),%xmm1
- pxor %xmm0,%xmm4
- movq $-1,%rbp
- cmpq $0x70,%rdx
- pxor %xmm0,%xmm5
- pxor %xmm0,%xmm6
- pxor %xmm0,%xmm7
- pxor %xmm0,%xmm8
-
-.byte 102,15,56,222,209
- pxor %xmm0,%xmm9
- movups 32-112(%rcx),%xmm0
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
- adcq $0,%rbp
- andq $128,%rbp
-.byte 102,68,15,56,222,201
- addq %rdi,%rbp
- movups 48-112(%rcx),%xmm1
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
- movups 64-112(%rcx),%xmm0
- nop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups 80-112(%rcx),%xmm1
- nop
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
- movups 96-112(%rcx),%xmm0
- nop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups 112-112(%rcx),%xmm1
- nop
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
- movups 128-112(%rcx),%xmm0
- nop
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups 144-112(%rcx),%xmm1
- cmpl $11,%eax
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
- movups 160-112(%rcx),%xmm0
- jb L$cbc_dec_done
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups 176-112(%rcx),%xmm1
- nop
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
- movups 192-112(%rcx),%xmm0
- je L$cbc_dec_done
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movups 208-112(%rcx),%xmm1
- nop
-.byte 102,15,56,222,208
-.byte 102,15,56,222,216
-.byte 102,15,56,222,224
-.byte 102,15,56,222,232
-.byte 102,15,56,222,240
-.byte 102,15,56,222,248
-.byte 102,68,15,56,222,192
-.byte 102,68,15,56,222,200
- movups 224-112(%rcx),%xmm0
- jmp L$cbc_dec_done
-.p2align 4
-L$cbc_dec_done:
-.byte 102,15,56,222,209
-.byte 102,15,56,222,217
- pxor %xmm0,%xmm10
- pxor %xmm0,%xmm11
-.byte 102,15,56,222,225
-.byte 102,15,56,222,233
- pxor %xmm0,%xmm12
- pxor %xmm0,%xmm13
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- pxor %xmm0,%xmm14
- pxor %xmm0,%xmm15
-.byte 102,68,15,56,222,193
-.byte 102,68,15,56,222,201
- movdqu 80(%rdi),%xmm1
-
-.byte 102,65,15,56,223,210
- movdqu 96(%rdi),%xmm10
- pxor %xmm0,%xmm1
-.byte 102,65,15,56,223,219
- pxor %xmm0,%xmm10
- movdqu 112(%rdi),%xmm0
-.byte 102,65,15,56,223,228
- leaq 128(%rdi),%rdi
- movdqu 0(%rbp),%xmm11
-.byte 102,65,15,56,223,237
-.byte 102,65,15,56,223,246
- movdqu 16(%rbp),%xmm12
- movdqu 32(%rbp),%xmm13
-.byte 102,65,15,56,223,255
-.byte 102,68,15,56,223,193
- movdqu 48(%rbp),%xmm14
- movdqu 64(%rbp),%xmm15
-.byte 102,69,15,56,223,202
- movdqa %xmm0,%xmm10
- movdqu 80(%rbp),%xmm1
- movups -112(%rcx),%xmm0
-
- movups %xmm2,(%rsi)
- movdqa %xmm11,%xmm2
- movups %xmm3,16(%rsi)
- movdqa %xmm12,%xmm3
- movups %xmm4,32(%rsi)
- movdqa %xmm13,%xmm4
- movups %xmm5,48(%rsi)
- movdqa %xmm14,%xmm5
- movups %xmm6,64(%rsi)
- movdqa %xmm15,%xmm6
- movups %xmm7,80(%rsi)
- movdqa %xmm1,%xmm7
- movups %xmm8,96(%rsi)
- leaq 112(%rsi),%rsi
-
- subq $0x80,%rdx
- ja L$cbc_dec_loop8
-
- movaps %xmm9,%xmm2
- leaq -112(%rcx),%rcx
- addq $0x70,%rdx
- jle L$cbc_dec_clear_tail_collected
- movups %xmm9,(%rsi)
- leaq 16(%rsi),%rsi
- cmpq $0x50,%rdx
- jbe L$cbc_dec_tail
-
- movaps %xmm11,%xmm2
-L$cbc_dec_six_or_seven:
- cmpq $0x60,%rdx
- ja L$cbc_dec_seven
-
- movaps %xmm7,%xmm8
- call _aesni_decrypt6
- pxor %xmm10,%xmm2
- movaps %xmm8,%xmm10
- pxor %xmm11,%xmm3
- movdqu %xmm2,(%rsi)
- pxor %xmm12,%xmm4
- movdqu %xmm3,16(%rsi)
- pxor %xmm3,%xmm3
- pxor %xmm13,%xmm5
- movdqu %xmm4,32(%rsi)
- pxor %xmm4,%xmm4
- pxor %xmm14,%xmm6
- movdqu %xmm5,48(%rsi)
- pxor %xmm5,%xmm5
- pxor %xmm15,%xmm7
- movdqu %xmm6,64(%rsi)
- pxor %xmm6,%xmm6
- leaq 80(%rsi),%rsi
- movdqa %xmm7,%xmm2
- pxor %xmm7,%xmm7
- jmp L$cbc_dec_tail_collected
-
-.p2align 4
-L$cbc_dec_seven:
- movups 96(%rdi),%xmm8
- xorps %xmm9,%xmm9
- call _aesni_decrypt8
- movups 80(%rdi),%xmm9
- pxor %xmm10,%xmm2
- movups 96(%rdi),%xmm10
- pxor %xmm11,%xmm3
- movdqu %xmm2,(%rsi)
- pxor %xmm12,%xmm4
- movdqu %xmm3,16(%rsi)
- pxor %xmm3,%xmm3
- pxor %xmm13,%xmm5
- movdqu %xmm4,32(%rsi)
- pxor %xmm4,%xmm4
- pxor %xmm14,%xmm6
- movdqu %xmm5,48(%rsi)
- pxor %xmm5,%xmm5
- pxor %xmm15,%xmm7
- movdqu %xmm6,64(%rsi)
- pxor %xmm6,%xmm6
- pxor %xmm9,%xmm8
- movdqu %xmm7,80(%rsi)
- pxor %xmm7,%xmm7
- leaq 96(%rsi),%rsi
- movdqa %xmm8,%xmm2
- pxor %xmm8,%xmm8
- pxor %xmm9,%xmm9
- jmp L$cbc_dec_tail_collected
-
-.p2align 4
-L$cbc_dec_loop6:
- movups %xmm7,(%rsi)
- leaq 16(%rsi),%rsi
- movdqu 0(%rdi),%xmm2
- movdqu 16(%rdi),%xmm3
- movdqa %xmm2,%xmm11
- movdqu 32(%rdi),%xmm4
- movdqa %xmm3,%xmm12
- movdqu 48(%rdi),%xmm5
- movdqa %xmm4,%xmm13
- movdqu 64(%rdi),%xmm6
- movdqa %xmm5,%xmm14
- movdqu 80(%rdi),%xmm7
- movdqa %xmm6,%xmm15
-L$cbc_dec_loop6_enter:
- leaq 96(%rdi),%rdi
- movdqa %xmm7,%xmm8
-
- call _aesni_decrypt6
-
- pxor %xmm10,%xmm2
- movdqa %xmm8,%xmm10
- pxor %xmm11,%xmm3
- movdqu %xmm2,(%rsi)
- pxor %xmm12,%xmm4
- movdqu %xmm3,16(%rsi)
- pxor %xmm13,%xmm5
- movdqu %xmm4,32(%rsi)
- pxor %xmm14,%xmm6
- movq %rbp,%rcx
- movdqu %xmm5,48(%rsi)
- pxor %xmm15,%xmm7
- movl %r10d,%eax
- movdqu %xmm6,64(%rsi)
- leaq 80(%rsi),%rsi
- subq $0x60,%rdx
- ja L$cbc_dec_loop6
-
- movdqa %xmm7,%xmm2
- addq $0x50,%rdx
- jle L$cbc_dec_clear_tail_collected
- movups %xmm7,(%rsi)
- leaq 16(%rsi),%rsi
-
-L$cbc_dec_tail:
- movups (%rdi),%xmm2
- subq $0x10,%rdx
- jbe L$cbc_dec_one
-
- movups 16(%rdi),%xmm3
- movaps %xmm2,%xmm11
- subq $0x10,%rdx
- jbe L$cbc_dec_two
-
- movups 32(%rdi),%xmm4
- movaps %xmm3,%xmm12
- subq $0x10,%rdx
- jbe L$cbc_dec_three
-
- movups 48(%rdi),%xmm5
- movaps %xmm4,%xmm13
- subq $0x10,%rdx
- jbe L$cbc_dec_four
-
- movups 64(%rdi),%xmm6
- movaps %xmm5,%xmm14
- movaps %xmm6,%xmm15
- xorps %xmm7,%xmm7
- call _aesni_decrypt6
- pxor %xmm10,%xmm2
- movaps %xmm15,%xmm10
- pxor %xmm11,%xmm3
- movdqu %xmm2,(%rsi)
- pxor %xmm12,%xmm4
- movdqu %xmm3,16(%rsi)
- pxor %xmm3,%xmm3
- pxor %xmm13,%xmm5
- movdqu %xmm4,32(%rsi)
- pxor %xmm4,%xmm4
- pxor %xmm14,%xmm6
- movdqu %xmm5,48(%rsi)
- pxor %xmm5,%xmm5
- leaq 64(%rsi),%rsi
- movdqa %xmm6,%xmm2
- pxor %xmm6,%xmm6
- pxor %xmm7,%xmm7
- subq $0x10,%rdx
- jmp L$cbc_dec_tail_collected
-
-.p2align 4
-L$cbc_dec_one:
- movaps %xmm2,%xmm11
- movups (%rcx),%xmm0
- movups 16(%rcx),%xmm1
- leaq 32(%rcx),%rcx
- xorps %xmm0,%xmm2
-L$oop_dec1_8:
-.byte 102,15,56,222,209
- decl %eax
- movups (%rcx),%xmm1
- leaq 16(%rcx),%rcx
- jnz L$oop_dec1_8
-.byte 102,15,56,223,209
- xorps %xmm10,%xmm2
- movaps %xmm11,%xmm10
- jmp L$cbc_dec_tail_collected
-.p2align 4
-L$cbc_dec_two:
- movaps %xmm3,%xmm12
- call _aesni_decrypt2
- pxor %xmm10,%xmm2
- movaps %xmm12,%xmm10
- pxor %xmm11,%xmm3
- movdqu %xmm2,(%rsi)
- movdqa %xmm3,%xmm2
- pxor %xmm3,%xmm3
- leaq 16(%rsi),%rsi
- jmp L$cbc_dec_tail_collected
-.p2align 4
-L$cbc_dec_three:
- movaps %xmm4,%xmm13
- call _aesni_decrypt3
- pxor %xmm10,%xmm2
- movaps %xmm13,%xmm10
- pxor %xmm11,%xmm3
- movdqu %xmm2,(%rsi)
- pxor %xmm12,%xmm4
- movdqu %xmm3,16(%rsi)
- pxor %xmm3,%xmm3
- movdqa %xmm4,%xmm2
- pxor %xmm4,%xmm4
- leaq 32(%rsi),%rsi
- jmp L$cbc_dec_tail_collected
-.p2align 4
-L$cbc_dec_four:
- movaps %xmm5,%xmm14
- call _aesni_decrypt4
- pxor %xmm10,%xmm2
- movaps %xmm14,%xmm10
- pxor %xmm11,%xmm3
- movdqu %xmm2,(%rsi)
- pxor %xmm12,%xmm4
- movdqu %xmm3,16(%rsi)
- pxor %xmm3,%xmm3
- pxor %xmm13,%xmm5
- movdqu %xmm4,32(%rsi)
- pxor %xmm4,%xmm4
- movdqa %xmm5,%xmm2
- pxor %xmm5,%xmm5
- leaq 48(%rsi),%rsi
- jmp L$cbc_dec_tail_collected
-
-.p2align 4
-L$cbc_dec_clear_tail_collected:
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
- pxor %xmm7,%xmm7
- pxor %xmm8,%xmm8
- pxor %xmm9,%xmm9
-L$cbc_dec_tail_collected:
- movups %xmm10,(%r8)
- andq $15,%rdx
- jnz L$cbc_dec_tail_partial
- movups %xmm2,(%rsi)
- pxor %xmm2,%xmm2
- jmp L$cbc_dec_ret
-.p2align 4
-L$cbc_dec_tail_partial:
- movaps %xmm2,(%rsp)
- pxor %xmm2,%xmm2
- movq $16,%rcx
- movq %rsi,%rdi
- subq %rdx,%rcx
- leaq (%rsp),%rsi
-.long 0x9066A4F3
- movdqa %xmm2,(%rsp)
-
-L$cbc_dec_ret:
- xorps %xmm0,%xmm0
- pxor %xmm1,%xmm1
- movq -8(%r11),%rbp
-
- leaq (%r11),%rsp
-
-L$cbc_ret:
- ret
-
-
-.globl _aes_hw_set_decrypt_key
-.private_extern _aes_hw_set_decrypt_key
-
-.p2align 4
-_aes_hw_set_decrypt_key:
-
-_CET_ENDBR
-.byte 0x48,0x83,0xEC,0x08
-
- call __aesni_set_encrypt_key
- shll $4,%esi
- testl %eax,%eax
- jnz L$dec_key_ret
- leaq 16(%rdx,%rsi,1),%rdi
-
- movups (%rdx),%xmm0
- movups (%rdi),%xmm1
- movups %xmm0,(%rdi)
- movups %xmm1,(%rdx)
- leaq 16(%rdx),%rdx
- leaq -16(%rdi),%rdi
-
-L$dec_key_inverse:
- movups (%rdx),%xmm0
- movups (%rdi),%xmm1
-.byte 102,15,56,219,192
-.byte 102,15,56,219,201
- leaq 16(%rdx),%rdx
- leaq -16(%rdi),%rdi
- movups %xmm0,16(%rdi)
- movups %xmm1,-16(%rdx)
- cmpq %rdx,%rdi
- ja L$dec_key_inverse
-
- movups (%rdx),%xmm0
-.byte 102,15,56,219,192
- pxor %xmm1,%xmm1
- movups %xmm0,(%rdi)
- pxor %xmm0,%xmm0
-L$dec_key_ret:
- addq $8,%rsp
-
- ret
-
-L$SEH_end_set_decrypt_key:
-
-.globl _aes_hw_set_encrypt_key
-.private_extern _aes_hw_set_encrypt_key
-
-.p2align 4
-_aes_hw_set_encrypt_key:
-__aesni_set_encrypt_key:
-
-_CET_ENDBR
-#ifdef BORINGSSL_DISPATCH_TEST
- movb $1,_BORINGSSL_function_hit+3(%rip)
-#endif
-.byte 0x48,0x83,0xEC,0x08
-
- movq $-1,%rax
- testq %rdi,%rdi
- jz L$enc_key_ret
- testq %rdx,%rdx
- jz L$enc_key_ret
-
- movups (%rdi),%xmm0
- xorps %xmm4,%xmm4
- leaq _OPENSSL_ia32cap_P(%rip),%r10
- movl 4(%r10),%r10d
- andl $268437504,%r10d
- leaq 16(%rdx),%rax
- cmpl $256,%esi
- je L$14rounds
- cmpl $192,%esi
- je L$12rounds
- cmpl $128,%esi
- jne L$bad_keybits
-
-L$10rounds:
- movl $9,%esi
- cmpl $268435456,%r10d
- je L$10rounds_alt
-
- movups %xmm0,(%rdx)
-.byte 102,15,58,223,200,1
- call L$key_expansion_128_cold
-.byte 102,15,58,223,200,2
- call L$key_expansion_128
-.byte 102,15,58,223,200,4
- call L$key_expansion_128
-.byte 102,15,58,223,200,8
- call L$key_expansion_128
-.byte 102,15,58,223,200,16
- call L$key_expansion_128
-.byte 102,15,58,223,200,32
- call L$key_expansion_128
-.byte 102,15,58,223,200,64
- call L$key_expansion_128
-.byte 102,15,58,223,200,128
- call L$key_expansion_128
-.byte 102,15,58,223,200,27
- call L$key_expansion_128
-.byte 102,15,58,223,200,54
- call L$key_expansion_128
- movups %xmm0,(%rax)
- movl %esi,80(%rax)
- xorl %eax,%eax
- jmp L$enc_key_ret
-
-.p2align 4
-L$10rounds_alt:
- movdqa L$key_rotate(%rip),%xmm5
- movl $8,%r10d
- movdqa L$key_rcon1(%rip),%xmm4
- movdqa %xmm0,%xmm2
- movdqu %xmm0,(%rdx)
- jmp L$oop_key128
-
-.p2align 4
-L$oop_key128:
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
- pslld $1,%xmm4
- leaq 16(%rax),%rax
-
- movdqa %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm3,%xmm2
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,-16(%rax)
- movdqa %xmm0,%xmm2
-
- decl %r10d
- jnz L$oop_key128
-
- movdqa L$key_rcon1b(%rip),%xmm4
-
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
- pslld $1,%xmm4
-
- movdqa %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm3,%xmm2
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,(%rax)
-
- movdqa %xmm0,%xmm2
-.byte 102,15,56,0,197
-.byte 102,15,56,221,196
-
- movdqa %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm2,%xmm3
- pslldq $4,%xmm2
- pxor %xmm3,%xmm2
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,16(%rax)
-
- movl %esi,96(%rax)
- xorl %eax,%eax
- jmp L$enc_key_ret
-
-.p2align 4
-L$12rounds:
- movq 16(%rdi),%xmm2
- movl $11,%esi
- cmpl $268435456,%r10d
- je L$12rounds_alt
-
- movups %xmm0,(%rdx)
-.byte 102,15,58,223,202,1
- call L$key_expansion_192a_cold
-.byte 102,15,58,223,202,2
- call L$key_expansion_192b
-.byte 102,15,58,223,202,4
- call L$key_expansion_192a
-.byte 102,15,58,223,202,8
- call L$key_expansion_192b
-.byte 102,15,58,223,202,16
- call L$key_expansion_192a
-.byte 102,15,58,223,202,32
- call L$key_expansion_192b
-.byte 102,15,58,223,202,64
- call L$key_expansion_192a
-.byte 102,15,58,223,202,128
- call L$key_expansion_192b
- movups %xmm0,(%rax)
- movl %esi,48(%rax)
- xorq %rax,%rax
- jmp L$enc_key_ret
-
-.p2align 4
-L$12rounds_alt:
- movdqa L$key_rotate192(%rip),%xmm5
- movdqa L$key_rcon1(%rip),%xmm4
- movl $8,%r10d
- movdqu %xmm0,(%rdx)
- jmp L$oop_key192
-
-.p2align 4
-L$oop_key192:
- movq %xmm2,0(%rax)
- movdqa %xmm2,%xmm1
-.byte 102,15,56,0,213
-.byte 102,15,56,221,212
- pslld $1,%xmm4
- leaq 24(%rax),%rax
-
- movdqa %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm3,%xmm0
-
- pshufd $0xff,%xmm0,%xmm3
- pxor %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm1,%xmm3
-
- pxor %xmm2,%xmm0
- pxor %xmm3,%xmm2
- movdqu %xmm0,-16(%rax)
-
- decl %r10d
- jnz L$oop_key192
-
- movl %esi,32(%rax)
- xorl %eax,%eax
- jmp L$enc_key_ret
-
-.p2align 4
-L$14rounds:
- movups 16(%rdi),%xmm2
- movl $13,%esi
- leaq 16(%rax),%rax
- cmpl $268435456,%r10d
- je L$14rounds_alt
-
- movups %xmm0,(%rdx)
- movups %xmm2,16(%rdx)
-.byte 102,15,58,223,202,1
- call L$key_expansion_256a_cold
-.byte 102,15,58,223,200,1
- call L$key_expansion_256b
-.byte 102,15,58,223,202,2
- call L$key_expansion_256a
-.byte 102,15,58,223,200,2
- call L$key_expansion_256b
-.byte 102,15,58,223,202,4
- call L$key_expansion_256a
-.byte 102,15,58,223,200,4
- call L$key_expansion_256b
-.byte 102,15,58,223,202,8
- call L$key_expansion_256a
-.byte 102,15,58,223,200,8
- call L$key_expansion_256b
-.byte 102,15,58,223,202,16
- call L$key_expansion_256a
-.byte 102,15,58,223,200,16
- call L$key_expansion_256b
-.byte 102,15,58,223,202,32
- call L$key_expansion_256a
-.byte 102,15,58,223,200,32
- call L$key_expansion_256b
-.byte 102,15,58,223,202,64
- call L$key_expansion_256a
- movups %xmm0,(%rax)
- movl %esi,16(%rax)
- xorq %rax,%rax
- jmp L$enc_key_ret
-
-.p2align 4
-L$14rounds_alt:
- movdqa L$key_rotate(%rip),%xmm5
- movdqa L$key_rcon1(%rip),%xmm4
- movl $7,%r10d
- movdqu %xmm0,0(%rdx)
- movdqa %xmm2,%xmm1
- movdqu %xmm2,16(%rdx)
- jmp L$oop_key256
-
-.p2align 4
-L$oop_key256:
-.byte 102,15,56,0,213
-.byte 102,15,56,221,212
-
- movdqa %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm0,%xmm3
- pslldq $4,%xmm0
- pxor %xmm3,%xmm0
- pslld $1,%xmm4
-
- pxor %xmm2,%xmm0
- movdqu %xmm0,(%rax)
-
- decl %r10d
- jz L$done_key256
-
- pshufd $0xff,%xmm0,%xmm2
- pxor %xmm3,%xmm3
-.byte 102,15,56,221,211
-
- movdqa %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm1,%xmm3
- pslldq $4,%xmm1
- pxor %xmm3,%xmm1
-
- pxor %xmm1,%xmm2
- movdqu %xmm2,16(%rax)
- leaq 32(%rax),%rax
- movdqa %xmm2,%xmm1
-
- jmp L$oop_key256
-
-L$done_key256:
- movl %esi,16(%rax)
- xorl %eax,%eax
- jmp L$enc_key_ret
-
-.p2align 4
-L$bad_keybits:
- movq $-2,%rax
-L$enc_key_ret:
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- addq $8,%rsp
-
- ret
-
-L$SEH_end_set_encrypt_key:
-
-.p2align 4
-L$key_expansion_128:
- movups %xmm0,(%rax)
- leaq 16(%rax),%rax
-L$key_expansion_128_cold:
- shufps $16,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps $255,%xmm1,%xmm1
- xorps %xmm1,%xmm0
- ret
-
-.p2align 4
-L$key_expansion_192a:
- movups %xmm0,(%rax)
- leaq 16(%rax),%rax
-L$key_expansion_192a_cold:
- movaps %xmm2,%xmm5
-L$key_expansion_192b_warm:
- shufps $16,%xmm0,%xmm4
- movdqa %xmm2,%xmm3
- xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
- pslldq $4,%xmm3
- xorps %xmm4,%xmm0
- pshufd $85,%xmm1,%xmm1
- pxor %xmm3,%xmm2
- pxor %xmm1,%xmm0
- pshufd $255,%xmm0,%xmm3
- pxor %xmm3,%xmm2
- ret
-
-.p2align 4
-L$key_expansion_192b:
- movaps %xmm0,%xmm3
- shufps $68,%xmm0,%xmm5
- movups %xmm5,(%rax)
- shufps $78,%xmm2,%xmm3
- movups %xmm3,16(%rax)
- leaq 32(%rax),%rax
- jmp L$key_expansion_192b_warm
-
-.p2align 4
-L$key_expansion_256a:
- movups %xmm2,(%rax)
- leaq 16(%rax),%rax
-L$key_expansion_256a_cold:
- shufps $16,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps $140,%xmm0,%xmm4
- xorps %xmm4,%xmm0
- shufps $255,%xmm1,%xmm1
- xorps %xmm1,%xmm0
- ret
-
-.p2align 4
-L$key_expansion_256b:
- movups %xmm0,(%rax)
- leaq 16(%rax),%rax
-
- shufps $16,%xmm2,%xmm4
- xorps %xmm4,%xmm2
- shufps $140,%xmm2,%xmm4
- xorps %xmm4,%xmm2
- shufps $170,%xmm1,%xmm1
- xorps %xmm1,%xmm2
- ret
-
-
-.section __DATA,__const
-.p2align 6
-L$bswap_mask:
-.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-L$increment32:
-.long 6,6,6,0
-L$increment64:
-.long 1,0,0,0
-L$xts_magic:
-.long 0x87,0,1,0
-L$increment1:
-.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
-L$key_rotate:
-.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
-L$key_rotate192:
-.long 0x04070605,0x04070605,0x04070605,0x04070605
-L$key_rcon1:
-.long 1,1,1,1
-L$key_rcon1b:
-.long 0x1b,0x1b,0x1b,0x1b
-
-.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.p2align 6
-.text
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64-apple.S
deleted file mode 100644
index bcbf824..0000000
--- a/apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64-apple.S
+++ /dev/null
@@ -1,423 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-
-
-
-
-
-.globl _gcm_gmult_ssse3
-.private_extern _gcm_gmult_ssse3
-.p2align 4
-_gcm_gmult_ssse3:
-
-
-_CET_ENDBR
- movdqu (%rdi),%xmm0
- movdqa L$reverse_bytes(%rip),%xmm10
- movdqa L$low4_mask(%rip),%xmm2
-
-
-.byte 102,65,15,56,0,194
-
-
- movdqa %xmm2,%xmm1
- pandn %xmm0,%xmm1
- psrld $4,%xmm1
- pand %xmm2,%xmm0
-
-
-
-
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- movq $5,%rax
-L$oop_row_1:
- movdqa (%rsi),%xmm4
- leaq 16(%rsi),%rsi
-
-
- movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
- movdqa %xmm6,%xmm3
- psrldq $1,%xmm2
-
-
-
-
- movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
-
-
- pxor %xmm5,%xmm2
-
-
-
- movdqa %xmm4,%xmm5
- psllq $60,%xmm5
- movdqa %xmm5,%xmm6
- pslldq $8,%xmm6
- pxor %xmm6,%xmm3
-
-
- psrldq $8,%xmm5
- pxor %xmm5,%xmm2
- psrlq $4,%xmm4
- pxor %xmm4,%xmm2
-
- subq $1,%rax
- jnz L$oop_row_1
-
-
-
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $5,%xmm3
- pxor %xmm3,%xmm2
- pxor %xmm3,%xmm3
- movq $5,%rax
-L$oop_row_2:
- movdqa (%rsi),%xmm4
- leaq 16(%rsi),%rsi
-
-
- movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
- movdqa %xmm6,%xmm3
- psrldq $1,%xmm2
-
-
-
-
- movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
-
-
- pxor %xmm5,%xmm2
-
-
-
- movdqa %xmm4,%xmm5
- psllq $60,%xmm5
- movdqa %xmm5,%xmm6
- pslldq $8,%xmm6
- pxor %xmm6,%xmm3
-
-
- psrldq $8,%xmm5
- pxor %xmm5,%xmm2
- psrlq $4,%xmm4
- pxor %xmm4,%xmm2
-
- subq $1,%rax
- jnz L$oop_row_2
-
-
-
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $5,%xmm3
- pxor %xmm3,%xmm2
- pxor %xmm3,%xmm3
- movq $6,%rax
-L$oop_row_3:
- movdqa (%rsi),%xmm4
- leaq 16(%rsi),%rsi
-
-
- movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
- movdqa %xmm6,%xmm3
- psrldq $1,%xmm2
-
-
-
-
- movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
-
-
- pxor %xmm5,%xmm2
-
-
-
- movdqa %xmm4,%xmm5
- psllq $60,%xmm5
- movdqa %xmm5,%xmm6
- pslldq $8,%xmm6
- pxor %xmm6,%xmm3
-
-
- psrldq $8,%xmm5
- pxor %xmm5,%xmm2
- psrlq $4,%xmm4
- pxor %xmm4,%xmm2
-
- subq $1,%rax
- jnz L$oop_row_3
-
-
-
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $5,%xmm3
- pxor %xmm3,%xmm2
- pxor %xmm3,%xmm3
-
-.byte 102,65,15,56,0,210
- movdqu %xmm2,(%rdi)
-
-
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
- ret
-
-
-
-
-
-
-
-
-
-.globl _gcm_ghash_ssse3
-.private_extern _gcm_ghash_ssse3
-.p2align 4
-_gcm_ghash_ssse3:
-
-
-_CET_ENDBR
- movdqu (%rdi),%xmm0
- movdqa L$reverse_bytes(%rip),%xmm10
- movdqa L$low4_mask(%rip),%xmm11
-
-
- andq $-16,%rcx
-
-
-
-.byte 102,65,15,56,0,194
-
-
- pxor %xmm3,%xmm3
-L$oop_ghash:
-
- movdqu (%rdx),%xmm1
-.byte 102,65,15,56,0,202
- pxor %xmm1,%xmm0
-
-
- movdqa %xmm11,%xmm1
- pandn %xmm0,%xmm1
- psrld $4,%xmm1
- pand %xmm11,%xmm0
-
-
-
-
- pxor %xmm2,%xmm2
-
- movq $5,%rax
-L$oop_row_4:
- movdqa (%rsi),%xmm4
- leaq 16(%rsi),%rsi
-
-
- movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
- movdqa %xmm6,%xmm3
- psrldq $1,%xmm2
-
-
-
-
- movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
-
-
- pxor %xmm5,%xmm2
-
-
-
- movdqa %xmm4,%xmm5
- psllq $60,%xmm5
- movdqa %xmm5,%xmm6
- pslldq $8,%xmm6
- pxor %xmm6,%xmm3
-
-
- psrldq $8,%xmm5
- pxor %xmm5,%xmm2
- psrlq $4,%xmm4
- pxor %xmm4,%xmm2
-
- subq $1,%rax
- jnz L$oop_row_4
-
-
-
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $5,%xmm3
- pxor %xmm3,%xmm2
- pxor %xmm3,%xmm3
- movq $5,%rax
-L$oop_row_5:
- movdqa (%rsi),%xmm4
- leaq 16(%rsi),%rsi
-
-
- movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
- movdqa %xmm6,%xmm3
- psrldq $1,%xmm2
-
-
-
-
- movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
-
-
- pxor %xmm5,%xmm2
-
-
-
- movdqa %xmm4,%xmm5
- psllq $60,%xmm5
- movdqa %xmm5,%xmm6
- pslldq $8,%xmm6
- pxor %xmm6,%xmm3
-
-
- psrldq $8,%xmm5
- pxor %xmm5,%xmm2
- psrlq $4,%xmm4
- pxor %xmm4,%xmm2
-
- subq $1,%rax
- jnz L$oop_row_5
-
-
-
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $5,%xmm3
- pxor %xmm3,%xmm2
- pxor %xmm3,%xmm3
- movq $6,%rax
-L$oop_row_6:
- movdqa (%rsi),%xmm4
- leaq 16(%rsi),%rsi
-
-
- movdqa %xmm2,%xmm6
-.byte 102,15,58,15,243,1
- movdqa %xmm6,%xmm3
- psrldq $1,%xmm2
-
-
-
-
- movdqa %xmm4,%xmm5
-.byte 102,15,56,0,224
-.byte 102,15,56,0,233
-
-
- pxor %xmm5,%xmm2
-
-
-
- movdqa %xmm4,%xmm5
- psllq $60,%xmm5
- movdqa %xmm5,%xmm6
- pslldq $8,%xmm6
- pxor %xmm6,%xmm3
-
-
- psrldq $8,%xmm5
- pxor %xmm5,%xmm2
- psrlq $4,%xmm4
- pxor %xmm4,%xmm2
-
- subq $1,%rax
- jnz L$oop_row_6
-
-
-
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $1,%xmm3
- pxor %xmm3,%xmm2
- psrlq $5,%xmm3
- pxor %xmm3,%xmm2
- pxor %xmm3,%xmm3
- movdqa %xmm2,%xmm0
-
-
- leaq -256(%rsi),%rsi
-
-
- leaq 16(%rdx),%rdx
- subq $16,%rcx
- jnz L$oop_ghash
-
-
-.byte 102,65,15,56,0,194
- movdqu %xmm0,(%rdi)
-
-
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
- ret
-
-
-
-
-.section __DATA,__const
-.p2align 4
-
-
-L$reverse_bytes:
-.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-L$low4_mask:
-.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-.text
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/ghash-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/ghash-x86_64-apple.S
deleted file mode 100644
index c17d8f7..0000000
--- a/apple-x86_64/crypto/fipsmodule/ghash-x86_64-apple.S
+++ /dev/null
@@ -1,1132 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-.globl _gcm_init_clmul
-.private_extern _gcm_init_clmul
-
-.p2align 4
-_gcm_init_clmul:
-
-
-_CET_ENDBR
-L$_init_clmul:
- movdqu (%rsi),%xmm2
- pshufd $78,%xmm2,%xmm2
-
-
- pshufd $255,%xmm2,%xmm4
- movdqa %xmm2,%xmm3
- psllq $1,%xmm2
- pxor %xmm5,%xmm5
- psrlq $63,%xmm3
- pcmpgtd %xmm4,%xmm5
- pslldq $8,%xmm3
- por %xmm3,%xmm2
-
-
- pand L$0x1c2_polynomial(%rip),%xmm5
- pxor %xmm5,%xmm2
-
-
- pshufd $78,%xmm2,%xmm6
- movdqa %xmm2,%xmm0
- pxor %xmm2,%xmm6
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,222,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
-
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
- psllq $5,%xmm0
- pxor %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm3
- pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
-
-
- movdqa %xmm0,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
- pshufd $78,%xmm2,%xmm3
- pshufd $78,%xmm0,%xmm4
- pxor %xmm2,%xmm3
- movdqu %xmm2,0(%rdi)
- pxor %xmm0,%xmm4
- movdqu %xmm0,16(%rdi)
-.byte 102,15,58,15,227,8
- movdqu %xmm4,32(%rdi)
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,222,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
-
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
- psllq $5,%xmm0
- pxor %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm3
- pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
-
-
- movdqa %xmm0,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
- movdqa %xmm0,%xmm5
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,222,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
-
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
- psllq $5,%xmm0
- pxor %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm3
- pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
-
-
- movdqa %xmm0,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
- pshufd $78,%xmm5,%xmm3
- pshufd $78,%xmm0,%xmm4
- pxor %xmm5,%xmm3
- movdqu %xmm5,48(%rdi)
- pxor %xmm0,%xmm4
- movdqu %xmm0,64(%rdi)
-.byte 102,15,58,15,227,8
- movdqu %xmm4,80(%rdi)
- ret
-
-
-
-.globl _gcm_gmult_clmul
-.private_extern _gcm_gmult_clmul
-
-.p2align 4
-_gcm_gmult_clmul:
-
-_CET_ENDBR
-L$_gmult_clmul:
- movdqu (%rdi),%xmm0
- movdqa L$bswap_mask(%rip),%xmm5
- movdqu (%rsi),%xmm2
- movdqu 32(%rsi),%xmm4
-.byte 102,15,56,0,197
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,220,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
-
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
- psllq $5,%xmm0
- pxor %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm3
- pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
-
-
- movdqa %xmm0,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
-.byte 102,15,56,0,197
- movdqu %xmm0,(%rdi)
- ret
-
-
-.globl _gcm_ghash_clmul
-.private_extern _gcm_ghash_clmul
-
-.p2align 5
-_gcm_ghash_clmul:
-
-
-_CET_ENDBR
-L$_ghash_clmul:
- movdqa L$bswap_mask(%rip),%xmm10
-
- movdqu (%rdi),%xmm0
- movdqu (%rsi),%xmm2
- movdqu 32(%rsi),%xmm7
-.byte 102,65,15,56,0,194
-
- subq $0x10,%rcx
- jz L$odd_tail
-
- movdqu 16(%rsi),%xmm6
- leaq _OPENSSL_ia32cap_P(%rip),%rax
- movl 4(%rax),%eax
- cmpq $0x30,%rcx
- jb L$skip4x
-
- andl $71303168,%eax
- cmpl $4194304,%eax
- je L$skip4x
-
- subq $0x30,%rcx
- movq $0xA040608020C0E000,%rax
- movdqu 48(%rsi),%xmm14
- movdqu 64(%rsi),%xmm15
-
-
-
-
- movdqu 48(%rdx),%xmm3
- movdqu 32(%rdx),%xmm11
-.byte 102,65,15,56,0,218
-.byte 102,69,15,56,0,218
- movdqa %xmm3,%xmm5
- pshufd $78,%xmm3,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,68,218,0
-.byte 102,15,58,68,234,17
-.byte 102,15,58,68,231,0
-
- movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
- pxor %xmm11,%xmm12
-.byte 102,68,15,58,68,222,0
-.byte 102,68,15,58,68,238,17
-.byte 102,68,15,58,68,231,16
- xorps %xmm11,%xmm3
- xorps %xmm13,%xmm5
- movups 80(%rsi),%xmm7
- xorps %xmm12,%xmm4
-
- movdqu 16(%rdx),%xmm11
- movdqu 0(%rdx),%xmm8
-.byte 102,69,15,56,0,218
-.byte 102,69,15,56,0,194
- movdqa %xmm11,%xmm13
- pshufd $78,%xmm11,%xmm12
- pxor %xmm8,%xmm0
- pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,222,0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm8
- pxor %xmm0,%xmm8
-.byte 102,69,15,58,68,238,17
-.byte 102,68,15,58,68,231,0
- xorps %xmm11,%xmm3
- xorps %xmm13,%xmm5
-
- leaq 64(%rdx),%rdx
- subq $0x40,%rcx
- jc L$tail4x
-
- jmp L$mod4_loop
-.p2align 5
-L$mod4_loop:
-.byte 102,65,15,58,68,199,0
- xorps %xmm12,%xmm4
- movdqu 48(%rdx),%xmm11
-.byte 102,69,15,56,0,218
-.byte 102,65,15,58,68,207,17
- xorps %xmm3,%xmm0
- movdqu 32(%rdx),%xmm3
- movdqa %xmm11,%xmm13
-.byte 102,68,15,58,68,199,16
- pshufd $78,%xmm11,%xmm12
- xorps %xmm5,%xmm1
- pxor %xmm11,%xmm12
-.byte 102,65,15,56,0,218
- movups 32(%rsi),%xmm7
- xorps %xmm4,%xmm8
-.byte 102,68,15,58,68,218,0
- pshufd $78,%xmm3,%xmm4
-
- pxor %xmm0,%xmm8
- movdqa %xmm3,%xmm5
- pxor %xmm1,%xmm8
- pxor %xmm3,%xmm4
- movdqa %xmm8,%xmm9
-.byte 102,68,15,58,68,234,17
- pslldq $8,%xmm8
- psrldq $8,%xmm9
- pxor %xmm8,%xmm0
- movdqa L$7_mask(%rip),%xmm8
- pxor %xmm9,%xmm1
-.byte 102,76,15,110,200
-
- pand %xmm0,%xmm8
-.byte 102,69,15,56,0,200
- pxor %xmm0,%xmm9
-.byte 102,68,15,58,68,231,0
- psllq $57,%xmm9
- movdqa %xmm9,%xmm8
- pslldq $8,%xmm9
-.byte 102,15,58,68,222,0
- psrldq $8,%xmm8
- pxor %xmm9,%xmm0
- pxor %xmm8,%xmm1
- movdqu 0(%rdx),%xmm8
-
- movdqa %xmm0,%xmm9
- psrlq $1,%xmm0
-.byte 102,15,58,68,238,17
- xorps %xmm11,%xmm3
- movdqu 16(%rdx),%xmm11
-.byte 102,69,15,56,0,218
-.byte 102,15,58,68,231,16
- xorps %xmm13,%xmm5
- movups 80(%rsi),%xmm7
-.byte 102,69,15,56,0,194
- pxor %xmm9,%xmm1
- pxor %xmm0,%xmm9
- psrlq $5,%xmm0
-
- movdqa %xmm11,%xmm13
- pxor %xmm12,%xmm4
- pshufd $78,%xmm11,%xmm12
- pxor %xmm9,%xmm0
- pxor %xmm8,%xmm1
- pxor %xmm11,%xmm12
-.byte 102,69,15,58,68,222,0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
- movdqa %xmm0,%xmm1
-.byte 102,69,15,58,68,238,17
- xorps %xmm11,%xmm3
- pshufd $78,%xmm0,%xmm8
- pxor %xmm0,%xmm8
-
-.byte 102,68,15,58,68,231,0
- xorps %xmm13,%xmm5
-
- leaq 64(%rdx),%rdx
- subq $0x40,%rcx
- jnc L$mod4_loop
-
-L$tail4x:
-.byte 102,65,15,58,68,199,0
-.byte 102,65,15,58,68,207,17
-.byte 102,68,15,58,68,199,16
- xorps %xmm12,%xmm4
- xorps %xmm3,%xmm0
- xorps %xmm5,%xmm1
- pxor %xmm0,%xmm1
- pxor %xmm4,%xmm8
-
- pxor %xmm1,%xmm8
- pxor %xmm0,%xmm1
-
- movdqa %xmm8,%xmm9
- psrldq $8,%xmm8
- pslldq $8,%xmm9
- pxor %xmm8,%xmm1
- pxor %xmm9,%xmm0
-
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
- psllq $5,%xmm0
- pxor %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm3
- pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
-
-
- movdqa %xmm0,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
- addq $0x40,%rcx
- jz L$done
- movdqu 32(%rsi),%xmm7
- subq $0x10,%rcx
- jz L$odd_tail
-L$skip4x:
-
-
-
-
-
- movdqu (%rdx),%xmm8
- movdqu 16(%rdx),%xmm3
-.byte 102,69,15,56,0,194
-.byte 102,65,15,56,0,218
- pxor %xmm8,%xmm0
-
- movdqa %xmm3,%xmm5
- pshufd $78,%xmm3,%xmm4
- pxor %xmm3,%xmm4
-.byte 102,15,58,68,218,0
-.byte 102,15,58,68,234,17
-.byte 102,15,58,68,231,0
-
- leaq 32(%rdx),%rdx
- nop
- subq $0x20,%rcx
- jbe L$even_tail
- nop
- jmp L$mod_loop
-
-.p2align 5
-L$mod_loop:
- movdqa %xmm0,%xmm1
- movdqa %xmm4,%xmm8
- pshufd $78,%xmm0,%xmm4
- pxor %xmm0,%xmm4
-
-.byte 102,15,58,68,198,0
-.byte 102,15,58,68,206,17
-.byte 102,15,58,68,231,16
-
- pxor %xmm3,%xmm0
- pxor %xmm5,%xmm1
- movdqu (%rdx),%xmm9
- pxor %xmm0,%xmm8
-.byte 102,69,15,56,0,202
- movdqu 16(%rdx),%xmm3
-
- pxor %xmm1,%xmm8
- pxor %xmm9,%xmm1
- pxor %xmm8,%xmm4
-.byte 102,65,15,56,0,218
- movdqa %xmm4,%xmm8
- psrldq $8,%xmm8
- pslldq $8,%xmm4
- pxor %xmm8,%xmm1
- pxor %xmm4,%xmm0
-
- movdqa %xmm3,%xmm5
-
- movdqa %xmm0,%xmm9
- movdqa %xmm0,%xmm8
- psllq $5,%xmm0
- pxor %xmm0,%xmm8
-.byte 102,15,58,68,218,0
- psllq $1,%xmm0
- pxor %xmm8,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm8
- pslldq $8,%xmm0
- psrldq $8,%xmm8
- pxor %xmm9,%xmm0
- pshufd $78,%xmm5,%xmm4
- pxor %xmm8,%xmm1
- pxor %xmm5,%xmm4
-
- movdqa %xmm0,%xmm9
- psrlq $1,%xmm0
-.byte 102,15,58,68,234,17
- pxor %xmm9,%xmm1
- pxor %xmm0,%xmm9
- psrlq $5,%xmm0
- pxor %xmm9,%xmm0
- leaq 32(%rdx),%rdx
- psrlq $1,%xmm0
-.byte 102,15,58,68,231,0
- pxor %xmm1,%xmm0
-
- subq $0x20,%rcx
- ja L$mod_loop
-
-L$even_tail:
- movdqa %xmm0,%xmm1
- movdqa %xmm4,%xmm8
- pshufd $78,%xmm0,%xmm4
- pxor %xmm0,%xmm4
-
-.byte 102,15,58,68,198,0
-.byte 102,15,58,68,206,17
-.byte 102,15,58,68,231,16
-
- pxor %xmm3,%xmm0
- pxor %xmm5,%xmm1
- pxor %xmm0,%xmm8
- pxor %xmm1,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm8
- psrldq $8,%xmm8
- pslldq $8,%xmm4
- pxor %xmm8,%xmm1
- pxor %xmm4,%xmm0
-
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
- psllq $5,%xmm0
- pxor %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm3
- pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
-
-
- movdqa %xmm0,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
- testq %rcx,%rcx
- jnz L$done
-
-L$odd_tail:
- movdqu (%rdx),%xmm8
-.byte 102,69,15,56,0,194
- pxor %xmm8,%xmm0
- movdqa %xmm0,%xmm1
- pshufd $78,%xmm0,%xmm3
- pxor %xmm0,%xmm3
-.byte 102,15,58,68,194,0
-.byte 102,15,58,68,202,17
-.byte 102,15,58,68,223,0
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
-
- movdqa %xmm3,%xmm4
- psrldq $8,%xmm3
- pslldq $8,%xmm4
- pxor %xmm3,%xmm1
- pxor %xmm4,%xmm0
-
- movdqa %xmm0,%xmm4
- movdqa %xmm0,%xmm3
- psllq $5,%xmm0
- pxor %xmm0,%xmm3
- psllq $1,%xmm0
- pxor %xmm3,%xmm0
- psllq $57,%xmm0
- movdqa %xmm0,%xmm3
- pslldq $8,%xmm0
- psrldq $8,%xmm3
- pxor %xmm4,%xmm0
- pxor %xmm3,%xmm1
-
-
- movdqa %xmm0,%xmm4
- psrlq $1,%xmm0
- pxor %xmm4,%xmm1
- pxor %xmm0,%xmm4
- psrlq $5,%xmm0
- pxor %xmm4,%xmm0
- psrlq $1,%xmm0
- pxor %xmm1,%xmm0
-L$done:
-.byte 102,65,15,56,0,194
- movdqu %xmm0,(%rdi)
- ret
-
-
-
-.globl _gcm_init_avx
-.private_extern _gcm_init_avx
-
-.p2align 5
-_gcm_init_avx:
-
-_CET_ENDBR
- vzeroupper
-
- vmovdqu (%rsi),%xmm2
- vpshufd $78,%xmm2,%xmm2
-
-
- vpshufd $255,%xmm2,%xmm4
- vpsrlq $63,%xmm2,%xmm3
- vpsllq $1,%xmm2,%xmm2
- vpxor %xmm5,%xmm5,%xmm5
- vpcmpgtd %xmm4,%xmm5,%xmm5
- vpslldq $8,%xmm3,%xmm3
- vpor %xmm3,%xmm2,%xmm2
-
-
- vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5
- vpxor %xmm5,%xmm2,%xmm2
-
- vpunpckhqdq %xmm2,%xmm2,%xmm6
- vmovdqa %xmm2,%xmm0
- vpxor %xmm2,%xmm6,%xmm6
- movq $4,%r10
- jmp L$init_start_avx
-.p2align 5
-L$init_loop_avx:
- vpalignr $8,%xmm3,%xmm4,%xmm5
- vmovdqu %xmm5,-16(%rdi)
- vpunpckhqdq %xmm0,%xmm0,%xmm3
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
- vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
- vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
- vpxor %xmm0,%xmm1,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
-
- vpslldq $8,%xmm3,%xmm4
- vpsrldq $8,%xmm3,%xmm3
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm3,%xmm1,%xmm1
- vpsllq $57,%xmm0,%xmm3
- vpsllq $62,%xmm0,%xmm4
- vpxor %xmm3,%xmm4,%xmm4
- vpsllq $63,%xmm0,%xmm3
- vpxor %xmm3,%xmm4,%xmm4
- vpslldq $8,%xmm4,%xmm3
- vpsrldq $8,%xmm4,%xmm4
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm4,%xmm1,%xmm1
-
- vpsrlq $1,%xmm0,%xmm4
- vpxor %xmm0,%xmm1,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $5,%xmm4,%xmm4
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $1,%xmm0,%xmm0
- vpxor %xmm1,%xmm0,%xmm0
-L$init_start_avx:
- vmovdqa %xmm0,%xmm5
- vpunpckhqdq %xmm0,%xmm0,%xmm3
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
- vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
- vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
- vpxor %xmm0,%xmm1,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
-
- vpslldq $8,%xmm3,%xmm4
- vpsrldq $8,%xmm3,%xmm3
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm3,%xmm1,%xmm1
- vpsllq $57,%xmm0,%xmm3
- vpsllq $62,%xmm0,%xmm4
- vpxor %xmm3,%xmm4,%xmm4
- vpsllq $63,%xmm0,%xmm3
- vpxor %xmm3,%xmm4,%xmm4
- vpslldq $8,%xmm4,%xmm3
- vpsrldq $8,%xmm4,%xmm4
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm4,%xmm1,%xmm1
-
- vpsrlq $1,%xmm0,%xmm4
- vpxor %xmm0,%xmm1,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $5,%xmm4,%xmm4
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $1,%xmm0,%xmm0
- vpxor %xmm1,%xmm0,%xmm0
- vpshufd $78,%xmm5,%xmm3
- vpshufd $78,%xmm0,%xmm4
- vpxor %xmm5,%xmm3,%xmm3
- vmovdqu %xmm5,0(%rdi)
- vpxor %xmm0,%xmm4,%xmm4
- vmovdqu %xmm0,16(%rdi)
- leaq 48(%rdi),%rdi
- subq $1,%r10
- jnz L$init_loop_avx
-
- vpalignr $8,%xmm4,%xmm3,%xmm5
- vmovdqu %xmm5,-16(%rdi)
-
- vzeroupper
- ret
-
-
-
-.globl _gcm_gmult_avx
-.private_extern _gcm_gmult_avx
-
-.p2align 5
-_gcm_gmult_avx:
-
-_CET_ENDBR
- jmp L$_gmult_clmul
-
-
-.globl _gcm_ghash_avx
-.private_extern _gcm_ghash_avx
-
-.p2align 5
-_gcm_ghash_avx:
-
-_CET_ENDBR
- vzeroupper
-
- vmovdqu (%rdi),%xmm10
- leaq L$0x1c2_polynomial(%rip),%r10
- leaq 64(%rsi),%rsi
- vmovdqu L$bswap_mask(%rip),%xmm13
- vpshufb %xmm13,%xmm10,%xmm10
- cmpq $0x80,%rcx
- jb L$short_avx
- subq $0x80,%rcx
-
- vmovdqu 112(%rdx),%xmm14
- vmovdqu 0-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm14
- vmovdqu 32-64(%rsi),%xmm7
-
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vmovdqu 96(%rdx),%xmm15
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpxor %xmm14,%xmm9,%xmm9
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 16-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vmovdqu 80(%rdx),%xmm14
- vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
-
- vpshufb %xmm13,%xmm14,%xmm14
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
- vmovdqu 48-64(%rsi),%xmm6
- vpxor %xmm14,%xmm9,%xmm9
- vmovdqu 64(%rdx),%xmm15
- vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
- vmovdqu 80-64(%rsi),%xmm7
-
- vpshufb %xmm13,%xmm15,%xmm15
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpxor %xmm1,%xmm4,%xmm4
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 64-64(%rsi),%xmm6
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
-
- vmovdqu 48(%rdx),%xmm14
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
- vpxor %xmm4,%xmm1,%xmm1
- vpshufb %xmm13,%xmm14,%xmm14
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
- vmovdqu 96-64(%rsi),%xmm6
- vpxor %xmm5,%xmm2,%xmm2
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
- vmovdqu 128-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
-
- vmovdqu 32(%rdx),%xmm15
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpxor %xmm1,%xmm4,%xmm4
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 112-64(%rsi),%xmm6
- vpxor %xmm2,%xmm5,%xmm5
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
-
- vmovdqu 16(%rdx),%xmm14
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
- vpxor %xmm4,%xmm1,%xmm1
- vpshufb %xmm13,%xmm14,%xmm14
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
- vmovdqu 144-64(%rsi),%xmm6
- vpxor %xmm5,%xmm2,%xmm2
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
- vmovdqu 176-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
-
- vmovdqu (%rdx),%xmm15
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpxor %xmm1,%xmm4,%xmm4
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 160-64(%rsi),%xmm6
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
-
- leaq 128(%rdx),%rdx
- cmpq $0x80,%rcx
- jb L$tail_avx
-
- vpxor %xmm10,%xmm15,%xmm15
- subq $0x80,%rcx
- jmp L$oop8x_avx
-
-.p2align 5
-L$oop8x_avx:
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vmovdqu 112(%rdx),%xmm14
- vpxor %xmm0,%xmm3,%xmm3
- vpxor %xmm15,%xmm8,%xmm8
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
- vpshufb %xmm13,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
- vmovdqu 0-64(%rsi),%xmm6
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
- vmovdqu 32-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
-
- vmovdqu 96(%rdx),%xmm15
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpxor %xmm3,%xmm10,%xmm10
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vxorps %xmm4,%xmm11,%xmm11
- vmovdqu 16-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
- vpxor %xmm5,%xmm12,%xmm12
- vxorps %xmm15,%xmm8,%xmm8
-
- vmovdqu 80(%rdx),%xmm14
- vpxor %xmm10,%xmm12,%xmm12
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
- vpxor %xmm11,%xmm12,%xmm12
- vpslldq $8,%xmm12,%xmm9
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
- vpsrldq $8,%xmm12,%xmm12
- vpxor %xmm9,%xmm10,%xmm10
- vmovdqu 48-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm14
- vxorps %xmm12,%xmm11,%xmm11
- vpxor %xmm1,%xmm4,%xmm4
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
- vmovdqu 80-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu 64(%rdx),%xmm15
- vpalignr $8,%xmm10,%xmm10,%xmm12
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpshufb %xmm13,%xmm15,%xmm15
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 64-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
- vxorps %xmm15,%xmm8,%xmm8
- vpxor %xmm5,%xmm2,%xmm2
-
- vmovdqu 48(%rdx),%xmm14
- vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
- vpshufb %xmm13,%xmm14,%xmm14
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
- vmovdqu 96-64(%rsi),%xmm6
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
- vmovdqu 128-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu 32(%rdx),%xmm15
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpshufb %xmm13,%xmm15,%xmm15
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 112-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
- vpxor %xmm5,%xmm2,%xmm2
- vxorps %xmm12,%xmm10,%xmm10
-
- vmovdqu 16(%rdx),%xmm14
- vpalignr $8,%xmm10,%xmm10,%xmm12
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
- vpshufb %xmm13,%xmm14,%xmm14
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
- vmovdqu 144-64(%rsi),%xmm6
- vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
- vxorps %xmm11,%xmm12,%xmm12
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
- vmovdqu 176-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu (%rdx),%xmm15
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 160-64(%rsi),%xmm6
- vpxor %xmm12,%xmm15,%xmm15
- vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
- vpxor %xmm10,%xmm15,%xmm15
-
- leaq 128(%rdx),%rdx
- subq $0x80,%rcx
- jnc L$oop8x_avx
-
- addq $0x80,%rcx
- jmp L$tail_no_xor_avx
-
-.p2align 5
-L$short_avx:
- vmovdqu -16(%rdx,%rcx,1),%xmm14
- leaq (%rdx,%rcx,1),%rdx
- vmovdqu 0-64(%rsi),%xmm6
- vmovdqu 32-64(%rsi),%xmm7
- vpshufb %xmm13,%xmm14,%xmm15
-
- vmovdqa %xmm0,%xmm3
- vmovdqa %xmm1,%xmm4
- vmovdqa %xmm2,%xmm5
- subq $0x10,%rcx
- jz L$tail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -32(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vmovdqu 16-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
- vpsrldq $8,%xmm7,%xmm7
- subq $0x10,%rcx
- jz L$tail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -48(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vmovdqu 48-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
- vmovdqu 80-64(%rsi),%xmm7
- subq $0x10,%rcx
- jz L$tail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -64(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vmovdqu 64-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
- vpsrldq $8,%xmm7,%xmm7
- subq $0x10,%rcx
- jz L$tail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -80(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vmovdqu 96-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
- vmovdqu 128-64(%rsi),%xmm7
- subq $0x10,%rcx
- jz L$tail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -96(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vmovdqu 112-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
- vpsrldq $8,%xmm7,%xmm7
- subq $0x10,%rcx
- jz L$tail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -112(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vmovdqu 144-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
- vmovq 184-64(%rsi),%xmm7
- subq $0x10,%rcx
- jmp L$tail_avx
-
-.p2align 5
-L$tail_avx:
- vpxor %xmm10,%xmm15,%xmm15
-L$tail_no_xor_avx:
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
-
- vmovdqu (%r10),%xmm12
-
- vpxor %xmm0,%xmm3,%xmm10
- vpxor %xmm1,%xmm4,%xmm11
- vpxor %xmm2,%xmm5,%xmm5
-
- vpxor %xmm10,%xmm5,%xmm5
- vpxor %xmm11,%xmm5,%xmm5
- vpslldq $8,%xmm5,%xmm9
- vpsrldq $8,%xmm5,%xmm5
- vpxor %xmm9,%xmm10,%xmm10
- vpxor %xmm5,%xmm11,%xmm11
-
- vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
- vpalignr $8,%xmm10,%xmm10,%xmm10
- vpxor %xmm9,%xmm10,%xmm10
-
- vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
- vpalignr $8,%xmm10,%xmm10,%xmm10
- vpxor %xmm11,%xmm10,%xmm10
- vpxor %xmm9,%xmm10,%xmm10
-
- cmpq $0,%rcx
- jne L$short_avx
-
- vpshufb %xmm13,%xmm10,%xmm10
- vmovdqu %xmm10,(%rdi)
- vzeroupper
- ret
-
-
-
-.section __DATA,__const
-.p2align 6
-L$bswap_mask:
-.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-L$0x1c2_polynomial:
-.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
-L$7_mask:
-.long 7,0,7,0
-.p2align 6
-
-.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.p2align 6
-.text
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/md5-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/md5-x86_64-apple.S
deleted file mode 100644
index e4c0241..0000000
--- a/apple-x86_64/crypto/fipsmodule/md5-x86_64-apple.S
+++ /dev/null
@@ -1,690 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-.p2align 4
-
-.globl _md5_block_asm_data_order
-.private_extern _md5_block_asm_data_order
-
-_md5_block_asm_data_order:
-
-_CET_ENDBR
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r14
-
- pushq %r15
-
-L$prologue:
-
-
-
-
- movq %rdi,%rbp
- shlq $6,%rdx
- leaq (%rsi,%rdx,1),%rdi
- movl 0(%rbp),%eax
- movl 4(%rbp),%ebx
- movl 8(%rbp),%ecx
- movl 12(%rbp),%edx
-
-
-
-
-
-
-
- cmpq %rdi,%rsi
- je L$end
-
-
-L$loop:
- movl %eax,%r8d
- movl %ebx,%r9d
- movl %ecx,%r14d
- movl %edx,%r15d
- movl 0(%rsi),%r10d
- movl %edx,%r11d
- xorl %ecx,%r11d
- leal -680876936(%rax,%r10,1),%eax
- andl %ebx,%r11d
- xorl %edx,%r11d
- movl 4(%rsi),%r10d
- addl %r11d,%eax
- roll $7,%eax
- movl %ecx,%r11d
- addl %ebx,%eax
- xorl %ebx,%r11d
- leal -389564586(%rdx,%r10,1),%edx
- andl %eax,%r11d
- xorl %ecx,%r11d
- movl 8(%rsi),%r10d
- addl %r11d,%edx
- roll $12,%edx
- movl %ebx,%r11d
- addl %eax,%edx
- xorl %eax,%r11d
- leal 606105819(%rcx,%r10,1),%ecx
- andl %edx,%r11d
- xorl %ebx,%r11d
- movl 12(%rsi),%r10d
- addl %r11d,%ecx
- roll $17,%ecx
- movl %eax,%r11d
- addl %edx,%ecx
- xorl %edx,%r11d
- leal -1044525330(%rbx,%r10,1),%ebx
- andl %ecx,%r11d
- xorl %eax,%r11d
- movl 16(%rsi),%r10d
- addl %r11d,%ebx
- roll $22,%ebx
- movl %edx,%r11d
- addl %ecx,%ebx
- xorl %ecx,%r11d
- leal -176418897(%rax,%r10,1),%eax
- andl %ebx,%r11d
- xorl %edx,%r11d
- movl 20(%rsi),%r10d
- addl %r11d,%eax
- roll $7,%eax
- movl %ecx,%r11d
- addl %ebx,%eax
- xorl %ebx,%r11d
- leal 1200080426(%rdx,%r10,1),%edx
- andl %eax,%r11d
- xorl %ecx,%r11d
- movl 24(%rsi),%r10d
- addl %r11d,%edx
- roll $12,%edx
- movl %ebx,%r11d
- addl %eax,%edx
- xorl %eax,%r11d
- leal -1473231341(%rcx,%r10,1),%ecx
- andl %edx,%r11d
- xorl %ebx,%r11d
- movl 28(%rsi),%r10d
- addl %r11d,%ecx
- roll $17,%ecx
- movl %eax,%r11d
- addl %edx,%ecx
- xorl %edx,%r11d
- leal -45705983(%rbx,%r10,1),%ebx
- andl %ecx,%r11d
- xorl %eax,%r11d
- movl 32(%rsi),%r10d
- addl %r11d,%ebx
- roll $22,%ebx
- movl %edx,%r11d
- addl %ecx,%ebx
- xorl %ecx,%r11d
- leal 1770035416(%rax,%r10,1),%eax
- andl %ebx,%r11d
- xorl %edx,%r11d
- movl 36(%rsi),%r10d
- addl %r11d,%eax
- roll $7,%eax
- movl %ecx,%r11d
- addl %ebx,%eax
- xorl %ebx,%r11d
- leal -1958414417(%rdx,%r10,1),%edx
- andl %eax,%r11d
- xorl %ecx,%r11d
- movl 40(%rsi),%r10d
- addl %r11d,%edx
- roll $12,%edx
- movl %ebx,%r11d
- addl %eax,%edx
- xorl %eax,%r11d
- leal -42063(%rcx,%r10,1),%ecx
- andl %edx,%r11d
- xorl %ebx,%r11d
- movl 44(%rsi),%r10d
- addl %r11d,%ecx
- roll $17,%ecx
- movl %eax,%r11d
- addl %edx,%ecx
- xorl %edx,%r11d
- leal -1990404162(%rbx,%r10,1),%ebx
- andl %ecx,%r11d
- xorl %eax,%r11d
- movl 48(%rsi),%r10d
- addl %r11d,%ebx
- roll $22,%ebx
- movl %edx,%r11d
- addl %ecx,%ebx
- xorl %ecx,%r11d
- leal 1804603682(%rax,%r10,1),%eax
- andl %ebx,%r11d
- xorl %edx,%r11d
- movl 52(%rsi),%r10d
- addl %r11d,%eax
- roll $7,%eax
- movl %ecx,%r11d
- addl %ebx,%eax
- xorl %ebx,%r11d
- leal -40341101(%rdx,%r10,1),%edx
- andl %eax,%r11d
- xorl %ecx,%r11d
- movl 56(%rsi),%r10d
- addl %r11d,%edx
- roll $12,%edx
- movl %ebx,%r11d
- addl %eax,%edx
- xorl %eax,%r11d
- leal -1502002290(%rcx,%r10,1),%ecx
- andl %edx,%r11d
- xorl %ebx,%r11d
- movl 60(%rsi),%r10d
- addl %r11d,%ecx
- roll $17,%ecx
- movl %eax,%r11d
- addl %edx,%ecx
- xorl %edx,%r11d
- leal 1236535329(%rbx,%r10,1),%ebx
- andl %ecx,%r11d
- xorl %eax,%r11d
- movl 0(%rsi),%r10d
- addl %r11d,%ebx
- roll $22,%ebx
- movl %edx,%r11d
- addl %ecx,%ebx
- movl 4(%rsi),%r10d
- movl %edx,%r11d
- movl %edx,%r12d
- notl %r11d
- leal -165796510(%rax,%r10,1),%eax
- andl %ebx,%r12d
- andl %ecx,%r11d
- movl 24(%rsi),%r10d
- orl %r11d,%r12d
- movl %ecx,%r11d
- addl %r12d,%eax
- movl %ecx,%r12d
- roll $5,%eax
- addl %ebx,%eax
- notl %r11d
- leal -1069501632(%rdx,%r10,1),%edx
- andl %eax,%r12d
- andl %ebx,%r11d
- movl 44(%rsi),%r10d
- orl %r11d,%r12d
- movl %ebx,%r11d
- addl %r12d,%edx
- movl %ebx,%r12d
- roll $9,%edx
- addl %eax,%edx
- notl %r11d
- leal 643717713(%rcx,%r10,1),%ecx
- andl %edx,%r12d
- andl %eax,%r11d
- movl 0(%rsi),%r10d
- orl %r11d,%r12d
- movl %eax,%r11d
- addl %r12d,%ecx
- movl %eax,%r12d
- roll $14,%ecx
- addl %edx,%ecx
- notl %r11d
- leal -373897302(%rbx,%r10,1),%ebx
- andl %ecx,%r12d
- andl %edx,%r11d
- movl 20(%rsi),%r10d
- orl %r11d,%r12d
- movl %edx,%r11d
- addl %r12d,%ebx
- movl %edx,%r12d
- roll $20,%ebx
- addl %ecx,%ebx
- notl %r11d
- leal -701558691(%rax,%r10,1),%eax
- andl %ebx,%r12d
- andl %ecx,%r11d
- movl 40(%rsi),%r10d
- orl %r11d,%r12d
- movl %ecx,%r11d
- addl %r12d,%eax
- movl %ecx,%r12d
- roll $5,%eax
- addl %ebx,%eax
- notl %r11d
- leal 38016083(%rdx,%r10,1),%edx
- andl %eax,%r12d
- andl %ebx,%r11d
- movl 60(%rsi),%r10d
- orl %r11d,%r12d
- movl %ebx,%r11d
- addl %r12d,%edx
- movl %ebx,%r12d
- roll $9,%edx
- addl %eax,%edx
- notl %r11d
- leal -660478335(%rcx,%r10,1),%ecx
- andl %edx,%r12d
- andl %eax,%r11d
- movl 16(%rsi),%r10d
- orl %r11d,%r12d
- movl %eax,%r11d
- addl %r12d,%ecx
- movl %eax,%r12d
- roll $14,%ecx
- addl %edx,%ecx
- notl %r11d
- leal -405537848(%rbx,%r10,1),%ebx
- andl %ecx,%r12d
- andl %edx,%r11d
- movl 36(%rsi),%r10d
- orl %r11d,%r12d
- movl %edx,%r11d
- addl %r12d,%ebx
- movl %edx,%r12d
- roll $20,%ebx
- addl %ecx,%ebx
- notl %r11d
- leal 568446438(%rax,%r10,1),%eax
- andl %ebx,%r12d
- andl %ecx,%r11d
- movl 56(%rsi),%r10d
- orl %r11d,%r12d
- movl %ecx,%r11d
- addl %r12d,%eax
- movl %ecx,%r12d
- roll $5,%eax
- addl %ebx,%eax
- notl %r11d
- leal -1019803690(%rdx,%r10,1),%edx
- andl %eax,%r12d
- andl %ebx,%r11d
- movl 12(%rsi),%r10d
- orl %r11d,%r12d
- movl %ebx,%r11d
- addl %r12d,%edx
- movl %ebx,%r12d
- roll $9,%edx
- addl %eax,%edx
- notl %r11d
- leal -187363961(%rcx,%r10,1),%ecx
- andl %edx,%r12d
- andl %eax,%r11d
- movl 32(%rsi),%r10d
- orl %r11d,%r12d
- movl %eax,%r11d
- addl %r12d,%ecx
- movl %eax,%r12d
- roll $14,%ecx
- addl %edx,%ecx
- notl %r11d
- leal 1163531501(%rbx,%r10,1),%ebx
- andl %ecx,%r12d
- andl %edx,%r11d
- movl 52(%rsi),%r10d
- orl %r11d,%r12d
- movl %edx,%r11d
- addl %r12d,%ebx
- movl %edx,%r12d
- roll $20,%ebx
- addl %ecx,%ebx
- notl %r11d
- leal -1444681467(%rax,%r10,1),%eax
- andl %ebx,%r12d
- andl %ecx,%r11d
- movl 8(%rsi),%r10d
- orl %r11d,%r12d
- movl %ecx,%r11d
- addl %r12d,%eax
- movl %ecx,%r12d
- roll $5,%eax
- addl %ebx,%eax
- notl %r11d
- leal -51403784(%rdx,%r10,1),%edx
- andl %eax,%r12d
- andl %ebx,%r11d
- movl 28(%rsi),%r10d
- orl %r11d,%r12d
- movl %ebx,%r11d
- addl %r12d,%edx
- movl %ebx,%r12d
- roll $9,%edx
- addl %eax,%edx
- notl %r11d
- leal 1735328473(%rcx,%r10,1),%ecx
- andl %edx,%r12d
- andl %eax,%r11d
- movl 48(%rsi),%r10d
- orl %r11d,%r12d
- movl %eax,%r11d
- addl %r12d,%ecx
- movl %eax,%r12d
- roll $14,%ecx
- addl %edx,%ecx
- notl %r11d
- leal -1926607734(%rbx,%r10,1),%ebx
- andl %ecx,%r12d
- andl %edx,%r11d
- movl 0(%rsi),%r10d
- orl %r11d,%r12d
- movl %edx,%r11d
- addl %r12d,%ebx
- movl %edx,%r12d
- roll $20,%ebx
- addl %ecx,%ebx
- movl 20(%rsi),%r10d
- movl %ecx,%r11d
- leal -378558(%rax,%r10,1),%eax
- movl 32(%rsi),%r10d
- xorl %edx,%r11d
- xorl %ebx,%r11d
- addl %r11d,%eax
- roll $4,%eax
- movl %ebx,%r11d
- addl %ebx,%eax
- leal -2022574463(%rdx,%r10,1),%edx
- movl 44(%rsi),%r10d
- xorl %ecx,%r11d
- xorl %eax,%r11d
- addl %r11d,%edx
- roll $11,%edx
- movl %eax,%r11d
- addl %eax,%edx
- leal 1839030562(%rcx,%r10,1),%ecx
- movl 56(%rsi),%r10d
- xorl %ebx,%r11d
- xorl %edx,%r11d
- addl %r11d,%ecx
- roll $16,%ecx
- movl %edx,%r11d
- addl %edx,%ecx
- leal -35309556(%rbx,%r10,1),%ebx
- movl 4(%rsi),%r10d
- xorl %eax,%r11d
- xorl %ecx,%r11d
- addl %r11d,%ebx
- roll $23,%ebx
- movl %ecx,%r11d
- addl %ecx,%ebx
- leal -1530992060(%rax,%r10,1),%eax
- movl 16(%rsi),%r10d
- xorl %edx,%r11d
- xorl %ebx,%r11d
- addl %r11d,%eax
- roll $4,%eax
- movl %ebx,%r11d
- addl %ebx,%eax
- leal 1272893353(%rdx,%r10,1),%edx
- movl 28(%rsi),%r10d
- xorl %ecx,%r11d
- xorl %eax,%r11d
- addl %r11d,%edx
- roll $11,%edx
- movl %eax,%r11d
- addl %eax,%edx
- leal -155497632(%rcx,%r10,1),%ecx
- movl 40(%rsi),%r10d
- xorl %ebx,%r11d
- xorl %edx,%r11d
- addl %r11d,%ecx
- roll $16,%ecx
- movl %edx,%r11d
- addl %edx,%ecx
- leal -1094730640(%rbx,%r10,1),%ebx
- movl 52(%rsi),%r10d
- xorl %eax,%r11d
- xorl %ecx,%r11d
- addl %r11d,%ebx
- roll $23,%ebx
- movl %ecx,%r11d
- addl %ecx,%ebx
- leal 681279174(%rax,%r10,1),%eax
- movl 0(%rsi),%r10d
- xorl %edx,%r11d
- xorl %ebx,%r11d
- addl %r11d,%eax
- roll $4,%eax
- movl %ebx,%r11d
- addl %ebx,%eax
- leal -358537222(%rdx,%r10,1),%edx
- movl 12(%rsi),%r10d
- xorl %ecx,%r11d
- xorl %eax,%r11d
- addl %r11d,%edx
- roll $11,%edx
- movl %eax,%r11d
- addl %eax,%edx
- leal -722521979(%rcx,%r10,1),%ecx
- movl 24(%rsi),%r10d
- xorl %ebx,%r11d
- xorl %edx,%r11d
- addl %r11d,%ecx
- roll $16,%ecx
- movl %edx,%r11d
- addl %edx,%ecx
- leal 76029189(%rbx,%r10,1),%ebx
- movl 36(%rsi),%r10d
- xorl %eax,%r11d
- xorl %ecx,%r11d
- addl %r11d,%ebx
- roll $23,%ebx
- movl %ecx,%r11d
- addl %ecx,%ebx
- leal -640364487(%rax,%r10,1),%eax
- movl 48(%rsi),%r10d
- xorl %edx,%r11d
- xorl %ebx,%r11d
- addl %r11d,%eax
- roll $4,%eax
- movl %ebx,%r11d
- addl %ebx,%eax
- leal -421815835(%rdx,%r10,1),%edx
- movl 60(%rsi),%r10d
- xorl %ecx,%r11d
- xorl %eax,%r11d
- addl %r11d,%edx
- roll $11,%edx
- movl %eax,%r11d
- addl %eax,%edx
- leal 530742520(%rcx,%r10,1),%ecx
- movl 8(%rsi),%r10d
- xorl %ebx,%r11d
- xorl %edx,%r11d
- addl %r11d,%ecx
- roll $16,%ecx
- movl %edx,%r11d
- addl %edx,%ecx
- leal -995338651(%rbx,%r10,1),%ebx
- movl 0(%rsi),%r10d
- xorl %eax,%r11d
- xorl %ecx,%r11d
- addl %r11d,%ebx
- roll $23,%ebx
- movl %ecx,%r11d
- addl %ecx,%ebx
- movl 0(%rsi),%r10d
- movl $0xffffffff,%r11d
- xorl %edx,%r11d
- leal -198630844(%rax,%r10,1),%eax
- orl %ebx,%r11d
- xorl %ecx,%r11d
- addl %r11d,%eax
- movl 28(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $6,%eax
- xorl %ecx,%r11d
- addl %ebx,%eax
- leal 1126891415(%rdx,%r10,1),%edx
- orl %eax,%r11d
- xorl %ebx,%r11d
- addl %r11d,%edx
- movl 56(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $10,%edx
- xorl %ebx,%r11d
- addl %eax,%edx
- leal -1416354905(%rcx,%r10,1),%ecx
- orl %edx,%r11d
- xorl %eax,%r11d
- addl %r11d,%ecx
- movl 20(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $15,%ecx
- xorl %eax,%r11d
- addl %edx,%ecx
- leal -57434055(%rbx,%r10,1),%ebx
- orl %ecx,%r11d
- xorl %edx,%r11d
- addl %r11d,%ebx
- movl 48(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $21,%ebx
- xorl %edx,%r11d
- addl %ecx,%ebx
- leal 1700485571(%rax,%r10,1),%eax
- orl %ebx,%r11d
- xorl %ecx,%r11d
- addl %r11d,%eax
- movl 12(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $6,%eax
- xorl %ecx,%r11d
- addl %ebx,%eax
- leal -1894986606(%rdx,%r10,1),%edx
- orl %eax,%r11d
- xorl %ebx,%r11d
- addl %r11d,%edx
- movl 40(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $10,%edx
- xorl %ebx,%r11d
- addl %eax,%edx
- leal -1051523(%rcx,%r10,1),%ecx
- orl %edx,%r11d
- xorl %eax,%r11d
- addl %r11d,%ecx
- movl 4(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $15,%ecx
- xorl %eax,%r11d
- addl %edx,%ecx
- leal -2054922799(%rbx,%r10,1),%ebx
- orl %ecx,%r11d
- xorl %edx,%r11d
- addl %r11d,%ebx
- movl 32(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $21,%ebx
- xorl %edx,%r11d
- addl %ecx,%ebx
- leal 1873313359(%rax,%r10,1),%eax
- orl %ebx,%r11d
- xorl %ecx,%r11d
- addl %r11d,%eax
- movl 60(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $6,%eax
- xorl %ecx,%r11d
- addl %ebx,%eax
- leal -30611744(%rdx,%r10,1),%edx
- orl %eax,%r11d
- xorl %ebx,%r11d
- addl %r11d,%edx
- movl 24(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $10,%edx
- xorl %ebx,%r11d
- addl %eax,%edx
- leal -1560198380(%rcx,%r10,1),%ecx
- orl %edx,%r11d
- xorl %eax,%r11d
- addl %r11d,%ecx
- movl 52(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $15,%ecx
- xorl %eax,%r11d
- addl %edx,%ecx
- leal 1309151649(%rbx,%r10,1),%ebx
- orl %ecx,%r11d
- xorl %edx,%r11d
- addl %r11d,%ebx
- movl 16(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $21,%ebx
- xorl %edx,%r11d
- addl %ecx,%ebx
- leal -145523070(%rax,%r10,1),%eax
- orl %ebx,%r11d
- xorl %ecx,%r11d
- addl %r11d,%eax
- movl 44(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $6,%eax
- xorl %ecx,%r11d
- addl %ebx,%eax
- leal -1120210379(%rdx,%r10,1),%edx
- orl %eax,%r11d
- xorl %ebx,%r11d
- addl %r11d,%edx
- movl 8(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $10,%edx
- xorl %ebx,%r11d
- addl %eax,%edx
- leal 718787259(%rcx,%r10,1),%ecx
- orl %edx,%r11d
- xorl %eax,%r11d
- addl %r11d,%ecx
- movl 36(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $15,%ecx
- xorl %eax,%r11d
- addl %edx,%ecx
- leal -343485551(%rbx,%r10,1),%ebx
- orl %ecx,%r11d
- xorl %edx,%r11d
- addl %r11d,%ebx
- movl 0(%rsi),%r10d
- movl $0xffffffff,%r11d
- roll $21,%ebx
- xorl %edx,%r11d
- addl %ecx,%ebx
-
- addl %r8d,%eax
- addl %r9d,%ebx
- addl %r14d,%ecx
- addl %r15d,%edx
-
-
- addq $64,%rsi
- cmpq %rdi,%rsi
- jb L$loop
-
-
-L$end:
- movl %eax,0(%rbp)
- movl %ebx,4(%rbp)
- movl %ecx,8(%rbp)
- movl %edx,12(%rbp)
-
- movq (%rsp),%r15
-
- movq 8(%rsp),%r14
-
- movq 16(%rsp),%r12
-
- movq 24(%rsp),%rbx
-
- movq 32(%rsp),%rbp
-
- addq $40,%rsp
-
-L$epilogue:
- ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/p256-x86_64-asm-apple.S b/apple-x86_64/crypto/fipsmodule/p256-x86_64-asm-apple.S
deleted file mode 100644
index 81cb582..0000000
--- a/apple-x86_64/crypto/fipsmodule/p256-x86_64-asm-apple.S
+++ /dev/null
@@ -1,4473 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-
-
-.section __DATA,__const
-.p2align 6
-L$poly:
-.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
-
-L$One:
-.long 1,1,1,1,1,1,1,1
-L$Two:
-.long 2,2,2,2,2,2,2,2
-L$Three:
-.long 3,3,3,3,3,3,3,3
-L$ONE_mont:
-.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
-
-
-L$ord:
-.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
-L$ordK:
-.quad 0xccd1c8aaee00bc4f
-.text
-
-
-
-.globl _ecp_nistz256_neg
-.private_extern _ecp_nistz256_neg
-
-.p2align 5
-_ecp_nistz256_neg:
-
-_CET_ENDBR
- pushq %r12
-
- pushq %r13
-
-L$neg_body:
-
- xorq %r8,%r8
- xorq %r9,%r9
- xorq %r10,%r10
- xorq %r11,%r11
- xorq %r13,%r13
-
- subq 0(%rsi),%r8
- sbbq 8(%rsi),%r9
- sbbq 16(%rsi),%r10
- movq %r8,%rax
- sbbq 24(%rsi),%r11
- leaq L$poly(%rip),%rsi
- movq %r9,%rdx
- sbbq $0,%r13
-
- addq 0(%rsi),%r8
- movq %r10,%rcx
- adcq 8(%rsi),%r9
- adcq 16(%rsi),%r10
- movq %r11,%r12
- adcq 24(%rsi),%r11
- testq %r13,%r13
-
- cmovzq %rax,%r8
- cmovzq %rdx,%r9
- movq %r8,0(%rdi)
- cmovzq %rcx,%r10
- movq %r9,8(%rdi)
- cmovzq %r12,%r11
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
-
- movq 0(%rsp),%r13
-
- movq 8(%rsp),%r12
-
- leaq 16(%rsp),%rsp
-
-L$neg_epilogue:
- ret
-
-
-
-
-
-
-
-
-.globl _ecp_nistz256_ord_mul_mont
-.private_extern _ecp_nistz256_ord_mul_mont
-
-.p2align 5
-_ecp_nistz256_ord_mul_mont:
-
-_CET_ENDBR
- leaq _OPENSSL_ia32cap_P(%rip),%rcx
- movq 8(%rcx),%rcx
- andl $0x80100,%ecx
- cmpl $0x80100,%ecx
- je L$ecp_nistz256_ord_mul_montx
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
-L$ord_mul_body:
-
- movq 0(%rdx),%rax
- movq %rdx,%rbx
- leaq L$ord(%rip),%r14
- movq L$ordK(%rip),%r15
-
-
- movq %rax,%rcx
- mulq 0(%rsi)
- movq %rax,%r8
- movq %rcx,%rax
- movq %rdx,%r9
-
- mulq 8(%rsi)
- addq %rax,%r9
- movq %rcx,%rax
- adcq $0,%rdx
- movq %rdx,%r10
-
- mulq 16(%rsi)
- addq %rax,%r10
- movq %rcx,%rax
- adcq $0,%rdx
-
- movq %r8,%r13
- imulq %r15,%r8
-
- movq %rdx,%r11
- mulq 24(%rsi)
- addq %rax,%r11
- movq %r8,%rax
- adcq $0,%rdx
- movq %rdx,%r12
-
-
- mulq 0(%r14)
- movq %r8,%rbp
- addq %rax,%r13
- movq %r8,%rax
- adcq $0,%rdx
- movq %rdx,%rcx
-
- subq %r8,%r10
- sbbq $0,%r8
-
- mulq 8(%r14)
- addq %rcx,%r9
- adcq $0,%rdx
- addq %rax,%r9
- movq %rbp,%rax
- adcq %rdx,%r10
- movq %rbp,%rdx
- adcq $0,%r8
-
- shlq $32,%rax
- shrq $32,%rdx
- subq %rax,%r11
- movq 8(%rbx),%rax
- sbbq %rdx,%rbp
-
- addq %r8,%r11
- adcq %rbp,%r12
- adcq $0,%r13
-
-
- movq %rax,%rcx
- mulq 0(%rsi)
- addq %rax,%r9
- movq %rcx,%rax
- adcq $0,%rdx
- movq %rdx,%rbp
-
- mulq 8(%rsi)
- addq %rbp,%r10
- adcq $0,%rdx
- addq %rax,%r10
- movq %rcx,%rax
- adcq $0,%rdx
- movq %rdx,%rbp
-
- mulq 16(%rsi)
- addq %rbp,%r11
- adcq $0,%rdx
- addq %rax,%r11
- movq %rcx,%rax
- adcq $0,%rdx
-
- movq %r9,%rcx
- imulq %r15,%r9
-
- movq %rdx,%rbp
- mulq 24(%rsi)
- addq %rbp,%r12
- adcq $0,%rdx
- xorq %r8,%r8
- addq %rax,%r12
- movq %r9,%rax
- adcq %rdx,%r13
- adcq $0,%r8
-
-
- mulq 0(%r14)
- movq %r9,%rbp
- addq %rax,%rcx
- movq %r9,%rax
- adcq %rdx,%rcx
-
- subq %r9,%r11
- sbbq $0,%r9
-
- mulq 8(%r14)
- addq %rcx,%r10
- adcq $0,%rdx
- addq %rax,%r10
- movq %rbp,%rax
- adcq %rdx,%r11
- movq %rbp,%rdx
- adcq $0,%r9
-
- shlq $32,%rax
- shrq $32,%rdx
- subq %rax,%r12
- movq 16(%rbx),%rax
- sbbq %rdx,%rbp
-
- addq %r9,%r12
- adcq %rbp,%r13
- adcq $0,%r8
-
-
- movq %rax,%rcx
- mulq 0(%rsi)
- addq %rax,%r10
- movq %rcx,%rax
- adcq $0,%rdx
- movq %rdx,%rbp
-
- mulq 8(%rsi)
- addq %rbp,%r11
- adcq $0,%rdx
- addq %rax,%r11
- movq %rcx,%rax
- adcq $0,%rdx
- movq %rdx,%rbp
-
- mulq 16(%rsi)
- addq %rbp,%r12
- adcq $0,%rdx
- addq %rax,%r12
- movq %rcx,%rax
- adcq $0,%rdx
-
- movq %r10,%rcx
- imulq %r15,%r10
-
- movq %rdx,%rbp
- mulq 24(%rsi)
- addq %rbp,%r13
- adcq $0,%rdx
- xorq %r9,%r9
- addq %rax,%r13
- movq %r10,%rax
- adcq %rdx,%r8
- adcq $0,%r9
-
-
- mulq 0(%r14)
- movq %r10,%rbp
- addq %rax,%rcx
- movq %r10,%rax
- adcq %rdx,%rcx
-
- subq %r10,%r12
- sbbq $0,%r10
-
- mulq 8(%r14)
- addq %rcx,%r11
- adcq $0,%rdx
- addq %rax,%r11
- movq %rbp,%rax
- adcq %rdx,%r12
- movq %rbp,%rdx
- adcq $0,%r10
-
- shlq $32,%rax
- shrq $32,%rdx
- subq %rax,%r13
- movq 24(%rbx),%rax
- sbbq %rdx,%rbp
-
- addq %r10,%r13
- adcq %rbp,%r8
- adcq $0,%r9
-
-
- movq %rax,%rcx
- mulq 0(%rsi)
- addq %rax,%r11
- movq %rcx,%rax
- adcq $0,%rdx
- movq %rdx,%rbp
-
- mulq 8(%rsi)
- addq %rbp,%r12
- adcq $0,%rdx
- addq %rax,%r12
- movq %rcx,%rax
- adcq $0,%rdx
- movq %rdx,%rbp
-
- mulq 16(%rsi)
- addq %rbp,%r13
- adcq $0,%rdx
- addq %rax,%r13
- movq %rcx,%rax
- adcq $0,%rdx
-
- movq %r11,%rcx
- imulq %r15,%r11
-
- movq %rdx,%rbp
- mulq 24(%rsi)
- addq %rbp,%r8
- adcq $0,%rdx
- xorq %r10,%r10
- addq %rax,%r8
- movq %r11,%rax
- adcq %rdx,%r9
- adcq $0,%r10
-
-
- mulq 0(%r14)
- movq %r11,%rbp
- addq %rax,%rcx
- movq %r11,%rax
- adcq %rdx,%rcx
-
- subq %r11,%r13
- sbbq $0,%r11
-
- mulq 8(%r14)
- addq %rcx,%r12
- adcq $0,%rdx
- addq %rax,%r12
- movq %rbp,%rax
- adcq %rdx,%r13
- movq %rbp,%rdx
- adcq $0,%r11
-
- shlq $32,%rax
- shrq $32,%rdx
- subq %rax,%r8
- sbbq %rdx,%rbp
-
- addq %r11,%r8
- adcq %rbp,%r9
- adcq $0,%r10
-
-
- movq %r12,%rsi
- subq 0(%r14),%r12
- movq %r13,%r11
- sbbq 8(%r14),%r13
- movq %r8,%rcx
- sbbq 16(%r14),%r8
- movq %r9,%rbp
- sbbq 24(%r14),%r9
- sbbq $0,%r10
-
- cmovcq %rsi,%r12
- cmovcq %r11,%r13
- cmovcq %rcx,%r8
- cmovcq %rbp,%r9
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- movq 0(%rsp),%r15
-
- movq 8(%rsp),%r14
-
- movq 16(%rsp),%r13
-
- movq 24(%rsp),%r12
-
- movq 32(%rsp),%rbx
-
- movq 40(%rsp),%rbp
-
- leaq 48(%rsp),%rsp
-
-L$ord_mul_epilogue:
- ret
-
-
-
-
-
-
-
-
-
-.globl _ecp_nistz256_ord_sqr_mont
-.private_extern _ecp_nistz256_ord_sqr_mont
-
-.p2align 5
-_ecp_nistz256_ord_sqr_mont:
-
-_CET_ENDBR
- leaq _OPENSSL_ia32cap_P(%rip),%rcx
- movq 8(%rcx),%rcx
- andl $0x80100,%ecx
- cmpl $0x80100,%ecx
- je L$ecp_nistz256_ord_sqr_montx
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
-L$ord_sqr_body:
-
- movq 0(%rsi),%r8
- movq 8(%rsi),%rax
- movq 16(%rsi),%r14
- movq 24(%rsi),%r15
- leaq L$ord(%rip),%rsi
- movq %rdx,%rbx
- jmp L$oop_ord_sqr
-
-.p2align 5
-L$oop_ord_sqr:
-
- movq %rax,%rbp
- mulq %r8
- movq %rax,%r9
-.byte 102,72,15,110,205
- movq %r14,%rax
- movq %rdx,%r10
-
- mulq %r8
- addq %rax,%r10
- movq %r15,%rax
-.byte 102,73,15,110,214
- adcq $0,%rdx
- movq %rdx,%r11
-
- mulq %r8
- addq %rax,%r11
- movq %r15,%rax
-.byte 102,73,15,110,223
- adcq $0,%rdx
- movq %rdx,%r12
-
-
- mulq %r14
- movq %rax,%r13
- movq %r14,%rax
- movq %rdx,%r14
-
-
- mulq %rbp
- addq %rax,%r11
- movq %r15,%rax
- adcq $0,%rdx
- movq %rdx,%r15
-
- mulq %rbp
- addq %rax,%r12
- adcq $0,%rdx
-
- addq %r15,%r12
- adcq %rdx,%r13
- adcq $0,%r14
-
-
- xorq %r15,%r15
- movq %r8,%rax
- addq %r9,%r9
- adcq %r10,%r10
- adcq %r11,%r11
- adcq %r12,%r12
- adcq %r13,%r13
- adcq %r14,%r14
- adcq $0,%r15
-
-
- mulq %rax
- movq %rax,%r8
-.byte 102,72,15,126,200
- movq %rdx,%rbp
-
- mulq %rax
- addq %rbp,%r9
- adcq %rax,%r10
-.byte 102,72,15,126,208
- adcq $0,%rdx
- movq %rdx,%rbp
-
- mulq %rax
- addq %rbp,%r11
- adcq %rax,%r12
-.byte 102,72,15,126,216
- adcq $0,%rdx
- movq %rdx,%rbp
-
- movq %r8,%rcx
- imulq 32(%rsi),%r8
-
- mulq %rax
- addq %rbp,%r13
- adcq %rax,%r14
- movq 0(%rsi),%rax
- adcq %rdx,%r15
-
-
- mulq %r8
- movq %r8,%rbp
- addq %rax,%rcx
- movq 8(%rsi),%rax
- adcq %rdx,%rcx
-
- subq %r8,%r10
- sbbq $0,%rbp
-
- mulq %r8
- addq %rcx,%r9
- adcq $0,%rdx
- addq %rax,%r9
- movq %r8,%rax
- adcq %rdx,%r10
- movq %r8,%rdx
- adcq $0,%rbp
-
- movq %r9,%rcx
- imulq 32(%rsi),%r9
-
- shlq $32,%rax
- shrq $32,%rdx
- subq %rax,%r11
- movq 0(%rsi),%rax
- sbbq %rdx,%r8
-
- addq %rbp,%r11
- adcq $0,%r8
-
-
- mulq %r9
- movq %r9,%rbp
- addq %rax,%rcx
- movq 8(%rsi),%rax
- adcq %rdx,%rcx
-
- subq %r9,%r11
- sbbq $0,%rbp
-
- mulq %r9
- addq %rcx,%r10
- adcq $0,%rdx
- addq %rax,%r10
- movq %r9,%rax
- adcq %rdx,%r11
- movq %r9,%rdx
- adcq $0,%rbp
-
- movq %r10,%rcx
- imulq 32(%rsi),%r10
-
- shlq $32,%rax
- shrq $32,%rdx
- subq %rax,%r8
- movq 0(%rsi),%rax
- sbbq %rdx,%r9
-
- addq %rbp,%r8
- adcq $0,%r9
-
-
- mulq %r10
- movq %r10,%rbp
- addq %rax,%rcx
- movq 8(%rsi),%rax
- adcq %rdx,%rcx
-
- subq %r10,%r8
- sbbq $0,%rbp
-
- mulq %r10
- addq %rcx,%r11
- adcq $0,%rdx
- addq %rax,%r11
- movq %r10,%rax
- adcq %rdx,%r8
- movq %r10,%rdx
- adcq $0,%rbp
-
- movq %r11,%rcx
- imulq 32(%rsi),%r11
-
- shlq $32,%rax
- shrq $32,%rdx
- subq %rax,%r9
- movq 0(%rsi),%rax
- sbbq %rdx,%r10
-
- addq %rbp,%r9
- adcq $0,%r10
-
-
- mulq %r11
- movq %r11,%rbp
- addq %rax,%rcx
- movq 8(%rsi),%rax
- adcq %rdx,%rcx
-
- subq %r11,%r9
- sbbq $0,%rbp
-
- mulq %r11
- addq %rcx,%r8
- adcq $0,%rdx
- addq %rax,%r8
- movq %r11,%rax
- adcq %rdx,%r9
- movq %r11,%rdx
- adcq $0,%rbp
-
- shlq $32,%rax
- shrq $32,%rdx
- subq %rax,%r10
- sbbq %rdx,%r11
-
- addq %rbp,%r10
- adcq $0,%r11
-
-
- xorq %rdx,%rdx
- addq %r12,%r8
- adcq %r13,%r9
- movq %r8,%r12
- adcq %r14,%r10
- adcq %r15,%r11
- movq %r9,%rax
- adcq $0,%rdx
-
-
- subq 0(%rsi),%r8
- movq %r10,%r14
- sbbq 8(%rsi),%r9
- sbbq 16(%rsi),%r10
- movq %r11,%r15
- sbbq 24(%rsi),%r11
- sbbq $0,%rdx
-
- cmovcq %r12,%r8
- cmovncq %r9,%rax
- cmovncq %r10,%r14
- cmovncq %r11,%r15
-
- decq %rbx
- jnz L$oop_ord_sqr
-
- movq %r8,0(%rdi)
- movq %rax,8(%rdi)
- pxor %xmm1,%xmm1
- movq %r14,16(%rdi)
- pxor %xmm2,%xmm2
- movq %r15,24(%rdi)
- pxor %xmm3,%xmm3
-
- movq 0(%rsp),%r15
-
- movq 8(%rsp),%r14
-
- movq 16(%rsp),%r13
-
- movq 24(%rsp),%r12
-
- movq 32(%rsp),%rbx
-
- movq 40(%rsp),%rbp
-
- leaq 48(%rsp),%rsp
-
-L$ord_sqr_epilogue:
- ret
-
-
-
-
-.p2align 5
-ecp_nistz256_ord_mul_montx:
-
-L$ecp_nistz256_ord_mul_montx:
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
-L$ord_mulx_body:
-
- movq %rdx,%rbx
- movq 0(%rdx),%rdx
- movq 0(%rsi),%r9
- movq 8(%rsi),%r10
- movq 16(%rsi),%r11
- movq 24(%rsi),%r12
- leaq -128(%rsi),%rsi
- leaq L$ord-128(%rip),%r14
- movq L$ordK(%rip),%r15
-
-
- mulxq %r9,%r8,%r9
- mulxq %r10,%rcx,%r10
- mulxq %r11,%rbp,%r11
- addq %rcx,%r9
- mulxq %r12,%rcx,%r12
- movq %r8,%rdx
- mulxq %r15,%rdx,%rax
- adcq %rbp,%r10
- adcq %rcx,%r11
- adcq $0,%r12
-
-
- xorq %r13,%r13
- mulxq 0+128(%r14),%rcx,%rbp
- adcxq %rcx,%r8
- adoxq %rbp,%r9
-
- mulxq 8+128(%r14),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
-
- mulxq 16+128(%r14),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 24+128(%r14),%rcx,%rbp
- movq 8(%rbx),%rdx
- adcxq %rcx,%r11
- adoxq %rbp,%r12
- adcxq %r8,%r12
- adoxq %r8,%r13
- adcq $0,%r13
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r9,%rdx
- mulxq %r15,%rdx,%rax
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- adcxq %r8,%r13
- adoxq %r8,%r8
- adcq $0,%r8
-
-
- mulxq 0+128(%r14),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
-
- mulxq 8+128(%r14),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 16+128(%r14),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 24+128(%r14),%rcx,%rbp
- movq 16(%rbx),%rdx
- adcxq %rcx,%r12
- adoxq %rbp,%r13
- adcxq %r9,%r13
- adoxq %r9,%r8
- adcq $0,%r8
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r10,%rdx
- mulxq %r15,%rdx,%rax
- adcxq %rcx,%r13
- adoxq %rbp,%r8
-
- adcxq %r9,%r8
- adoxq %r9,%r9
- adcq $0,%r9
-
-
- mulxq 0+128(%r14),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 8+128(%r14),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 16+128(%r14),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 24+128(%r14),%rcx,%rbp
- movq 24(%rbx),%rdx
- adcxq %rcx,%r13
- adoxq %rbp,%r8
- adcxq %r10,%r8
- adoxq %r10,%r9
- adcq $0,%r9
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r13
- adoxq %rbp,%r8
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r11,%rdx
- mulxq %r15,%rdx,%rax
- adcxq %rcx,%r8
- adoxq %rbp,%r9
-
- adcxq %r10,%r9
- adoxq %r10,%r10
- adcq $0,%r10
-
-
- mulxq 0+128(%r14),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 8+128(%r14),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 16+128(%r14),%rcx,%rbp
- adcxq %rcx,%r13
- adoxq %rbp,%r8
-
- mulxq 24+128(%r14),%rcx,%rbp
- leaq 128(%r14),%r14
- movq %r12,%rbx
- adcxq %rcx,%r8
- adoxq %rbp,%r9
- movq %r13,%rdx
- adcxq %r11,%r9
- adoxq %r11,%r10
- adcq $0,%r10
-
-
-
- movq %r8,%rcx
- subq 0(%r14),%r12
- sbbq 8(%r14),%r13
- sbbq 16(%r14),%r8
- movq %r9,%rbp
- sbbq 24(%r14),%r9
- sbbq $0,%r10
-
- cmovcq %rbx,%r12
- cmovcq %rdx,%r13
- cmovcq %rcx,%r8
- cmovcq %rbp,%r9
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- movq 0(%rsp),%r15
-
- movq 8(%rsp),%r14
-
- movq 16(%rsp),%r13
-
- movq 24(%rsp),%r12
-
- movq 32(%rsp),%rbx
-
- movq 40(%rsp),%rbp
-
- leaq 48(%rsp),%rsp
-
-L$ord_mulx_epilogue:
- ret
-
-
-
-
-.p2align 5
-ecp_nistz256_ord_sqr_montx:
-
-L$ecp_nistz256_ord_sqr_montx:
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
-L$ord_sqrx_body:
-
- movq %rdx,%rbx
- movq 0(%rsi),%rdx
- movq 8(%rsi),%r14
- movq 16(%rsi),%r15
- movq 24(%rsi),%r8
- leaq L$ord(%rip),%rsi
- jmp L$oop_ord_sqrx
-
-.p2align 5
-L$oop_ord_sqrx:
- mulxq %r14,%r9,%r10
- mulxq %r15,%rcx,%r11
- movq %rdx,%rax
-.byte 102,73,15,110,206
- mulxq %r8,%rbp,%r12
- movq %r14,%rdx
- addq %rcx,%r10
-.byte 102,73,15,110,215
- adcq %rbp,%r11
- adcq $0,%r12
- xorq %r13,%r13
-
- mulxq %r15,%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq %r8,%rcx,%rbp
- movq %r15,%rdx
- adcxq %rcx,%r12
- adoxq %rbp,%r13
- adcq $0,%r13
-
- mulxq %r8,%rcx,%r14
- movq %rax,%rdx
-.byte 102,73,15,110,216
- xorq %r15,%r15
- adcxq %r9,%r9
- adoxq %rcx,%r13
- adcxq %r10,%r10
- adoxq %r15,%r14
-
-
- mulxq %rdx,%r8,%rbp
-.byte 102,72,15,126,202
- adcxq %r11,%r11
- adoxq %rbp,%r9
- adcxq %r12,%r12
- mulxq %rdx,%rcx,%rax
-.byte 102,72,15,126,210
- adcxq %r13,%r13
- adoxq %rcx,%r10
- adcxq %r14,%r14
- mulxq %rdx,%rcx,%rbp
-.byte 0x67
-.byte 102,72,15,126,218
- adoxq %rax,%r11
- adcxq %r15,%r15
- adoxq %rcx,%r12
- adoxq %rbp,%r13
- mulxq %rdx,%rcx,%rax
- adoxq %rcx,%r14
- adoxq %rax,%r15
-
-
- movq %r8,%rdx
- mulxq 32(%rsi),%rdx,%rcx
-
- xorq %rax,%rax
- mulxq 0(%rsi),%rcx,%rbp
- adcxq %rcx,%r8
- adoxq %rbp,%r9
- mulxq 8(%rsi),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
- mulxq 16(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
- mulxq 24(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r8
- adcxq %rax,%r8
-
-
- movq %r9,%rdx
- mulxq 32(%rsi),%rdx,%rcx
-
- mulxq 0(%rsi),%rcx,%rbp
- adoxq %rcx,%r9
- adcxq %rbp,%r10
- mulxq 8(%rsi),%rcx,%rbp
- adoxq %rcx,%r10
- adcxq %rbp,%r11
- mulxq 16(%rsi),%rcx,%rbp
- adoxq %rcx,%r11
- adcxq %rbp,%r8
- mulxq 24(%rsi),%rcx,%rbp
- adoxq %rcx,%r8
- adcxq %rbp,%r9
- adoxq %rax,%r9
-
-
- movq %r10,%rdx
- mulxq 32(%rsi),%rdx,%rcx
-
- mulxq 0(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
- mulxq 8(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r8
- mulxq 16(%rsi),%rcx,%rbp
- adcxq %rcx,%r8
- adoxq %rbp,%r9
- mulxq 24(%rsi),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
- adcxq %rax,%r10
-
-
- movq %r11,%rdx
- mulxq 32(%rsi),%rdx,%rcx
-
- mulxq 0(%rsi),%rcx,%rbp
- adoxq %rcx,%r11
- adcxq %rbp,%r8
- mulxq 8(%rsi),%rcx,%rbp
- adoxq %rcx,%r8
- adcxq %rbp,%r9
- mulxq 16(%rsi),%rcx,%rbp
- adoxq %rcx,%r9
- adcxq %rbp,%r10
- mulxq 24(%rsi),%rcx,%rbp
- adoxq %rcx,%r10
- adcxq %rbp,%r11
- adoxq %rax,%r11
-
-
- addq %r8,%r12
- adcq %r13,%r9
- movq %r12,%rdx
- adcq %r14,%r10
- adcq %r15,%r11
- movq %r9,%r14
- adcq $0,%rax
-
-
- subq 0(%rsi),%r12
- movq %r10,%r15
- sbbq 8(%rsi),%r9
- sbbq 16(%rsi),%r10
- movq %r11,%r8
- sbbq 24(%rsi),%r11
- sbbq $0,%rax
-
- cmovncq %r12,%rdx
- cmovncq %r9,%r14
- cmovncq %r10,%r15
- cmovncq %r11,%r8
-
- decq %rbx
- jnz L$oop_ord_sqrx
-
- movq %rdx,0(%rdi)
- movq %r14,8(%rdi)
- pxor %xmm1,%xmm1
- movq %r15,16(%rdi)
- pxor %xmm2,%xmm2
- movq %r8,24(%rdi)
- pxor %xmm3,%xmm3
-
- movq 0(%rsp),%r15
-
- movq 8(%rsp),%r14
-
- movq 16(%rsp),%r13
-
- movq 24(%rsp),%r12
-
- movq 32(%rsp),%rbx
-
- movq 40(%rsp),%rbp
-
- leaq 48(%rsp),%rsp
-
-L$ord_sqrx_epilogue:
- ret
-
-
-
-
-
-
-
-
-.globl _ecp_nistz256_mul_mont
-.private_extern _ecp_nistz256_mul_mont
-
-.p2align 5
-_ecp_nistz256_mul_mont:
-
-_CET_ENDBR
- leaq _OPENSSL_ia32cap_P(%rip),%rcx
- movq 8(%rcx),%rcx
- andl $0x80100,%ecx
-L$mul_mont:
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
-L$mul_body:
- cmpl $0x80100,%ecx
- je L$mul_montx
- movq %rdx,%rbx
- movq 0(%rdx),%rax
- movq 0(%rsi),%r9
- movq 8(%rsi),%r10
- movq 16(%rsi),%r11
- movq 24(%rsi),%r12
-
- call __ecp_nistz256_mul_montq
- jmp L$mul_mont_done
-
-.p2align 5
-L$mul_montx:
- movq %rdx,%rbx
- movq 0(%rdx),%rdx
- movq 0(%rsi),%r9
- movq 8(%rsi),%r10
- movq 16(%rsi),%r11
- movq 24(%rsi),%r12
- leaq -128(%rsi),%rsi
-
- call __ecp_nistz256_mul_montx
-L$mul_mont_done:
- movq 0(%rsp),%r15
-
- movq 8(%rsp),%r14
-
- movq 16(%rsp),%r13
-
- movq 24(%rsp),%r12
-
- movq 32(%rsp),%rbx
-
- movq 40(%rsp),%rbp
-
- leaq 48(%rsp),%rsp
-
-L$mul_epilogue:
- ret
-
-
-
-
-.p2align 5
-__ecp_nistz256_mul_montq:
-
-
-
- movq %rax,%rbp
- mulq %r9
- movq L$poly+8(%rip),%r14
- movq %rax,%r8
- movq %rbp,%rax
- movq %rdx,%r9
-
- mulq %r10
- movq L$poly+24(%rip),%r15
- addq %rax,%r9
- movq %rbp,%rax
- adcq $0,%rdx
- movq %rdx,%r10
-
- mulq %r11
- addq %rax,%r10
- movq %rbp,%rax
- adcq $0,%rdx
- movq %rdx,%r11
-
- mulq %r12
- addq %rax,%r11
- movq %r8,%rax
- adcq $0,%rdx
- xorq %r13,%r13
- movq %rdx,%r12
-
-
-
-
-
-
-
-
-
-
- movq %r8,%rbp
- shlq $32,%r8
- mulq %r15
- shrq $32,%rbp
- addq %r8,%r9
- adcq %rbp,%r10
- adcq %rax,%r11
- movq 8(%rbx),%rax
- adcq %rdx,%r12
- adcq $0,%r13
- xorq %r8,%r8
-
-
-
- movq %rax,%rbp
- mulq 0(%rsi)
- addq %rax,%r9
- movq %rbp,%rax
- adcq $0,%rdx
- movq %rdx,%rcx
-
- mulq 8(%rsi)
- addq %rcx,%r10
- adcq $0,%rdx
- addq %rax,%r10
- movq %rbp,%rax
- adcq $0,%rdx
- movq %rdx,%rcx
-
- mulq 16(%rsi)
- addq %rcx,%r11
- adcq $0,%rdx
- addq %rax,%r11
- movq %rbp,%rax
- adcq $0,%rdx
- movq %rdx,%rcx
-
- mulq 24(%rsi)
- addq %rcx,%r12
- adcq $0,%rdx
- addq %rax,%r12
- movq %r9,%rax
- adcq %rdx,%r13
- adcq $0,%r8
-
-
-
- movq %r9,%rbp
- shlq $32,%r9
- mulq %r15
- shrq $32,%rbp
- addq %r9,%r10
- adcq %rbp,%r11
- adcq %rax,%r12
- movq 16(%rbx),%rax
- adcq %rdx,%r13
- adcq $0,%r8
- xorq %r9,%r9
-
-
-
- movq %rax,%rbp
- mulq 0(%rsi)
- addq %rax,%r10
- movq %rbp,%rax
- adcq $0,%rdx
- movq %rdx,%rcx
-
- mulq 8(%rsi)
- addq %rcx,%r11
- adcq $0,%rdx
- addq %rax,%r11
- movq %rbp,%rax
- adcq $0,%rdx
- movq %rdx,%rcx
-
- mulq 16(%rsi)
- addq %rcx,%r12
- adcq $0,%rdx
- addq %rax,%r12
- movq %rbp,%rax
- adcq $0,%rdx
- movq %rdx,%rcx
-
- mulq 24(%rsi)
- addq %rcx,%r13
- adcq $0,%rdx
- addq %rax,%r13
- movq %r10,%rax
- adcq %rdx,%r8
- adcq $0,%r9
-
-
-
- movq %r10,%rbp
- shlq $32,%r10
- mulq %r15
- shrq $32,%rbp
- addq %r10,%r11
- adcq %rbp,%r12
- adcq %rax,%r13
- movq 24(%rbx),%rax
- adcq %rdx,%r8
- adcq $0,%r9
- xorq %r10,%r10
-
-
-
- movq %rax,%rbp
- mulq 0(%rsi)
- addq %rax,%r11
- movq %rbp,%rax
- adcq $0,%rdx
- movq %rdx,%rcx
-
- mulq 8(%rsi)
- addq %rcx,%r12
- adcq $0,%rdx
- addq %rax,%r12
- movq %rbp,%rax
- adcq $0,%rdx
- movq %rdx,%rcx
-
- mulq 16(%rsi)
- addq %rcx,%r13
- adcq $0,%rdx
- addq %rax,%r13
- movq %rbp,%rax
- adcq $0,%rdx
- movq %rdx,%rcx
-
- mulq 24(%rsi)
- addq %rcx,%r8
- adcq $0,%rdx
- addq %rax,%r8
- movq %r11,%rax
- adcq %rdx,%r9
- adcq $0,%r10
-
-
-
- movq %r11,%rbp
- shlq $32,%r11
- mulq %r15
- shrq $32,%rbp
- addq %r11,%r12
- adcq %rbp,%r13
- movq %r12,%rcx
- adcq %rax,%r8
- adcq %rdx,%r9
- movq %r13,%rbp
- adcq $0,%r10
-
-
-
- subq $-1,%r12
- movq %r8,%rbx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%rdx
- sbbq %r15,%r9
- sbbq $0,%r10
-
- cmovcq %rcx,%r12
- cmovcq %rbp,%r13
- movq %r12,0(%rdi)
- cmovcq %rbx,%r8
- movq %r13,8(%rdi)
- cmovcq %rdx,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- ret
-
-
-
-
-
-
-
-
-
-
-.globl _ecp_nistz256_sqr_mont
-.private_extern _ecp_nistz256_sqr_mont
-
-.p2align 5
-_ecp_nistz256_sqr_mont:
-
-_CET_ENDBR
- leaq _OPENSSL_ia32cap_P(%rip),%rcx
- movq 8(%rcx),%rcx
- andl $0x80100,%ecx
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
-L$sqr_body:
- cmpl $0x80100,%ecx
- je L$sqr_montx
- movq 0(%rsi),%rax
- movq 8(%rsi),%r14
- movq 16(%rsi),%r15
- movq 24(%rsi),%r8
-
- call __ecp_nistz256_sqr_montq
- jmp L$sqr_mont_done
-
-.p2align 5
-L$sqr_montx:
- movq 0(%rsi),%rdx
- movq 8(%rsi),%r14
- movq 16(%rsi),%r15
- movq 24(%rsi),%r8
- leaq -128(%rsi),%rsi
-
- call __ecp_nistz256_sqr_montx
-L$sqr_mont_done:
- movq 0(%rsp),%r15
-
- movq 8(%rsp),%r14
-
- movq 16(%rsp),%r13
-
- movq 24(%rsp),%r12
-
- movq 32(%rsp),%rbx
-
- movq 40(%rsp),%rbp
-
- leaq 48(%rsp),%rsp
-
-L$sqr_epilogue:
- ret
-
-
-
-
-.p2align 5
-__ecp_nistz256_sqr_montq:
-
- movq %rax,%r13
- mulq %r14
- movq %rax,%r9
- movq %r15,%rax
- movq %rdx,%r10
-
- mulq %r13
- addq %rax,%r10
- movq %r8,%rax
- adcq $0,%rdx
- movq %rdx,%r11
-
- mulq %r13
- addq %rax,%r11
- movq %r15,%rax
- adcq $0,%rdx
- movq %rdx,%r12
-
-
- mulq %r14
- addq %rax,%r11
- movq %r8,%rax
- adcq $0,%rdx
- movq %rdx,%rbp
-
- mulq %r14
- addq %rax,%r12
- movq %r8,%rax
- adcq $0,%rdx
- addq %rbp,%r12
- movq %rdx,%r13
- adcq $0,%r13
-
-
- mulq %r15
- xorq %r15,%r15
- addq %rax,%r13
- movq 0(%rsi),%rax
- movq %rdx,%r14
- adcq $0,%r14
-
- addq %r9,%r9
- adcq %r10,%r10
- adcq %r11,%r11
- adcq %r12,%r12
- adcq %r13,%r13
- adcq %r14,%r14
- adcq $0,%r15
-
- mulq %rax
- movq %rax,%r8
- movq 8(%rsi),%rax
- movq %rdx,%rcx
-
- mulq %rax
- addq %rcx,%r9
- adcq %rax,%r10
- movq 16(%rsi),%rax
- adcq $0,%rdx
- movq %rdx,%rcx
-
- mulq %rax
- addq %rcx,%r11
- adcq %rax,%r12
- movq 24(%rsi),%rax
- adcq $0,%rdx
- movq %rdx,%rcx
-
- mulq %rax
- addq %rcx,%r13
- adcq %rax,%r14
- movq %r8,%rax
- adcq %rdx,%r15
-
- movq L$poly+8(%rip),%rsi
- movq L$poly+24(%rip),%rbp
-
-
-
-
- movq %r8,%rcx
- shlq $32,%r8
- mulq %rbp
- shrq $32,%rcx
- addq %r8,%r9
- adcq %rcx,%r10
- adcq %rax,%r11
- movq %r9,%rax
- adcq $0,%rdx
-
-
-
- movq %r9,%rcx
- shlq $32,%r9
- movq %rdx,%r8
- mulq %rbp
- shrq $32,%rcx
- addq %r9,%r10
- adcq %rcx,%r11
- adcq %rax,%r8
- movq %r10,%rax
- adcq $0,%rdx
-
-
-
- movq %r10,%rcx
- shlq $32,%r10
- movq %rdx,%r9
- mulq %rbp
- shrq $32,%rcx
- addq %r10,%r11
- adcq %rcx,%r8
- adcq %rax,%r9
- movq %r11,%rax
- adcq $0,%rdx
-
-
-
- movq %r11,%rcx
- shlq $32,%r11
- movq %rdx,%r10
- mulq %rbp
- shrq $32,%rcx
- addq %r11,%r8
- adcq %rcx,%r9
- adcq %rax,%r10
- adcq $0,%rdx
- xorq %r11,%r11
-
-
-
- addq %r8,%r12
- adcq %r9,%r13
- movq %r12,%r8
- adcq %r10,%r14
- adcq %rdx,%r15
- movq %r13,%r9
- adcq $0,%r11
-
- subq $-1,%r12
- movq %r14,%r10
- sbbq %rsi,%r13
- sbbq $0,%r14
- movq %r15,%rcx
- sbbq %rbp,%r15
- sbbq $0,%r11
-
- cmovcq %r8,%r12
- cmovcq %r9,%r13
- movq %r12,0(%rdi)
- cmovcq %r10,%r14
- movq %r13,8(%rdi)
- cmovcq %rcx,%r15
- movq %r14,16(%rdi)
- movq %r15,24(%rdi)
-
- ret
-
-
-
-.p2align 5
-__ecp_nistz256_mul_montx:
-
-
-
- mulxq %r9,%r8,%r9
- mulxq %r10,%rcx,%r10
- movq $32,%r14
- xorq %r13,%r13
- mulxq %r11,%rbp,%r11
- movq L$poly+24(%rip),%r15
- adcq %rcx,%r9
- mulxq %r12,%rcx,%r12
- movq %r8,%rdx
- adcq %rbp,%r10
- shlxq %r14,%r8,%rbp
- adcq %rcx,%r11
- shrxq %r14,%r8,%rcx
- adcq $0,%r12
-
-
-
- addq %rbp,%r9
- adcq %rcx,%r10
-
- mulxq %r15,%rcx,%rbp
- movq 8(%rbx),%rdx
- adcq %rcx,%r11
- adcq %rbp,%r12
- adcq $0,%r13
- xorq %r8,%r8
-
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r9,%rdx
- adcxq %rcx,%r12
- shlxq %r14,%r9,%rcx
- adoxq %rbp,%r13
- shrxq %r14,%r9,%rbp
-
- adcxq %r8,%r13
- adoxq %r8,%r8
- adcq $0,%r8
-
-
-
- addq %rcx,%r10
- adcq %rbp,%r11
-
- mulxq %r15,%rcx,%rbp
- movq 16(%rbx),%rdx
- adcq %rcx,%r12
- adcq %rbp,%r13
- adcq $0,%r8
- xorq %r9,%r9
-
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r10,%rdx
- adcxq %rcx,%r13
- shlxq %r14,%r10,%rcx
- adoxq %rbp,%r8
- shrxq %r14,%r10,%rbp
-
- adcxq %r9,%r8
- adoxq %r9,%r9
- adcq $0,%r9
-
-
-
- addq %rcx,%r11
- adcq %rbp,%r12
-
- mulxq %r15,%rcx,%rbp
- movq 24(%rbx),%rdx
- adcq %rcx,%r13
- adcq %rbp,%r8
- adcq $0,%r9
- xorq %r10,%r10
-
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r13
- adoxq %rbp,%r8
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r11,%rdx
- adcxq %rcx,%r8
- shlxq %r14,%r11,%rcx
- adoxq %rbp,%r9
- shrxq %r14,%r11,%rbp
-
- adcxq %r10,%r9
- adoxq %r10,%r10
- adcq $0,%r10
-
-
-
- addq %rcx,%r12
- adcq %rbp,%r13
-
- mulxq %r15,%rcx,%rbp
- movq %r12,%rbx
- movq L$poly+8(%rip),%r14
- adcq %rcx,%r8
- movq %r13,%rdx
- adcq %rbp,%r9
- adcq $0,%r10
-
-
-
- xorl %eax,%eax
- movq %r8,%rcx
- sbbq $-1,%r12
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%rbp
- sbbq %r15,%r9
- sbbq $0,%r10
-
- cmovcq %rbx,%r12
- cmovcq %rdx,%r13
- movq %r12,0(%rdi)
- cmovcq %rcx,%r8
- movq %r13,8(%rdi)
- cmovcq %rbp,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- ret
-
-
-
-
-.p2align 5
-__ecp_nistz256_sqr_montx:
-
- mulxq %r14,%r9,%r10
- mulxq %r15,%rcx,%r11
- xorl %eax,%eax
- adcq %rcx,%r10
- mulxq %r8,%rbp,%r12
- movq %r14,%rdx
- adcq %rbp,%r11
- adcq $0,%r12
- xorq %r13,%r13
-
-
- mulxq %r15,%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq %r8,%rcx,%rbp
- movq %r15,%rdx
- adcxq %rcx,%r12
- adoxq %rbp,%r13
- adcq $0,%r13
-
-
- mulxq %r8,%rcx,%r14
- movq 0+128(%rsi),%rdx
- xorq %r15,%r15
- adcxq %r9,%r9
- adoxq %rcx,%r13
- adcxq %r10,%r10
- adoxq %r15,%r14
-
- mulxq %rdx,%r8,%rbp
- movq 8+128(%rsi),%rdx
- adcxq %r11,%r11
- adoxq %rbp,%r9
- adcxq %r12,%r12
- mulxq %rdx,%rcx,%rax
- movq 16+128(%rsi),%rdx
- adcxq %r13,%r13
- adoxq %rcx,%r10
- adcxq %r14,%r14
-.byte 0x67
- mulxq %rdx,%rcx,%rbp
- movq 24+128(%rsi),%rdx
- adoxq %rax,%r11
- adcxq %r15,%r15
- adoxq %rcx,%r12
- movq $32,%rsi
- adoxq %rbp,%r13
-.byte 0x67,0x67
- mulxq %rdx,%rcx,%rax
- movq L$poly+24(%rip),%rdx
- adoxq %rcx,%r14
- shlxq %rsi,%r8,%rcx
- adoxq %rax,%r15
- shrxq %rsi,%r8,%rax
- movq %rdx,%rbp
-
-
- addq %rcx,%r9
- adcq %rax,%r10
-
- mulxq %r8,%rcx,%r8
- adcq %rcx,%r11
- shlxq %rsi,%r9,%rcx
- adcq $0,%r8
- shrxq %rsi,%r9,%rax
-
-
- addq %rcx,%r10
- adcq %rax,%r11
-
- mulxq %r9,%rcx,%r9
- adcq %rcx,%r8
- shlxq %rsi,%r10,%rcx
- adcq $0,%r9
- shrxq %rsi,%r10,%rax
-
-
- addq %rcx,%r11
- adcq %rax,%r8
-
- mulxq %r10,%rcx,%r10
- adcq %rcx,%r9
- shlxq %rsi,%r11,%rcx
- adcq $0,%r10
- shrxq %rsi,%r11,%rax
-
-
- addq %rcx,%r8
- adcq %rax,%r9
-
- mulxq %r11,%rcx,%r11
- adcq %rcx,%r10
- adcq $0,%r11
-
- xorq %rdx,%rdx
- addq %r8,%r12
- movq L$poly+8(%rip),%rsi
- adcq %r9,%r13
- movq %r12,%r8
- adcq %r10,%r14
- adcq %r11,%r15
- movq %r13,%r9
- adcq $0,%rdx
-
- subq $-1,%r12
- movq %r14,%r10
- sbbq %rsi,%r13
- sbbq $0,%r14
- movq %r15,%r11
- sbbq %rbp,%r15
- sbbq $0,%rdx
-
- cmovcq %r8,%r12
- cmovcq %r9,%r13
- movq %r12,0(%rdi)
- cmovcq %r10,%r14
- movq %r13,8(%rdi)
- cmovcq %r11,%r15
- movq %r14,16(%rdi)
- movq %r15,24(%rdi)
-
- ret
-
-
-
-
-.globl _ecp_nistz256_select_w5
-.private_extern _ecp_nistz256_select_w5
-
-.p2align 5
-_ecp_nistz256_select_w5:
-
-_CET_ENDBR
- leaq _OPENSSL_ia32cap_P(%rip),%rax
- movq 8(%rax),%rax
- testl $32,%eax
- jnz L$avx2_select_w5
- movdqa L$One(%rip),%xmm0
- movd %edx,%xmm1
-
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
- pxor %xmm7,%xmm7
-
- movdqa %xmm0,%xmm8
- pshufd $0,%xmm1,%xmm1
-
- movq $16,%rax
-L$select_loop_sse_w5:
-
- movdqa %xmm8,%xmm15
- paddd %xmm0,%xmm8
- pcmpeqd %xmm1,%xmm15
-
- movdqa 0(%rsi),%xmm9
- movdqa 16(%rsi),%xmm10
- movdqa 32(%rsi),%xmm11
- movdqa 48(%rsi),%xmm12
- movdqa 64(%rsi),%xmm13
- movdqa 80(%rsi),%xmm14
- leaq 96(%rsi),%rsi
-
- pand %xmm15,%xmm9
- pand %xmm15,%xmm10
- por %xmm9,%xmm2
- pand %xmm15,%xmm11
- por %xmm10,%xmm3
- pand %xmm15,%xmm12
- por %xmm11,%xmm4
- pand %xmm15,%xmm13
- por %xmm12,%xmm5
- pand %xmm15,%xmm14
- por %xmm13,%xmm6
- por %xmm14,%xmm7
-
- decq %rax
- jnz L$select_loop_sse_w5
-
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
- movdqu %xmm4,32(%rdi)
- movdqu %xmm5,48(%rdi)
- movdqu %xmm6,64(%rdi)
- movdqu %xmm7,80(%rdi)
- ret
-
-L$SEH_end_ecp_nistz256_select_w5:
-
-
-
-
-.globl _ecp_nistz256_select_w7
-.private_extern _ecp_nistz256_select_w7
-
-.p2align 5
-_ecp_nistz256_select_w7:
-
-_CET_ENDBR
- leaq _OPENSSL_ia32cap_P(%rip),%rax
- movq 8(%rax),%rax
- testl $32,%eax
- jnz L$avx2_select_w7
- movdqa L$One(%rip),%xmm8
- movd %edx,%xmm1
-
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
-
- movdqa %xmm8,%xmm0
- pshufd $0,%xmm1,%xmm1
- movq $64,%rax
-
-L$select_loop_sse_w7:
- movdqa %xmm8,%xmm15
- paddd %xmm0,%xmm8
- movdqa 0(%rsi),%xmm9
- movdqa 16(%rsi),%xmm10
- pcmpeqd %xmm1,%xmm15
- movdqa 32(%rsi),%xmm11
- movdqa 48(%rsi),%xmm12
- leaq 64(%rsi),%rsi
-
- pand %xmm15,%xmm9
- pand %xmm15,%xmm10
- por %xmm9,%xmm2
- pand %xmm15,%xmm11
- por %xmm10,%xmm3
- pand %xmm15,%xmm12
- por %xmm11,%xmm4
- prefetcht0 255(%rsi)
- por %xmm12,%xmm5
-
- decq %rax
- jnz L$select_loop_sse_w7
-
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
- movdqu %xmm4,32(%rdi)
- movdqu %xmm5,48(%rdi)
- ret
-
-L$SEH_end_ecp_nistz256_select_w7:
-
-
-
-
-.p2align 5
-ecp_nistz256_avx2_select_w5:
-
-L$avx2_select_w5:
- vzeroupper
- vmovdqa L$Two(%rip),%ymm0
-
- vpxor %ymm2,%ymm2,%ymm2
- vpxor %ymm3,%ymm3,%ymm3
- vpxor %ymm4,%ymm4,%ymm4
-
- vmovdqa L$One(%rip),%ymm5
- vmovdqa L$Two(%rip),%ymm10
-
- vmovd %edx,%xmm1
- vpermd %ymm1,%ymm2,%ymm1
-
- movq $8,%rax
-L$select_loop_avx2_w5:
-
- vmovdqa 0(%rsi),%ymm6
- vmovdqa 32(%rsi),%ymm7
- vmovdqa 64(%rsi),%ymm8
-
- vmovdqa 96(%rsi),%ymm11
- vmovdqa 128(%rsi),%ymm12
- vmovdqa 160(%rsi),%ymm13
-
- vpcmpeqd %ymm1,%ymm5,%ymm9
- vpcmpeqd %ymm1,%ymm10,%ymm14
-
- vpaddd %ymm0,%ymm5,%ymm5
- vpaddd %ymm0,%ymm10,%ymm10
- leaq 192(%rsi),%rsi
-
- vpand %ymm9,%ymm6,%ymm6
- vpand %ymm9,%ymm7,%ymm7
- vpand %ymm9,%ymm8,%ymm8
- vpand %ymm14,%ymm11,%ymm11
- vpand %ymm14,%ymm12,%ymm12
- vpand %ymm14,%ymm13,%ymm13
-
- vpxor %ymm6,%ymm2,%ymm2
- vpxor %ymm7,%ymm3,%ymm3
- vpxor %ymm8,%ymm4,%ymm4
- vpxor %ymm11,%ymm2,%ymm2
- vpxor %ymm12,%ymm3,%ymm3
- vpxor %ymm13,%ymm4,%ymm4
-
- decq %rax
- jnz L$select_loop_avx2_w5
-
- vmovdqu %ymm2,0(%rdi)
- vmovdqu %ymm3,32(%rdi)
- vmovdqu %ymm4,64(%rdi)
- vzeroupper
- ret
-
-L$SEH_end_ecp_nistz256_avx2_select_w5:
-
-
-
-
-.globl _ecp_nistz256_avx2_select_w7
-.private_extern _ecp_nistz256_avx2_select_w7
-
-.p2align 5
-_ecp_nistz256_avx2_select_w7:
-
-L$avx2_select_w7:
-_CET_ENDBR
- vzeroupper
- vmovdqa L$Three(%rip),%ymm0
-
- vpxor %ymm2,%ymm2,%ymm2
- vpxor %ymm3,%ymm3,%ymm3
-
- vmovdqa L$One(%rip),%ymm4
- vmovdqa L$Two(%rip),%ymm8
- vmovdqa L$Three(%rip),%ymm12
-
- vmovd %edx,%xmm1
- vpermd %ymm1,%ymm2,%ymm1
-
-
- movq $21,%rax
-L$select_loop_avx2_w7:
-
- vmovdqa 0(%rsi),%ymm5
- vmovdqa 32(%rsi),%ymm6
-
- vmovdqa 64(%rsi),%ymm9
- vmovdqa 96(%rsi),%ymm10
-
- vmovdqa 128(%rsi),%ymm13
- vmovdqa 160(%rsi),%ymm14
-
- vpcmpeqd %ymm1,%ymm4,%ymm7
- vpcmpeqd %ymm1,%ymm8,%ymm11
- vpcmpeqd %ymm1,%ymm12,%ymm15
-
- vpaddd %ymm0,%ymm4,%ymm4
- vpaddd %ymm0,%ymm8,%ymm8
- vpaddd %ymm0,%ymm12,%ymm12
- leaq 192(%rsi),%rsi
-
- vpand %ymm7,%ymm5,%ymm5
- vpand %ymm7,%ymm6,%ymm6
- vpand %ymm11,%ymm9,%ymm9
- vpand %ymm11,%ymm10,%ymm10
- vpand %ymm15,%ymm13,%ymm13
- vpand %ymm15,%ymm14,%ymm14
-
- vpxor %ymm5,%ymm2,%ymm2
- vpxor %ymm6,%ymm3,%ymm3
- vpxor %ymm9,%ymm2,%ymm2
- vpxor %ymm10,%ymm3,%ymm3
- vpxor %ymm13,%ymm2,%ymm2
- vpxor %ymm14,%ymm3,%ymm3
-
- decq %rax
- jnz L$select_loop_avx2_w7
-
-
- vmovdqa 0(%rsi),%ymm5
- vmovdqa 32(%rsi),%ymm6
-
- vpcmpeqd %ymm1,%ymm4,%ymm7
-
- vpand %ymm7,%ymm5,%ymm5
- vpand %ymm7,%ymm6,%ymm6
-
- vpxor %ymm5,%ymm2,%ymm2
- vpxor %ymm6,%ymm3,%ymm3
-
- vmovdqu %ymm2,0(%rdi)
- vmovdqu %ymm3,32(%rdi)
- vzeroupper
- ret
-
-L$SEH_end_ecp_nistz256_avx2_select_w7:
-
-
-.p2align 5
-__ecp_nistz256_add_toq:
-
- xorq %r11,%r11
- addq 0(%rbx),%r12
- adcq 8(%rbx),%r13
- movq %r12,%rax
- adcq 16(%rbx),%r8
- adcq 24(%rbx),%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- sbbq $0,%r11
-
- cmovcq %rax,%r12
- cmovcq %rbp,%r13
- movq %r12,0(%rdi)
- cmovcq %rcx,%r8
- movq %r13,8(%rdi)
- cmovcq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- ret
-
-
-
-
-.p2align 5
-__ecp_nistz256_sub_fromq:
-
- subq 0(%rbx),%r12
- sbbq 8(%rbx),%r13
- movq %r12,%rax
- sbbq 16(%rbx),%r8
- sbbq 24(%rbx),%r9
- movq %r13,%rbp
- sbbq %r11,%r11
-
- addq $-1,%r12
- movq %r8,%rcx
- adcq %r14,%r13
- adcq $0,%r8
- movq %r9,%r10
- adcq %r15,%r9
- testq %r11,%r11
-
- cmovzq %rax,%r12
- cmovzq %rbp,%r13
- movq %r12,0(%rdi)
- cmovzq %rcx,%r8
- movq %r13,8(%rdi)
- cmovzq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- ret
-
-
-
-
-.p2align 5
-__ecp_nistz256_subq:
-
- subq %r12,%rax
- sbbq %r13,%rbp
- movq %rax,%r12
- sbbq %r8,%rcx
- sbbq %r9,%r10
- movq %rbp,%r13
- sbbq %r11,%r11
-
- addq $-1,%rax
- movq %rcx,%r8
- adcq %r14,%rbp
- adcq $0,%rcx
- movq %r10,%r9
- adcq %r15,%r10
- testq %r11,%r11
-
- cmovnzq %rax,%r12
- cmovnzq %rbp,%r13
- cmovnzq %rcx,%r8
- cmovnzq %r10,%r9
-
- ret
-
-
-
-
-.p2align 5
-__ecp_nistz256_mul_by_2q:
-
- xorq %r11,%r11
- addq %r12,%r12
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- sbbq $0,%r11
-
- cmovcq %rax,%r12
- cmovcq %rbp,%r13
- movq %r12,0(%rdi)
- cmovcq %rcx,%r8
- movq %r13,8(%rdi)
- cmovcq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- ret
-
-
-.globl _ecp_nistz256_point_double
-.private_extern _ecp_nistz256_point_double
-
-.p2align 5
-_ecp_nistz256_point_double:
-
-_CET_ENDBR
- leaq _OPENSSL_ia32cap_P(%rip),%rcx
- movq 8(%rcx),%rcx
- andl $0x80100,%ecx
- cmpl $0x80100,%ecx
- je L$point_doublex
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- subq $160+8,%rsp
-
-L$point_doubleq_body:
-
-L$point_double_shortcutq:
- movdqu 0(%rsi),%xmm0
- movq %rsi,%rbx
- movdqu 16(%rsi),%xmm1
- movq 32+0(%rsi),%r12
- movq 32+8(%rsi),%r13
- movq 32+16(%rsi),%r8
- movq 32+24(%rsi),%r9
- movq L$poly+8(%rip),%r14
- movq L$poly+24(%rip),%r15
- movdqa %xmm0,96(%rsp)
- movdqa %xmm1,96+16(%rsp)
- leaq 32(%rdi),%r10
- leaq 64(%rdi),%r11
-.byte 102,72,15,110,199
-.byte 102,73,15,110,202
-.byte 102,73,15,110,211
-
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_by_2q
-
- movq 64+0(%rsi),%rax
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- leaq 64-0(%rsi),%rsi
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 0+0(%rsp),%rax
- movq 8+0(%rsp),%r14
- leaq 0+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 32(%rbx),%rax
- movq 64+0(%rbx),%r9
- movq 64+8(%rbx),%r10
- movq 64+16(%rbx),%r11
- movq 64+24(%rbx),%r12
- leaq 64-0(%rbx),%rsi
- leaq 32(%rbx),%rbx
-.byte 102,72,15,126,215
- call __ecp_nistz256_mul_montq
- call __ecp_nistz256_mul_by_2q
-
- movq 96+0(%rsp),%r12
- movq 96+8(%rsp),%r13
- leaq 64(%rsp),%rbx
- movq 96+16(%rsp),%r8
- movq 96+24(%rsp),%r9
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_add_toq
-
- movq 96+0(%rsp),%r12
- movq 96+8(%rsp),%r13
- leaq 64(%rsp),%rbx
- movq 96+16(%rsp),%r8
- movq 96+24(%rsp),%r9
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- movq 0+0(%rsp),%rax
- movq 8+0(%rsp),%r14
- leaq 0+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
-.byte 102,72,15,126,207
- call __ecp_nistz256_sqr_montq
- xorq %r9,%r9
- movq %r12,%rax
- addq $-1,%r12
- movq %r13,%r10
- adcq %rsi,%r13
- movq %r14,%rcx
- adcq $0,%r14
- movq %r15,%r8
- adcq %rbp,%r15
- adcq $0,%r9
- xorq %rsi,%rsi
- testq $1,%rax
-
- cmovzq %rax,%r12
- cmovzq %r10,%r13
- cmovzq %rcx,%r14
- cmovzq %r8,%r15
- cmovzq %rsi,%r9
-
- movq %r13,%rax
- shrq $1,%r12
- shlq $63,%rax
- movq %r14,%r10
- shrq $1,%r13
- orq %rax,%r12
- shlq $63,%r10
- movq %r15,%rcx
- shrq $1,%r14
- orq %r10,%r13
- shlq $63,%rcx
- movq %r12,0(%rdi)
- shrq $1,%r15
- movq %r13,8(%rdi)
- shlq $63,%r9
- orq %rcx,%r14
- orq %r9,%r15
- movq %r14,16(%rdi)
- movq %r15,24(%rdi)
- movq 64(%rsp),%rax
- leaq 64(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_by_2q
-
- leaq 32(%rsp),%rbx
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_add_toq
-
- movq 96(%rsp),%rax
- leaq 96(%rsp),%rbx
- movq 0+0(%rsp),%r9
- movq 8+0(%rsp),%r10
- leaq 0+0(%rsp),%rsi
- movq 16+0(%rsp),%r11
- movq 24+0(%rsp),%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_by_2q
-
- movq 0+32(%rsp),%rax
- movq 8+32(%rsp),%r14
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r15
- movq 24+32(%rsp),%r8
-.byte 102,72,15,126,199
- call __ecp_nistz256_sqr_montq
-
- leaq 128(%rsp),%rbx
- movq %r14,%r8
- movq %r15,%r9
- movq %rsi,%r14
- movq %rbp,%r15
- call __ecp_nistz256_sub_fromq
-
- movq 0+0(%rsp),%rax
- movq 0+8(%rsp),%rbp
- movq 0+16(%rsp),%rcx
- movq 0+24(%rsp),%r10
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_subq
-
- movq 32(%rsp),%rax
- leaq 32(%rsp),%rbx
- movq %r12,%r14
- xorl %ecx,%ecx
- movq %r12,0+0(%rsp)
- movq %r13,%r10
- movq %r13,0+8(%rsp)
- cmovzq %r8,%r11
- movq %r8,0+16(%rsp)
- leaq 0-0(%rsp),%rsi
- cmovzq %r9,%r12
- movq %r9,0+24(%rsp)
- movq %r14,%r9
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
-.byte 102,72,15,126,203
-.byte 102,72,15,126,207
- call __ecp_nistz256_sub_fromq
-
- leaq 160+56(%rsp),%rsi
-
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbx
-
- movq -8(%rsi),%rbp
-
- leaq (%rsi),%rsp
-
-L$point_doubleq_epilogue:
- ret
-
-
-.globl _ecp_nistz256_point_add
-.private_extern _ecp_nistz256_point_add
-
-.p2align 5
-_ecp_nistz256_point_add:
-
-_CET_ENDBR
- leaq _OPENSSL_ia32cap_P(%rip),%rcx
- movq 8(%rcx),%rcx
- andl $0x80100,%ecx
- cmpl $0x80100,%ecx
- je L$point_addx
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- subq $576+8,%rsp
-
-L$point_addq_body:
-
- movdqu 0(%rsi),%xmm0
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
- movdqu 64(%rsi),%xmm4
- movdqu 80(%rsi),%xmm5
- movq %rsi,%rbx
- movq %rdx,%rsi
- movdqa %xmm0,384(%rsp)
- movdqa %xmm1,384+16(%rsp)
- movdqa %xmm2,416(%rsp)
- movdqa %xmm3,416+16(%rsp)
- movdqa %xmm4,448(%rsp)
- movdqa %xmm5,448+16(%rsp)
- por %xmm4,%xmm5
-
- movdqu 0(%rsi),%xmm0
- pshufd $0xb1,%xmm5,%xmm3
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- por %xmm3,%xmm5
- movdqu 48(%rsi),%xmm3
- movq 64+0(%rsi),%rax
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- movdqa %xmm0,480(%rsp)
- pshufd $0x1e,%xmm5,%xmm4
- movdqa %xmm1,480+16(%rsp)
- movdqu 64(%rsi),%xmm0
- movdqu 80(%rsi),%xmm1
- movdqa %xmm2,512(%rsp)
- movdqa %xmm3,512+16(%rsp)
- por %xmm4,%xmm5
- pxor %xmm4,%xmm4
- por %xmm0,%xmm1
-.byte 102,72,15,110,199
-
- leaq 64-0(%rsi),%rsi
- movq %rax,544+0(%rsp)
- movq %r14,544+8(%rsp)
- movq %r15,544+16(%rsp)
- movq %r8,544+24(%rsp)
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- pcmpeqd %xmm4,%xmm5
- pshufd $0xb1,%xmm1,%xmm4
- por %xmm1,%xmm4
- pshufd $0,%xmm5,%xmm5
- pshufd $0x1e,%xmm4,%xmm3
- por %xmm3,%xmm4
- pxor %xmm3,%xmm3
- pcmpeqd %xmm3,%xmm4
- pshufd $0,%xmm4,%xmm4
- movq 64+0(%rbx),%rax
- movq 64+8(%rbx),%r14
- movq 64+16(%rbx),%r15
- movq 64+24(%rbx),%r8
-.byte 102,72,15,110,203
-
- leaq 64-0(%rbx),%rsi
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 544(%rsp),%rax
- leaq 544(%rsp),%rbx
- movq 0+96(%rsp),%r9
- movq 8+96(%rsp),%r10
- leaq 0+96(%rsp),%rsi
- movq 16+96(%rsp),%r11
- movq 24+96(%rsp),%r12
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 448(%rsp),%rax
- leaq 448(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 416(%rsp),%rax
- leaq 416(%rsp),%rbx
- movq 0+224(%rsp),%r9
- movq 8+224(%rsp),%r10
- leaq 0+224(%rsp),%rsi
- movq 16+224(%rsp),%r11
- movq 24+224(%rsp),%r12
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 512(%rsp),%rax
- leaq 512(%rsp),%rbx
- movq 0+256(%rsp),%r9
- movq 8+256(%rsp),%r10
- leaq 0+256(%rsp),%rsi
- movq 16+256(%rsp),%r11
- movq 24+256(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 224(%rsp),%rbx
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- orq %r13,%r12
- movdqa %xmm4,%xmm2
- orq %r8,%r12
- orq %r9,%r12
- por %xmm5,%xmm2
-.byte 102,73,15,110,220
-
- movq 384(%rsp),%rax
- leaq 384(%rsp),%rbx
- movq 0+96(%rsp),%r9
- movq 8+96(%rsp),%r10
- leaq 0+96(%rsp),%rsi
- movq 16+96(%rsp),%r11
- movq 24+96(%rsp),%r12
- leaq 160(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 480(%rsp),%rax
- leaq 480(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 160(%rsp),%rbx
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- orq %r13,%r12
- orq %r8,%r12
- orq %r9,%r12
-
-.byte 102,73,15,126,208
-.byte 102,73,15,126,217
- orq %r8,%r12
-.byte 0x3e
- jnz L$add_proceedq
-
-
-
- testq %r9,%r9
- jz L$add_doubleq
-
-
-
-
-
-
-.byte 102,72,15,126,199
- pxor %xmm0,%xmm0
- movdqu %xmm0,0(%rdi)
- movdqu %xmm0,16(%rdi)
- movdqu %xmm0,32(%rdi)
- movdqu %xmm0,48(%rdi)
- movdqu %xmm0,64(%rdi)
- movdqu %xmm0,80(%rdi)
- jmp L$add_doneq
-
-.p2align 5
-L$add_doubleq:
-.byte 102,72,15,126,206
-.byte 102,72,15,126,199
- addq $416,%rsp
-
- jmp L$point_double_shortcutq
-
-
-.p2align 5
-L$add_proceedq:
- movq 0+64(%rsp),%rax
- movq 8+64(%rsp),%r14
- leaq 0+64(%rsp),%rsi
- movq 16+64(%rsp),%r15
- movq 24+64(%rsp),%r8
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 448(%rsp),%rax
- leaq 448(%rsp),%rbx
- movq 0+0(%rsp),%r9
- movq 8+0(%rsp),%r10
- leaq 0+0(%rsp),%rsi
- movq 16+0(%rsp),%r11
- movq 24+0(%rsp),%r12
- leaq 352(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 0+0(%rsp),%rax
- movq 8+0(%rsp),%r14
- leaq 0+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 544(%rsp),%rax
- leaq 544(%rsp),%rbx
- movq 0+352(%rsp),%r9
- movq 8+352(%rsp),%r10
- leaq 0+352(%rsp),%rsi
- movq 16+352(%rsp),%r11
- movq 24+352(%rsp),%r12
- leaq 352(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 0(%rsp),%rax
- leaq 0(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 160(%rsp),%rax
- leaq 160(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
-
-
-
- xorq %r11,%r11
- addq %r12,%r12
- leaq 96(%rsp),%rsi
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- sbbq $0,%r11
-
- cmovcq %rax,%r12
- movq 0(%rsi),%rax
- cmovcq %rbp,%r13
- movq 8(%rsi),%rbp
- cmovcq %rcx,%r8
- movq 16(%rsi),%rcx
- cmovcq %r10,%r9
- movq 24(%rsi),%r10
-
- call __ecp_nistz256_subq
-
- leaq 128(%rsp),%rbx
- leaq 288(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- movq 192+0(%rsp),%rax
- movq 192+8(%rsp),%rbp
- movq 192+16(%rsp),%rcx
- movq 192+24(%rsp),%r10
- leaq 320(%rsp),%rdi
-
- call __ecp_nistz256_subq
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
- movq 128(%rsp),%rax
- leaq 128(%rsp),%rbx
- movq 0+224(%rsp),%r9
- movq 8+224(%rsp),%r10
- leaq 0+224(%rsp),%rsi
- movq 16+224(%rsp),%r11
- movq 24+224(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 320(%rsp),%rax
- leaq 320(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq 0+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 320(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 256(%rsp),%rbx
- leaq 320(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
-.byte 102,72,15,126,199
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 352(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 352+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 544(%rsp),%xmm2
- pand 544+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 448(%rsp),%xmm2
- pand 448+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,64(%rdi)
- movdqu %xmm3,80(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 288(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 288+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 480(%rsp),%xmm2
- pand 480+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 384(%rsp),%xmm2
- pand 384+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 320(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 320+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 512(%rsp),%xmm2
- pand 512+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 416(%rsp),%xmm2
- pand 416+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,32(%rdi)
- movdqu %xmm3,48(%rdi)
-
-L$add_doneq:
- leaq 576+56(%rsp),%rsi
-
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbx
-
- movq -8(%rsi),%rbp
-
- leaq (%rsi),%rsp
-
-L$point_addq_epilogue:
- ret
-
-
-.globl _ecp_nistz256_point_add_affine
-.private_extern _ecp_nistz256_point_add_affine
-
-.p2align 5
-_ecp_nistz256_point_add_affine:
-
-_CET_ENDBR
- leaq _OPENSSL_ia32cap_P(%rip),%rcx
- movq 8(%rcx),%rcx
- andl $0x80100,%ecx
- cmpl $0x80100,%ecx
- je L$point_add_affinex
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- subq $480+8,%rsp
-
-L$add_affineq_body:
-
- movdqu 0(%rsi),%xmm0
- movq %rdx,%rbx
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
- movdqu 64(%rsi),%xmm4
- movdqu 80(%rsi),%xmm5
- movq 64+0(%rsi),%rax
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- movdqa %xmm0,320(%rsp)
- movdqa %xmm1,320+16(%rsp)
- movdqa %xmm2,352(%rsp)
- movdqa %xmm3,352+16(%rsp)
- movdqa %xmm4,384(%rsp)
- movdqa %xmm5,384+16(%rsp)
- por %xmm4,%xmm5
-
- movdqu 0(%rbx),%xmm0
- pshufd $0xb1,%xmm5,%xmm3
- movdqu 16(%rbx),%xmm1
- movdqu 32(%rbx),%xmm2
- por %xmm3,%xmm5
- movdqu 48(%rbx),%xmm3
- movdqa %xmm0,416(%rsp)
- pshufd $0x1e,%xmm5,%xmm4
- movdqa %xmm1,416+16(%rsp)
- por %xmm0,%xmm1
-.byte 102,72,15,110,199
- movdqa %xmm2,448(%rsp)
- movdqa %xmm3,448+16(%rsp)
- por %xmm2,%xmm3
- por %xmm4,%xmm5
- pxor %xmm4,%xmm4
- por %xmm1,%xmm3
-
- leaq 64-0(%rsi),%rsi
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- pcmpeqd %xmm4,%xmm5
- pshufd $0xb1,%xmm3,%xmm4
- movq 0(%rbx),%rax
-
- movq %r12,%r9
- por %xmm3,%xmm4
- pshufd $0,%xmm5,%xmm5
- pshufd $0x1e,%xmm4,%xmm3
- movq %r13,%r10
- por %xmm3,%xmm4
- pxor %xmm3,%xmm3
- movq %r14,%r11
- pcmpeqd %xmm3,%xmm4
- pshufd $0,%xmm4,%xmm4
-
- leaq 32-0(%rsp),%rsi
- movq %r15,%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 320(%rsp),%rbx
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- movq 384(%rsp),%rax
- leaq 384(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 384(%rsp),%rax
- leaq 384(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq 0+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 288(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 448(%rsp),%rax
- leaq 448(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 352(%rsp),%rbx
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- movq 0+64(%rsp),%rax
- movq 8+64(%rsp),%r14
- leaq 0+64(%rsp),%rsi
- movq 16+64(%rsp),%r15
- movq 24+64(%rsp),%r8
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 0+96(%rsp),%rax
- movq 8+96(%rsp),%r14
- leaq 0+96(%rsp),%rsi
- movq 16+96(%rsp),%r15
- movq 24+96(%rsp),%r8
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 128(%rsp),%rax
- leaq 128(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq 0+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 160(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 320(%rsp),%rax
- leaq 320(%rsp),%rbx
- movq 0+128(%rsp),%r9
- movq 8+128(%rsp),%r10
- leaq 0+128(%rsp),%rsi
- movq 16+128(%rsp),%r11
- movq 24+128(%rsp),%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
-
-
-
- xorq %r11,%r11
- addq %r12,%r12
- leaq 192(%rsp),%rsi
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- sbbq $0,%r11
-
- cmovcq %rax,%r12
- movq 0(%rsi),%rax
- cmovcq %rbp,%r13
- movq 8(%rsi),%rbp
- cmovcq %rcx,%r8
- movq 16(%rsi),%rcx
- cmovcq %r10,%r9
- movq 24(%rsi),%r10
-
- call __ecp_nistz256_subq
-
- leaq 160(%rsp),%rbx
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- movq 0+0(%rsp),%rax
- movq 0+8(%rsp),%rbp
- movq 0+16(%rsp),%rcx
- movq 0+24(%rsp),%r10
- leaq 64(%rsp),%rdi
-
- call __ecp_nistz256_subq
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
- movq 352(%rsp),%rax
- leaq 352(%rsp),%rbx
- movq 0+160(%rsp),%r9
- movq 8+160(%rsp),%r10
- leaq 0+160(%rsp),%rsi
- movq 16+160(%rsp),%r11
- movq 24+160(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 96(%rsp),%rax
- leaq 96(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq 0+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 32(%rsp),%rbx
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
-.byte 102,72,15,126,199
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 288(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 288+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand L$ONE_mont(%rip),%xmm2
- pand L$ONE_mont+16(%rip),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 384(%rsp),%xmm2
- pand 384+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,64(%rdi)
- movdqu %xmm3,80(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 224(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 224+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 416(%rsp),%xmm2
- pand 416+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 320(%rsp),%xmm2
- pand 320+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 256(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 256+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 448(%rsp),%xmm2
- pand 448+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 352(%rsp),%xmm2
- pand 352+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,32(%rdi)
- movdqu %xmm3,48(%rdi)
-
- leaq 480+56(%rsp),%rsi
-
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbx
-
- movq -8(%rsi),%rbp
-
- leaq (%rsi),%rsp
-
-L$add_affineq_epilogue:
- ret
-
-
-
-.p2align 5
-__ecp_nistz256_add_tox:
-
- xorq %r11,%r11
- adcq 0(%rbx),%r12
- adcq 8(%rbx),%r13
- movq %r12,%rax
- adcq 16(%rbx),%r8
- adcq 24(%rbx),%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- xorq %r10,%r10
- sbbq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- sbbq $0,%r11
-
- cmovcq %rax,%r12
- cmovcq %rbp,%r13
- movq %r12,0(%rdi)
- cmovcq %rcx,%r8
- movq %r13,8(%rdi)
- cmovcq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- ret
-
-
-
-
-.p2align 5
-__ecp_nistz256_sub_fromx:
-
- xorq %r11,%r11
- sbbq 0(%rbx),%r12
- sbbq 8(%rbx),%r13
- movq %r12,%rax
- sbbq 16(%rbx),%r8
- sbbq 24(%rbx),%r9
- movq %r13,%rbp
- sbbq $0,%r11
-
- xorq %r10,%r10
- adcq $-1,%r12
- movq %r8,%rcx
- adcq %r14,%r13
- adcq $0,%r8
- movq %r9,%r10
- adcq %r15,%r9
-
- btq $0,%r11
- cmovncq %rax,%r12
- cmovncq %rbp,%r13
- movq %r12,0(%rdi)
- cmovncq %rcx,%r8
- movq %r13,8(%rdi)
- cmovncq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- ret
-
-
-
-
-.p2align 5
-__ecp_nistz256_subx:
-
- xorq %r11,%r11
- sbbq %r12,%rax
- sbbq %r13,%rbp
- movq %rax,%r12
- sbbq %r8,%rcx
- sbbq %r9,%r10
- movq %rbp,%r13
- sbbq $0,%r11
-
- xorq %r9,%r9
- adcq $-1,%rax
- movq %rcx,%r8
- adcq %r14,%rbp
- adcq $0,%rcx
- movq %r10,%r9
- adcq %r15,%r10
-
- btq $0,%r11
- cmovcq %rax,%r12
- cmovcq %rbp,%r13
- cmovcq %rcx,%r8
- cmovcq %r10,%r9
-
- ret
-
-
-
-
-.p2align 5
-__ecp_nistz256_mul_by_2x:
-
- xorq %r11,%r11
- adcq %r12,%r12
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- xorq %r10,%r10
- sbbq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- sbbq $0,%r11
-
- cmovcq %rax,%r12
- cmovcq %rbp,%r13
- movq %r12,0(%rdi)
- cmovcq %rcx,%r8
- movq %r13,8(%rdi)
- cmovcq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- ret
-
-
-
-.p2align 5
-ecp_nistz256_point_doublex:
-
-L$point_doublex:
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- subq $160+8,%rsp
-
-L$point_doublex_body:
-
-L$point_double_shortcutx:
- movdqu 0(%rsi),%xmm0
- movq %rsi,%rbx
- movdqu 16(%rsi),%xmm1
- movq 32+0(%rsi),%r12
- movq 32+8(%rsi),%r13
- movq 32+16(%rsi),%r8
- movq 32+24(%rsi),%r9
- movq L$poly+8(%rip),%r14
- movq L$poly+24(%rip),%r15
- movdqa %xmm0,96(%rsp)
- movdqa %xmm1,96+16(%rsp)
- leaq 32(%rdi),%r10
- leaq 64(%rdi),%r11
-.byte 102,72,15,110,199
-.byte 102,73,15,110,202
-.byte 102,73,15,110,211
-
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_by_2x
-
- movq 64+0(%rsi),%rdx
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- leaq 64-128(%rsi),%rsi
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 0+0(%rsp),%rdx
- movq 8+0(%rsp),%r14
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 32(%rbx),%rdx
- movq 64+0(%rbx),%r9
- movq 64+8(%rbx),%r10
- movq 64+16(%rbx),%r11
- movq 64+24(%rbx),%r12
- leaq 64-128(%rbx),%rsi
- leaq 32(%rbx),%rbx
-.byte 102,72,15,126,215
- call __ecp_nistz256_mul_montx
- call __ecp_nistz256_mul_by_2x
-
- movq 96+0(%rsp),%r12
- movq 96+8(%rsp),%r13
- leaq 64(%rsp),%rbx
- movq 96+16(%rsp),%r8
- movq 96+24(%rsp),%r9
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_add_tox
-
- movq 96+0(%rsp),%r12
- movq 96+8(%rsp),%r13
- leaq 64(%rsp),%rbx
- movq 96+16(%rsp),%r8
- movq 96+24(%rsp),%r9
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 0+0(%rsp),%rdx
- movq 8+0(%rsp),%r14
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
-.byte 102,72,15,126,207
- call __ecp_nistz256_sqr_montx
- xorq %r9,%r9
- movq %r12,%rax
- addq $-1,%r12
- movq %r13,%r10
- adcq %rsi,%r13
- movq %r14,%rcx
- adcq $0,%r14
- movq %r15,%r8
- adcq %rbp,%r15
- adcq $0,%r9
- xorq %rsi,%rsi
- testq $1,%rax
-
- cmovzq %rax,%r12
- cmovzq %r10,%r13
- cmovzq %rcx,%r14
- cmovzq %r8,%r15
- cmovzq %rsi,%r9
-
- movq %r13,%rax
- shrq $1,%r12
- shlq $63,%rax
- movq %r14,%r10
- shrq $1,%r13
- orq %rax,%r12
- shlq $63,%r10
- movq %r15,%rcx
- shrq $1,%r14
- orq %r10,%r13
- shlq $63,%rcx
- movq %r12,0(%rdi)
- shrq $1,%r15
- movq %r13,8(%rdi)
- shlq $63,%r9
- orq %rcx,%r14
- orq %r9,%r15
- movq %r14,16(%rdi)
- movq %r15,24(%rdi)
- movq 64(%rsp),%rdx
- leaq 64(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_by_2x
-
- leaq 32(%rsp),%rbx
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_add_tox
-
- movq 96(%rsp),%rdx
- leaq 96(%rsp),%rbx
- movq 0+0(%rsp),%r9
- movq 8+0(%rsp),%r10
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r11
- movq 24+0(%rsp),%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_by_2x
-
- movq 0+32(%rsp),%rdx
- movq 8+32(%rsp),%r14
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r15
- movq 24+32(%rsp),%r8
-.byte 102,72,15,126,199
- call __ecp_nistz256_sqr_montx
-
- leaq 128(%rsp),%rbx
- movq %r14,%r8
- movq %r15,%r9
- movq %rsi,%r14
- movq %rbp,%r15
- call __ecp_nistz256_sub_fromx
-
- movq 0+0(%rsp),%rax
- movq 0+8(%rsp),%rbp
- movq 0+16(%rsp),%rcx
- movq 0+24(%rsp),%r10
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_subx
-
- movq 32(%rsp),%rdx
- leaq 32(%rsp),%rbx
- movq %r12,%r14
- xorl %ecx,%ecx
- movq %r12,0+0(%rsp)
- movq %r13,%r10
- movq %r13,0+8(%rsp)
- cmovzq %r8,%r11
- movq %r8,0+16(%rsp)
- leaq 0-128(%rsp),%rsi
- cmovzq %r9,%r12
- movq %r9,0+24(%rsp)
- movq %r14,%r9
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
-.byte 102,72,15,126,203
-.byte 102,72,15,126,207
- call __ecp_nistz256_sub_fromx
-
- leaq 160+56(%rsp),%rsi
-
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbx
-
- movq -8(%rsi),%rbp
-
- leaq (%rsi),%rsp
-
-L$point_doublex_epilogue:
- ret
-
-
-
-.p2align 5
-ecp_nistz256_point_addx:
-
-L$point_addx:
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- subq $576+8,%rsp
-
-L$point_addx_body:
-
- movdqu 0(%rsi),%xmm0
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
- movdqu 64(%rsi),%xmm4
- movdqu 80(%rsi),%xmm5
- movq %rsi,%rbx
- movq %rdx,%rsi
- movdqa %xmm0,384(%rsp)
- movdqa %xmm1,384+16(%rsp)
- movdqa %xmm2,416(%rsp)
- movdqa %xmm3,416+16(%rsp)
- movdqa %xmm4,448(%rsp)
- movdqa %xmm5,448+16(%rsp)
- por %xmm4,%xmm5
-
- movdqu 0(%rsi),%xmm0
- pshufd $0xb1,%xmm5,%xmm3
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- por %xmm3,%xmm5
- movdqu 48(%rsi),%xmm3
- movq 64+0(%rsi),%rdx
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- movdqa %xmm0,480(%rsp)
- pshufd $0x1e,%xmm5,%xmm4
- movdqa %xmm1,480+16(%rsp)
- movdqu 64(%rsi),%xmm0
- movdqu 80(%rsi),%xmm1
- movdqa %xmm2,512(%rsp)
- movdqa %xmm3,512+16(%rsp)
- por %xmm4,%xmm5
- pxor %xmm4,%xmm4
- por %xmm0,%xmm1
-.byte 102,72,15,110,199
-
- leaq 64-128(%rsi),%rsi
- movq %rdx,544+0(%rsp)
- movq %r14,544+8(%rsp)
- movq %r15,544+16(%rsp)
- movq %r8,544+24(%rsp)
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- pcmpeqd %xmm4,%xmm5
- pshufd $0xb1,%xmm1,%xmm4
- por %xmm1,%xmm4
- pshufd $0,%xmm5,%xmm5
- pshufd $0x1e,%xmm4,%xmm3
- por %xmm3,%xmm4
- pxor %xmm3,%xmm3
- pcmpeqd %xmm3,%xmm4
- pshufd $0,%xmm4,%xmm4
- movq 64+0(%rbx),%rdx
- movq 64+8(%rbx),%r14
- movq 64+16(%rbx),%r15
- movq 64+24(%rbx),%r8
-.byte 102,72,15,110,203
-
- leaq 64-128(%rbx),%rsi
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 544(%rsp),%rdx
- leaq 544(%rsp),%rbx
- movq 0+96(%rsp),%r9
- movq 8+96(%rsp),%r10
- leaq -128+96(%rsp),%rsi
- movq 16+96(%rsp),%r11
- movq 24+96(%rsp),%r12
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 448(%rsp),%rdx
- leaq 448(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 416(%rsp),%rdx
- leaq 416(%rsp),%rbx
- movq 0+224(%rsp),%r9
- movq 8+224(%rsp),%r10
- leaq -128+224(%rsp),%rsi
- movq 16+224(%rsp),%r11
- movq 24+224(%rsp),%r12
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 512(%rsp),%rdx
- leaq 512(%rsp),%rbx
- movq 0+256(%rsp),%r9
- movq 8+256(%rsp),%r10
- leaq -128+256(%rsp),%rsi
- movq 16+256(%rsp),%r11
- movq 24+256(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 224(%rsp),%rbx
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- orq %r13,%r12
- movdqa %xmm4,%xmm2
- orq %r8,%r12
- orq %r9,%r12
- por %xmm5,%xmm2
-.byte 102,73,15,110,220
-
- movq 384(%rsp),%rdx
- leaq 384(%rsp),%rbx
- movq 0+96(%rsp),%r9
- movq 8+96(%rsp),%r10
- leaq -128+96(%rsp),%rsi
- movq 16+96(%rsp),%r11
- movq 24+96(%rsp),%r12
- leaq 160(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 480(%rsp),%rdx
- leaq 480(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 160(%rsp),%rbx
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- orq %r13,%r12
- orq %r8,%r12
- orq %r9,%r12
-
-.byte 102,73,15,126,208
-.byte 102,73,15,126,217
- orq %r8,%r12
-.byte 0x3e
- jnz L$add_proceedx
-
-
-
- testq %r9,%r9
- jz L$add_doublex
-
-
-
-
-
-
-.byte 102,72,15,126,199
- pxor %xmm0,%xmm0
- movdqu %xmm0,0(%rdi)
- movdqu %xmm0,16(%rdi)
- movdqu %xmm0,32(%rdi)
- movdqu %xmm0,48(%rdi)
- movdqu %xmm0,64(%rdi)
- movdqu %xmm0,80(%rdi)
- jmp L$add_donex
-
-.p2align 5
-L$add_doublex:
-.byte 102,72,15,126,206
-.byte 102,72,15,126,199
- addq $416,%rsp
-
- jmp L$point_double_shortcutx
-
-
-.p2align 5
-L$add_proceedx:
- movq 0+64(%rsp),%rdx
- movq 8+64(%rsp),%r14
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r15
- movq 24+64(%rsp),%r8
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 448(%rsp),%rdx
- leaq 448(%rsp),%rbx
- movq 0+0(%rsp),%r9
- movq 8+0(%rsp),%r10
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r11
- movq 24+0(%rsp),%r12
- leaq 352(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 0+0(%rsp),%rdx
- movq 8+0(%rsp),%r14
- leaq -128+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 544(%rsp),%rdx
- leaq 544(%rsp),%rbx
- movq 0+352(%rsp),%r9
- movq 8+352(%rsp),%r10
- leaq -128+352(%rsp),%rsi
- movq 16+352(%rsp),%r11
- movq 24+352(%rsp),%r12
- leaq 352(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 0(%rsp),%rdx
- leaq 0(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 160(%rsp),%rdx
- leaq 160(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
-
-
-
- xorq %r11,%r11
- addq %r12,%r12
- leaq 96(%rsp),%rsi
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- sbbq $0,%r11
-
- cmovcq %rax,%r12
- movq 0(%rsi),%rax
- cmovcq %rbp,%r13
- movq 8(%rsi),%rbp
- cmovcq %rcx,%r8
- movq 16(%rsi),%rcx
- cmovcq %r10,%r9
- movq 24(%rsi),%r10
-
- call __ecp_nistz256_subx
-
- leaq 128(%rsp),%rbx
- leaq 288(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 192+0(%rsp),%rax
- movq 192+8(%rsp),%rbp
- movq 192+16(%rsp),%rcx
- movq 192+24(%rsp),%r10
- leaq 320(%rsp),%rdi
-
- call __ecp_nistz256_subx
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
- movq 128(%rsp),%rdx
- leaq 128(%rsp),%rbx
- movq 0+224(%rsp),%r9
- movq 8+224(%rsp),%r10
- leaq -128+224(%rsp),%rsi
- movq 16+224(%rsp),%r11
- movq 24+224(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 320(%rsp),%rdx
- leaq 320(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 320(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 256(%rsp),%rbx
- leaq 320(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
-.byte 102,72,15,126,199
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 352(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 352+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 544(%rsp),%xmm2
- pand 544+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 448(%rsp),%xmm2
- pand 448+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,64(%rdi)
- movdqu %xmm3,80(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 288(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 288+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 480(%rsp),%xmm2
- pand 480+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 384(%rsp),%xmm2
- pand 384+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 320(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 320+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 512(%rsp),%xmm2
- pand 512+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 416(%rsp),%xmm2
- pand 416+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,32(%rdi)
- movdqu %xmm3,48(%rdi)
-
-L$add_donex:
- leaq 576+56(%rsp),%rsi
-
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbx
-
- movq -8(%rsi),%rbp
-
- leaq (%rsi),%rsp
-
-L$point_addx_epilogue:
- ret
-
-
-
-.p2align 5
-ecp_nistz256_point_add_affinex:
-
-L$point_add_affinex:
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- subq $480+8,%rsp
-
-L$add_affinex_body:
-
- movdqu 0(%rsi),%xmm0
- movq %rdx,%rbx
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
- movdqu 64(%rsi),%xmm4
- movdqu 80(%rsi),%xmm5
- movq 64+0(%rsi),%rdx
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- movdqa %xmm0,320(%rsp)
- movdqa %xmm1,320+16(%rsp)
- movdqa %xmm2,352(%rsp)
- movdqa %xmm3,352+16(%rsp)
- movdqa %xmm4,384(%rsp)
- movdqa %xmm5,384+16(%rsp)
- por %xmm4,%xmm5
-
- movdqu 0(%rbx),%xmm0
- pshufd $0xb1,%xmm5,%xmm3
- movdqu 16(%rbx),%xmm1
- movdqu 32(%rbx),%xmm2
- por %xmm3,%xmm5
- movdqu 48(%rbx),%xmm3
- movdqa %xmm0,416(%rsp)
- pshufd $0x1e,%xmm5,%xmm4
- movdqa %xmm1,416+16(%rsp)
- por %xmm0,%xmm1
-.byte 102,72,15,110,199
- movdqa %xmm2,448(%rsp)
- movdqa %xmm3,448+16(%rsp)
- por %xmm2,%xmm3
- por %xmm4,%xmm5
- pxor %xmm4,%xmm4
- por %xmm1,%xmm3
-
- leaq 64-128(%rsi),%rsi
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- pcmpeqd %xmm4,%xmm5
- pshufd $0xb1,%xmm3,%xmm4
- movq 0(%rbx),%rdx
-
- movq %r12,%r9
- por %xmm3,%xmm4
- pshufd $0,%xmm5,%xmm5
- pshufd $0x1e,%xmm4,%xmm3
- movq %r13,%r10
- por %xmm3,%xmm4
- pxor %xmm3,%xmm3
- movq %r14,%r11
- pcmpeqd %xmm3,%xmm4
- pshufd $0,%xmm4,%xmm4
-
- leaq 32-128(%rsp),%rsi
- movq %r15,%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 320(%rsp),%rbx
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 384(%rsp),%rdx
- leaq 384(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 384(%rsp),%rdx
- leaq 384(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 288(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 448(%rsp),%rdx
- leaq 448(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 352(%rsp),%rbx
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 0+64(%rsp),%rdx
- movq 8+64(%rsp),%r14
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r15
- movq 24+64(%rsp),%r8
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 0+96(%rsp),%rdx
- movq 8+96(%rsp),%r14
- leaq -128+96(%rsp),%rsi
- movq 16+96(%rsp),%r15
- movq 24+96(%rsp),%r8
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
-
- movq 128(%rsp),%rdx
- leaq 128(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 160(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 320(%rsp),%rdx
- leaq 320(%rsp),%rbx
- movq 0+128(%rsp),%r9
- movq 8+128(%rsp),%r10
- leaq -128+128(%rsp),%rsi
- movq 16+128(%rsp),%r11
- movq 24+128(%rsp),%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
-
-
-
- xorq %r11,%r11
- addq %r12,%r12
- leaq 192(%rsp),%rsi
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- sbbq $0,%r11
-
- cmovcq %rax,%r12
- movq 0(%rsi),%rax
- cmovcq %rbp,%r13
- movq 8(%rsi),%rbp
- cmovcq %rcx,%r8
- movq 16(%rsi),%rcx
- cmovcq %r10,%r9
- movq 24(%rsi),%r10
-
- call __ecp_nistz256_subx
-
- leaq 160(%rsp),%rbx
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
- movq 0+0(%rsp),%rax
- movq 0+8(%rsp),%rbp
- movq 0+16(%rsp),%rcx
- movq 0+24(%rsp),%r10
- leaq 64(%rsp),%rdi
-
- call __ecp_nistz256_subx
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
- movq 352(%rsp),%rdx
- leaq 352(%rsp),%rbx
- movq 0+160(%rsp),%r9
- movq 8+160(%rsp),%r10
- leaq -128+160(%rsp),%rsi
- movq 16+160(%rsp),%r11
- movq 24+160(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- movq 96(%rsp),%rdx
- leaq 96(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_mul_montx
-
- leaq 32(%rsp),%rbx
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
-
-.byte 102,72,15,126,199
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 288(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 288+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand L$ONE_mont(%rip),%xmm2
- pand L$ONE_mont+16(%rip),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 384(%rsp),%xmm2
- pand 384+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,64(%rdi)
- movdqu %xmm3,80(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 224(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 224+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 416(%rsp),%xmm2
- pand 416+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 320(%rsp),%xmm2
- pand 320+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 256(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 256+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 448(%rsp),%xmm2
- pand 448+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 352(%rsp),%xmm2
- pand 352+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,32(%rdi)
- movdqu %xmm3,48(%rdi)
-
- leaq 480+56(%rsp),%rsi
-
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbx
-
- movq -8(%rsi),%rbp
-
- leaq (%rsi),%rsp
-
-L$add_affinex_epilogue:
- ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm-apple.S b/apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm-apple.S
deleted file mode 100644
index fc6552c..0000000
--- a/apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm-apple.S
+++ /dev/null
@@ -1,322 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-
-.private_extern _beeu_mod_inverse_vartime
-.globl _beeu_mod_inverse_vartime
-.private_extern _beeu_mod_inverse_vartime
-.p2align 5
-_beeu_mod_inverse_vartime:
-
-_CET_ENDBR
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- pushq %rbx
-
- pushq %rsi
-
-
- subq $80,%rsp
-
- movq %rdi,0(%rsp)
-
-
- movq $1,%r8
- xorq %r9,%r9
- xorq %r10,%r10
- xorq %r11,%r11
- xorq %rdi,%rdi
-
- xorq %r12,%r12
- xorq %r13,%r13
- xorq %r14,%r14
- xorq %r15,%r15
- xorq %rbp,%rbp
-
-
- vmovdqu 0(%rsi),%xmm0
- vmovdqu 16(%rsi),%xmm1
- vmovdqu %xmm0,48(%rsp)
- vmovdqu %xmm1,64(%rsp)
-
- vmovdqu 0(%rdx),%xmm0
- vmovdqu 16(%rdx),%xmm1
- vmovdqu %xmm0,16(%rsp)
- vmovdqu %xmm1,32(%rsp)
-
-L$beeu_loop:
- xorq %rbx,%rbx
- orq 48(%rsp),%rbx
- orq 56(%rsp),%rbx
- orq 64(%rsp),%rbx
- orq 72(%rsp),%rbx
- jz L$beeu_loop_end
-
-
-
-
-
-
-
-
-
-
- movq $1,%rcx
-
-
-L$beeu_shift_loop_XB:
- movq %rcx,%rbx
- andq 48(%rsp),%rbx
- jnz L$beeu_shift_loop_end_XB
-
-
- movq $1,%rbx
- andq %r8,%rbx
- jz L$shift1_0
- addq 0(%rdx),%r8
- adcq 8(%rdx),%r9
- adcq 16(%rdx),%r10
- adcq 24(%rdx),%r11
- adcq $0,%rdi
-
-L$shift1_0:
- shrdq $1,%r9,%r8
- shrdq $1,%r10,%r9
- shrdq $1,%r11,%r10
- shrdq $1,%rdi,%r11
- shrq $1,%rdi
-
- shlq $1,%rcx
-
-
-
-
-
- cmpq $0x8000000,%rcx
- jne L$beeu_shift_loop_XB
-
-L$beeu_shift_loop_end_XB:
- bsfq %rcx,%rcx
- testq %rcx,%rcx
- jz L$beeu_no_shift_XB
-
-
-
- movq 8+48(%rsp),%rax
- movq 16+48(%rsp),%rbx
- movq 24+48(%rsp),%rsi
-
- shrdq %cl,%rax,0+48(%rsp)
- shrdq %cl,%rbx,8+48(%rsp)
- shrdq %cl,%rsi,16+48(%rsp)
-
- shrq %cl,%rsi
- movq %rsi,24+48(%rsp)
-
-
-L$beeu_no_shift_XB:
-
- movq $1,%rcx
-
-
-L$beeu_shift_loop_YA:
- movq %rcx,%rbx
- andq 16(%rsp),%rbx
- jnz L$beeu_shift_loop_end_YA
-
-
- movq $1,%rbx
- andq %r12,%rbx
- jz L$shift1_1
- addq 0(%rdx),%r12
- adcq 8(%rdx),%r13
- adcq 16(%rdx),%r14
- adcq 24(%rdx),%r15
- adcq $0,%rbp
-
-L$shift1_1:
- shrdq $1,%r13,%r12
- shrdq $1,%r14,%r13
- shrdq $1,%r15,%r14
- shrdq $1,%rbp,%r15
- shrq $1,%rbp
-
- shlq $1,%rcx
-
-
-
-
-
- cmpq $0x8000000,%rcx
- jne L$beeu_shift_loop_YA
-
-L$beeu_shift_loop_end_YA:
- bsfq %rcx,%rcx
- testq %rcx,%rcx
- jz L$beeu_no_shift_YA
-
-
-
- movq 8+16(%rsp),%rax
- movq 16+16(%rsp),%rbx
- movq 24+16(%rsp),%rsi
-
- shrdq %cl,%rax,0+16(%rsp)
- shrdq %cl,%rbx,8+16(%rsp)
- shrdq %cl,%rsi,16+16(%rsp)
-
- shrq %cl,%rsi
- movq %rsi,24+16(%rsp)
-
-
-L$beeu_no_shift_YA:
-
- movq 48(%rsp),%rax
- movq 56(%rsp),%rbx
- movq 64(%rsp),%rsi
- movq 72(%rsp),%rcx
- subq 16(%rsp),%rax
- sbbq 24(%rsp),%rbx
- sbbq 32(%rsp),%rsi
- sbbq 40(%rsp),%rcx
- jnc L$beeu_B_bigger_than_A
-
-
- movq 16(%rsp),%rax
- movq 24(%rsp),%rbx
- movq 32(%rsp),%rsi
- movq 40(%rsp),%rcx
- subq 48(%rsp),%rax
- sbbq 56(%rsp),%rbx
- sbbq 64(%rsp),%rsi
- sbbq 72(%rsp),%rcx
- movq %rax,16(%rsp)
- movq %rbx,24(%rsp)
- movq %rsi,32(%rsp)
- movq %rcx,40(%rsp)
-
-
- addq %r8,%r12
- adcq %r9,%r13
- adcq %r10,%r14
- adcq %r11,%r15
- adcq %rdi,%rbp
- jmp L$beeu_loop
-
-L$beeu_B_bigger_than_A:
-
- movq %rax,48(%rsp)
- movq %rbx,56(%rsp)
- movq %rsi,64(%rsp)
- movq %rcx,72(%rsp)
-
-
- addq %r12,%r8
- adcq %r13,%r9
- adcq %r14,%r10
- adcq %r15,%r11
- adcq %rbp,%rdi
-
- jmp L$beeu_loop
-
-L$beeu_loop_end:
-
-
-
-
- movq 16(%rsp),%rbx
- subq $1,%rbx
- orq 24(%rsp),%rbx
- orq 32(%rsp),%rbx
- orq 40(%rsp),%rbx
-
- jnz L$beeu_err
-
-
-
-
- movq 0(%rdx),%r8
- movq 8(%rdx),%r9
- movq 16(%rdx),%r10
- movq 24(%rdx),%r11
- xorq %rdi,%rdi
-
-L$beeu_reduction_loop:
- movq %r12,16(%rsp)
- movq %r13,24(%rsp)
- movq %r14,32(%rsp)
- movq %r15,40(%rsp)
- movq %rbp,48(%rsp)
-
-
- subq %r8,%r12
- sbbq %r9,%r13
- sbbq %r10,%r14
- sbbq %r11,%r15
- sbbq $0,%rbp
-
-
- cmovcq 16(%rsp),%r12
- cmovcq 24(%rsp),%r13
- cmovcq 32(%rsp),%r14
- cmovcq 40(%rsp),%r15
- jnc L$beeu_reduction_loop
-
-
- subq %r12,%r8
- sbbq %r13,%r9
- sbbq %r14,%r10
- sbbq %r15,%r11
-
-L$beeu_save:
-
- movq 0(%rsp),%rdi
-
- movq %r8,0(%rdi)
- movq %r9,8(%rdi)
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
-
-
- movq $1,%rax
- jmp L$beeu_finish
-
-L$beeu_err:
-
- xorq %rax,%rax
-
-L$beeu_finish:
- addq $80,%rsp
-
- popq %rsi
-
- popq %rbx
-
- popq %r15
-
- popq %r14
-
- popq %r13
-
- popq %r12
-
- popq %rbp
-
- ret
-
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/rdrand-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/rdrand-x86_64-apple.S
deleted file mode 100644
index 5fdf105..0000000
--- a/apple-x86_64/crypto/fipsmodule/rdrand-x86_64-apple.S
+++ /dev/null
@@ -1,57 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-
-
-
-.globl _CRYPTO_rdrand
-.private_extern _CRYPTO_rdrand
-
-.p2align 4
-_CRYPTO_rdrand:
-
-_CET_ENDBR
- xorq %rax,%rax
-.byte 72,15,199,242
-
- adcq %rax,%rax
- movq %rdx,0(%rdi)
- ret
-
-
-
-
-
-
-
-.globl _CRYPTO_rdrand_multiple8_buf
-.private_extern _CRYPTO_rdrand_multiple8_buf
-
-.p2align 4
-_CRYPTO_rdrand_multiple8_buf:
-
-_CET_ENDBR
- testq %rsi,%rsi
- jz L$out
- movq $8,%rdx
-L$loop:
-.byte 72,15,199,241
- jnc L$err
- movq %rcx,0(%rdi)
- addq %rdx,%rdi
- subq %rdx,%rsi
- jnz L$loop
-L$out:
- movq $1,%rax
- ret
-L$err:
- xorq %rax,%rax
- ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/rsaz-avx2-apple.S b/apple-x86_64/crypto/fipsmodule/rsaz-avx2-apple.S
deleted file mode 100644
index 3672309..0000000
--- a/apple-x86_64/crypto/fipsmodule/rsaz-avx2-apple.S
+++ /dev/null
@@ -1,1749 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-.globl _rsaz_1024_sqr_avx2
-.private_extern _rsaz_1024_sqr_avx2
-
-.p2align 6
-_rsaz_1024_sqr_avx2:
-
-_CET_ENDBR
- leaq (%rsp),%rax
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- vzeroupper
- movq %rax,%rbp
-
- movq %rdx,%r13
- subq $832,%rsp
- movq %r13,%r15
- subq $-128,%rdi
- subq $-128,%rsi
- subq $-128,%r13
-
- andq $4095,%r15
- addq $320,%r15
- shrq $12,%r15
- vpxor %ymm9,%ymm9,%ymm9
- jz L$sqr_1024_no_n_copy
-
-
-
-
-
- subq $320,%rsp
- vmovdqu 0-128(%r13),%ymm0
- andq $-2048,%rsp
- vmovdqu 32-128(%r13),%ymm1
- vmovdqu 64-128(%r13),%ymm2
- vmovdqu 96-128(%r13),%ymm3
- vmovdqu 128-128(%r13),%ymm4
- vmovdqu 160-128(%r13),%ymm5
- vmovdqu 192-128(%r13),%ymm6
- vmovdqu 224-128(%r13),%ymm7
- vmovdqu 256-128(%r13),%ymm8
- leaq 832+128(%rsp),%r13
- vmovdqu %ymm0,0-128(%r13)
- vmovdqu %ymm1,32-128(%r13)
- vmovdqu %ymm2,64-128(%r13)
- vmovdqu %ymm3,96-128(%r13)
- vmovdqu %ymm4,128-128(%r13)
- vmovdqu %ymm5,160-128(%r13)
- vmovdqu %ymm6,192-128(%r13)
- vmovdqu %ymm7,224-128(%r13)
- vmovdqu %ymm8,256-128(%r13)
- vmovdqu %ymm9,288-128(%r13)
-
-L$sqr_1024_no_n_copy:
- andq $-1024,%rsp
-
- vmovdqu 32-128(%rsi),%ymm1
- vmovdqu 64-128(%rsi),%ymm2
- vmovdqu 96-128(%rsi),%ymm3
- vmovdqu 128-128(%rsi),%ymm4
- vmovdqu 160-128(%rsi),%ymm5
- vmovdqu 192-128(%rsi),%ymm6
- vmovdqu 224-128(%rsi),%ymm7
- vmovdqu 256-128(%rsi),%ymm8
-
- leaq 192(%rsp),%rbx
- vmovdqu L$and_mask(%rip),%ymm15
- jmp L$OOP_GRANDE_SQR_1024
-
-.p2align 5
-L$OOP_GRANDE_SQR_1024:
- leaq 576+128(%rsp),%r9
- leaq 448(%rsp),%r12
-
-
-
-
- vpaddq %ymm1,%ymm1,%ymm1
- vpbroadcastq 0-128(%rsi),%ymm10
- vpaddq %ymm2,%ymm2,%ymm2
- vmovdqa %ymm1,0-128(%r9)
- vpaddq %ymm3,%ymm3,%ymm3
- vmovdqa %ymm2,32-128(%r9)
- vpaddq %ymm4,%ymm4,%ymm4
- vmovdqa %ymm3,64-128(%r9)
- vpaddq %ymm5,%ymm5,%ymm5
- vmovdqa %ymm4,96-128(%r9)
- vpaddq %ymm6,%ymm6,%ymm6
- vmovdqa %ymm5,128-128(%r9)
- vpaddq %ymm7,%ymm7,%ymm7
- vmovdqa %ymm6,160-128(%r9)
- vpaddq %ymm8,%ymm8,%ymm8
- vmovdqa %ymm7,192-128(%r9)
- vpxor %ymm9,%ymm9,%ymm9
- vmovdqa %ymm8,224-128(%r9)
-
- vpmuludq 0-128(%rsi),%ymm10,%ymm0
- vpbroadcastq 32-128(%rsi),%ymm11
- vmovdqu %ymm9,288-192(%rbx)
- vpmuludq %ymm10,%ymm1,%ymm1
- vmovdqu %ymm9,320-448(%r12)
- vpmuludq %ymm10,%ymm2,%ymm2
- vmovdqu %ymm9,352-448(%r12)
- vpmuludq %ymm10,%ymm3,%ymm3
- vmovdqu %ymm9,384-448(%r12)
- vpmuludq %ymm10,%ymm4,%ymm4
- vmovdqu %ymm9,416-448(%r12)
- vpmuludq %ymm10,%ymm5,%ymm5
- vmovdqu %ymm9,448-448(%r12)
- vpmuludq %ymm10,%ymm6,%ymm6
- vmovdqu %ymm9,480-448(%r12)
- vpmuludq %ymm10,%ymm7,%ymm7
- vmovdqu %ymm9,512-448(%r12)
- vpmuludq %ymm10,%ymm8,%ymm8
- vpbroadcastq 64-128(%rsi),%ymm10
- vmovdqu %ymm9,544-448(%r12)
-
- movq %rsi,%r15
- movl $4,%r14d
- jmp L$sqr_entry_1024
-.p2align 5
-L$OOP_SQR_1024:
- vpbroadcastq 32-128(%r15),%ymm11
- vpmuludq 0-128(%rsi),%ymm10,%ymm0
- vpaddq 0-192(%rbx),%ymm0,%ymm0
- vpmuludq 0-128(%r9),%ymm10,%ymm1
- vpaddq 32-192(%rbx),%ymm1,%ymm1
- vpmuludq 32-128(%r9),%ymm10,%ymm2
- vpaddq 64-192(%rbx),%ymm2,%ymm2
- vpmuludq 64-128(%r9),%ymm10,%ymm3
- vpaddq 96-192(%rbx),%ymm3,%ymm3
- vpmuludq 96-128(%r9),%ymm10,%ymm4
- vpaddq 128-192(%rbx),%ymm4,%ymm4
- vpmuludq 128-128(%r9),%ymm10,%ymm5
- vpaddq 160-192(%rbx),%ymm5,%ymm5
- vpmuludq 160-128(%r9),%ymm10,%ymm6
- vpaddq 192-192(%rbx),%ymm6,%ymm6
- vpmuludq 192-128(%r9),%ymm10,%ymm7
- vpaddq 224-192(%rbx),%ymm7,%ymm7
- vpmuludq 224-128(%r9),%ymm10,%ymm8
- vpbroadcastq 64-128(%r15),%ymm10
- vpaddq 256-192(%rbx),%ymm8,%ymm8
-L$sqr_entry_1024:
- vmovdqu %ymm0,0-192(%rbx)
- vmovdqu %ymm1,32-192(%rbx)
-
- vpmuludq 32-128(%rsi),%ymm11,%ymm12
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 32-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm3,%ymm3
- vpmuludq 64-128(%r9),%ymm11,%ymm13
- vpaddq %ymm13,%ymm4,%ymm4
- vpmuludq 96-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 128-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm6,%ymm6
- vpmuludq 160-128(%r9),%ymm11,%ymm13
- vpaddq %ymm13,%ymm7,%ymm7
- vpmuludq 192-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq 224-128(%r9),%ymm11,%ymm0
- vpbroadcastq 96-128(%r15),%ymm11
- vpaddq 288-192(%rbx),%ymm0,%ymm0
-
- vmovdqu %ymm2,64-192(%rbx)
- vmovdqu %ymm3,96-192(%rbx)
-
- vpmuludq 64-128(%rsi),%ymm10,%ymm13
- vpaddq %ymm13,%ymm4,%ymm4
- vpmuludq 64-128(%r9),%ymm10,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 96-128(%r9),%ymm10,%ymm14
- vpaddq %ymm14,%ymm6,%ymm6
- vpmuludq 128-128(%r9),%ymm10,%ymm13
- vpaddq %ymm13,%ymm7,%ymm7
- vpmuludq 160-128(%r9),%ymm10,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq 192-128(%r9),%ymm10,%ymm14
- vpaddq %ymm14,%ymm0,%ymm0
- vpmuludq 224-128(%r9),%ymm10,%ymm1
- vpbroadcastq 128-128(%r15),%ymm10
- vpaddq 320-448(%r12),%ymm1,%ymm1
-
- vmovdqu %ymm4,128-192(%rbx)
- vmovdqu %ymm5,160-192(%rbx)
-
- vpmuludq 96-128(%rsi),%ymm11,%ymm12
- vpaddq %ymm12,%ymm6,%ymm6
- vpmuludq 96-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm7,%ymm7
- vpmuludq 128-128(%r9),%ymm11,%ymm13
- vpaddq %ymm13,%ymm8,%ymm8
- vpmuludq 160-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm0,%ymm0
- vpmuludq 192-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm1,%ymm1
- vpmuludq 224-128(%r9),%ymm11,%ymm2
- vpbroadcastq 160-128(%r15),%ymm11
- vpaddq 352-448(%r12),%ymm2,%ymm2
-
- vmovdqu %ymm6,192-192(%rbx)
- vmovdqu %ymm7,224-192(%rbx)
-
- vpmuludq 128-128(%rsi),%ymm10,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq 128-128(%r9),%ymm10,%ymm14
- vpaddq %ymm14,%ymm0,%ymm0
- vpmuludq 160-128(%r9),%ymm10,%ymm13
- vpaddq %ymm13,%ymm1,%ymm1
- vpmuludq 192-128(%r9),%ymm10,%ymm12
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 224-128(%r9),%ymm10,%ymm3
- vpbroadcastq 192-128(%r15),%ymm10
- vpaddq 384-448(%r12),%ymm3,%ymm3
-
- vmovdqu %ymm8,256-192(%rbx)
- vmovdqu %ymm0,288-192(%rbx)
- leaq 8(%rbx),%rbx
-
- vpmuludq 160-128(%rsi),%ymm11,%ymm13
- vpaddq %ymm13,%ymm1,%ymm1
- vpmuludq 160-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 192-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm3,%ymm3
- vpmuludq 224-128(%r9),%ymm11,%ymm4
- vpbroadcastq 224-128(%r15),%ymm11
- vpaddq 416-448(%r12),%ymm4,%ymm4
-
- vmovdqu %ymm1,320-448(%r12)
- vmovdqu %ymm2,352-448(%r12)
-
- vpmuludq 192-128(%rsi),%ymm10,%ymm12
- vpaddq %ymm12,%ymm3,%ymm3
- vpmuludq 192-128(%r9),%ymm10,%ymm14
- vpbroadcastq 256-128(%r15),%ymm0
- vpaddq %ymm14,%ymm4,%ymm4
- vpmuludq 224-128(%r9),%ymm10,%ymm5
- vpbroadcastq 0+8-128(%r15),%ymm10
- vpaddq 448-448(%r12),%ymm5,%ymm5
-
- vmovdqu %ymm3,384-448(%r12)
- vmovdqu %ymm4,416-448(%r12)
- leaq 8(%r15),%r15
-
- vpmuludq 224-128(%rsi),%ymm11,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 224-128(%r9),%ymm11,%ymm6
- vpaddq 480-448(%r12),%ymm6,%ymm6
-
- vpmuludq 256-128(%rsi),%ymm0,%ymm7
- vmovdqu %ymm5,448-448(%r12)
- vpaddq 512-448(%r12),%ymm7,%ymm7
- vmovdqu %ymm6,480-448(%r12)
- vmovdqu %ymm7,512-448(%r12)
- leaq 8(%r12),%r12
-
- decl %r14d
- jnz L$OOP_SQR_1024
-
- vmovdqu 256(%rsp),%ymm8
- vmovdqu 288(%rsp),%ymm1
- vmovdqu 320(%rsp),%ymm2
- leaq 192(%rsp),%rbx
-
- vpsrlq $29,%ymm8,%ymm14
- vpand %ymm15,%ymm8,%ymm8
- vpsrlq $29,%ymm1,%ymm11
- vpand %ymm15,%ymm1,%ymm1
-
- vpermq $0x93,%ymm14,%ymm14
- vpxor %ymm9,%ymm9,%ymm9
- vpermq $0x93,%ymm11,%ymm11
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm8,%ymm8
- vpblendd $3,%ymm11,%ymm9,%ymm11
- vpaddq %ymm14,%ymm1,%ymm1
- vpaddq %ymm11,%ymm2,%ymm2
- vmovdqu %ymm1,288-192(%rbx)
- vmovdqu %ymm2,320-192(%rbx)
-
- movq (%rsp),%rax
- movq 8(%rsp),%r10
- movq 16(%rsp),%r11
- movq 24(%rsp),%r12
- vmovdqu 32(%rsp),%ymm1
- vmovdqu 64-192(%rbx),%ymm2
- vmovdqu 96-192(%rbx),%ymm3
- vmovdqu 128-192(%rbx),%ymm4
- vmovdqu 160-192(%rbx),%ymm5
- vmovdqu 192-192(%rbx),%ymm6
- vmovdqu 224-192(%rbx),%ymm7
-
- movq %rax,%r9
- imull %ecx,%eax
- andl $0x1fffffff,%eax
- vmovd %eax,%xmm12
-
- movq %rax,%rdx
- imulq -128(%r13),%rax
- vpbroadcastq %xmm12,%ymm12
- addq %rax,%r9
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
- shrq $29,%r9
- addq %rax,%r10
- movq %rdx,%rax
- imulq 16-128(%r13),%rax
- addq %r9,%r10
- addq %rax,%r11
- imulq 24-128(%r13),%rdx
- addq %rdx,%r12
-
- movq %r10,%rax
- imull %ecx,%eax
- andl $0x1fffffff,%eax
-
- movl $9,%r14d
- jmp L$OOP_REDUCE_1024
-
-.p2align 5
-L$OOP_REDUCE_1024:
- vmovd %eax,%xmm13
- vpbroadcastq %xmm13,%ymm13
-
- vpmuludq 32-128(%r13),%ymm12,%ymm10
- movq %rax,%rdx
- imulq -128(%r13),%rax
- vpaddq %ymm10,%ymm1,%ymm1
- addq %rax,%r10
- vpmuludq 64-128(%r13),%ymm12,%ymm14
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
- vpaddq %ymm14,%ymm2,%ymm2
- vpmuludq 96-128(%r13),%ymm12,%ymm11
-.byte 0x67
- addq %rax,%r11
-.byte 0x67
- movq %rdx,%rax
- imulq 16-128(%r13),%rax
- shrq $29,%r10
- vpaddq %ymm11,%ymm3,%ymm3
- vpmuludq 128-128(%r13),%ymm12,%ymm10
- addq %rax,%r12
- addq %r10,%r11
- vpaddq %ymm10,%ymm4,%ymm4
- vpmuludq 160-128(%r13),%ymm12,%ymm14
- movq %r11,%rax
- imull %ecx,%eax
- vpaddq %ymm14,%ymm5,%ymm5
- vpmuludq 192-128(%r13),%ymm12,%ymm11
- andl $0x1fffffff,%eax
- vpaddq %ymm11,%ymm6,%ymm6
- vpmuludq 224-128(%r13),%ymm12,%ymm10
- vpaddq %ymm10,%ymm7,%ymm7
- vpmuludq 256-128(%r13),%ymm12,%ymm14
- vmovd %eax,%xmm12
-
- vpaddq %ymm14,%ymm8,%ymm8
-
- vpbroadcastq %xmm12,%ymm12
-
- vpmuludq 32-8-128(%r13),%ymm13,%ymm11
- vmovdqu 96-8-128(%r13),%ymm14
- movq %rax,%rdx
- imulq -128(%r13),%rax
- vpaddq %ymm11,%ymm1,%ymm1
- vpmuludq 64-8-128(%r13),%ymm13,%ymm10
- vmovdqu 128-8-128(%r13),%ymm11
- addq %rax,%r11
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
- vpaddq %ymm10,%ymm2,%ymm2
- addq %r12,%rax
- shrq $29,%r11
- vpmuludq %ymm13,%ymm14,%ymm14
- vmovdqu 160-8-128(%r13),%ymm10
- addq %r11,%rax
- vpaddq %ymm14,%ymm3,%ymm3
- vpmuludq %ymm13,%ymm11,%ymm11
- vmovdqu 192-8-128(%r13),%ymm14
-.byte 0x67
- movq %rax,%r12
- imull %ecx,%eax
- vpaddq %ymm11,%ymm4,%ymm4
- vpmuludq %ymm13,%ymm10,%ymm10
-.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
- andl $0x1fffffff,%eax
- vpaddq %ymm10,%ymm5,%ymm5
- vpmuludq %ymm13,%ymm14,%ymm14
- vmovdqu 256-8-128(%r13),%ymm10
- vpaddq %ymm14,%ymm6,%ymm6
- vpmuludq %ymm13,%ymm11,%ymm11
- vmovdqu 288-8-128(%r13),%ymm9
- vmovd %eax,%xmm0
- imulq -128(%r13),%rax
- vpaddq %ymm11,%ymm7,%ymm7
- vpmuludq %ymm13,%ymm10,%ymm10
- vmovdqu 32-16-128(%r13),%ymm14
- vpbroadcastq %xmm0,%ymm0
- vpaddq %ymm10,%ymm8,%ymm8
- vpmuludq %ymm13,%ymm9,%ymm9
- vmovdqu 64-16-128(%r13),%ymm11
- addq %rax,%r12
-
- vmovdqu 32-24-128(%r13),%ymm13
- vpmuludq %ymm12,%ymm14,%ymm14
- vmovdqu 96-16-128(%r13),%ymm10
- vpaddq %ymm14,%ymm1,%ymm1
- vpmuludq %ymm0,%ymm13,%ymm13
- vpmuludq %ymm12,%ymm11,%ymm11
-.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
- vpaddq %ymm1,%ymm13,%ymm13
- vpaddq %ymm11,%ymm2,%ymm2
- vpmuludq %ymm12,%ymm10,%ymm10
- vmovdqu 160-16-128(%r13),%ymm11
-.byte 0x67
- vmovq %xmm13,%rax
- vmovdqu %ymm13,(%rsp)
- vpaddq %ymm10,%ymm3,%ymm3
- vpmuludq %ymm12,%ymm14,%ymm14
- vmovdqu 192-16-128(%r13),%ymm10
- vpaddq %ymm14,%ymm4,%ymm4
- vpmuludq %ymm12,%ymm11,%ymm11
- vmovdqu 224-16-128(%r13),%ymm14
- vpaddq %ymm11,%ymm5,%ymm5
- vpmuludq %ymm12,%ymm10,%ymm10
- vmovdqu 256-16-128(%r13),%ymm11
- vpaddq %ymm10,%ymm6,%ymm6
- vpmuludq %ymm12,%ymm14,%ymm14
- shrq $29,%r12
- vmovdqu 288-16-128(%r13),%ymm10
- addq %r12,%rax
- vpaddq %ymm14,%ymm7,%ymm7
- vpmuludq %ymm12,%ymm11,%ymm11
-
- movq %rax,%r9
- imull %ecx,%eax
- vpaddq %ymm11,%ymm8,%ymm8
- vpmuludq %ymm12,%ymm10,%ymm10
- andl $0x1fffffff,%eax
- vmovd %eax,%xmm12
- vmovdqu 96-24-128(%r13),%ymm11
-.byte 0x67
- vpaddq %ymm10,%ymm9,%ymm9
- vpbroadcastq %xmm12,%ymm12
-
- vpmuludq 64-24-128(%r13),%ymm0,%ymm14
- vmovdqu 128-24-128(%r13),%ymm10
- movq %rax,%rdx
- imulq -128(%r13),%rax
- movq 8(%rsp),%r10
- vpaddq %ymm14,%ymm2,%ymm1
- vpmuludq %ymm0,%ymm11,%ymm11
- vmovdqu 160-24-128(%r13),%ymm14
- addq %rax,%r9
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
-.byte 0x67
- shrq $29,%r9
- movq 16(%rsp),%r11
- vpaddq %ymm11,%ymm3,%ymm2
- vpmuludq %ymm0,%ymm10,%ymm10
- vmovdqu 192-24-128(%r13),%ymm11
- addq %rax,%r10
- movq %rdx,%rax
- imulq 16-128(%r13),%rax
- vpaddq %ymm10,%ymm4,%ymm3
- vpmuludq %ymm0,%ymm14,%ymm14
- vmovdqu 224-24-128(%r13),%ymm10
- imulq 24-128(%r13),%rdx
- addq %rax,%r11
- leaq (%r9,%r10,1),%rax
- vpaddq %ymm14,%ymm5,%ymm4
- vpmuludq %ymm0,%ymm11,%ymm11
- vmovdqu 256-24-128(%r13),%ymm14
- movq %rax,%r10
- imull %ecx,%eax
- vpmuludq %ymm0,%ymm10,%ymm10
- vpaddq %ymm11,%ymm6,%ymm5
- vmovdqu 288-24-128(%r13),%ymm11
- andl $0x1fffffff,%eax
- vpaddq %ymm10,%ymm7,%ymm6
- vpmuludq %ymm0,%ymm14,%ymm14
- addq 24(%rsp),%rdx
- vpaddq %ymm14,%ymm8,%ymm7
- vpmuludq %ymm0,%ymm11,%ymm11
- vpaddq %ymm11,%ymm9,%ymm8
- vmovq %r12,%xmm9
- movq %rdx,%r12
-
- decl %r14d
- jnz L$OOP_REDUCE_1024
- leaq 448(%rsp),%r12
- vpaddq %ymm9,%ymm13,%ymm0
- vpxor %ymm9,%ymm9,%ymm9
-
- vpaddq 288-192(%rbx),%ymm0,%ymm0
- vpaddq 320-448(%r12),%ymm1,%ymm1
- vpaddq 352-448(%r12),%ymm2,%ymm2
- vpaddq 384-448(%r12),%ymm3,%ymm3
- vpaddq 416-448(%r12),%ymm4,%ymm4
- vpaddq 448-448(%r12),%ymm5,%ymm5
- vpaddq 480-448(%r12),%ymm6,%ymm6
- vpaddq 512-448(%r12),%ymm7,%ymm7
- vpaddq 544-448(%r12),%ymm8,%ymm8
-
- vpsrlq $29,%ymm0,%ymm14
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm11
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm12
- vpermq $0x93,%ymm14,%ymm14
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm13
- vpermq $0x93,%ymm11,%ymm11
- vpand %ymm15,%ymm3,%ymm3
- vpermq $0x93,%ymm12,%ymm12
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $0x93,%ymm13,%ymm13
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm0,%ymm0
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm1,%ymm1
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm2,%ymm2
- vpblendd $3,%ymm13,%ymm9,%ymm13
- vpaddq %ymm12,%ymm3,%ymm3
- vpaddq %ymm13,%ymm4,%ymm4
-
- vpsrlq $29,%ymm0,%ymm14
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm11
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm12
- vpermq $0x93,%ymm14,%ymm14
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm13
- vpermq $0x93,%ymm11,%ymm11
- vpand %ymm15,%ymm3,%ymm3
- vpermq $0x93,%ymm12,%ymm12
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $0x93,%ymm13,%ymm13
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm0,%ymm0
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm1,%ymm1
- vmovdqu %ymm0,0-128(%rdi)
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm2,%ymm2
- vmovdqu %ymm1,32-128(%rdi)
- vpblendd $3,%ymm13,%ymm9,%ymm13
- vpaddq %ymm12,%ymm3,%ymm3
- vmovdqu %ymm2,64-128(%rdi)
- vpaddq %ymm13,%ymm4,%ymm4
- vmovdqu %ymm3,96-128(%rdi)
- vpsrlq $29,%ymm4,%ymm14
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm11
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm12
- vpermq $0x93,%ymm14,%ymm14
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm13
- vpermq $0x93,%ymm11,%ymm11
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $0x93,%ymm12,%ymm12
- vpand %ymm15,%ymm8,%ymm8
- vpermq $0x93,%ymm13,%ymm13
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $0x93,%ymm0,%ymm0
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm4,%ymm4
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm5,%ymm5
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm6,%ymm6
- vpblendd $3,%ymm13,%ymm0,%ymm13
- vpaddq %ymm12,%ymm7,%ymm7
- vpaddq %ymm13,%ymm8,%ymm8
-
- vpsrlq $29,%ymm4,%ymm14
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm11
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm12
- vpermq $0x93,%ymm14,%ymm14
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm13
- vpermq $0x93,%ymm11,%ymm11
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $0x93,%ymm12,%ymm12
- vpand %ymm15,%ymm8,%ymm8
- vpermq $0x93,%ymm13,%ymm13
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $0x93,%ymm0,%ymm0
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm4,%ymm4
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm5,%ymm5
- vmovdqu %ymm4,128-128(%rdi)
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm6,%ymm6
- vmovdqu %ymm5,160-128(%rdi)
- vpblendd $3,%ymm13,%ymm0,%ymm13
- vpaddq %ymm12,%ymm7,%ymm7
- vmovdqu %ymm6,192-128(%rdi)
- vpaddq %ymm13,%ymm8,%ymm8
- vmovdqu %ymm7,224-128(%rdi)
- vmovdqu %ymm8,256-128(%rdi)
-
- movq %rdi,%rsi
- decl %r8d
- jne L$OOP_GRANDE_SQR_1024
-
- vzeroall
- movq %rbp,%rax
-
- movq -48(%rax),%r15
-
- movq -40(%rax),%r14
-
- movq -32(%rax),%r13
-
- movq -24(%rax),%r12
-
- movq -16(%rax),%rbp
-
- movq -8(%rax),%rbx
-
- leaq (%rax),%rsp
-
-L$sqr_1024_epilogue:
- ret
-
-
-.globl _rsaz_1024_mul_avx2
-.private_extern _rsaz_1024_mul_avx2
-
-.p2align 6
-_rsaz_1024_mul_avx2:
-
-_CET_ENDBR
- leaq (%rsp),%rax
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- movq %rax,%rbp
-
- vzeroall
- movq %rdx,%r13
- subq $64,%rsp
-
-
-
-
-
-
-.byte 0x67,0x67
- movq %rsi,%r15
- andq $4095,%r15
- addq $320,%r15
- shrq $12,%r15
- movq %rsi,%r15
- cmovnzq %r13,%rsi
- cmovnzq %r15,%r13
-
- movq %rcx,%r15
- subq $-128,%rsi
- subq $-128,%rcx
- subq $-128,%rdi
-
- andq $4095,%r15
- addq $320,%r15
-.byte 0x67,0x67
- shrq $12,%r15
- jz L$mul_1024_no_n_copy
-
-
-
-
-
- subq $320,%rsp
- vmovdqu 0-128(%rcx),%ymm0
- andq $-512,%rsp
- vmovdqu 32-128(%rcx),%ymm1
- vmovdqu 64-128(%rcx),%ymm2
- vmovdqu 96-128(%rcx),%ymm3
- vmovdqu 128-128(%rcx),%ymm4
- vmovdqu 160-128(%rcx),%ymm5
- vmovdqu 192-128(%rcx),%ymm6
- vmovdqu 224-128(%rcx),%ymm7
- vmovdqu 256-128(%rcx),%ymm8
- leaq 64+128(%rsp),%rcx
- vmovdqu %ymm0,0-128(%rcx)
- vpxor %ymm0,%ymm0,%ymm0
- vmovdqu %ymm1,32-128(%rcx)
- vpxor %ymm1,%ymm1,%ymm1
- vmovdqu %ymm2,64-128(%rcx)
- vpxor %ymm2,%ymm2,%ymm2
- vmovdqu %ymm3,96-128(%rcx)
- vpxor %ymm3,%ymm3,%ymm3
- vmovdqu %ymm4,128-128(%rcx)
- vpxor %ymm4,%ymm4,%ymm4
- vmovdqu %ymm5,160-128(%rcx)
- vpxor %ymm5,%ymm5,%ymm5
- vmovdqu %ymm6,192-128(%rcx)
- vpxor %ymm6,%ymm6,%ymm6
- vmovdqu %ymm7,224-128(%rcx)
- vpxor %ymm7,%ymm7,%ymm7
- vmovdqu %ymm8,256-128(%rcx)
- vmovdqa %ymm0,%ymm8
- vmovdqu %ymm9,288-128(%rcx)
-L$mul_1024_no_n_copy:
- andq $-64,%rsp
-
- movq (%r13),%rbx
- vpbroadcastq (%r13),%ymm10
- vmovdqu %ymm0,(%rsp)
- xorq %r9,%r9
-.byte 0x67
- xorq %r10,%r10
- xorq %r11,%r11
- xorq %r12,%r12
-
- vmovdqu L$and_mask(%rip),%ymm15
- movl $9,%r14d
- vmovdqu %ymm9,288-128(%rdi)
- jmp L$oop_mul_1024
-
-.p2align 5
-L$oop_mul_1024:
- vpsrlq $29,%ymm3,%ymm9
- movq %rbx,%rax
- imulq -128(%rsi),%rax
- addq %r9,%rax
- movq %rbx,%r10
- imulq 8-128(%rsi),%r10
- addq 8(%rsp),%r10
-
- movq %rax,%r9
- imull %r8d,%eax
- andl $0x1fffffff,%eax
-
- movq %rbx,%r11
- imulq 16-128(%rsi),%r11
- addq 16(%rsp),%r11
-
- movq %rbx,%r12
- imulq 24-128(%rsi),%r12
- addq 24(%rsp),%r12
- vpmuludq 32-128(%rsi),%ymm10,%ymm0
- vmovd %eax,%xmm11
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq 64-128(%rsi),%ymm10,%ymm12
- vpbroadcastq %xmm11,%ymm11
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 96-128(%rsi),%ymm10,%ymm13
- vpand %ymm15,%ymm3,%ymm3
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq 128-128(%rsi),%ymm10,%ymm0
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq 160-128(%rsi),%ymm10,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 192-128(%rsi),%ymm10,%ymm13
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq 224-128(%rsi),%ymm10,%ymm0
- vpermq $0x93,%ymm9,%ymm9
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq 256-128(%rsi),%ymm10,%ymm12
- vpbroadcastq 8(%r13),%ymm10
- vpaddq %ymm12,%ymm8,%ymm8
-
- movq %rax,%rdx
- imulq -128(%rcx),%rax
- addq %rax,%r9
- movq %rdx,%rax
- imulq 8-128(%rcx),%rax
- addq %rax,%r10
- movq %rdx,%rax
- imulq 16-128(%rcx),%rax
- addq %rax,%r11
- shrq $29,%r9
- imulq 24-128(%rcx),%rdx
- addq %rdx,%r12
- addq %r9,%r10
-
- vpmuludq 32-128(%rcx),%ymm11,%ymm13
- vmovq %xmm10,%rbx
- vpaddq %ymm13,%ymm1,%ymm1
- vpmuludq 64-128(%rcx),%ymm11,%ymm0
- vpaddq %ymm0,%ymm2,%ymm2
- vpmuludq 96-128(%rcx),%ymm11,%ymm12
- vpaddq %ymm12,%ymm3,%ymm3
- vpmuludq 128-128(%rcx),%ymm11,%ymm13
- vpaddq %ymm13,%ymm4,%ymm4
- vpmuludq 160-128(%rcx),%ymm11,%ymm0
- vpaddq %ymm0,%ymm5,%ymm5
- vpmuludq 192-128(%rcx),%ymm11,%ymm12
- vpaddq %ymm12,%ymm6,%ymm6
- vpmuludq 224-128(%rcx),%ymm11,%ymm13
- vpblendd $3,%ymm14,%ymm9,%ymm12
- vpaddq %ymm13,%ymm7,%ymm7
- vpmuludq 256-128(%rcx),%ymm11,%ymm0
- vpaddq %ymm12,%ymm3,%ymm3
- vpaddq %ymm0,%ymm8,%ymm8
-
- movq %rbx,%rax
- imulq -128(%rsi),%rax
- addq %rax,%r10
- vmovdqu -8+32-128(%rsi),%ymm12
- movq %rbx,%rax
- imulq 8-128(%rsi),%rax
- addq %rax,%r11
- vmovdqu -8+64-128(%rsi),%ymm13
-
- movq %r10,%rax
- vpblendd $0xfc,%ymm14,%ymm9,%ymm9
- imull %r8d,%eax
- vpaddq %ymm9,%ymm4,%ymm4
- andl $0x1fffffff,%eax
-
- imulq 16-128(%rsi),%rbx
- addq %rbx,%r12
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovd %eax,%xmm11
- vmovdqu -8+96-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm1,%ymm1
- vpmuludq %ymm10,%ymm13,%ymm13
- vpbroadcastq %xmm11,%ymm11
- vmovdqu -8+128-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm2,%ymm2
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -8+160-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm3,%ymm3
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -8+192-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm4,%ymm4
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -8+224-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm5,%ymm5
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -8+256-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm6,%ymm6
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -8+288-128(%rsi),%ymm9
- vpaddq %ymm12,%ymm7,%ymm7
- vpmuludq %ymm10,%ymm13,%ymm13
- vpaddq %ymm13,%ymm8,%ymm8
- vpmuludq %ymm10,%ymm9,%ymm9
- vpbroadcastq 16(%r13),%ymm10
-
- movq %rax,%rdx
- imulq -128(%rcx),%rax
- addq %rax,%r10
- vmovdqu -8+32-128(%rcx),%ymm0
- movq %rdx,%rax
- imulq 8-128(%rcx),%rax
- addq %rax,%r11
- vmovdqu -8+64-128(%rcx),%ymm12
- shrq $29,%r10
- imulq 16-128(%rcx),%rdx
- addq %rdx,%r12
- addq %r10,%r11
-
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovq %xmm10,%rbx
- vmovdqu -8+96-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -8+128-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -8+160-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -8+192-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -8+224-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -8+256-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -8+288-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm11,%ymm12,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm11,%ymm13,%ymm13
- vpaddq %ymm13,%ymm9,%ymm9
-
- vmovdqu -16+32-128(%rsi),%ymm0
- movq %rbx,%rax
- imulq -128(%rsi),%rax
- addq %r11,%rax
-
- vmovdqu -16+64-128(%rsi),%ymm12
- movq %rax,%r11
- imull %r8d,%eax
- andl $0x1fffffff,%eax
-
- imulq 8-128(%rsi),%rbx
- addq %rbx,%r12
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovd %eax,%xmm11
- vmovdqu -16+96-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm10,%ymm12,%ymm12
- vpbroadcastq %xmm11,%ymm11
- vmovdqu -16+128-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -16+160-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -16+192-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -16+224-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -16+256-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -16+288-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm10,%ymm12,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm10,%ymm13,%ymm13
- vpbroadcastq 24(%r13),%ymm10
- vpaddq %ymm13,%ymm9,%ymm9
-
- vmovdqu -16+32-128(%rcx),%ymm0
- movq %rax,%rdx
- imulq -128(%rcx),%rax
- addq %rax,%r11
- vmovdqu -16+64-128(%rcx),%ymm12
- imulq 8-128(%rcx),%rdx
- addq %rdx,%r12
- shrq $29,%r11
-
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovq %xmm10,%rbx
- vmovdqu -16+96-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -16+128-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -16+160-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -16+192-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -16+224-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -16+256-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -16+288-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -24+32-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -24+64-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm9,%ymm9
-
- addq %r11,%r12
- imulq -128(%rsi),%rbx
- addq %rbx,%r12
-
- movq %r12,%rax
- imull %r8d,%eax
- andl $0x1fffffff,%eax
-
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovd %eax,%xmm11
- vmovdqu -24+96-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm10,%ymm12,%ymm12
- vpbroadcastq %xmm11,%ymm11
- vmovdqu -24+128-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -24+160-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -24+192-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -24+224-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -24+256-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -24+288-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm10,%ymm12,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm10,%ymm13,%ymm13
- vpbroadcastq 32(%r13),%ymm10
- vpaddq %ymm13,%ymm9,%ymm9
- addq $32,%r13
-
- vmovdqu -24+32-128(%rcx),%ymm0
- imulq -128(%rcx),%rax
- addq %rax,%r12
- shrq $29,%r12
-
- vmovdqu -24+64-128(%rcx),%ymm12
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovq %xmm10,%rbx
- vmovdqu -24+96-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm1,%ymm0
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu %ymm0,(%rsp)
- vpaddq %ymm12,%ymm2,%ymm1
- vmovdqu -24+128-128(%rcx),%ymm0
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -24+160-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm3,%ymm2
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -24+192-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm4,%ymm3
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -24+224-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm5,%ymm4
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -24+256-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm6,%ymm5
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -24+288-128(%rcx),%ymm13
- movq %r12,%r9
- vpaddq %ymm0,%ymm7,%ymm6
- vpmuludq %ymm11,%ymm12,%ymm12
- addq (%rsp),%r9
- vpaddq %ymm12,%ymm8,%ymm7
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovq %r12,%xmm12
- vpaddq %ymm13,%ymm9,%ymm8
-
- decl %r14d
- jnz L$oop_mul_1024
- vpaddq (%rsp),%ymm12,%ymm0
-
- vpsrlq $29,%ymm0,%ymm12
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm13
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm10
- vpermq $0x93,%ymm12,%ymm12
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm11
- vpermq $0x93,%ymm13,%ymm13
- vpand %ymm15,%ymm3,%ymm3
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $0x93,%ymm10,%ymm10
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpermq $0x93,%ymm11,%ymm11
- vpaddq %ymm9,%ymm0,%ymm0
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm1,%ymm1
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm2,%ymm2
- vpblendd $3,%ymm11,%ymm14,%ymm11
- vpaddq %ymm10,%ymm3,%ymm3
- vpaddq %ymm11,%ymm4,%ymm4
-
- vpsrlq $29,%ymm0,%ymm12
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm13
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm10
- vpermq $0x93,%ymm12,%ymm12
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm11
- vpermq $0x93,%ymm13,%ymm13
- vpand %ymm15,%ymm3,%ymm3
- vpermq $0x93,%ymm10,%ymm10
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $0x93,%ymm11,%ymm11
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm9,%ymm0,%ymm0
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm1,%ymm1
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm2,%ymm2
- vpblendd $3,%ymm11,%ymm14,%ymm11
- vpaddq %ymm10,%ymm3,%ymm3
- vpaddq %ymm11,%ymm4,%ymm4
-
- vmovdqu %ymm0,0-128(%rdi)
- vmovdqu %ymm1,32-128(%rdi)
- vmovdqu %ymm2,64-128(%rdi)
- vmovdqu %ymm3,96-128(%rdi)
- vpsrlq $29,%ymm4,%ymm12
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm13
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm10
- vpermq $0x93,%ymm12,%ymm12
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm11
- vpermq $0x93,%ymm13,%ymm13
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $0x93,%ymm10,%ymm10
- vpand %ymm15,%ymm8,%ymm8
- vpermq $0x93,%ymm11,%ymm11
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $0x93,%ymm0,%ymm0
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm9,%ymm4,%ymm4
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm5,%ymm5
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm6,%ymm6
- vpblendd $3,%ymm11,%ymm0,%ymm11
- vpaddq %ymm10,%ymm7,%ymm7
- vpaddq %ymm11,%ymm8,%ymm8
-
- vpsrlq $29,%ymm4,%ymm12
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm13
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm10
- vpermq $0x93,%ymm12,%ymm12
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm11
- vpermq $0x93,%ymm13,%ymm13
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $0x93,%ymm10,%ymm10
- vpand %ymm15,%ymm8,%ymm8
- vpermq $0x93,%ymm11,%ymm11
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $0x93,%ymm0,%ymm0
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm9,%ymm4,%ymm4
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm5,%ymm5
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm6,%ymm6
- vpblendd $3,%ymm11,%ymm0,%ymm11
- vpaddq %ymm10,%ymm7,%ymm7
- vpaddq %ymm11,%ymm8,%ymm8
-
- vmovdqu %ymm4,128-128(%rdi)
- vmovdqu %ymm5,160-128(%rdi)
- vmovdqu %ymm6,192-128(%rdi)
- vmovdqu %ymm7,224-128(%rdi)
- vmovdqu %ymm8,256-128(%rdi)
- vzeroupper
-
- movq %rbp,%rax
-
- movq -48(%rax),%r15
-
- movq -40(%rax),%r14
-
- movq -32(%rax),%r13
-
- movq -24(%rax),%r12
-
- movq -16(%rax),%rbp
-
- movq -8(%rax),%rbx
-
- leaq (%rax),%rsp
-
-L$mul_1024_epilogue:
- ret
-
-
-.globl _rsaz_1024_red2norm_avx2
-.private_extern _rsaz_1024_red2norm_avx2
-
-.p2align 5
-_rsaz_1024_red2norm_avx2:
-
-_CET_ENDBR
- subq $-128,%rsi
- xorq %rax,%rax
- movq -128(%rsi),%r8
- movq -120(%rsi),%r9
- movq -112(%rsi),%r10
- shlq $0,%r8
- shlq $29,%r9
- movq %r10,%r11
- shlq $58,%r10
- shrq $6,%r11
- addq %r8,%rax
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,0(%rdi)
- movq %r11,%rax
- movq -104(%rsi),%r8
- movq -96(%rsi),%r9
- shlq $23,%r8
- movq %r9,%r10
- shlq $52,%r9
- shrq $12,%r10
- addq %r8,%rax
- addq %r9,%rax
- adcq $0,%r10
- movq %rax,8(%rdi)
- movq %r10,%rax
- movq -88(%rsi),%r11
- movq -80(%rsi),%r8
- shlq $17,%r11
- movq %r8,%r9
- shlq $46,%r8
- shrq $18,%r9
- addq %r11,%rax
- addq %r8,%rax
- adcq $0,%r9
- movq %rax,16(%rdi)
- movq %r9,%rax
- movq -72(%rsi),%r10
- movq -64(%rsi),%r11
- shlq $11,%r10
- movq %r11,%r8
- shlq $40,%r11
- shrq $24,%r8
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,24(%rdi)
- movq %r8,%rax
- movq -56(%rsi),%r9
- movq -48(%rsi),%r10
- movq -40(%rsi),%r11
- shlq $5,%r9
- shlq $34,%r10
- movq %r11,%r8
- shlq $63,%r11
- shrq $1,%r8
- addq %r9,%rax
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,32(%rdi)
- movq %r8,%rax
- movq -32(%rsi),%r9
- movq -24(%rsi),%r10
- shlq $28,%r9
- movq %r10,%r11
- shlq $57,%r10
- shrq $7,%r11
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,40(%rdi)
- movq %r11,%rax
- movq -16(%rsi),%r8
- movq -8(%rsi),%r9
- shlq $22,%r8
- movq %r9,%r10
- shlq $51,%r9
- shrq $13,%r10
- addq %r8,%rax
- addq %r9,%rax
- adcq $0,%r10
- movq %rax,48(%rdi)
- movq %r10,%rax
- movq 0(%rsi),%r11
- movq 8(%rsi),%r8
- shlq $16,%r11
- movq %r8,%r9
- shlq $45,%r8
- shrq $19,%r9
- addq %r11,%rax
- addq %r8,%rax
- adcq $0,%r9
- movq %rax,56(%rdi)
- movq %r9,%rax
- movq 16(%rsi),%r10
- movq 24(%rsi),%r11
- shlq $10,%r10
- movq %r11,%r8
- shlq $39,%r11
- shrq $25,%r8
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,64(%rdi)
- movq %r8,%rax
- movq 32(%rsi),%r9
- movq 40(%rsi),%r10
- movq 48(%rsi),%r11
- shlq $4,%r9
- shlq $33,%r10
- movq %r11,%r8
- shlq $62,%r11
- shrq $2,%r8
- addq %r9,%rax
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,72(%rdi)
- movq %r8,%rax
- movq 56(%rsi),%r9
- movq 64(%rsi),%r10
- shlq $27,%r9
- movq %r10,%r11
- shlq $56,%r10
- shrq $8,%r11
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,80(%rdi)
- movq %r11,%rax
- movq 72(%rsi),%r8
- movq 80(%rsi),%r9
- shlq $21,%r8
- movq %r9,%r10
- shlq $50,%r9
- shrq $14,%r10
- addq %r8,%rax
- addq %r9,%rax
- adcq $0,%r10
- movq %rax,88(%rdi)
- movq %r10,%rax
- movq 88(%rsi),%r11
- movq 96(%rsi),%r8
- shlq $15,%r11
- movq %r8,%r9
- shlq $44,%r8
- shrq $20,%r9
- addq %r11,%rax
- addq %r8,%rax
- adcq $0,%r9
- movq %rax,96(%rdi)
- movq %r9,%rax
- movq 104(%rsi),%r10
- movq 112(%rsi),%r11
- shlq $9,%r10
- movq %r11,%r8
- shlq $38,%r11
- shrq $26,%r8
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,104(%rdi)
- movq %r8,%rax
- movq 120(%rsi),%r9
- movq 128(%rsi),%r10
- movq 136(%rsi),%r11
- shlq $3,%r9
- shlq $32,%r10
- movq %r11,%r8
- shlq $61,%r11
- shrq $3,%r8
- addq %r9,%rax
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,112(%rdi)
- movq %r8,%rax
- movq 144(%rsi),%r9
- movq 152(%rsi),%r10
- shlq $26,%r9
- movq %r10,%r11
- shlq $55,%r10
- shrq $9,%r11
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,120(%rdi)
- movq %r11,%rax
- ret
-
-
-
-.globl _rsaz_1024_norm2red_avx2
-.private_extern _rsaz_1024_norm2red_avx2
-
-.p2align 5
-_rsaz_1024_norm2red_avx2:
-
-_CET_ENDBR
- subq $-128,%rdi
- movq (%rsi),%r8
- movl $0x1fffffff,%eax
- movq 8(%rsi),%r9
- movq %r8,%r11
- shrq $0,%r11
- andq %rax,%r11
- movq %r11,-128(%rdi)
- movq %r8,%r10
- shrq $29,%r10
- andq %rax,%r10
- movq %r10,-120(%rdi)
- shrdq $58,%r9,%r8
- andq %rax,%r8
- movq %r8,-112(%rdi)
- movq 16(%rsi),%r10
- movq %r9,%r8
- shrq $23,%r8
- andq %rax,%r8
- movq %r8,-104(%rdi)
- shrdq $52,%r10,%r9
- andq %rax,%r9
- movq %r9,-96(%rdi)
- movq 24(%rsi),%r11
- movq %r10,%r9
- shrq $17,%r9
- andq %rax,%r9
- movq %r9,-88(%rdi)
- shrdq $46,%r11,%r10
- andq %rax,%r10
- movq %r10,-80(%rdi)
- movq 32(%rsi),%r8
- movq %r11,%r10
- shrq $11,%r10
- andq %rax,%r10
- movq %r10,-72(%rdi)
- shrdq $40,%r8,%r11
- andq %rax,%r11
- movq %r11,-64(%rdi)
- movq 40(%rsi),%r9
- movq %r8,%r11
- shrq $5,%r11
- andq %rax,%r11
- movq %r11,-56(%rdi)
- movq %r8,%r10
- shrq $34,%r10
- andq %rax,%r10
- movq %r10,-48(%rdi)
- shrdq $63,%r9,%r8
- andq %rax,%r8
- movq %r8,-40(%rdi)
- movq 48(%rsi),%r10
- movq %r9,%r8
- shrq $28,%r8
- andq %rax,%r8
- movq %r8,-32(%rdi)
- shrdq $57,%r10,%r9
- andq %rax,%r9
- movq %r9,-24(%rdi)
- movq 56(%rsi),%r11
- movq %r10,%r9
- shrq $22,%r9
- andq %rax,%r9
- movq %r9,-16(%rdi)
- shrdq $51,%r11,%r10
- andq %rax,%r10
- movq %r10,-8(%rdi)
- movq 64(%rsi),%r8
- movq %r11,%r10
- shrq $16,%r10
- andq %rax,%r10
- movq %r10,0(%rdi)
- shrdq $45,%r8,%r11
- andq %rax,%r11
- movq %r11,8(%rdi)
- movq 72(%rsi),%r9
- movq %r8,%r11
- shrq $10,%r11
- andq %rax,%r11
- movq %r11,16(%rdi)
- shrdq $39,%r9,%r8
- andq %rax,%r8
- movq %r8,24(%rdi)
- movq 80(%rsi),%r10
- movq %r9,%r8
- shrq $4,%r8
- andq %rax,%r8
- movq %r8,32(%rdi)
- movq %r9,%r11
- shrq $33,%r11
- andq %rax,%r11
- movq %r11,40(%rdi)
- shrdq $62,%r10,%r9
- andq %rax,%r9
- movq %r9,48(%rdi)
- movq 88(%rsi),%r11
- movq %r10,%r9
- shrq $27,%r9
- andq %rax,%r9
- movq %r9,56(%rdi)
- shrdq $56,%r11,%r10
- andq %rax,%r10
- movq %r10,64(%rdi)
- movq 96(%rsi),%r8
- movq %r11,%r10
- shrq $21,%r10
- andq %rax,%r10
- movq %r10,72(%rdi)
- shrdq $50,%r8,%r11
- andq %rax,%r11
- movq %r11,80(%rdi)
- movq 104(%rsi),%r9
- movq %r8,%r11
- shrq $15,%r11
- andq %rax,%r11
- movq %r11,88(%rdi)
- shrdq $44,%r9,%r8
- andq %rax,%r8
- movq %r8,96(%rdi)
- movq 112(%rsi),%r10
- movq %r9,%r8
- shrq $9,%r8
- andq %rax,%r8
- movq %r8,104(%rdi)
- shrdq $38,%r10,%r9
- andq %rax,%r9
- movq %r9,112(%rdi)
- movq 120(%rsi),%r11
- movq %r10,%r9
- shrq $3,%r9
- andq %rax,%r9
- movq %r9,120(%rdi)
- movq %r10,%r8
- shrq $32,%r8
- andq %rax,%r8
- movq %r8,128(%rdi)
- shrdq $61,%r11,%r10
- andq %rax,%r10
- movq %r10,136(%rdi)
- xorq %r8,%r8
- movq %r11,%r10
- shrq $26,%r10
- andq %rax,%r10
- movq %r10,144(%rdi)
- shrdq $55,%r8,%r11
- andq %rax,%r11
- movq %r11,152(%rdi)
- movq %r8,160(%rdi)
- movq %r8,168(%rdi)
- movq %r8,176(%rdi)
- movq %r8,184(%rdi)
- ret
-
-
-.globl _rsaz_1024_scatter5_avx2
-.private_extern _rsaz_1024_scatter5_avx2
-
-.p2align 5
-_rsaz_1024_scatter5_avx2:
-
-_CET_ENDBR
- vzeroupper
- vmovdqu L$scatter_permd(%rip),%ymm5
- shll $4,%edx
- leaq (%rdi,%rdx,1),%rdi
- movl $9,%eax
- jmp L$oop_scatter_1024
-
-.p2align 5
-L$oop_scatter_1024:
- vmovdqu (%rsi),%ymm0
- leaq 32(%rsi),%rsi
- vpermd %ymm0,%ymm5,%ymm0
- vmovdqu %xmm0,(%rdi)
- leaq 512(%rdi),%rdi
- decl %eax
- jnz L$oop_scatter_1024
-
- vzeroupper
- ret
-
-
-
-.globl _rsaz_1024_gather5_avx2
-.private_extern _rsaz_1024_gather5_avx2
-
-.p2align 5
-_rsaz_1024_gather5_avx2:
-
-_CET_ENDBR
- vzeroupper
- movq %rsp,%r11
-
- leaq -256(%rsp),%rsp
- andq $-32,%rsp
- leaq L$inc(%rip),%r10
- leaq -128(%rsp),%rax
-
- vmovd %edx,%xmm4
- vmovdqa (%r10),%ymm0
- vmovdqa 32(%r10),%ymm1
- vmovdqa 64(%r10),%ymm5
- vpbroadcastd %xmm4,%ymm4
-
- vpaddd %ymm5,%ymm0,%ymm2
- vpcmpeqd %ymm4,%ymm0,%ymm0
- vpaddd %ymm5,%ymm1,%ymm3
- vpcmpeqd %ymm4,%ymm1,%ymm1
- vmovdqa %ymm0,0+128(%rax)
- vpaddd %ymm5,%ymm2,%ymm0
- vpcmpeqd %ymm4,%ymm2,%ymm2
- vmovdqa %ymm1,32+128(%rax)
- vpaddd %ymm5,%ymm3,%ymm1
- vpcmpeqd %ymm4,%ymm3,%ymm3
- vmovdqa %ymm2,64+128(%rax)
- vpaddd %ymm5,%ymm0,%ymm2
- vpcmpeqd %ymm4,%ymm0,%ymm0
- vmovdqa %ymm3,96+128(%rax)
- vpaddd %ymm5,%ymm1,%ymm3
- vpcmpeqd %ymm4,%ymm1,%ymm1
- vmovdqa %ymm0,128+128(%rax)
- vpaddd %ymm5,%ymm2,%ymm8
- vpcmpeqd %ymm4,%ymm2,%ymm2
- vmovdqa %ymm1,160+128(%rax)
- vpaddd %ymm5,%ymm3,%ymm9
- vpcmpeqd %ymm4,%ymm3,%ymm3
- vmovdqa %ymm2,192+128(%rax)
- vpaddd %ymm5,%ymm8,%ymm10
- vpcmpeqd %ymm4,%ymm8,%ymm8
- vmovdqa %ymm3,224+128(%rax)
- vpaddd %ymm5,%ymm9,%ymm11
- vpcmpeqd %ymm4,%ymm9,%ymm9
- vpaddd %ymm5,%ymm10,%ymm12
- vpcmpeqd %ymm4,%ymm10,%ymm10
- vpaddd %ymm5,%ymm11,%ymm13
- vpcmpeqd %ymm4,%ymm11,%ymm11
- vpaddd %ymm5,%ymm12,%ymm14
- vpcmpeqd %ymm4,%ymm12,%ymm12
- vpaddd %ymm5,%ymm13,%ymm15
- vpcmpeqd %ymm4,%ymm13,%ymm13
- vpcmpeqd %ymm4,%ymm14,%ymm14
- vpcmpeqd %ymm4,%ymm15,%ymm15
-
- vmovdqa -32(%r10),%ymm7
- leaq 128(%rsi),%rsi
- movl $9,%edx
-
-L$oop_gather_1024:
- vmovdqa 0-128(%rsi),%ymm0
- vmovdqa 32-128(%rsi),%ymm1
- vmovdqa 64-128(%rsi),%ymm2
- vmovdqa 96-128(%rsi),%ymm3
- vpand 0+128(%rax),%ymm0,%ymm0
- vpand 32+128(%rax),%ymm1,%ymm1
- vpand 64+128(%rax),%ymm2,%ymm2
- vpor %ymm0,%ymm1,%ymm4
- vpand 96+128(%rax),%ymm3,%ymm3
- vmovdqa 128-128(%rsi),%ymm0
- vmovdqa 160-128(%rsi),%ymm1
- vpor %ymm2,%ymm3,%ymm5
- vmovdqa 192-128(%rsi),%ymm2
- vmovdqa 224-128(%rsi),%ymm3
- vpand 128+128(%rax),%ymm0,%ymm0
- vpand 160+128(%rax),%ymm1,%ymm1
- vpand 192+128(%rax),%ymm2,%ymm2
- vpor %ymm0,%ymm4,%ymm4
- vpand 224+128(%rax),%ymm3,%ymm3
- vpand 256-128(%rsi),%ymm8,%ymm0
- vpor %ymm1,%ymm5,%ymm5
- vpand 288-128(%rsi),%ymm9,%ymm1
- vpor %ymm2,%ymm4,%ymm4
- vpand 320-128(%rsi),%ymm10,%ymm2
- vpor %ymm3,%ymm5,%ymm5
- vpand 352-128(%rsi),%ymm11,%ymm3
- vpor %ymm0,%ymm4,%ymm4
- vpand 384-128(%rsi),%ymm12,%ymm0
- vpor %ymm1,%ymm5,%ymm5
- vpand 416-128(%rsi),%ymm13,%ymm1
- vpor %ymm2,%ymm4,%ymm4
- vpand 448-128(%rsi),%ymm14,%ymm2
- vpor %ymm3,%ymm5,%ymm5
- vpand 480-128(%rsi),%ymm15,%ymm3
- leaq 512(%rsi),%rsi
- vpor %ymm0,%ymm4,%ymm4
- vpor %ymm1,%ymm5,%ymm5
- vpor %ymm2,%ymm4,%ymm4
- vpor %ymm3,%ymm5,%ymm5
-
- vpor %ymm5,%ymm4,%ymm4
- vextracti128 $1,%ymm4,%xmm5
- vpor %xmm4,%xmm5,%xmm5
- vpermd %ymm5,%ymm7,%ymm5
- vmovdqu %ymm5,(%rdi)
- leaq 32(%rdi),%rdi
- decl %edx
- jnz L$oop_gather_1024
-
- vpxor %ymm0,%ymm0,%ymm0
- vmovdqu %ymm0,(%rdi)
- vzeroupper
- leaq (%r11),%rsp
-
- ret
-
-L$SEH_end_rsaz_1024_gather5:
-
-.section __DATA,__const
-.p2align 6
-L$and_mask:
-.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
-L$scatter_permd:
-.long 0,2,4,6,7,7,7,7
-L$gather_permd:
-.long 0,7,1,7,2,7,3,7
-L$inc:
-.long 0,0,0,0, 1,1,1,1
-.long 2,2,2,2, 3,3,3,3
-.long 4,4,4,4, 4,4,4,4
-.p2align 6
-.text
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/sha1-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/sha1-x86_64-apple.S
deleted file mode 100644
index 6af6744..0000000
--- a/apple-x86_64/crypto/fipsmodule/sha1-x86_64-apple.S
+++ /dev/null
@@ -1,5463 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-
-.globl _sha1_block_data_order
-.private_extern _sha1_block_data_order
-
-.p2align 4
-_sha1_block_data_order:
-
-_CET_ENDBR
- leaq _OPENSSL_ia32cap_P(%rip),%r10
- movl 0(%r10),%r9d
- movl 4(%r10),%r8d
- movl 8(%r10),%r10d
- testl $512,%r8d
- jz L$ialu
- testl $536870912,%r10d
- jnz _shaext_shortcut
- andl $296,%r10d
- cmpl $296,%r10d
- je _avx2_shortcut
- andl $268435456,%r8d
- andl $1073741824,%r9d
- orl %r9d,%r8d
- cmpl $1342177280,%r8d
- je _avx_shortcut
- jmp _ssse3_shortcut
-
-.p2align 4
-L$ialu:
- movq %rsp,%rax
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- movq %rdi,%r8
- subq $72,%rsp
- movq %rsi,%r9
- andq $-64,%rsp
- movq %rdx,%r10
- movq %rax,64(%rsp)
-
-L$prologue:
-
- movl 0(%r8),%esi
- movl 4(%r8),%edi
- movl 8(%r8),%r11d
- movl 12(%r8),%r12d
- movl 16(%r8),%r13d
- jmp L$loop
-
-.p2align 4
-L$loop:
- movl 0(%r9),%edx
- bswapl %edx
- movl 4(%r9),%ebp
- movl %r12d,%eax
- movl %edx,0(%rsp)
- movl %esi,%ecx
- bswapl %ebp
- xorl %r11d,%eax
- roll $5,%ecx
- andl %edi,%eax
- leal 1518500249(%rdx,%r13,1),%r13d
- addl %ecx,%r13d
- xorl %r12d,%eax
- roll $30,%edi
- addl %eax,%r13d
- movl 8(%r9),%r14d
- movl %r11d,%eax
- movl %ebp,4(%rsp)
- movl %r13d,%ecx
- bswapl %r14d
- xorl %edi,%eax
- roll $5,%ecx
- andl %esi,%eax
- leal 1518500249(%rbp,%r12,1),%r12d
- addl %ecx,%r12d
- xorl %r11d,%eax
- roll $30,%esi
- addl %eax,%r12d
- movl 12(%r9),%edx
- movl %edi,%eax
- movl %r14d,8(%rsp)
- movl %r12d,%ecx
- bswapl %edx
- xorl %esi,%eax
- roll $5,%ecx
- andl %r13d,%eax
- leal 1518500249(%r14,%r11,1),%r11d
- addl %ecx,%r11d
- xorl %edi,%eax
- roll $30,%r13d
- addl %eax,%r11d
- movl 16(%r9),%ebp
- movl %esi,%eax
- movl %edx,12(%rsp)
- movl %r11d,%ecx
- bswapl %ebp
- xorl %r13d,%eax
- roll $5,%ecx
- andl %r12d,%eax
- leal 1518500249(%rdx,%rdi,1),%edi
- addl %ecx,%edi
- xorl %esi,%eax
- roll $30,%r12d
- addl %eax,%edi
- movl 20(%r9),%r14d
- movl %r13d,%eax
- movl %ebp,16(%rsp)
- movl %edi,%ecx
- bswapl %r14d
- xorl %r12d,%eax
- roll $5,%ecx
- andl %r11d,%eax
- leal 1518500249(%rbp,%rsi,1),%esi
- addl %ecx,%esi
- xorl %r13d,%eax
- roll $30,%r11d
- addl %eax,%esi
- movl 24(%r9),%edx
- movl %r12d,%eax
- movl %r14d,20(%rsp)
- movl %esi,%ecx
- bswapl %edx
- xorl %r11d,%eax
- roll $5,%ecx
- andl %edi,%eax
- leal 1518500249(%r14,%r13,1),%r13d
- addl %ecx,%r13d
- xorl %r12d,%eax
- roll $30,%edi
- addl %eax,%r13d
- movl 28(%r9),%ebp
- movl %r11d,%eax
- movl %edx,24(%rsp)
- movl %r13d,%ecx
- bswapl %ebp
- xorl %edi,%eax
- roll $5,%ecx
- andl %esi,%eax
- leal 1518500249(%rdx,%r12,1),%r12d
- addl %ecx,%r12d
- xorl %r11d,%eax
- roll $30,%esi
- addl %eax,%r12d
- movl 32(%r9),%r14d
- movl %edi,%eax
- movl %ebp,28(%rsp)
- movl %r12d,%ecx
- bswapl %r14d
- xorl %esi,%eax
- roll $5,%ecx
- andl %r13d,%eax
- leal 1518500249(%rbp,%r11,1),%r11d
- addl %ecx,%r11d
- xorl %edi,%eax
- roll $30,%r13d
- addl %eax,%r11d
- movl 36(%r9),%edx
- movl %esi,%eax
- movl %r14d,32(%rsp)
- movl %r11d,%ecx
- bswapl %edx
- xorl %r13d,%eax
- roll $5,%ecx
- andl %r12d,%eax
- leal 1518500249(%r14,%rdi,1),%edi
- addl %ecx,%edi
- xorl %esi,%eax
- roll $30,%r12d
- addl %eax,%edi
- movl 40(%r9),%ebp
- movl %r13d,%eax
- movl %edx,36(%rsp)
- movl %edi,%ecx
- bswapl %ebp
- xorl %r12d,%eax
- roll $5,%ecx
- andl %r11d,%eax
- leal 1518500249(%rdx,%rsi,1),%esi
- addl %ecx,%esi
- xorl %r13d,%eax
- roll $30,%r11d
- addl %eax,%esi
- movl 44(%r9),%r14d
- movl %r12d,%eax
- movl %ebp,40(%rsp)
- movl %esi,%ecx
- bswapl %r14d
- xorl %r11d,%eax
- roll $5,%ecx
- andl %edi,%eax
- leal 1518500249(%rbp,%r13,1),%r13d
- addl %ecx,%r13d
- xorl %r12d,%eax
- roll $30,%edi
- addl %eax,%r13d
- movl 48(%r9),%edx
- movl %r11d,%eax
- movl %r14d,44(%rsp)
- movl %r13d,%ecx
- bswapl %edx
- xorl %edi,%eax
- roll $5,%ecx
- andl %esi,%eax
- leal 1518500249(%r14,%r12,1),%r12d
- addl %ecx,%r12d
- xorl %r11d,%eax
- roll $30,%esi
- addl %eax,%r12d
- movl 52(%r9),%ebp
- movl %edi,%eax
- movl %edx,48(%rsp)
- movl %r12d,%ecx
- bswapl %ebp
- xorl %esi,%eax
- roll $5,%ecx
- andl %r13d,%eax
- leal 1518500249(%rdx,%r11,1),%r11d
- addl %ecx,%r11d
- xorl %edi,%eax
- roll $30,%r13d
- addl %eax,%r11d
- movl 56(%r9),%r14d
- movl %esi,%eax
- movl %ebp,52(%rsp)
- movl %r11d,%ecx
- bswapl %r14d
- xorl %r13d,%eax
- roll $5,%ecx
- andl %r12d,%eax
- leal 1518500249(%rbp,%rdi,1),%edi
- addl %ecx,%edi
- xorl %esi,%eax
- roll $30,%r12d
- addl %eax,%edi
- movl 60(%r9),%edx
- movl %r13d,%eax
- movl %r14d,56(%rsp)
- movl %edi,%ecx
- bswapl %edx
- xorl %r12d,%eax
- roll $5,%ecx
- andl %r11d,%eax
- leal 1518500249(%r14,%rsi,1),%esi
- addl %ecx,%esi
- xorl %r13d,%eax
- roll $30,%r11d
- addl %eax,%esi
- xorl 0(%rsp),%ebp
- movl %r12d,%eax
- movl %edx,60(%rsp)
- movl %esi,%ecx
- xorl 8(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- xorl 32(%rsp),%ebp
- andl %edi,%eax
- leal 1518500249(%rdx,%r13,1),%r13d
- roll $30,%edi
- xorl %r12d,%eax
- addl %ecx,%r13d
- roll $1,%ebp
- addl %eax,%r13d
- xorl 4(%rsp),%r14d
- movl %r11d,%eax
- movl %ebp,0(%rsp)
- movl %r13d,%ecx
- xorl 12(%rsp),%r14d
- xorl %edi,%eax
- roll $5,%ecx
- xorl 36(%rsp),%r14d
- andl %esi,%eax
- leal 1518500249(%rbp,%r12,1),%r12d
- roll $30,%esi
- xorl %r11d,%eax
- addl %ecx,%r12d
- roll $1,%r14d
- addl %eax,%r12d
- xorl 8(%rsp),%edx
- movl %edi,%eax
- movl %r14d,4(%rsp)
- movl %r12d,%ecx
- xorl 16(%rsp),%edx
- xorl %esi,%eax
- roll $5,%ecx
- xorl 40(%rsp),%edx
- andl %r13d,%eax
- leal 1518500249(%r14,%r11,1),%r11d
- roll $30,%r13d
- xorl %edi,%eax
- addl %ecx,%r11d
- roll $1,%edx
- addl %eax,%r11d
- xorl 12(%rsp),%ebp
- movl %esi,%eax
- movl %edx,8(%rsp)
- movl %r11d,%ecx
- xorl 20(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- xorl 44(%rsp),%ebp
- andl %r12d,%eax
- leal 1518500249(%rdx,%rdi,1),%edi
- roll $30,%r12d
- xorl %esi,%eax
- addl %ecx,%edi
- roll $1,%ebp
- addl %eax,%edi
- xorl 16(%rsp),%r14d
- movl %r13d,%eax
- movl %ebp,12(%rsp)
- movl %edi,%ecx
- xorl 24(%rsp),%r14d
- xorl %r12d,%eax
- roll $5,%ecx
- xorl 48(%rsp),%r14d
- andl %r11d,%eax
- leal 1518500249(%rbp,%rsi,1),%esi
- roll $30,%r11d
- xorl %r13d,%eax
- addl %ecx,%esi
- roll $1,%r14d
- addl %eax,%esi
- xorl 20(%rsp),%edx
- movl %edi,%eax
- movl %r14d,16(%rsp)
- movl %esi,%ecx
- xorl 28(%rsp),%edx
- xorl %r12d,%eax
- roll $5,%ecx
- xorl 52(%rsp),%edx
- leal 1859775393(%r14,%r13,1),%r13d
- xorl %r11d,%eax
- addl %ecx,%r13d
- roll $30,%edi
- addl %eax,%r13d
- roll $1,%edx
- xorl 24(%rsp),%ebp
- movl %esi,%eax
- movl %edx,20(%rsp)
- movl %r13d,%ecx
- xorl 32(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- xorl 56(%rsp),%ebp
- leal 1859775393(%rdx,%r12,1),%r12d
- xorl %edi,%eax
- addl %ecx,%r12d
- roll $30,%esi
- addl %eax,%r12d
- roll $1,%ebp
- xorl 28(%rsp),%r14d
- movl %r13d,%eax
- movl %ebp,24(%rsp)
- movl %r12d,%ecx
- xorl 36(%rsp),%r14d
- xorl %edi,%eax
- roll $5,%ecx
- xorl 60(%rsp),%r14d
- leal 1859775393(%rbp,%r11,1),%r11d
- xorl %esi,%eax
- addl %ecx,%r11d
- roll $30,%r13d
- addl %eax,%r11d
- roll $1,%r14d
- xorl 32(%rsp),%edx
- movl %r12d,%eax
- movl %r14d,28(%rsp)
- movl %r11d,%ecx
- xorl 40(%rsp),%edx
- xorl %esi,%eax
- roll $5,%ecx
- xorl 0(%rsp),%edx
- leal 1859775393(%r14,%rdi,1),%edi
- xorl %r13d,%eax
- addl %ecx,%edi
- roll $30,%r12d
- addl %eax,%edi
- roll $1,%edx
- xorl 36(%rsp),%ebp
- movl %r11d,%eax
- movl %edx,32(%rsp)
- movl %edi,%ecx
- xorl 44(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- xorl 4(%rsp),%ebp
- leal 1859775393(%rdx,%rsi,1),%esi
- xorl %r12d,%eax
- addl %ecx,%esi
- roll $30,%r11d
- addl %eax,%esi
- roll $1,%ebp
- xorl 40(%rsp),%r14d
- movl %edi,%eax
- movl %ebp,36(%rsp)
- movl %esi,%ecx
- xorl 48(%rsp),%r14d
- xorl %r12d,%eax
- roll $5,%ecx
- xorl 8(%rsp),%r14d
- leal 1859775393(%rbp,%r13,1),%r13d
- xorl %r11d,%eax
- addl %ecx,%r13d
- roll $30,%edi
- addl %eax,%r13d
- roll $1,%r14d
- xorl 44(%rsp),%edx
- movl %esi,%eax
- movl %r14d,40(%rsp)
- movl %r13d,%ecx
- xorl 52(%rsp),%edx
- xorl %r11d,%eax
- roll $5,%ecx
- xorl 12(%rsp),%edx
- leal 1859775393(%r14,%r12,1),%r12d
- xorl %edi,%eax
- addl %ecx,%r12d
- roll $30,%esi
- addl %eax,%r12d
- roll $1,%edx
- xorl 48(%rsp),%ebp
- movl %r13d,%eax
- movl %edx,44(%rsp)
- movl %r12d,%ecx
- xorl 56(%rsp),%ebp
- xorl %edi,%eax
- roll $5,%ecx
- xorl 16(%rsp),%ebp
- leal 1859775393(%rdx,%r11,1),%r11d
- xorl %esi,%eax
- addl %ecx,%r11d
- roll $30,%r13d
- addl %eax,%r11d
- roll $1,%ebp
- xorl 52(%rsp),%r14d
- movl %r12d,%eax
- movl %ebp,48(%rsp)
- movl %r11d,%ecx
- xorl 60(%rsp),%r14d
- xorl %esi,%eax
- roll $5,%ecx
- xorl 20(%rsp),%r14d
- leal 1859775393(%rbp,%rdi,1),%edi
- xorl %r13d,%eax
- addl %ecx,%edi
- roll $30,%r12d
- addl %eax,%edi
- roll $1,%r14d
- xorl 56(%rsp),%edx
- movl %r11d,%eax
- movl %r14d,52(%rsp)
- movl %edi,%ecx
- xorl 0(%rsp),%edx
- xorl %r13d,%eax
- roll $5,%ecx
- xorl 24(%rsp),%edx
- leal 1859775393(%r14,%rsi,1),%esi
- xorl %r12d,%eax
- addl %ecx,%esi
- roll $30,%r11d
- addl %eax,%esi
- roll $1,%edx
- xorl 60(%rsp),%ebp
- movl %edi,%eax
- movl %edx,56(%rsp)
- movl %esi,%ecx
- xorl 4(%rsp),%ebp
- xorl %r12d,%eax
- roll $5,%ecx
- xorl 28(%rsp),%ebp
- leal 1859775393(%rdx,%r13,1),%r13d
- xorl %r11d,%eax
- addl %ecx,%r13d
- roll $30,%edi
- addl %eax,%r13d
- roll $1,%ebp
- xorl 0(%rsp),%r14d
- movl %esi,%eax
- movl %ebp,60(%rsp)
- movl %r13d,%ecx
- xorl 8(%rsp),%r14d
- xorl %r11d,%eax
- roll $5,%ecx
- xorl 32(%rsp),%r14d
- leal 1859775393(%rbp,%r12,1),%r12d
- xorl %edi,%eax
- addl %ecx,%r12d
- roll $30,%esi
- addl %eax,%r12d
- roll $1,%r14d
- xorl 4(%rsp),%edx
- movl %r13d,%eax
- movl %r14d,0(%rsp)
- movl %r12d,%ecx
- xorl 12(%rsp),%edx
- xorl %edi,%eax
- roll $5,%ecx
- xorl 36(%rsp),%edx
- leal 1859775393(%r14,%r11,1),%r11d
- xorl %esi,%eax
- addl %ecx,%r11d
- roll $30,%r13d
- addl %eax,%r11d
- roll $1,%edx
- xorl 8(%rsp),%ebp
- movl %r12d,%eax
- movl %edx,4(%rsp)
- movl %r11d,%ecx
- xorl 16(%rsp),%ebp
- xorl %esi,%eax
- roll $5,%ecx
- xorl 40(%rsp),%ebp
- leal 1859775393(%rdx,%rdi,1),%edi
- xorl %r13d,%eax
- addl %ecx,%edi
- roll $30,%r12d
- addl %eax,%edi
- roll $1,%ebp
- xorl 12(%rsp),%r14d
- movl %r11d,%eax
- movl %ebp,8(%rsp)
- movl %edi,%ecx
- xorl 20(%rsp),%r14d
- xorl %r13d,%eax
- roll $5,%ecx
- xorl 44(%rsp),%r14d
- leal 1859775393(%rbp,%rsi,1),%esi
- xorl %r12d,%eax
- addl %ecx,%esi
- roll $30,%r11d
- addl %eax,%esi
- roll $1,%r14d
- xorl 16(%rsp),%edx
- movl %edi,%eax
- movl %r14d,12(%rsp)
- movl %esi,%ecx
- xorl 24(%rsp),%edx
- xorl %r12d,%eax
- roll $5,%ecx
- xorl 48(%rsp),%edx
- leal 1859775393(%r14,%r13,1),%r13d
- xorl %r11d,%eax
- addl %ecx,%r13d
- roll $30,%edi
- addl %eax,%r13d
- roll $1,%edx
- xorl 20(%rsp),%ebp
- movl %esi,%eax
- movl %edx,16(%rsp)
- movl %r13d,%ecx
- xorl 28(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- xorl 52(%rsp),%ebp
- leal 1859775393(%rdx,%r12,1),%r12d
- xorl %edi,%eax
- addl %ecx,%r12d
- roll $30,%esi
- addl %eax,%r12d
- roll $1,%ebp
- xorl 24(%rsp),%r14d
- movl %r13d,%eax
- movl %ebp,20(%rsp)
- movl %r12d,%ecx
- xorl 32(%rsp),%r14d
- xorl %edi,%eax
- roll $5,%ecx
- xorl 56(%rsp),%r14d
- leal 1859775393(%rbp,%r11,1),%r11d
- xorl %esi,%eax
- addl %ecx,%r11d
- roll $30,%r13d
- addl %eax,%r11d
- roll $1,%r14d
- xorl 28(%rsp),%edx
- movl %r12d,%eax
- movl %r14d,24(%rsp)
- movl %r11d,%ecx
- xorl 36(%rsp),%edx
- xorl %esi,%eax
- roll $5,%ecx
- xorl 60(%rsp),%edx
- leal 1859775393(%r14,%rdi,1),%edi
- xorl %r13d,%eax
- addl %ecx,%edi
- roll $30,%r12d
- addl %eax,%edi
- roll $1,%edx
- xorl 32(%rsp),%ebp
- movl %r11d,%eax
- movl %edx,28(%rsp)
- movl %edi,%ecx
- xorl 40(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- xorl 0(%rsp),%ebp
- leal 1859775393(%rdx,%rsi,1),%esi
- xorl %r12d,%eax
- addl %ecx,%esi
- roll $30,%r11d
- addl %eax,%esi
- roll $1,%ebp
- xorl 36(%rsp),%r14d
- movl %r12d,%eax
- movl %ebp,32(%rsp)
- movl %r12d,%ebx
- xorl 44(%rsp),%r14d
- andl %r11d,%eax
- movl %esi,%ecx
- xorl 4(%rsp),%r14d
- leal -1894007588(%rbp,%r13,1),%r13d
- xorl %r11d,%ebx
- roll $5,%ecx
- addl %eax,%r13d
- roll $1,%r14d
- andl %edi,%ebx
- addl %ecx,%r13d
- roll $30,%edi
- addl %ebx,%r13d
- xorl 40(%rsp),%edx
- movl %r11d,%eax
- movl %r14d,36(%rsp)
- movl %r11d,%ebx
- xorl 48(%rsp),%edx
- andl %edi,%eax
- movl %r13d,%ecx
- xorl 8(%rsp),%edx
- leal -1894007588(%r14,%r12,1),%r12d
- xorl %edi,%ebx
- roll $5,%ecx
- addl %eax,%r12d
- roll $1,%edx
- andl %esi,%ebx
- addl %ecx,%r12d
- roll $30,%esi
- addl %ebx,%r12d
- xorl 44(%rsp),%ebp
- movl %edi,%eax
- movl %edx,40(%rsp)
- movl %edi,%ebx
- xorl 52(%rsp),%ebp
- andl %esi,%eax
- movl %r12d,%ecx
- xorl 12(%rsp),%ebp
- leal -1894007588(%rdx,%r11,1),%r11d
- xorl %esi,%ebx
- roll $5,%ecx
- addl %eax,%r11d
- roll $1,%ebp
- andl %r13d,%ebx
- addl %ecx,%r11d
- roll $30,%r13d
- addl %ebx,%r11d
- xorl 48(%rsp),%r14d
- movl %esi,%eax
- movl %ebp,44(%rsp)
- movl %esi,%ebx
- xorl 56(%rsp),%r14d
- andl %r13d,%eax
- movl %r11d,%ecx
- xorl 16(%rsp),%r14d
- leal -1894007588(%rbp,%rdi,1),%edi
- xorl %r13d,%ebx
- roll $5,%ecx
- addl %eax,%edi
- roll $1,%r14d
- andl %r12d,%ebx
- addl %ecx,%edi
- roll $30,%r12d
- addl %ebx,%edi
- xorl 52(%rsp),%edx
- movl %r13d,%eax
- movl %r14d,48(%rsp)
- movl %r13d,%ebx
- xorl 60(%rsp),%edx
- andl %r12d,%eax
- movl %edi,%ecx
- xorl 20(%rsp),%edx
- leal -1894007588(%r14,%rsi,1),%esi
- xorl %r12d,%ebx
- roll $5,%ecx
- addl %eax,%esi
- roll $1,%edx
- andl %r11d,%ebx
- addl %ecx,%esi
- roll $30,%r11d
- addl %ebx,%esi
- xorl 56(%rsp),%ebp
- movl %r12d,%eax
- movl %edx,52(%rsp)
- movl %r12d,%ebx
- xorl 0(%rsp),%ebp
- andl %r11d,%eax
- movl %esi,%ecx
- xorl 24(%rsp),%ebp
- leal -1894007588(%rdx,%r13,1),%r13d
- xorl %r11d,%ebx
- roll $5,%ecx
- addl %eax,%r13d
- roll $1,%ebp
- andl %edi,%ebx
- addl %ecx,%r13d
- roll $30,%edi
- addl %ebx,%r13d
- xorl 60(%rsp),%r14d
- movl %r11d,%eax
- movl %ebp,56(%rsp)
- movl %r11d,%ebx
- xorl 4(%rsp),%r14d
- andl %edi,%eax
- movl %r13d,%ecx
- xorl 28(%rsp),%r14d
- leal -1894007588(%rbp,%r12,1),%r12d
- xorl %edi,%ebx
- roll $5,%ecx
- addl %eax,%r12d
- roll $1,%r14d
- andl %esi,%ebx
- addl %ecx,%r12d
- roll $30,%esi
- addl %ebx,%r12d
- xorl 0(%rsp),%edx
- movl %edi,%eax
- movl %r14d,60(%rsp)
- movl %edi,%ebx
- xorl 8(%rsp),%edx
- andl %esi,%eax
- movl %r12d,%ecx
- xorl 32(%rsp),%edx
- leal -1894007588(%r14,%r11,1),%r11d
- xorl %esi,%ebx
- roll $5,%ecx
- addl %eax,%r11d
- roll $1,%edx
- andl %r13d,%ebx
- addl %ecx,%r11d
- roll $30,%r13d
- addl %ebx,%r11d
- xorl 4(%rsp),%ebp
- movl %esi,%eax
- movl %edx,0(%rsp)
- movl %esi,%ebx
- xorl 12(%rsp),%ebp
- andl %r13d,%eax
- movl %r11d,%ecx
- xorl 36(%rsp),%ebp
- leal -1894007588(%rdx,%rdi,1),%edi
- xorl %r13d,%ebx
- roll $5,%ecx
- addl %eax,%edi
- roll $1,%ebp
- andl %r12d,%ebx
- addl %ecx,%edi
- roll $30,%r12d
- addl %ebx,%edi
- xorl 8(%rsp),%r14d
- movl %r13d,%eax
- movl %ebp,4(%rsp)
- movl %r13d,%ebx
- xorl 16(%rsp),%r14d
- andl %r12d,%eax
- movl %edi,%ecx
- xorl 40(%rsp),%r14d
- leal -1894007588(%rbp,%rsi,1),%esi
- xorl %r12d,%ebx
- roll $5,%ecx
- addl %eax,%esi
- roll $1,%r14d
- andl %r11d,%ebx
- addl %ecx,%esi
- roll $30,%r11d
- addl %ebx,%esi
- xorl 12(%rsp),%edx
- movl %r12d,%eax
- movl %r14d,8(%rsp)
- movl %r12d,%ebx
- xorl 20(%rsp),%edx
- andl %r11d,%eax
- movl %esi,%ecx
- xorl 44(%rsp),%edx
- leal -1894007588(%r14,%r13,1),%r13d
- xorl %r11d,%ebx
- roll $5,%ecx
- addl %eax,%r13d
- roll $1,%edx
- andl %edi,%ebx
- addl %ecx,%r13d
- roll $30,%edi
- addl %ebx,%r13d
- xorl 16(%rsp),%ebp
- movl %r11d,%eax
- movl %edx,12(%rsp)
- movl %r11d,%ebx
- xorl 24(%rsp),%ebp
- andl %edi,%eax
- movl %r13d,%ecx
- xorl 48(%rsp),%ebp
- leal -1894007588(%rdx,%r12,1),%r12d
- xorl %edi,%ebx
- roll $5,%ecx
- addl %eax,%r12d
- roll $1,%ebp
- andl %esi,%ebx
- addl %ecx,%r12d
- roll $30,%esi
- addl %ebx,%r12d
- xorl 20(%rsp),%r14d
- movl %edi,%eax
- movl %ebp,16(%rsp)
- movl %edi,%ebx
- xorl 28(%rsp),%r14d
- andl %esi,%eax
- movl %r12d,%ecx
- xorl 52(%rsp),%r14d
- leal -1894007588(%rbp,%r11,1),%r11d
- xorl %esi,%ebx
- roll $5,%ecx
- addl %eax,%r11d
- roll $1,%r14d
- andl %r13d,%ebx
- addl %ecx,%r11d
- roll $30,%r13d
- addl %ebx,%r11d
- xorl 24(%rsp),%edx
- movl %esi,%eax
- movl %r14d,20(%rsp)
- movl %esi,%ebx
- xorl 32(%rsp),%edx
- andl %r13d,%eax
- movl %r11d,%ecx
- xorl 56(%rsp),%edx
- leal -1894007588(%r14,%rdi,1),%edi
- xorl %r13d,%ebx
- roll $5,%ecx
- addl %eax,%edi
- roll $1,%edx
- andl %r12d,%ebx
- addl %ecx,%edi
- roll $30,%r12d
- addl %ebx,%edi
- xorl 28(%rsp),%ebp
- movl %r13d,%eax
- movl %edx,24(%rsp)
- movl %r13d,%ebx
- xorl 36(%rsp),%ebp
- andl %r12d,%eax
- movl %edi,%ecx
- xorl 60(%rsp),%ebp
- leal -1894007588(%rdx,%rsi,1),%esi
- xorl %r12d,%ebx
- roll $5,%ecx
- addl %eax,%esi
- roll $1,%ebp
- andl %r11d,%ebx
- addl %ecx,%esi
- roll $30,%r11d
- addl %ebx,%esi
- xorl 32(%rsp),%r14d
- movl %r12d,%eax
- movl %ebp,28(%rsp)
- movl %r12d,%ebx
- xorl 40(%rsp),%r14d
- andl %r11d,%eax
- movl %esi,%ecx
- xorl 0(%rsp),%r14d
- leal -1894007588(%rbp,%r13,1),%r13d
- xorl %r11d,%ebx
- roll $5,%ecx
- addl %eax,%r13d
- roll $1,%r14d
- andl %edi,%ebx
- addl %ecx,%r13d
- roll $30,%edi
- addl %ebx,%r13d
- xorl 36(%rsp),%edx
- movl %r11d,%eax
- movl %r14d,32(%rsp)
- movl %r11d,%ebx
- xorl 44(%rsp),%edx
- andl %edi,%eax
- movl %r13d,%ecx
- xorl 4(%rsp),%edx
- leal -1894007588(%r14,%r12,1),%r12d
- xorl %edi,%ebx
- roll $5,%ecx
- addl %eax,%r12d
- roll $1,%edx
- andl %esi,%ebx
- addl %ecx,%r12d
- roll $30,%esi
- addl %ebx,%r12d
- xorl 40(%rsp),%ebp
- movl %edi,%eax
- movl %edx,36(%rsp)
- movl %edi,%ebx
- xorl 48(%rsp),%ebp
- andl %esi,%eax
- movl %r12d,%ecx
- xorl 8(%rsp),%ebp
- leal -1894007588(%rdx,%r11,1),%r11d
- xorl %esi,%ebx
- roll $5,%ecx
- addl %eax,%r11d
- roll $1,%ebp
- andl %r13d,%ebx
- addl %ecx,%r11d
- roll $30,%r13d
- addl %ebx,%r11d
- xorl 44(%rsp),%r14d
- movl %esi,%eax
- movl %ebp,40(%rsp)
- movl %esi,%ebx
- xorl 52(%rsp),%r14d
- andl %r13d,%eax
- movl %r11d,%ecx
- xorl 12(%rsp),%r14d
- leal -1894007588(%rbp,%rdi,1),%edi
- xorl %r13d,%ebx
- roll $5,%ecx
- addl %eax,%edi
- roll $1,%r14d
- andl %r12d,%ebx
- addl %ecx,%edi
- roll $30,%r12d
- addl %ebx,%edi
- xorl 48(%rsp),%edx
- movl %r13d,%eax
- movl %r14d,44(%rsp)
- movl %r13d,%ebx
- xorl 56(%rsp),%edx
- andl %r12d,%eax
- movl %edi,%ecx
- xorl 16(%rsp),%edx
- leal -1894007588(%r14,%rsi,1),%esi
- xorl %r12d,%ebx
- roll $5,%ecx
- addl %eax,%esi
- roll $1,%edx
- andl %r11d,%ebx
- addl %ecx,%esi
- roll $30,%r11d
- addl %ebx,%esi
- xorl 52(%rsp),%ebp
- movl %edi,%eax
- movl %edx,48(%rsp)
- movl %esi,%ecx
- xorl 60(%rsp),%ebp
- xorl %r12d,%eax
- roll $5,%ecx
- xorl 20(%rsp),%ebp
- leal -899497514(%rdx,%r13,1),%r13d
- xorl %r11d,%eax
- addl %ecx,%r13d
- roll $30,%edi
- addl %eax,%r13d
- roll $1,%ebp
- xorl 56(%rsp),%r14d
- movl %esi,%eax
- movl %ebp,52(%rsp)
- movl %r13d,%ecx
- xorl 0(%rsp),%r14d
- xorl %r11d,%eax
- roll $5,%ecx
- xorl 24(%rsp),%r14d
- leal -899497514(%rbp,%r12,1),%r12d
- xorl %edi,%eax
- addl %ecx,%r12d
- roll $30,%esi
- addl %eax,%r12d
- roll $1,%r14d
- xorl 60(%rsp),%edx
- movl %r13d,%eax
- movl %r14d,56(%rsp)
- movl %r12d,%ecx
- xorl 4(%rsp),%edx
- xorl %edi,%eax
- roll $5,%ecx
- xorl 28(%rsp),%edx
- leal -899497514(%r14,%r11,1),%r11d
- xorl %esi,%eax
- addl %ecx,%r11d
- roll $30,%r13d
- addl %eax,%r11d
- roll $1,%edx
- xorl 0(%rsp),%ebp
- movl %r12d,%eax
- movl %edx,60(%rsp)
- movl %r11d,%ecx
- xorl 8(%rsp),%ebp
- xorl %esi,%eax
- roll $5,%ecx
- xorl 32(%rsp),%ebp
- leal -899497514(%rdx,%rdi,1),%edi
- xorl %r13d,%eax
- addl %ecx,%edi
- roll $30,%r12d
- addl %eax,%edi
- roll $1,%ebp
- xorl 4(%rsp),%r14d
- movl %r11d,%eax
- movl %ebp,0(%rsp)
- movl %edi,%ecx
- xorl 12(%rsp),%r14d
- xorl %r13d,%eax
- roll $5,%ecx
- xorl 36(%rsp),%r14d
- leal -899497514(%rbp,%rsi,1),%esi
- xorl %r12d,%eax
- addl %ecx,%esi
- roll $30,%r11d
- addl %eax,%esi
- roll $1,%r14d
- xorl 8(%rsp),%edx
- movl %edi,%eax
- movl %r14d,4(%rsp)
- movl %esi,%ecx
- xorl 16(%rsp),%edx
- xorl %r12d,%eax
- roll $5,%ecx
- xorl 40(%rsp),%edx
- leal -899497514(%r14,%r13,1),%r13d
- xorl %r11d,%eax
- addl %ecx,%r13d
- roll $30,%edi
- addl %eax,%r13d
- roll $1,%edx
- xorl 12(%rsp),%ebp
- movl %esi,%eax
- movl %edx,8(%rsp)
- movl %r13d,%ecx
- xorl 20(%rsp),%ebp
- xorl %r11d,%eax
- roll $5,%ecx
- xorl 44(%rsp),%ebp
- leal -899497514(%rdx,%r12,1),%r12d
- xorl %edi,%eax
- addl %ecx,%r12d
- roll $30,%esi
- addl %eax,%r12d
- roll $1,%ebp
- xorl 16(%rsp),%r14d
- movl %r13d,%eax
- movl %ebp,12(%rsp)
- movl %r12d,%ecx
- xorl 24(%rsp),%r14d
- xorl %edi,%eax
- roll $5,%ecx
- xorl 48(%rsp),%r14d
- leal -899497514(%rbp,%r11,1),%r11d
- xorl %esi,%eax
- addl %ecx,%r11d
- roll $30,%r13d
- addl %eax,%r11d
- roll $1,%r14d
- xorl 20(%rsp),%edx
- movl %r12d,%eax
- movl %r14d,16(%rsp)
- movl %r11d,%ecx
- xorl 28(%rsp),%edx
- xorl %esi,%eax
- roll $5,%ecx
- xorl 52(%rsp),%edx
- leal -899497514(%r14,%rdi,1),%edi
- xorl %r13d,%eax
- addl %ecx,%edi
- roll $30,%r12d
- addl %eax,%edi
- roll $1,%edx
- xorl 24(%rsp),%ebp
- movl %r11d,%eax
- movl %edx,20(%rsp)
- movl %edi,%ecx
- xorl 32(%rsp),%ebp
- xorl %r13d,%eax
- roll $5,%ecx
- xorl 56(%rsp),%ebp
- leal -899497514(%rdx,%rsi,1),%esi
- xorl %r12d,%eax
- addl %ecx,%esi
- roll $30,%r11d
- addl %eax,%esi
- roll $1,%ebp
- xorl 28(%rsp),%r14d
- movl %edi,%eax
- movl %ebp,24(%rsp)
- movl %esi,%ecx
- xorl 36(%rsp),%r14d
- xorl %r12d,%eax
- roll $5,%ecx
- xorl 60(%rsp),%r14d
- leal -899497514(%rbp,%r13,1),%r13d
- xorl %r11d,%eax
- addl %ecx,%r13d
- roll $30,%edi
- addl %eax,%r13d
- roll $1,%r14d
- xorl 32(%rsp),%edx
- movl %esi,%eax
- movl %r14d,28(%rsp)
- movl %r13d,%ecx
- xorl 40(%rsp),%edx
- xorl %r11d,%eax
- roll $5,%ecx
- xorl 0(%rsp),%edx
- leal -899497514(%r14,%r12,1),%r12d
- xorl %edi,%eax
- addl %ecx,%r12d
- roll $30,%esi
- addl %eax,%r12d
- roll $1,%edx
- xorl 36(%rsp),%ebp
- movl %r13d,%eax
-
- movl %r12d,%ecx
- xorl 44(%rsp),%ebp
- xorl %edi,%eax
- roll $5,%ecx
- xorl 4(%rsp),%ebp
- leal -899497514(%rdx,%r11,1),%r11d
- xorl %esi,%eax
- addl %ecx,%r11d
- roll $30,%r13d
- addl %eax,%r11d
- roll $1,%ebp
- xorl 40(%rsp),%r14d
- movl %r12d,%eax
-
- movl %r11d,%ecx
- xorl 48(%rsp),%r14d
- xorl %esi,%eax
- roll $5,%ecx
- xorl 8(%rsp),%r14d
- leal -899497514(%rbp,%rdi,1),%edi
- xorl %r13d,%eax
- addl %ecx,%edi
- roll $30,%r12d
- addl %eax,%edi
- roll $1,%r14d
- xorl 44(%rsp),%edx
- movl %r11d,%eax
-
- movl %edi,%ecx
- xorl 52(%rsp),%edx
- xorl %r13d,%eax
- roll $5,%ecx
- xorl 12(%rsp),%edx
- leal -899497514(%r14,%rsi,1),%esi
- xorl %r12d,%eax
- addl %ecx,%esi
- roll $30,%r11d
- addl %eax,%esi
- roll $1,%edx
- xorl 48(%rsp),%ebp
- movl %edi,%eax
-
- movl %esi,%ecx
- xorl 56(%rsp),%ebp
- xorl %r12d,%eax
- roll $5,%ecx
- xorl 16(%rsp),%ebp
- leal -899497514(%rdx,%r13,1),%r13d
- xorl %r11d,%eax
- addl %ecx,%r13d
- roll $30,%edi
- addl %eax,%r13d
- roll $1,%ebp
- xorl 52(%rsp),%r14d
- movl %esi,%eax
-
- movl %r13d,%ecx
- xorl 60(%rsp),%r14d
- xorl %r11d,%eax
- roll $5,%ecx
- xorl 20(%rsp),%r14d
- leal -899497514(%rbp,%r12,1),%r12d
- xorl %edi,%eax
- addl %ecx,%r12d
- roll $30,%esi
- addl %eax,%r12d
- roll $1,%r14d
- xorl 56(%rsp),%edx
- movl %r13d,%eax
-
- movl %r12d,%ecx
- xorl 0(%rsp),%edx
- xorl %edi,%eax
- roll $5,%ecx
- xorl 24(%rsp),%edx
- leal -899497514(%r14,%r11,1),%r11d
- xorl %esi,%eax
- addl %ecx,%r11d
- roll $30,%r13d
- addl %eax,%r11d
- roll $1,%edx
- xorl 60(%rsp),%ebp
- movl %r12d,%eax
-
- movl %r11d,%ecx
- xorl 4(%rsp),%ebp
- xorl %esi,%eax
- roll $5,%ecx
- xorl 28(%rsp),%ebp
- leal -899497514(%rdx,%rdi,1),%edi
- xorl %r13d,%eax
- addl %ecx,%edi
- roll $30,%r12d
- addl %eax,%edi
- roll $1,%ebp
- movl %r11d,%eax
- movl %edi,%ecx
- xorl %r13d,%eax
- leal -899497514(%rbp,%rsi,1),%esi
- roll $5,%ecx
- xorl %r12d,%eax
- addl %ecx,%esi
- roll $30,%r11d
- addl %eax,%esi
- addl 0(%r8),%esi
- addl 4(%r8),%edi
- addl 8(%r8),%r11d
- addl 12(%r8),%r12d
- addl 16(%r8),%r13d
- movl %esi,0(%r8)
- movl %edi,4(%r8)
- movl %r11d,8(%r8)
- movl %r12d,12(%r8)
- movl %r13d,16(%r8)
-
- subq $1,%r10
- leaq 64(%r9),%r9
- jnz L$loop
-
- movq 64(%rsp),%rsi
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbp
-
- movq -8(%rsi),%rbx
-
- leaq (%rsi),%rsp
-
-L$epilogue:
- ret
-
-
-
-.p2align 5
-sha1_block_data_order_shaext:
-_shaext_shortcut:
-
- movdqu (%rdi),%xmm0
- movd 16(%rdi),%xmm1
- movdqa K_XX_XX+160(%rip),%xmm3
-
- movdqu (%rsi),%xmm4
- pshufd $27,%xmm0,%xmm0
- movdqu 16(%rsi),%xmm5
- pshufd $27,%xmm1,%xmm1
- movdqu 32(%rsi),%xmm6
-.byte 102,15,56,0,227
- movdqu 48(%rsi),%xmm7
-.byte 102,15,56,0,235
-.byte 102,15,56,0,243
- movdqa %xmm1,%xmm9
-.byte 102,15,56,0,251
- jmp L$oop_shaext
-
-.p2align 4
-L$oop_shaext:
- decq %rdx
- leaq 64(%rsi),%r8
- paddd %xmm4,%xmm1
- cmovneq %r8,%rsi
- prefetcht0 512(%rsi)
- movdqa %xmm0,%xmm8
-.byte 15,56,201,229
- movdqa %xmm0,%xmm2
-.byte 15,58,204,193,0
-.byte 15,56,200,213
- pxor %xmm6,%xmm4
-.byte 15,56,201,238
-.byte 15,56,202,231
-
- movdqa %xmm0,%xmm1
-.byte 15,58,204,194,0
-.byte 15,56,200,206
- pxor %xmm7,%xmm5
-.byte 15,56,202,236
-.byte 15,56,201,247
- movdqa %xmm0,%xmm2
-.byte 15,58,204,193,0
-.byte 15,56,200,215
- pxor %xmm4,%xmm6
-.byte 15,56,201,252
-.byte 15,56,202,245
-
- movdqa %xmm0,%xmm1
-.byte 15,58,204,194,0
-.byte 15,56,200,204
- pxor %xmm5,%xmm7
-.byte 15,56,202,254
-.byte 15,56,201,229
- movdqa %xmm0,%xmm2
-.byte 15,58,204,193,0
-.byte 15,56,200,213
- pxor %xmm6,%xmm4
-.byte 15,56,201,238
-.byte 15,56,202,231
-
- movdqa %xmm0,%xmm1
-.byte 15,58,204,194,1
-.byte 15,56,200,206
- pxor %xmm7,%xmm5
-.byte 15,56,202,236
-.byte 15,56,201,247
- movdqa %xmm0,%xmm2
-.byte 15,58,204,193,1
-.byte 15,56,200,215
- pxor %xmm4,%xmm6
-.byte 15,56,201,252
-.byte 15,56,202,245
-
- movdqa %xmm0,%xmm1
-.byte 15,58,204,194,1
-.byte 15,56,200,204
- pxor %xmm5,%xmm7
-.byte 15,56,202,254
-.byte 15,56,201,229
- movdqa %xmm0,%xmm2
-.byte 15,58,204,193,1
-.byte 15,56,200,213
- pxor %xmm6,%xmm4
-.byte 15,56,201,238
-.byte 15,56,202,231
-
- movdqa %xmm0,%xmm1
-.byte 15,58,204,194,1
-.byte 15,56,200,206
- pxor %xmm7,%xmm5
-.byte 15,56,202,236
-.byte 15,56,201,247
- movdqa %xmm0,%xmm2
-.byte 15,58,204,193,2
-.byte 15,56,200,215
- pxor %xmm4,%xmm6
-.byte 15,56,201,252
-.byte 15,56,202,245
-
- movdqa %xmm0,%xmm1
-.byte 15,58,204,194,2
-.byte 15,56,200,204
- pxor %xmm5,%xmm7
-.byte 15,56,202,254
-.byte 15,56,201,229
- movdqa %xmm0,%xmm2
-.byte 15,58,204,193,2
-.byte 15,56,200,213
- pxor %xmm6,%xmm4
-.byte 15,56,201,238
-.byte 15,56,202,231
-
- movdqa %xmm0,%xmm1
-.byte 15,58,204,194,2
-.byte 15,56,200,206
- pxor %xmm7,%xmm5
-.byte 15,56,202,236
-.byte 15,56,201,247
- movdqa %xmm0,%xmm2
-.byte 15,58,204,193,2
-.byte 15,56,200,215
- pxor %xmm4,%xmm6
-.byte 15,56,201,252
-.byte 15,56,202,245
-
- movdqa %xmm0,%xmm1
-.byte 15,58,204,194,3
-.byte 15,56,200,204
- pxor %xmm5,%xmm7
-.byte 15,56,202,254
- movdqu (%rsi),%xmm4
- movdqa %xmm0,%xmm2
-.byte 15,58,204,193,3
-.byte 15,56,200,213
- movdqu 16(%rsi),%xmm5
-.byte 102,15,56,0,227
-
- movdqa %xmm0,%xmm1
-.byte 15,58,204,194,3
-.byte 15,56,200,206
- movdqu 32(%rsi),%xmm6
-.byte 102,15,56,0,235
-
- movdqa %xmm0,%xmm2
-.byte 15,58,204,193,3
-.byte 15,56,200,215
- movdqu 48(%rsi),%xmm7
-.byte 102,15,56,0,243
-
- movdqa %xmm0,%xmm1
-.byte 15,58,204,194,3
-.byte 65,15,56,200,201
-.byte 102,15,56,0,251
-
- paddd %xmm8,%xmm0
- movdqa %xmm1,%xmm9
-
- jnz L$oop_shaext
-
- pshufd $27,%xmm0,%xmm0
- pshufd $27,%xmm1,%xmm1
- movdqu %xmm0,(%rdi)
- movd %xmm1,16(%rdi)
- ret
-
-
-
-.p2align 4
-sha1_block_data_order_ssse3:
-_ssse3_shortcut:
-
- movq %rsp,%r11
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- leaq -64(%rsp),%rsp
- andq $-64,%rsp
- movq %rdi,%r8
- movq %rsi,%r9
- movq %rdx,%r10
-
- shlq $6,%r10
- addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r14
-
- movl 0(%r8),%eax
- movl 4(%r8),%ebx
- movl 8(%r8),%ecx
- movl 12(%r8),%edx
- movl %ebx,%esi
- movl 16(%r8),%ebp
- movl %ecx,%edi
- xorl %edx,%edi
- andl %edi,%esi
-
- movdqa 64(%r14),%xmm6
- movdqa -64(%r14),%xmm9
- movdqu 0(%r9),%xmm0
- movdqu 16(%r9),%xmm1
- movdqu 32(%r9),%xmm2
- movdqu 48(%r9),%xmm3
-.byte 102,15,56,0,198
-.byte 102,15,56,0,206
-.byte 102,15,56,0,214
- addq $64,%r9
- paddd %xmm9,%xmm0
-.byte 102,15,56,0,222
- paddd %xmm9,%xmm1
- paddd %xmm9,%xmm2
- movdqa %xmm0,0(%rsp)
- psubd %xmm9,%xmm0
- movdqa %xmm1,16(%rsp)
- psubd %xmm9,%xmm1
- movdqa %xmm2,32(%rsp)
- psubd %xmm9,%xmm2
- jmp L$oop_ssse3
-.p2align 4
-L$oop_ssse3:
- rorl $2,%ebx
- pshufd $238,%xmm0,%xmm4
- xorl %edx,%esi
- movdqa %xmm3,%xmm8
- paddd %xmm3,%xmm9
- movl %eax,%edi
- addl 0(%rsp),%ebp
- punpcklqdq %xmm1,%xmm4
- xorl %ecx,%ebx
- roll $5,%eax
- addl %esi,%ebp
- psrldq $4,%xmm8
- andl %ebx,%edi
- xorl %ecx,%ebx
- pxor %xmm0,%xmm4
- addl %eax,%ebp
- rorl $7,%eax
- pxor %xmm2,%xmm8
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 4(%rsp),%edx
- pxor %xmm8,%xmm4
- xorl %ebx,%eax
- roll $5,%ebp
- movdqa %xmm9,48(%rsp)
- addl %edi,%edx
- andl %eax,%esi
- movdqa %xmm4,%xmm10
- xorl %ebx,%eax
- addl %ebp,%edx
- rorl $7,%ebp
- movdqa %xmm4,%xmm8
- xorl %ebx,%esi
- pslldq $12,%xmm10
- paddd %xmm4,%xmm4
- movl %edx,%edi
- addl 8(%rsp),%ecx
- psrld $31,%xmm8
- xorl %eax,%ebp
- roll $5,%edx
- addl %esi,%ecx
- movdqa %xmm10,%xmm9
- andl %ebp,%edi
- xorl %eax,%ebp
- psrld $30,%xmm10
- addl %edx,%ecx
- rorl $7,%edx
- por %xmm8,%xmm4
- xorl %eax,%edi
- movl %ecx,%esi
- addl 12(%rsp),%ebx
- pslld $2,%xmm9
- pxor %xmm10,%xmm4
- xorl %ebp,%edx
- movdqa -64(%r14),%xmm10
- roll $5,%ecx
- addl %edi,%ebx
- andl %edx,%esi
- pxor %xmm9,%xmm4
- xorl %ebp,%edx
- addl %ecx,%ebx
- rorl $7,%ecx
- pshufd $238,%xmm1,%xmm5
- xorl %ebp,%esi
- movdqa %xmm4,%xmm9
- paddd %xmm4,%xmm10
- movl %ebx,%edi
- addl 16(%rsp),%eax
- punpcklqdq %xmm2,%xmm5
- xorl %edx,%ecx
- roll $5,%ebx
- addl %esi,%eax
- psrldq $4,%xmm9
- andl %ecx,%edi
- xorl %edx,%ecx
- pxor %xmm1,%xmm5
- addl %ebx,%eax
- rorl $7,%ebx
- pxor %xmm3,%xmm9
- xorl %edx,%edi
- movl %eax,%esi
- addl 20(%rsp),%ebp
- pxor %xmm9,%xmm5
- xorl %ecx,%ebx
- roll $5,%eax
- movdqa %xmm10,0(%rsp)
- addl %edi,%ebp
- andl %ebx,%esi
- movdqa %xmm5,%xmm8
- xorl %ecx,%ebx
- addl %eax,%ebp
- rorl $7,%eax
- movdqa %xmm5,%xmm9
- xorl %ecx,%esi
- pslldq $12,%xmm8
- paddd %xmm5,%xmm5
- movl %ebp,%edi
- addl 24(%rsp),%edx
- psrld $31,%xmm9
- xorl %ebx,%eax
- roll $5,%ebp
- addl %esi,%edx
- movdqa %xmm8,%xmm10
- andl %eax,%edi
- xorl %ebx,%eax
- psrld $30,%xmm8
- addl %ebp,%edx
- rorl $7,%ebp
- por %xmm9,%xmm5
- xorl %ebx,%edi
- movl %edx,%esi
- addl 28(%rsp),%ecx
- pslld $2,%xmm10
- pxor %xmm8,%xmm5
- xorl %eax,%ebp
- movdqa -32(%r14),%xmm8
- roll $5,%edx
- addl %edi,%ecx
- andl %ebp,%esi
- pxor %xmm10,%xmm5
- xorl %eax,%ebp
- addl %edx,%ecx
- rorl $7,%edx
- pshufd $238,%xmm2,%xmm6
- xorl %eax,%esi
- movdqa %xmm5,%xmm10
- paddd %xmm5,%xmm8
- movl %ecx,%edi
- addl 32(%rsp),%ebx
- punpcklqdq %xmm3,%xmm6
- xorl %ebp,%edx
- roll $5,%ecx
- addl %esi,%ebx
- psrldq $4,%xmm10
- andl %edx,%edi
- xorl %ebp,%edx
- pxor %xmm2,%xmm6
- addl %ecx,%ebx
- rorl $7,%ecx
- pxor %xmm4,%xmm10
- xorl %ebp,%edi
- movl %ebx,%esi
- addl 36(%rsp),%eax
- pxor %xmm10,%xmm6
- xorl %edx,%ecx
- roll $5,%ebx
- movdqa %xmm8,16(%rsp)
- addl %edi,%eax
- andl %ecx,%esi
- movdqa %xmm6,%xmm9
- xorl %edx,%ecx
- addl %ebx,%eax
- rorl $7,%ebx
- movdqa %xmm6,%xmm10
- xorl %edx,%esi
- pslldq $12,%xmm9
- paddd %xmm6,%xmm6
- movl %eax,%edi
- addl 40(%rsp),%ebp
- psrld $31,%xmm10
- xorl %ecx,%ebx
- roll $5,%eax
- addl %esi,%ebp
- movdqa %xmm9,%xmm8
- andl %ebx,%edi
- xorl %ecx,%ebx
- psrld $30,%xmm9
- addl %eax,%ebp
- rorl $7,%eax
- por %xmm10,%xmm6
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 44(%rsp),%edx
- pslld $2,%xmm8
- pxor %xmm9,%xmm6
- xorl %ebx,%eax
- movdqa -32(%r14),%xmm9
- roll $5,%ebp
- addl %edi,%edx
- andl %eax,%esi
- pxor %xmm8,%xmm6
- xorl %ebx,%eax
- addl %ebp,%edx
- rorl $7,%ebp
- pshufd $238,%xmm3,%xmm7
- xorl %ebx,%esi
- movdqa %xmm6,%xmm8
- paddd %xmm6,%xmm9
- movl %edx,%edi
- addl 48(%rsp),%ecx
- punpcklqdq %xmm4,%xmm7
- xorl %eax,%ebp
- roll $5,%edx
- addl %esi,%ecx
- psrldq $4,%xmm8
- andl %ebp,%edi
- xorl %eax,%ebp
- pxor %xmm3,%xmm7
- addl %edx,%ecx
- rorl $7,%edx
- pxor %xmm5,%xmm8
- xorl %eax,%edi
- movl %ecx,%esi
- addl 52(%rsp),%ebx
- pxor %xmm8,%xmm7
- xorl %ebp,%edx
- roll $5,%ecx
- movdqa %xmm9,32(%rsp)
- addl %edi,%ebx
- andl %edx,%esi
- movdqa %xmm7,%xmm10
- xorl %ebp,%edx
- addl %ecx,%ebx
- rorl $7,%ecx
- movdqa %xmm7,%xmm8
- xorl %ebp,%esi
- pslldq $12,%xmm10
- paddd %xmm7,%xmm7
- movl %ebx,%edi
- addl 56(%rsp),%eax
- psrld $31,%xmm8
- xorl %edx,%ecx
- roll $5,%ebx
- addl %esi,%eax
- movdqa %xmm10,%xmm9
- andl %ecx,%edi
- xorl %edx,%ecx
- psrld $30,%xmm10
- addl %ebx,%eax
- rorl $7,%ebx
- por %xmm8,%xmm7
- xorl %edx,%edi
- movl %eax,%esi
- addl 60(%rsp),%ebp
- pslld $2,%xmm9
- pxor %xmm10,%xmm7
- xorl %ecx,%ebx
- movdqa -32(%r14),%xmm10
- roll $5,%eax
- addl %edi,%ebp
- andl %ebx,%esi
- pxor %xmm9,%xmm7
- pshufd $238,%xmm6,%xmm9
- xorl %ecx,%ebx
- addl %eax,%ebp
- rorl $7,%eax
- pxor %xmm4,%xmm0
- xorl %ecx,%esi
- movl %ebp,%edi
- addl 0(%rsp),%edx
- punpcklqdq %xmm7,%xmm9
- xorl %ebx,%eax
- roll $5,%ebp
- pxor %xmm1,%xmm0
- addl %esi,%edx
- andl %eax,%edi
- movdqa %xmm10,%xmm8
- xorl %ebx,%eax
- paddd %xmm7,%xmm10
- addl %ebp,%edx
- pxor %xmm9,%xmm0
- rorl $7,%ebp
- xorl %ebx,%edi
- movl %edx,%esi
- addl 4(%rsp),%ecx
- movdqa %xmm0,%xmm9
- xorl %eax,%ebp
- roll $5,%edx
- movdqa %xmm10,48(%rsp)
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- pslld $2,%xmm0
- addl %edx,%ecx
- rorl $7,%edx
- psrld $30,%xmm9
- xorl %eax,%esi
- movl %ecx,%edi
- addl 8(%rsp),%ebx
- por %xmm9,%xmm0
- xorl %ebp,%edx
- roll $5,%ecx
- pshufd $238,%xmm7,%xmm10
- addl %esi,%ebx
- andl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 12(%rsp),%eax
- xorl %ebp,%edi
- movl %ebx,%esi
- roll $5,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- rorl $7,%ecx
- addl %ebx,%eax
- pxor %xmm5,%xmm1
- addl 16(%rsp),%ebp
- xorl %ecx,%esi
- punpcklqdq %xmm0,%xmm10
- movl %eax,%edi
- roll $5,%eax
- pxor %xmm2,%xmm1
- addl %esi,%ebp
- xorl %ecx,%edi
- movdqa %xmm8,%xmm9
- rorl $7,%ebx
- paddd %xmm0,%xmm8
- addl %eax,%ebp
- pxor %xmm10,%xmm1
- addl 20(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- roll $5,%ebp
- movdqa %xmm1,%xmm10
- addl %edi,%edx
- xorl %ebx,%esi
- movdqa %xmm8,0(%rsp)
- rorl $7,%eax
- addl %ebp,%edx
- addl 24(%rsp),%ecx
- pslld $2,%xmm1
- xorl %eax,%esi
- movl %edx,%edi
- psrld $30,%xmm10
- roll $5,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- rorl $7,%ebp
- por %xmm10,%xmm1
- addl %edx,%ecx
- addl 28(%rsp),%ebx
- pshufd $238,%xmm0,%xmm8
- xorl %ebp,%edi
- movl %ecx,%esi
- roll $5,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- rorl $7,%edx
- addl %ecx,%ebx
- pxor %xmm6,%xmm2
- addl 32(%rsp),%eax
- xorl %edx,%esi
- punpcklqdq %xmm1,%xmm8
- movl %ebx,%edi
- roll $5,%ebx
- pxor %xmm3,%xmm2
- addl %esi,%eax
- xorl %edx,%edi
- movdqa 0(%r14),%xmm10
- rorl $7,%ecx
- paddd %xmm1,%xmm9
- addl %ebx,%eax
- pxor %xmm8,%xmm2
- addl 36(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- roll $5,%eax
- movdqa %xmm2,%xmm8
- addl %edi,%ebp
- xorl %ecx,%esi
- movdqa %xmm9,16(%rsp)
- rorl $7,%ebx
- addl %eax,%ebp
- addl 40(%rsp),%edx
- pslld $2,%xmm2
- xorl %ebx,%esi
- movl %ebp,%edi
- psrld $30,%xmm8
- roll $5,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- rorl $7,%eax
- por %xmm8,%xmm2
- addl %ebp,%edx
- addl 44(%rsp),%ecx
- pshufd $238,%xmm1,%xmm9
- xorl %eax,%edi
- movl %edx,%esi
- roll $5,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- rorl $7,%ebp
- addl %edx,%ecx
- pxor %xmm7,%xmm3
- addl 48(%rsp),%ebx
- xorl %ebp,%esi
- punpcklqdq %xmm2,%xmm9
- movl %ecx,%edi
- roll $5,%ecx
- pxor %xmm4,%xmm3
- addl %esi,%ebx
- xorl %ebp,%edi
- movdqa %xmm10,%xmm8
- rorl $7,%edx
- paddd %xmm2,%xmm10
- addl %ecx,%ebx
- pxor %xmm9,%xmm3
- addl 52(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- roll $5,%ebx
- movdqa %xmm3,%xmm9
- addl %edi,%eax
- xorl %edx,%esi
- movdqa %xmm10,32(%rsp)
- rorl $7,%ecx
- addl %ebx,%eax
- addl 56(%rsp),%ebp
- pslld $2,%xmm3
- xorl %ecx,%esi
- movl %eax,%edi
- psrld $30,%xmm9
- roll $5,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- rorl $7,%ebx
- por %xmm9,%xmm3
- addl %eax,%ebp
- addl 60(%rsp),%edx
- pshufd $238,%xmm2,%xmm10
- xorl %ebx,%edi
- movl %ebp,%esi
- roll $5,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- rorl $7,%eax
- addl %ebp,%edx
- pxor %xmm0,%xmm4
- addl 0(%rsp),%ecx
- xorl %eax,%esi
- punpcklqdq %xmm3,%xmm10
- movl %edx,%edi
- roll $5,%edx
- pxor %xmm5,%xmm4
- addl %esi,%ecx
- xorl %eax,%edi
- movdqa %xmm8,%xmm9
- rorl $7,%ebp
- paddd %xmm3,%xmm8
- addl %edx,%ecx
- pxor %xmm10,%xmm4
- addl 4(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- roll $5,%ecx
- movdqa %xmm4,%xmm10
- addl %edi,%ebx
- xorl %ebp,%esi
- movdqa %xmm8,48(%rsp)
- rorl $7,%edx
- addl %ecx,%ebx
- addl 8(%rsp),%eax
- pslld $2,%xmm4
- xorl %edx,%esi
- movl %ebx,%edi
- psrld $30,%xmm10
- roll $5,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- rorl $7,%ecx
- por %xmm10,%xmm4
- addl %ebx,%eax
- addl 12(%rsp),%ebp
- pshufd $238,%xmm3,%xmm8
- xorl %ecx,%edi
- movl %eax,%esi
- roll $5,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- rorl $7,%ebx
- addl %eax,%ebp
- pxor %xmm1,%xmm5
- addl 16(%rsp),%edx
- xorl %ebx,%esi
- punpcklqdq %xmm4,%xmm8
- movl %ebp,%edi
- roll $5,%ebp
- pxor %xmm6,%xmm5
- addl %esi,%edx
- xorl %ebx,%edi
- movdqa %xmm9,%xmm10
- rorl $7,%eax
- paddd %xmm4,%xmm9
- addl %ebp,%edx
- pxor %xmm8,%xmm5
- addl 20(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- roll $5,%edx
- movdqa %xmm5,%xmm8
- addl %edi,%ecx
- xorl %eax,%esi
- movdqa %xmm9,0(%rsp)
- rorl $7,%ebp
- addl %edx,%ecx
- addl 24(%rsp),%ebx
- pslld $2,%xmm5
- xorl %ebp,%esi
- movl %ecx,%edi
- psrld $30,%xmm8
- roll $5,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- rorl $7,%edx
- por %xmm8,%xmm5
- addl %ecx,%ebx
- addl 28(%rsp),%eax
- pshufd $238,%xmm4,%xmm9
- rorl $7,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- roll $5,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- pxor %xmm2,%xmm6
- addl 32(%rsp),%ebp
- andl %ecx,%esi
- xorl %edx,%ecx
- rorl $7,%ebx
- punpcklqdq %xmm5,%xmm9
- movl %eax,%edi
- xorl %ecx,%esi
- pxor %xmm7,%xmm6
- roll $5,%eax
- addl %esi,%ebp
- movdqa %xmm10,%xmm8
- xorl %ebx,%edi
- paddd %xmm5,%xmm10
- xorl %ecx,%ebx
- pxor %xmm9,%xmm6
- addl %eax,%ebp
- addl 36(%rsp),%edx
- andl %ebx,%edi
- xorl %ecx,%ebx
- rorl $7,%eax
- movdqa %xmm6,%xmm9
- movl %ebp,%esi
- xorl %ebx,%edi
- movdqa %xmm10,16(%rsp)
- roll $5,%ebp
- addl %edi,%edx
- xorl %eax,%esi
- pslld $2,%xmm6
- xorl %ebx,%eax
- addl %ebp,%edx
- psrld $30,%xmm9
- addl 40(%rsp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- por %xmm9,%xmm6
- rorl $7,%ebp
- movl %edx,%edi
- xorl %eax,%esi
- roll $5,%edx
- pshufd $238,%xmm5,%xmm10
- addl %esi,%ecx
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 44(%rsp),%ebx
- andl %ebp,%edi
- xorl %eax,%ebp
- rorl $7,%edx
- movl %ecx,%esi
- xorl %ebp,%edi
- roll $5,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- pxor %xmm3,%xmm7
- addl 48(%rsp),%eax
- andl %edx,%esi
- xorl %ebp,%edx
- rorl $7,%ecx
- punpcklqdq %xmm6,%xmm10
- movl %ebx,%edi
- xorl %edx,%esi
- pxor %xmm0,%xmm7
- roll $5,%ebx
- addl %esi,%eax
- movdqa 32(%r14),%xmm9
- xorl %ecx,%edi
- paddd %xmm6,%xmm8
- xorl %edx,%ecx
- pxor %xmm10,%xmm7
- addl %ebx,%eax
- addl 52(%rsp),%ebp
- andl %ecx,%edi
- xorl %edx,%ecx
- rorl $7,%ebx
- movdqa %xmm7,%xmm10
- movl %eax,%esi
- xorl %ecx,%edi
- movdqa %xmm8,32(%rsp)
- roll $5,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- pslld $2,%xmm7
- xorl %ecx,%ebx
- addl %eax,%ebp
- psrld $30,%xmm10
- addl 56(%rsp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- por %xmm10,%xmm7
- rorl $7,%eax
- movl %ebp,%edi
- xorl %ebx,%esi
- roll $5,%ebp
- pshufd $238,%xmm6,%xmm8
- addl %esi,%edx
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 60(%rsp),%ecx
- andl %eax,%edi
- xorl %ebx,%eax
- rorl $7,%ebp
- movl %edx,%esi
- xorl %eax,%edi
- roll $5,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- pxor %xmm4,%xmm0
- addl 0(%rsp),%ebx
- andl %ebp,%esi
- xorl %eax,%ebp
- rorl $7,%edx
- punpcklqdq %xmm7,%xmm8
- movl %ecx,%edi
- xorl %ebp,%esi
- pxor %xmm1,%xmm0
- roll $5,%ecx
- addl %esi,%ebx
- movdqa %xmm9,%xmm10
- xorl %edx,%edi
- paddd %xmm7,%xmm9
- xorl %ebp,%edx
- pxor %xmm8,%xmm0
- addl %ecx,%ebx
- addl 4(%rsp),%eax
- andl %edx,%edi
- xorl %ebp,%edx
- rorl $7,%ecx
- movdqa %xmm0,%xmm8
- movl %ebx,%esi
- xorl %edx,%edi
- movdqa %xmm9,48(%rsp)
- roll $5,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- pslld $2,%xmm0
- xorl %edx,%ecx
- addl %ebx,%eax
- psrld $30,%xmm8
- addl 8(%rsp),%ebp
- andl %ecx,%esi
- xorl %edx,%ecx
- por %xmm8,%xmm0
- rorl $7,%ebx
- movl %eax,%edi
- xorl %ecx,%esi
- roll $5,%eax
- pshufd $238,%xmm7,%xmm9
- addl %esi,%ebp
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 12(%rsp),%edx
- andl %ebx,%edi
- xorl %ecx,%ebx
- rorl $7,%eax
- movl %ebp,%esi
- xorl %ebx,%edi
- roll $5,%ebp
- addl %edi,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- pxor %xmm5,%xmm1
- addl 16(%rsp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- rorl $7,%ebp
- punpcklqdq %xmm0,%xmm9
- movl %edx,%edi
- xorl %eax,%esi
- pxor %xmm2,%xmm1
- roll $5,%edx
- addl %esi,%ecx
- movdqa %xmm10,%xmm8
- xorl %ebp,%edi
- paddd %xmm0,%xmm10
- xorl %eax,%ebp
- pxor %xmm9,%xmm1
- addl %edx,%ecx
- addl 20(%rsp),%ebx
- andl %ebp,%edi
- xorl %eax,%ebp
- rorl $7,%edx
- movdqa %xmm1,%xmm9
- movl %ecx,%esi
- xorl %ebp,%edi
- movdqa %xmm10,0(%rsp)
- roll $5,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- pslld $2,%xmm1
- xorl %ebp,%edx
- addl %ecx,%ebx
- psrld $30,%xmm9
- addl 24(%rsp),%eax
- andl %edx,%esi
- xorl %ebp,%edx
- por %xmm9,%xmm1
- rorl $7,%ecx
- movl %ebx,%edi
- xorl %edx,%esi
- roll $5,%ebx
- pshufd $238,%xmm0,%xmm10
- addl %esi,%eax
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 28(%rsp),%ebp
- andl %ecx,%edi
- xorl %edx,%ecx
- rorl $7,%ebx
- movl %eax,%esi
- xorl %ecx,%edi
- roll $5,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- pxor %xmm6,%xmm2
- addl 32(%rsp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- rorl $7,%eax
- punpcklqdq %xmm1,%xmm10
- movl %ebp,%edi
- xorl %ebx,%esi
- pxor %xmm3,%xmm2
- roll $5,%ebp
- addl %esi,%edx
- movdqa %xmm8,%xmm9
- xorl %eax,%edi
- paddd %xmm1,%xmm8
- xorl %ebx,%eax
- pxor %xmm10,%xmm2
- addl %ebp,%edx
- addl 36(%rsp),%ecx
- andl %eax,%edi
- xorl %ebx,%eax
- rorl $7,%ebp
- movdqa %xmm2,%xmm10
- movl %edx,%esi
- xorl %eax,%edi
- movdqa %xmm8,16(%rsp)
- roll $5,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- pslld $2,%xmm2
- xorl %eax,%ebp
- addl %edx,%ecx
- psrld $30,%xmm10
- addl 40(%rsp),%ebx
- andl %ebp,%esi
- xorl %eax,%ebp
- por %xmm10,%xmm2
- rorl $7,%edx
- movl %ecx,%edi
- xorl %ebp,%esi
- roll $5,%ecx
- pshufd $238,%xmm1,%xmm8
- addl %esi,%ebx
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 44(%rsp),%eax
- andl %edx,%edi
- xorl %ebp,%edx
- rorl $7,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- roll $5,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- pxor %xmm7,%xmm3
- addl 48(%rsp),%ebp
- xorl %ecx,%esi
- punpcklqdq %xmm2,%xmm8
- movl %eax,%edi
- roll $5,%eax
- pxor %xmm4,%xmm3
- addl %esi,%ebp
- xorl %ecx,%edi
- movdqa %xmm9,%xmm10
- rorl $7,%ebx
- paddd %xmm2,%xmm9
- addl %eax,%ebp
- pxor %xmm8,%xmm3
- addl 52(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- roll $5,%ebp
- movdqa %xmm3,%xmm8
- addl %edi,%edx
- xorl %ebx,%esi
- movdqa %xmm9,32(%rsp)
- rorl $7,%eax
- addl %ebp,%edx
- addl 56(%rsp),%ecx
- pslld $2,%xmm3
- xorl %eax,%esi
- movl %edx,%edi
- psrld $30,%xmm8
- roll $5,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- rorl $7,%ebp
- por %xmm8,%xmm3
- addl %edx,%ecx
- addl 60(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- roll $5,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- rorl $7,%edx
- addl %ecx,%ebx
- addl 0(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- roll $5,%ebx
- paddd %xmm3,%xmm10
- addl %esi,%eax
- xorl %edx,%edi
- movdqa %xmm10,48(%rsp)
- rorl $7,%ecx
- addl %ebx,%eax
- addl 4(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- roll $5,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- rorl $7,%ebx
- addl %eax,%ebp
- addl 8(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- roll $5,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- rorl $7,%eax
- addl %ebp,%edx
- addl 12(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- roll $5,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- rorl $7,%ebp
- addl %edx,%ecx
- cmpq %r10,%r9
- je L$done_ssse3
- movdqa 64(%r14),%xmm6
- movdqa -64(%r14),%xmm9
- movdqu 0(%r9),%xmm0
- movdqu 16(%r9),%xmm1
- movdqu 32(%r9),%xmm2
- movdqu 48(%r9),%xmm3
-.byte 102,15,56,0,198
- addq $64,%r9
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
-.byte 102,15,56,0,206
- roll $5,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- rorl $7,%edx
- paddd %xmm9,%xmm0
- addl %ecx,%ebx
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- movdqa %xmm0,0(%rsp)
- roll $5,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- rorl $7,%ecx
- psubd %xmm9,%xmm0
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- roll $5,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- rorl $7,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- roll $5,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- rorl $7,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
-.byte 102,15,56,0,214
- roll $5,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- rorl $7,%ebp
- paddd %xmm9,%xmm1
- addl %edx,%ecx
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- movdqa %xmm1,16(%rsp)
- roll $5,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- rorl $7,%edx
- psubd %xmm9,%xmm1
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- roll $5,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- rorl $7,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- roll $5,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- rorl $7,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
-.byte 102,15,56,0,222
- roll $5,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- rorl $7,%eax
- paddd %xmm9,%xmm2
- addl %ebp,%edx
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- movdqa %xmm2,32(%rsp)
- roll $5,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- rorl $7,%ebp
- psubd %xmm9,%xmm2
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- roll $5,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- rorl $7,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- roll $5,%ebx
- addl %edi,%eax
- rorl $7,%ecx
- addl %ebx,%eax
- addl 0(%r8),%eax
- addl 4(%r8),%esi
- addl 8(%r8),%ecx
- addl 12(%r8),%edx
- movl %eax,0(%r8)
- addl 16(%r8),%ebp
- movl %esi,4(%r8)
- movl %esi,%ebx
- movl %ecx,8(%r8)
- movl %ecx,%edi
- movl %edx,12(%r8)
- xorl %edx,%edi
- movl %ebp,16(%r8)
- andl %edi,%esi
- jmp L$oop_ssse3
-
-.p2align 4
-L$done_ssse3:
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- roll $5,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- rorl $7,%edx
- addl %ecx,%ebx
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- roll $5,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- rorl $7,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- roll $5,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- rorl $7,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- roll $5,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- rorl $7,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- roll $5,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- rorl $7,%ebp
- addl %edx,%ecx
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- roll $5,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- rorl $7,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- roll $5,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- rorl $7,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- roll $5,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- rorl $7,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- roll $5,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- rorl $7,%eax
- addl %ebp,%edx
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- roll $5,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- rorl $7,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- roll $5,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- rorl $7,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- roll $5,%ebx
- addl %edi,%eax
- rorl $7,%ecx
- addl %ebx,%eax
- addl 0(%r8),%eax
- addl 4(%r8),%esi
- addl 8(%r8),%ecx
- movl %eax,0(%r8)
- addl 12(%r8),%edx
- movl %esi,4(%r8)
- addl 16(%r8),%ebp
- movl %ecx,8(%r8)
- movl %edx,12(%r8)
- movl %ebp,16(%r8)
- movq -40(%r11),%r14
-
- movq -32(%r11),%r13
-
- movq -24(%r11),%r12
-
- movq -16(%r11),%rbp
-
- movq -8(%r11),%rbx
-
- leaq (%r11),%rsp
-
-L$epilogue_ssse3:
- ret
-
-
-
-.p2align 4
-sha1_block_data_order_avx:
-_avx_shortcut:
-
- movq %rsp,%r11
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- leaq -64(%rsp),%rsp
- vzeroupper
- andq $-64,%rsp
- movq %rdi,%r8
- movq %rsi,%r9
- movq %rdx,%r10
-
- shlq $6,%r10
- addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r14
-
- movl 0(%r8),%eax
- movl 4(%r8),%ebx
- movl 8(%r8),%ecx
- movl 12(%r8),%edx
- movl %ebx,%esi
- movl 16(%r8),%ebp
- movl %ecx,%edi
- xorl %edx,%edi
- andl %edi,%esi
-
- vmovdqa 64(%r14),%xmm6
- vmovdqa -64(%r14),%xmm11
- vmovdqu 0(%r9),%xmm0
- vmovdqu 16(%r9),%xmm1
- vmovdqu 32(%r9),%xmm2
- vmovdqu 48(%r9),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- addq $64,%r9
- vpshufb %xmm6,%xmm1,%xmm1
- vpshufb %xmm6,%xmm2,%xmm2
- vpshufb %xmm6,%xmm3,%xmm3
- vpaddd %xmm11,%xmm0,%xmm4
- vpaddd %xmm11,%xmm1,%xmm5
- vpaddd %xmm11,%xmm2,%xmm6
- vmovdqa %xmm4,0(%rsp)
- vmovdqa %xmm5,16(%rsp)
- vmovdqa %xmm6,32(%rsp)
- jmp L$oop_avx
-.p2align 4
-L$oop_avx:
- shrdl $2,%ebx,%ebx
- xorl %edx,%esi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- movl %eax,%edi
- addl 0(%rsp),%ebp
- vpaddd %xmm3,%xmm11,%xmm9
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrldq $4,%xmm3,%xmm8
- addl %esi,%ebp
- andl %ebx,%edi
- vpxor %xmm0,%xmm4,%xmm4
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpxor %xmm2,%xmm8,%xmm8
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 4(%rsp),%edx
- vpxor %xmm8,%xmm4,%xmm4
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%edx
- andl %eax,%esi
- vpsrld $31,%xmm4,%xmm8
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpslldq $12,%xmm4,%xmm10
- vpaddd %xmm4,%xmm4,%xmm4
- movl %edx,%edi
- addl 8(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm4,%xmm4
- addl %esi,%ecx
- andl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm4,%xmm4
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 12(%rsp),%ebx
- vpxor %xmm10,%xmm4,%xmm4
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- andl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpalignr $8,%xmm1,%xmm2,%xmm5
- movl %ebx,%edi
- addl 16(%rsp),%eax
- vpaddd %xmm4,%xmm11,%xmm9
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrldq $4,%xmm4,%xmm8
- addl %esi,%eax
- andl %ecx,%edi
- vpxor %xmm1,%xmm5,%xmm5
- xorl %edx,%ecx
- addl %ebx,%eax
- vpxor %xmm3,%xmm8,%xmm8
- shrdl $7,%ebx,%ebx
- xorl %edx,%edi
- movl %eax,%esi
- addl 20(%rsp),%ebp
- vpxor %xmm8,%xmm5,%xmm5
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ebp
- andl %ebx,%esi
- vpsrld $31,%xmm5,%xmm8
- xorl %ecx,%ebx
- addl %eax,%ebp
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- vpslldq $12,%xmm5,%xmm10
- vpaddd %xmm5,%xmm5,%xmm5
- movl %ebp,%edi
- addl 24(%rsp),%edx
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm5,%xmm5
- addl %esi,%edx
- andl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm5,%xmm5
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- movl %edx,%esi
- addl 28(%rsp),%ecx
- vpxor %xmm10,%xmm5,%xmm5
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vmovdqa -32(%r14),%xmm11
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- vpalignr $8,%xmm2,%xmm3,%xmm6
- movl %ecx,%edi
- addl 32(%rsp),%ebx
- vpaddd %xmm5,%xmm11,%xmm9
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vpsrldq $4,%xmm5,%xmm8
- addl %esi,%ebx
- andl %edx,%edi
- vpxor %xmm2,%xmm6,%xmm6
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpxor %xmm4,%xmm8,%xmm8
- shrdl $7,%ecx,%ecx
- xorl %ebp,%edi
- movl %ebx,%esi
- addl 36(%rsp),%eax
- vpxor %xmm8,%xmm6,%xmm6
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%eax
- andl %ecx,%esi
- vpsrld $31,%xmm6,%xmm8
- xorl %edx,%ecx
- addl %ebx,%eax
- shrdl $7,%ebx,%ebx
- xorl %edx,%esi
- vpslldq $12,%xmm6,%xmm10
- vpaddd %xmm6,%xmm6,%xmm6
- movl %eax,%edi
- addl 40(%rsp),%ebp
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm6,%xmm6
- addl %esi,%ebp
- andl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm6,%xmm6
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 44(%rsp),%edx
- vpxor %xmm10,%xmm6,%xmm6
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- andl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- movl %edx,%edi
- addl 48(%rsp),%ecx
- vpaddd %xmm6,%xmm11,%xmm9
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpsrldq $4,%xmm6,%xmm8
- addl %esi,%ecx
- andl %ebp,%edi
- vpxor %xmm3,%xmm7,%xmm7
- xorl %eax,%ebp
- addl %edx,%ecx
- vpxor %xmm5,%xmm8,%xmm8
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 52(%rsp),%ebx
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%ebx
- andl %edx,%esi
- vpsrld $31,%xmm7,%xmm8
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpslldq $12,%xmm7,%xmm10
- vpaddd %xmm7,%xmm7,%xmm7
- movl %ebx,%edi
- addl 56(%rsp),%eax
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm7,%xmm7
- addl %esi,%eax
- andl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm7,%xmm7
- shrdl $7,%ebx,%ebx
- xorl %edx,%edi
- movl %eax,%esi
- addl 60(%rsp),%ebp
- vpxor %xmm10,%xmm7,%xmm7
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- addl %edi,%ebp
- andl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- movl %ebp,%edi
- addl 0(%rsp),%edx
- vpxor %xmm1,%xmm0,%xmm0
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpaddd %xmm7,%xmm11,%xmm9
- addl %esi,%edx
- andl %eax,%edi
- vpxor %xmm8,%xmm0,%xmm0
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- movl %edx,%esi
- addl 4(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpslld $2,%xmm0,%xmm0
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- movl %ecx,%edi
- addl 8(%rsp),%ebx
- vpor %xmm8,%xmm0,%xmm0
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- andl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 12(%rsp),%eax
- xorl %ebp,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm2,%xmm1,%xmm1
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm0,%xmm11,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm1,%xmm1
- addl 20(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm1,%xmm1
- addl 24(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm1,%xmm1
- addl 28(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- vpxor %xmm3,%xmm2,%xmm2
- addl %esi,%eax
- xorl %edx,%edi
- vpaddd %xmm1,%xmm11,%xmm9
- vmovdqa 0(%r14),%xmm11
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpxor %xmm8,%xmm2,%xmm2
- addl 36(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm2,%xmm2
- addl 40(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpor %xmm8,%xmm2,%xmm2
- addl 44(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebx
- xorl %ebp,%edi
- vpaddd %xmm2,%xmm11,%xmm9
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm2,%xmm3,%xmm8
- vpxor %xmm0,%xmm4,%xmm4
- addl 0(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpxor %xmm5,%xmm4,%xmm4
- addl %esi,%ecx
- xorl %eax,%edi
- vpaddd %xmm3,%xmm11,%xmm9
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpxor %xmm8,%xmm4,%xmm4
- addl 4(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- vpsrld $30,%xmm4,%xmm8
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpslld $2,%xmm4,%xmm4
- addl 8(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpor %xmm8,%xmm4,%xmm4
- addl 12(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm3,%xmm4,%xmm8
- vpxor %xmm1,%xmm5,%xmm5
- addl 16(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpxor %xmm6,%xmm5,%xmm5
- addl %esi,%edx
- xorl %ebx,%edi
- vpaddd %xmm4,%xmm11,%xmm9
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpxor %xmm8,%xmm5,%xmm5
- addl 20(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm5,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm5,%xmm5
- addl 24(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpor %xmm8,%xmm5,%xmm5
- addl 28(%rsp),%eax
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm4,%xmm5,%xmm8
- vpxor %xmm2,%xmm6,%xmm6
- addl 32(%rsp),%ebp
- andl %ecx,%esi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- movl %eax,%edi
- xorl %ecx,%esi
- vpaddd %xmm5,%xmm11,%xmm9
- shldl $5,%eax,%eax
- addl %esi,%ebp
- vpxor %xmm8,%xmm6,%xmm6
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 36(%rsp),%edx
- vpsrld $30,%xmm6,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- vpslld $2,%xmm6,%xmm6
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 40(%rsp),%ecx
- andl %eax,%esi
- vpor %xmm8,%xmm6,%xmm6
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%edi
- xorl %eax,%esi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 44(%rsp),%ebx
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm5,%xmm6,%xmm8
- vpxor %xmm3,%xmm7,%xmm7
- addl 48(%rsp),%eax
- andl %edx,%esi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- vpxor %xmm0,%xmm7,%xmm7
- movl %ebx,%edi
- xorl %edx,%esi
- vpaddd %xmm6,%xmm11,%xmm9
- vmovdqa 32(%r14),%xmm11
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 52(%rsp),%ebp
- vpsrld $30,%xmm7,%xmm8
- vmovdqa %xmm9,32(%rsp)
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- vpslld $2,%xmm7,%xmm7
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 56(%rsp),%edx
- andl %ebx,%esi
- vpor %xmm8,%xmm7,%xmm7
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%edi
- xorl %ebx,%esi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 60(%rsp),%ecx
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- addl 0(%rsp),%ebx
- andl %ebp,%esi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vpxor %xmm1,%xmm0,%xmm0
- movl %ecx,%edi
- xorl %ebp,%esi
- vpaddd %xmm7,%xmm11,%xmm9
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- vpxor %xmm8,%xmm0,%xmm0
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 4(%rsp),%eax
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- vpslld $2,%xmm0,%xmm0
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 8(%rsp),%ebp
- andl %ecx,%esi
- vpor %xmm8,%xmm0,%xmm0
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%edi
- xorl %ecx,%esi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 12(%rsp),%edx
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- vpxor %xmm2,%xmm1,%xmm1
- movl %edx,%edi
- xorl %eax,%esi
- vpaddd %xmm0,%xmm11,%xmm9
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vpxor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 20(%rsp),%ebx
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- vpslld $2,%xmm1,%xmm1
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 24(%rsp),%eax
- andl %edx,%esi
- vpor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%edi
- xorl %edx,%esi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 28(%rsp),%ebp
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vpxor %xmm3,%xmm2,%xmm2
- movl %ebp,%edi
- xorl %ebx,%esi
- vpaddd %xmm1,%xmm11,%xmm9
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- vpxor %xmm8,%xmm2,%xmm2
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 36(%rsp),%ecx
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- vpslld $2,%xmm2,%xmm2
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 40(%rsp),%ebx
- andl %ebp,%esi
- vpor %xmm8,%xmm2,%xmm2
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- movl %ecx,%edi
- xorl %ebp,%esi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 44(%rsp),%eax
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm2,%xmm11,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 0(%rsp),%eax
- vpaddd %xmm3,%xmm11,%xmm9
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vmovdqa %xmm9,48(%rsp)
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 4(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 8(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 12(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- cmpq %r10,%r9
- je L$done_avx
- vmovdqa 64(%r14),%xmm6
- vmovdqa -64(%r14),%xmm11
- vmovdqu 0(%r9),%xmm0
- vmovdqu 16(%r9),%xmm1
- vmovdqu 32(%r9),%xmm2
- vmovdqu 48(%r9),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- addq $64,%r9
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- vpshufb %xmm6,%xmm1,%xmm1
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpaddd %xmm11,%xmm0,%xmm4
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vmovdqa %xmm4,0(%rsp)
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- vpshufb %xmm6,%xmm2,%xmm2
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpaddd %xmm11,%xmm1,%xmm5
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vmovdqa %xmm5,16(%rsp)
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- vpshufb %xmm6,%xmm3,%xmm3
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpaddd %xmm11,%xmm2,%xmm6
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vmovdqa %xmm6,32(%rsp)
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 0(%r8),%eax
- addl 4(%r8),%esi
- addl 8(%r8),%ecx
- addl 12(%r8),%edx
- movl %eax,0(%r8)
- addl 16(%r8),%ebp
- movl %esi,4(%r8)
- movl %esi,%ebx
- movl %ecx,8(%r8)
- movl %ecx,%edi
- movl %edx,12(%r8)
- xorl %edx,%edi
- movl %ebp,16(%r8)
- andl %edi,%esi
- jmp L$oop_avx
-
-.p2align 4
-L$done_avx:
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vzeroupper
-
- addl 0(%r8),%eax
- addl 4(%r8),%esi
- addl 8(%r8),%ecx
- movl %eax,0(%r8)
- addl 12(%r8),%edx
- movl %esi,4(%r8)
- addl 16(%r8),%ebp
- movl %ecx,8(%r8)
- movl %edx,12(%r8)
- movl %ebp,16(%r8)
- movq -40(%r11),%r14
-
- movq -32(%r11),%r13
-
- movq -24(%r11),%r12
-
- movq -16(%r11),%rbp
-
- movq -8(%r11),%rbx
-
- leaq (%r11),%rsp
-
-L$epilogue_avx:
- ret
-
-
-
-.p2align 4
-sha1_block_data_order_avx2:
-_avx2_shortcut:
-
- movq %rsp,%r11
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- vzeroupper
- movq %rdi,%r8
- movq %rsi,%r9
- movq %rdx,%r10
-
- leaq -640(%rsp),%rsp
- shlq $6,%r10
- leaq 64(%r9),%r13
- andq $-128,%rsp
- addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r14
-
- movl 0(%r8),%eax
- cmpq %r10,%r13
- cmovaeq %r9,%r13
- movl 4(%r8),%ebp
- movl 8(%r8),%ecx
- movl 12(%r8),%edx
- movl 16(%r8),%esi
- vmovdqu 64(%r14),%ymm6
-
- vmovdqu (%r9),%xmm0
- vmovdqu 16(%r9),%xmm1
- vmovdqu 32(%r9),%xmm2
- vmovdqu 48(%r9),%xmm3
- leaq 64(%r9),%r9
- vinserti128 $1,(%r13),%ymm0,%ymm0
- vinserti128 $1,16(%r13),%ymm1,%ymm1
- vpshufb %ymm6,%ymm0,%ymm0
- vinserti128 $1,32(%r13),%ymm2,%ymm2
- vpshufb %ymm6,%ymm1,%ymm1
- vinserti128 $1,48(%r13),%ymm3,%ymm3
- vpshufb %ymm6,%ymm2,%ymm2
- vmovdqu -64(%r14),%ymm11
- vpshufb %ymm6,%ymm3,%ymm3
-
- vpaddd %ymm11,%ymm0,%ymm4
- vpaddd %ymm11,%ymm1,%ymm5
- vmovdqu %ymm4,0(%rsp)
- vpaddd %ymm11,%ymm2,%ymm6
- vmovdqu %ymm5,32(%rsp)
- vpaddd %ymm11,%ymm3,%ymm7
- vmovdqu %ymm6,64(%rsp)
- vmovdqu %ymm7,96(%rsp)
- vpalignr $8,%ymm0,%ymm1,%ymm4
- vpsrldq $4,%ymm3,%ymm8
- vpxor %ymm0,%ymm4,%ymm4
- vpxor %ymm2,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $31,%ymm4,%ymm8
- vpslldq $12,%ymm4,%ymm10
- vpaddd %ymm4,%ymm4,%ymm4
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm4,%ymm4
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm4,%ymm4
- vpxor %ymm10,%ymm4,%ymm4
- vpaddd %ymm11,%ymm4,%ymm9
- vmovdqu %ymm9,128(%rsp)
- vpalignr $8,%ymm1,%ymm2,%ymm5
- vpsrldq $4,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm3,%ymm8,%ymm8
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $31,%ymm5,%ymm8
- vmovdqu -32(%r14),%ymm11
- vpslldq $12,%ymm5,%ymm10
- vpaddd %ymm5,%ymm5,%ymm5
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm5,%ymm5
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm10,%ymm5,%ymm5
- vpaddd %ymm11,%ymm5,%ymm9
- vmovdqu %ymm9,160(%rsp)
- vpalignr $8,%ymm2,%ymm3,%ymm6
- vpsrldq $4,%ymm5,%ymm8
- vpxor %ymm2,%ymm6,%ymm6
- vpxor %ymm4,%ymm8,%ymm8
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $31,%ymm6,%ymm8
- vpslldq $12,%ymm6,%ymm10
- vpaddd %ymm6,%ymm6,%ymm6
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm6,%ymm6
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm6,%ymm6
- vpxor %ymm10,%ymm6,%ymm6
- vpaddd %ymm11,%ymm6,%ymm9
- vmovdqu %ymm9,192(%rsp)
- vpalignr $8,%ymm3,%ymm4,%ymm7
- vpsrldq $4,%ymm6,%ymm8
- vpxor %ymm3,%ymm7,%ymm7
- vpxor %ymm5,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm7,%ymm8
- vpslldq $12,%ymm7,%ymm10
- vpaddd %ymm7,%ymm7,%ymm7
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm7,%ymm7
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm7,%ymm7
- vpxor %ymm10,%ymm7,%ymm7
- vpaddd %ymm11,%ymm7,%ymm9
- vmovdqu %ymm9,224(%rsp)
- leaq 128(%rsp),%r13
- jmp L$oop_avx2
-.p2align 5
-L$oop_avx2:
- rorxl $2,%ebp,%ebx
- andnl %edx,%ebp,%edi
- andl %ecx,%ebp
- xorl %edi,%ebp
- jmp L$align32_1
-.p2align 5
-L$align32_1:
- vpalignr $8,%ymm6,%ymm7,%ymm8
- vpxor %ymm4,%ymm0,%ymm0
- addl -128(%r13),%esi
- andnl %ecx,%eax,%edi
- vpxor %ymm1,%ymm0,%ymm0
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- vpxor %ymm8,%ymm0,%ymm0
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- vpsrld $30,%ymm0,%ymm8
- vpslld $2,%ymm0,%ymm0
- addl -124(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- vpor %ymm8,%ymm0,%ymm0
- addl %r12d,%edx
- xorl %edi,%esi
- addl -120(%r13),%ecx
- andnl %ebp,%edx,%edi
- vpaddd %ymm11,%ymm0,%ymm9
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- vmovdqu %ymm9,256(%rsp)
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -116(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl -96(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- vpalignr $8,%ymm7,%ymm0,%ymm8
- vpxor %ymm5,%ymm1,%ymm1
- addl -92(%r13),%eax
- andnl %edx,%ebp,%edi
- vpxor %ymm2,%ymm1,%ymm1
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- vpxor %ymm8,%ymm1,%ymm1
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- vpsrld $30,%ymm1,%ymm8
- vpslld $2,%ymm1,%ymm1
- addl -88(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- vpor %ymm8,%ymm1,%ymm1
- addl %r12d,%esi
- xorl %edi,%eax
- addl -84(%r13),%edx
- andnl %ebx,%esi,%edi
- vpaddd %ymm11,%ymm1,%ymm9
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- vmovdqu %ymm9,288(%rsp)
- addl %r12d,%edx
- xorl %edi,%esi
- addl -64(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -60(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- vpalignr $8,%ymm0,%ymm1,%ymm8
- vpxor %ymm6,%ymm2,%ymm2
- addl -56(%r13),%ebp
- andnl %esi,%ebx,%edi
- vpxor %ymm3,%ymm2,%ymm2
- vmovdqu 0(%r14),%ymm11
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- vpxor %ymm8,%ymm2,%ymm2
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- vpsrld $30,%ymm2,%ymm8
- vpslld $2,%ymm2,%ymm2
- addl -52(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- vpor %ymm8,%ymm2,%ymm2
- addl %r12d,%eax
- xorl %edi,%ebp
- addl -32(%r13),%esi
- andnl %ecx,%eax,%edi
- vpaddd %ymm11,%ymm2,%ymm9
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- vmovdqu %ymm9,320(%rsp)
- addl %r12d,%esi
- xorl %edi,%eax
- addl -28(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -24(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- vpalignr $8,%ymm1,%ymm2,%ymm8
- vpxor %ymm7,%ymm3,%ymm3
- addl -20(%r13),%ebx
- andnl %eax,%ecx,%edi
- vpxor %ymm4,%ymm3,%ymm3
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- vpxor %ymm8,%ymm3,%ymm3
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- vpsrld $30,%ymm3,%ymm8
- vpslld $2,%ymm3,%ymm3
- addl 0(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- vpor %ymm8,%ymm3,%ymm3
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl 4(%r13),%eax
- andnl %edx,%ebp,%edi
- vpaddd %ymm11,%ymm3,%ymm9
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- vmovdqu %ymm9,352(%rsp)
- addl %r12d,%eax
- xorl %edi,%ebp
- addl 8(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl 12(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- vpalignr $8,%ymm2,%ymm3,%ymm8
- vpxor %ymm0,%ymm4,%ymm4
- addl 32(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpxor %ymm8,%ymm4,%ymm4
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 36(%r13),%ebx
- vpsrld $30,%ymm4,%ymm8
- vpslld $2,%ymm4,%ymm4
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- vpor %ymm8,%ymm4,%ymm4
- addl 40(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- vpaddd %ymm11,%ymm4,%ymm9
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 44(%r13),%eax
- vmovdqu %ymm9,384(%rsp)
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl 64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vpalignr $8,%ymm3,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- addl 68(%r13),%edx
- leal (%rdx,%rax,1),%edx
- vpxor %ymm6,%ymm5,%ymm5
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- vpxor %ymm8,%ymm5,%ymm5
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 72(%r13),%ecx
- vpsrld $30,%ymm5,%ymm8
- vpslld $2,%ymm5,%ymm5
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- vpor %ymm8,%ymm5,%ymm5
- addl 76(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- vpaddd %ymm11,%ymm5,%ymm9
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl 96(%r13),%ebp
- vmovdqu %ymm9,416(%rsp)
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 100(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpalignr $8,%ymm4,%ymm5,%ymm8
- vpxor %ymm2,%ymm6,%ymm6
- addl 104(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- vpxor %ymm8,%ymm6,%ymm6
- addl %r12d,%esi
- xorl %ecx,%eax
- addl 108(%r13),%edx
- leaq 256(%r13),%r13
- vpsrld $30,%ymm6,%ymm8
- vpslld $2,%ymm6,%ymm6
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- vpor %ymm8,%ymm6,%ymm6
- addl -128(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- vpaddd %ymm11,%ymm6,%ymm9
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -124(%r13),%ebx
- vmovdqu %ymm9,448(%rsp)
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -120(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vpalignr $8,%ymm5,%ymm6,%ymm8
- vpxor %ymm3,%ymm7,%ymm7
- addl -116(%r13),%eax
- leal (%rax,%rbx,1),%eax
- vpxor %ymm0,%ymm7,%ymm7
- vmovdqu 32(%r14),%ymm11
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- vpxor %ymm8,%ymm7,%ymm7
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -96(%r13),%esi
- vpsrld $30,%ymm7,%ymm8
- vpslld $2,%ymm7,%ymm7
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vpor %ymm8,%ymm7,%ymm7
- addl -92(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpaddd %ymm11,%ymm7,%ymm9
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -88(%r13),%ecx
- vmovdqu %ymm9,480(%rsp)
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -84(%r13),%ebx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- jmp L$align32_2
-.p2align 5
-L$align32_2:
- vpalignr $8,%ymm6,%ymm7,%ymm8
- vpxor %ymm4,%ymm0,%ymm0
- addl -64(%r13),%ebp
- xorl %esi,%ecx
- vpxor %ymm1,%ymm0,%ymm0
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- vpxor %ymm8,%ymm0,%ymm0
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- vpsrld $30,%ymm0,%ymm8
- vpslld $2,%ymm0,%ymm0
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -60(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- vpor %ymm8,%ymm0,%ymm0
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- vpaddd %ymm11,%ymm0,%ymm9
- addl %r12d,%eax
- andl %edi,%ebp
- addl -56(%r13),%esi
- xorl %ecx,%ebp
- vmovdqu %ymm9,512(%rsp)
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl -52(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- addl -32(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- andl %edi,%edx
- vpalignr $8,%ymm7,%ymm0,%ymm8
- vpxor %ymm5,%ymm1,%ymm1
- addl -28(%r13),%ebx
- xorl %eax,%edx
- vpxor %ymm2,%ymm1,%ymm1
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- vpxor %ymm8,%ymm1,%ymm1
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- vpsrld $30,%ymm1,%ymm8
- vpslld $2,%ymm1,%ymm1
- addl %r12d,%ebx
- andl %edi,%ecx
- addl -24(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- vpor %ymm8,%ymm1,%ymm1
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- vpaddd %ymm11,%ymm1,%ymm9
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -20(%r13),%eax
- xorl %edx,%ebx
- vmovdqu %ymm9,544(%rsp)
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 0(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl 4(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- vpalignr $8,%ymm0,%ymm1,%ymm8
- vpxor %ymm6,%ymm2,%ymm2
- addl 8(%r13),%ecx
- xorl %ebp,%esi
- vpxor %ymm3,%ymm2,%ymm2
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- vpxor %ymm8,%ymm2,%ymm2
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpsrld $30,%ymm2,%ymm8
- vpslld $2,%ymm2,%ymm2
- addl %r12d,%ecx
- andl %edi,%edx
- addl 12(%r13),%ebx
- xorl %eax,%edx
- movl %esi,%edi
- xorl %eax,%edi
- vpor %ymm8,%ymm2,%ymm2
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- vpaddd %ymm11,%ymm2,%ymm9
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 32(%r13),%ebp
- xorl %esi,%ecx
- vmovdqu %ymm9,576(%rsp)
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 36(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 40(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- vpalignr $8,%ymm1,%ymm2,%ymm8
- vpxor %ymm7,%ymm3,%ymm3
- addl 44(%r13),%edx
- xorl %ebx,%eax
- vpxor %ymm4,%ymm3,%ymm3
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- vpxor %ymm8,%ymm3,%ymm3
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- vpsrld $30,%ymm3,%ymm8
- vpslld $2,%ymm3,%ymm3
- addl %r12d,%edx
- andl %edi,%esi
- addl 64(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- vpor %ymm8,%ymm3,%ymm3
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpaddd %ymm11,%ymm3,%ymm9
- addl %r12d,%ecx
- andl %edi,%edx
- addl 68(%r13),%ebx
- xorl %eax,%edx
- vmovdqu %ymm9,608(%rsp)
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 72(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 76(%r13),%eax
- xorl %edx,%ebx
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl 96(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl 100(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 104(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 108(%r13),%ebx
- leaq 256(%r13),%r13
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -128(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -124(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -120(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -116(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -96(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -92(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -88(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -84(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -60(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -56(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -52(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -32(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -28(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -24(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -20(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- addl %r12d,%edx
- leaq 128(%r9),%r13
- leaq 128(%r9),%rdi
- cmpq %r10,%r13
- cmovaeq %r9,%r13
-
-
- addl 0(%r8),%edx
- addl 4(%r8),%esi
- addl 8(%r8),%ebp
- movl %edx,0(%r8)
- addl 12(%r8),%ebx
- movl %esi,4(%r8)
- movl %edx,%eax
- addl 16(%r8),%ecx
- movl %ebp,%r12d
- movl %ebp,8(%r8)
- movl %ebx,%edx
-
- movl %ebx,12(%r8)
- movl %esi,%ebp
- movl %ecx,16(%r8)
-
- movl %ecx,%esi
- movl %r12d,%ecx
-
-
- cmpq %r10,%r9
- je L$done_avx2
- vmovdqu 64(%r14),%ymm6
- cmpq %r10,%rdi
- ja L$ast_avx2
-
- vmovdqu -64(%rdi),%xmm0
- vmovdqu -48(%rdi),%xmm1
- vmovdqu -32(%rdi),%xmm2
- vmovdqu -16(%rdi),%xmm3
- vinserti128 $1,0(%r13),%ymm0,%ymm0
- vinserti128 $1,16(%r13),%ymm1,%ymm1
- vinserti128 $1,32(%r13),%ymm2,%ymm2
- vinserti128 $1,48(%r13),%ymm3,%ymm3
- jmp L$ast_avx2
-
-.p2align 5
-L$ast_avx2:
- leaq 128+16(%rsp),%r13
- rorxl $2,%ebp,%ebx
- andnl %edx,%ebp,%edi
- andl %ecx,%ebp
- xorl %edi,%ebp
- subq $-128,%r9
- addl -128(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl -124(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -120(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -116(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl -96(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl -92(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- addl -88(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl -84(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -64(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -60(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl -56(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl -52(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- addl -32(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl -28(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -24(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -20(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl 0(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl 4(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- addl 8(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl 12(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 32(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 36(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl 40(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 44(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl 64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vmovdqu -64(%r14),%ymm11
- vpshufb %ymm6,%ymm0,%ymm0
- addl 68(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 72(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 76(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl 96(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 100(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpshufb %ymm6,%ymm1,%ymm1
- vpaddd %ymm11,%ymm0,%ymm8
- addl 104(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl 108(%r13),%edx
- leaq 256(%r13),%r13
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -128(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -124(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -120(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vmovdqu %ymm8,0(%rsp)
- vpshufb %ymm6,%ymm2,%ymm2
- vpaddd %ymm11,%ymm1,%ymm9
- addl -116(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -96(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -92(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -88(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -84(%r13),%ebx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- vmovdqu %ymm9,32(%rsp)
- vpshufb %ymm6,%ymm3,%ymm3
- vpaddd %ymm11,%ymm2,%ymm6
- addl -64(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -60(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl -56(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl -52(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- addl -32(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- andl %edi,%edx
- jmp L$align32_3
-.p2align 5
-L$align32_3:
- vmovdqu %ymm6,64(%rsp)
- vpaddd %ymm11,%ymm3,%ymm7
- addl -28(%r13),%ebx
- xorl %eax,%edx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- addl -24(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -20(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 0(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl 4(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- vmovdqu %ymm7,96(%rsp)
- addl 8(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- andl %edi,%edx
- addl 12(%r13),%ebx
- xorl %eax,%edx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 32(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 36(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 40(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- vpalignr $8,%ymm0,%ymm1,%ymm4
- addl 44(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- vpsrldq $4,%ymm3,%ymm8
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpxor %ymm0,%ymm4,%ymm4
- vpxor %ymm2,%ymm8,%ymm8
- xorl %ebp,%esi
- addl %r12d,%edx
- vpxor %ymm8,%ymm4,%ymm4
- andl %edi,%esi
- addl 64(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- vpsrld $31,%ymm4,%ymm8
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- vpslldq $12,%ymm4,%ymm10
- vpaddd %ymm4,%ymm4,%ymm4
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm4,%ymm4
- addl %r12d,%ecx
- andl %edi,%edx
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm4,%ymm4
- addl 68(%r13),%ebx
- xorl %eax,%edx
- vpxor %ymm10,%ymm4,%ymm4
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- vpaddd %ymm11,%ymm4,%ymm9
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- vmovdqu %ymm9,128(%rsp)
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 72(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 76(%r13),%eax
- xorl %edx,%ebx
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpalignr $8,%ymm1,%ymm2,%ymm5
- addl 96(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- vpsrldq $4,%ymm4,%ymm8
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm3,%ymm8,%ymm8
- addl 100(%r13),%edx
- leal (%rdx,%rax,1),%edx
- vpxor %ymm8,%ymm5,%ymm5
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- vpsrld $31,%ymm5,%ymm8
- vmovdqu -32(%r14),%ymm11
- xorl %ebx,%esi
- addl 104(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- vpslldq $12,%ymm5,%ymm10
- vpaddd %ymm5,%ymm5,%ymm5
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm5,%ymm5
- xorl %eax,%edx
- addl %r12d,%ecx
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm5,%ymm5
- xorl %ebp,%edx
- addl 108(%r13),%ebx
- leaq 256(%r13),%r13
- vpxor %ymm10,%ymm5,%ymm5
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- vpaddd %ymm11,%ymm5,%ymm9
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- vmovdqu %ymm9,160(%rsp)
- addl -128(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vpalignr $8,%ymm2,%ymm3,%ymm6
- addl -124(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- vpsrldq $4,%ymm5,%ymm8
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpxor %ymm2,%ymm6,%ymm6
- vpxor %ymm4,%ymm8,%ymm8
- addl -120(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- vpxor %ymm8,%ymm6,%ymm6
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- vpsrld $31,%ymm6,%ymm8
- xorl %ecx,%eax
- addl -116(%r13),%edx
- leal (%rdx,%rax,1),%edx
- vpslldq $12,%ymm6,%ymm10
- vpaddd %ymm6,%ymm6,%ymm6
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm6,%ymm6
- xorl %ebp,%esi
- addl %r12d,%edx
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm6,%ymm6
- xorl %ebx,%esi
- addl -96(%r13),%ecx
- vpxor %ymm10,%ymm6,%ymm6
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- vpaddd %ymm11,%ymm6,%ymm9
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- vmovdqu %ymm9,192(%rsp)
- addl -92(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- vpalignr $8,%ymm3,%ymm4,%ymm7
- addl -88(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- vpsrldq $4,%ymm6,%ymm8
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vpxor %ymm3,%ymm7,%ymm7
- vpxor %ymm5,%ymm8,%ymm8
- addl -84(%r13),%eax
- leal (%rax,%rbx,1),%eax
- vpxor %ymm8,%ymm7,%ymm7
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- vpsrld $31,%ymm7,%ymm8
- xorl %edx,%ebp
- addl -64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- vpslldq $12,%ymm7,%ymm10
- vpaddd %ymm7,%ymm7,%ymm7
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm7,%ymm7
- xorl %ebx,%eax
- addl %r12d,%esi
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm7,%ymm7
- xorl %ecx,%eax
- addl -60(%r13),%edx
- vpxor %ymm10,%ymm7,%ymm7
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpaddd %ymm11,%ymm7,%ymm9
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- vmovdqu %ymm9,224(%rsp)
- addl -56(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -52(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -32(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -28(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -24(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -20(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- addl %r12d,%edx
- leaq 128(%rsp),%r13
-
-
- addl 0(%r8),%edx
- addl 4(%r8),%esi
- addl 8(%r8),%ebp
- movl %edx,0(%r8)
- addl 12(%r8),%ebx
- movl %esi,4(%r8)
- movl %edx,%eax
- addl 16(%r8),%ecx
- movl %ebp,%r12d
- movl %ebp,8(%r8)
- movl %ebx,%edx
-
- movl %ebx,12(%r8)
- movl %esi,%ebp
- movl %ecx,16(%r8)
-
- movl %ecx,%esi
- movl %r12d,%ecx
-
-
- cmpq %r10,%r9
- jbe L$oop_avx2
-
-L$done_avx2:
- vzeroupper
- movq -40(%r11),%r14
-
- movq -32(%r11),%r13
-
- movq -24(%r11),%r12
-
- movq -16(%r11),%rbp
-
- movq -8(%r11),%rbx
-
- leaq (%r11),%rsp
-
-L$epilogue_avx2:
- ret
-
-
-.section __DATA,__const
-.p2align 6
-K_XX_XX:
-.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
-.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
-.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
-.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
-.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
-.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
-.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
-.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
-.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
-.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.p2align 6
-.text
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/sha256-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/sha256-x86_64-apple.S
deleted file mode 100644
index 018af0d..0000000
--- a/apple-x86_64/crypto/fipsmodule/sha256-x86_64-apple.S
+++ /dev/null
@@ -1,4178 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-
-.globl _sha256_block_data_order
-.private_extern _sha256_block_data_order
-
-.p2align 4
-_sha256_block_data_order:
-
-_CET_ENDBR
- leaq _OPENSSL_ia32cap_P(%rip),%r11
- movl 0(%r11),%r9d
- movl 4(%r11),%r10d
- movl 8(%r11),%r11d
- testl $536870912,%r11d
- jnz L$shaext_shortcut
- andl $1073741824,%r9d
- andl $268435968,%r10d
- orl %r9d,%r10d
- cmpl $1342177792,%r10d
- je L$avx_shortcut
- testl $512,%r10d
- jnz L$ssse3_shortcut
- movq %rsp,%rax
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- shlq $4,%rdx
- subq $64+32,%rsp
- leaq (%rsi,%rdx,4),%rdx
- andq $-64,%rsp
- movq %rdi,64+0(%rsp)
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
- movq %rax,88(%rsp)
-
-L$prologue:
-
- movl 0(%rdi),%eax
- movl 4(%rdi),%ebx
- movl 8(%rdi),%ecx
- movl 12(%rdi),%edx
- movl 16(%rdi),%r8d
- movl 20(%rdi),%r9d
- movl 24(%rdi),%r10d
- movl 28(%rdi),%r11d
- jmp L$loop
-
-.p2align 4
-L$loop:
- movl %ebx,%edi
- leaq K256(%rip),%rbp
- xorl %ecx,%edi
- movl 0(%rsi),%r12d
- movl %r8d,%r13d
- movl %eax,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %r9d,%r15d
-
- xorl %r8d,%r13d
- rorl $9,%r14d
- xorl %r10d,%r15d
-
- movl %r12d,0(%rsp)
- xorl %eax,%r14d
- andl %r8d,%r15d
-
- rorl $5,%r13d
- addl %r11d,%r12d
- xorl %r10d,%r15d
-
- rorl $11,%r14d
- xorl %r8d,%r13d
- addl %r15d,%r12d
-
- movl %eax,%r15d
- addl (%rbp),%r12d
- xorl %eax,%r14d
-
- xorl %ebx,%r15d
- rorl $6,%r13d
- movl %ebx,%r11d
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%r11d
- addl %r12d,%edx
- addl %r12d,%r11d
-
- leaq 4(%rbp),%rbp
- addl %r14d,%r11d
- movl 4(%rsi),%r12d
- movl %edx,%r13d
- movl %r11d,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %r8d,%edi
-
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%edi
-
- movl %r12d,4(%rsp)
- xorl %r11d,%r14d
- andl %edx,%edi
-
- rorl $5,%r13d
- addl %r10d,%r12d
- xorl %r9d,%edi
-
- rorl $11,%r14d
- xorl %edx,%r13d
- addl %edi,%r12d
-
- movl %r11d,%edi
- addl (%rbp),%r12d
- xorl %r11d,%r14d
-
- xorl %eax,%edi
- rorl $6,%r13d
- movl %eax,%r10d
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%r10d
- addl %r12d,%ecx
- addl %r12d,%r10d
-
- leaq 4(%rbp),%rbp
- addl %r14d,%r10d
- movl 8(%rsi),%r12d
- movl %ecx,%r13d
- movl %r10d,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %edx,%r15d
-
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r15d
-
- movl %r12d,8(%rsp)
- xorl %r10d,%r14d
- andl %ecx,%r15d
-
- rorl $5,%r13d
- addl %r9d,%r12d
- xorl %r8d,%r15d
-
- rorl $11,%r14d
- xorl %ecx,%r13d
- addl %r15d,%r12d
-
- movl %r10d,%r15d
- addl (%rbp),%r12d
- xorl %r10d,%r14d
-
- xorl %r11d,%r15d
- rorl $6,%r13d
- movl %r11d,%r9d
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%r9d
- addl %r12d,%ebx
- addl %r12d,%r9d
-
- leaq 4(%rbp),%rbp
- addl %r14d,%r9d
- movl 12(%rsi),%r12d
- movl %ebx,%r13d
- movl %r9d,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %ecx,%edi
-
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%edi
-
- movl %r12d,12(%rsp)
- xorl %r9d,%r14d
- andl %ebx,%edi
-
- rorl $5,%r13d
- addl %r8d,%r12d
- xorl %edx,%edi
-
- rorl $11,%r14d
- xorl %ebx,%r13d
- addl %edi,%r12d
-
- movl %r9d,%edi
- addl (%rbp),%r12d
- xorl %r9d,%r14d
-
- xorl %r10d,%edi
- rorl $6,%r13d
- movl %r10d,%r8d
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%r8d
- addl %r12d,%eax
- addl %r12d,%r8d
-
- leaq 20(%rbp),%rbp
- addl %r14d,%r8d
- movl 16(%rsi),%r12d
- movl %eax,%r13d
- movl %r8d,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %ebx,%r15d
-
- xorl %eax,%r13d
- rorl $9,%r14d
- xorl %ecx,%r15d
-
- movl %r12d,16(%rsp)
- xorl %r8d,%r14d
- andl %eax,%r15d
-
- rorl $5,%r13d
- addl %edx,%r12d
- xorl %ecx,%r15d
-
- rorl $11,%r14d
- xorl %eax,%r13d
- addl %r15d,%r12d
-
- movl %r8d,%r15d
- addl (%rbp),%r12d
- xorl %r8d,%r14d
-
- xorl %r9d,%r15d
- rorl $6,%r13d
- movl %r9d,%edx
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%edx
- addl %r12d,%r11d
- addl %r12d,%edx
-
- leaq 4(%rbp),%rbp
- addl %r14d,%edx
- movl 20(%rsi),%r12d
- movl %r11d,%r13d
- movl %edx,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %eax,%edi
-
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%edi
-
- movl %r12d,20(%rsp)
- xorl %edx,%r14d
- andl %r11d,%edi
-
- rorl $5,%r13d
- addl %ecx,%r12d
- xorl %ebx,%edi
-
- rorl $11,%r14d
- xorl %r11d,%r13d
- addl %edi,%r12d
-
- movl %edx,%edi
- addl (%rbp),%r12d
- xorl %edx,%r14d
-
- xorl %r8d,%edi
- rorl $6,%r13d
- movl %r8d,%ecx
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%ecx
- addl %r12d,%r10d
- addl %r12d,%ecx
-
- leaq 4(%rbp),%rbp
- addl %r14d,%ecx
- movl 24(%rsi),%r12d
- movl %r10d,%r13d
- movl %ecx,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %r11d,%r15d
-
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r15d
-
- movl %r12d,24(%rsp)
- xorl %ecx,%r14d
- andl %r10d,%r15d
-
- rorl $5,%r13d
- addl %ebx,%r12d
- xorl %eax,%r15d
-
- rorl $11,%r14d
- xorl %r10d,%r13d
- addl %r15d,%r12d
-
- movl %ecx,%r15d
- addl (%rbp),%r12d
- xorl %ecx,%r14d
-
- xorl %edx,%r15d
- rorl $6,%r13d
- movl %edx,%ebx
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%ebx
- addl %r12d,%r9d
- addl %r12d,%ebx
-
- leaq 4(%rbp),%rbp
- addl %r14d,%ebx
- movl 28(%rsi),%r12d
- movl %r9d,%r13d
- movl %ebx,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %r10d,%edi
-
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%edi
-
- movl %r12d,28(%rsp)
- xorl %ebx,%r14d
- andl %r9d,%edi
-
- rorl $5,%r13d
- addl %eax,%r12d
- xorl %r11d,%edi
-
- rorl $11,%r14d
- xorl %r9d,%r13d
- addl %edi,%r12d
-
- movl %ebx,%edi
- addl (%rbp),%r12d
- xorl %ebx,%r14d
-
- xorl %ecx,%edi
- rorl $6,%r13d
- movl %ecx,%eax
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%eax
- addl %r12d,%r8d
- addl %r12d,%eax
-
- leaq 20(%rbp),%rbp
- addl %r14d,%eax
- movl 32(%rsi),%r12d
- movl %r8d,%r13d
- movl %eax,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %r9d,%r15d
-
- xorl %r8d,%r13d
- rorl $9,%r14d
- xorl %r10d,%r15d
-
- movl %r12d,32(%rsp)
- xorl %eax,%r14d
- andl %r8d,%r15d
-
- rorl $5,%r13d
- addl %r11d,%r12d
- xorl %r10d,%r15d
-
- rorl $11,%r14d
- xorl %r8d,%r13d
- addl %r15d,%r12d
-
- movl %eax,%r15d
- addl (%rbp),%r12d
- xorl %eax,%r14d
-
- xorl %ebx,%r15d
- rorl $6,%r13d
- movl %ebx,%r11d
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%r11d
- addl %r12d,%edx
- addl %r12d,%r11d
-
- leaq 4(%rbp),%rbp
- addl %r14d,%r11d
- movl 36(%rsi),%r12d
- movl %edx,%r13d
- movl %r11d,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %r8d,%edi
-
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%edi
-
- movl %r12d,36(%rsp)
- xorl %r11d,%r14d
- andl %edx,%edi
-
- rorl $5,%r13d
- addl %r10d,%r12d
- xorl %r9d,%edi
-
- rorl $11,%r14d
- xorl %edx,%r13d
- addl %edi,%r12d
-
- movl %r11d,%edi
- addl (%rbp),%r12d
- xorl %r11d,%r14d
-
- xorl %eax,%edi
- rorl $6,%r13d
- movl %eax,%r10d
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%r10d
- addl %r12d,%ecx
- addl %r12d,%r10d
-
- leaq 4(%rbp),%rbp
- addl %r14d,%r10d
- movl 40(%rsi),%r12d
- movl %ecx,%r13d
- movl %r10d,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %edx,%r15d
-
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r15d
-
- movl %r12d,40(%rsp)
- xorl %r10d,%r14d
- andl %ecx,%r15d
-
- rorl $5,%r13d
- addl %r9d,%r12d
- xorl %r8d,%r15d
-
- rorl $11,%r14d
- xorl %ecx,%r13d
- addl %r15d,%r12d
-
- movl %r10d,%r15d
- addl (%rbp),%r12d
- xorl %r10d,%r14d
-
- xorl %r11d,%r15d
- rorl $6,%r13d
- movl %r11d,%r9d
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%r9d
- addl %r12d,%ebx
- addl %r12d,%r9d
-
- leaq 4(%rbp),%rbp
- addl %r14d,%r9d
- movl 44(%rsi),%r12d
- movl %ebx,%r13d
- movl %r9d,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %ecx,%edi
-
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%edi
-
- movl %r12d,44(%rsp)
- xorl %r9d,%r14d
- andl %ebx,%edi
-
- rorl $5,%r13d
- addl %r8d,%r12d
- xorl %edx,%edi
-
- rorl $11,%r14d
- xorl %ebx,%r13d
- addl %edi,%r12d
-
- movl %r9d,%edi
- addl (%rbp),%r12d
- xorl %r9d,%r14d
-
- xorl %r10d,%edi
- rorl $6,%r13d
- movl %r10d,%r8d
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%r8d
- addl %r12d,%eax
- addl %r12d,%r8d
-
- leaq 20(%rbp),%rbp
- addl %r14d,%r8d
- movl 48(%rsi),%r12d
- movl %eax,%r13d
- movl %r8d,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %ebx,%r15d
-
- xorl %eax,%r13d
- rorl $9,%r14d
- xorl %ecx,%r15d
-
- movl %r12d,48(%rsp)
- xorl %r8d,%r14d
- andl %eax,%r15d
-
- rorl $5,%r13d
- addl %edx,%r12d
- xorl %ecx,%r15d
-
- rorl $11,%r14d
- xorl %eax,%r13d
- addl %r15d,%r12d
-
- movl %r8d,%r15d
- addl (%rbp),%r12d
- xorl %r8d,%r14d
-
- xorl %r9d,%r15d
- rorl $6,%r13d
- movl %r9d,%edx
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%edx
- addl %r12d,%r11d
- addl %r12d,%edx
-
- leaq 4(%rbp),%rbp
- addl %r14d,%edx
- movl 52(%rsi),%r12d
- movl %r11d,%r13d
- movl %edx,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %eax,%edi
-
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%edi
-
- movl %r12d,52(%rsp)
- xorl %edx,%r14d
- andl %r11d,%edi
-
- rorl $5,%r13d
- addl %ecx,%r12d
- xorl %ebx,%edi
-
- rorl $11,%r14d
- xorl %r11d,%r13d
- addl %edi,%r12d
-
- movl %edx,%edi
- addl (%rbp),%r12d
- xorl %edx,%r14d
-
- xorl %r8d,%edi
- rorl $6,%r13d
- movl %r8d,%ecx
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%ecx
- addl %r12d,%r10d
- addl %r12d,%ecx
-
- leaq 4(%rbp),%rbp
- addl %r14d,%ecx
- movl 56(%rsi),%r12d
- movl %r10d,%r13d
- movl %ecx,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %r11d,%r15d
-
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r15d
-
- movl %r12d,56(%rsp)
- xorl %ecx,%r14d
- andl %r10d,%r15d
-
- rorl $5,%r13d
- addl %ebx,%r12d
- xorl %eax,%r15d
-
- rorl $11,%r14d
- xorl %r10d,%r13d
- addl %r15d,%r12d
-
- movl %ecx,%r15d
- addl (%rbp),%r12d
- xorl %ecx,%r14d
-
- xorl %edx,%r15d
- rorl $6,%r13d
- movl %edx,%ebx
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%ebx
- addl %r12d,%r9d
- addl %r12d,%ebx
-
- leaq 4(%rbp),%rbp
- addl %r14d,%ebx
- movl 60(%rsi),%r12d
- movl %r9d,%r13d
- movl %ebx,%r14d
- bswapl %r12d
- rorl $14,%r13d
- movl %r10d,%edi
-
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%edi
-
- movl %r12d,60(%rsp)
- xorl %ebx,%r14d
- andl %r9d,%edi
-
- rorl $5,%r13d
- addl %eax,%r12d
- xorl %r11d,%edi
-
- rorl $11,%r14d
- xorl %r9d,%r13d
- addl %edi,%r12d
-
- movl %ebx,%edi
- addl (%rbp),%r12d
- xorl %ebx,%r14d
-
- xorl %ecx,%edi
- rorl $6,%r13d
- movl %ecx,%eax
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%eax
- addl %r12d,%r8d
- addl %r12d,%eax
-
- leaq 20(%rbp),%rbp
- jmp L$rounds_16_xx
-.p2align 4
-L$rounds_16_xx:
- movl 4(%rsp),%r13d
- movl 56(%rsp),%r15d
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%eax
- movl %r15d,%r14d
- rorl $2,%r15d
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%r15d
- shrl $10,%r14d
-
- rorl $17,%r15d
- xorl %r13d,%r12d
- xorl %r14d,%r15d
- addl 36(%rsp),%r12d
-
- addl 0(%rsp),%r12d
- movl %r8d,%r13d
- addl %r15d,%r12d
- movl %eax,%r14d
- rorl $14,%r13d
- movl %r9d,%r15d
-
- xorl %r8d,%r13d
- rorl $9,%r14d
- xorl %r10d,%r15d
-
- movl %r12d,0(%rsp)
- xorl %eax,%r14d
- andl %r8d,%r15d
-
- rorl $5,%r13d
- addl %r11d,%r12d
- xorl %r10d,%r15d
-
- rorl $11,%r14d
- xorl %r8d,%r13d
- addl %r15d,%r12d
-
- movl %eax,%r15d
- addl (%rbp),%r12d
- xorl %eax,%r14d
-
- xorl %ebx,%r15d
- rorl $6,%r13d
- movl %ebx,%r11d
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%r11d
- addl %r12d,%edx
- addl %r12d,%r11d
-
- leaq 4(%rbp),%rbp
- movl 8(%rsp),%r13d
- movl 60(%rsp),%edi
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%r11d
- movl %edi,%r14d
- rorl $2,%edi
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%edi
- shrl $10,%r14d
-
- rorl $17,%edi
- xorl %r13d,%r12d
- xorl %r14d,%edi
- addl 40(%rsp),%r12d
-
- addl 4(%rsp),%r12d
- movl %edx,%r13d
- addl %edi,%r12d
- movl %r11d,%r14d
- rorl $14,%r13d
- movl %r8d,%edi
-
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%edi
-
- movl %r12d,4(%rsp)
- xorl %r11d,%r14d
- andl %edx,%edi
-
- rorl $5,%r13d
- addl %r10d,%r12d
- xorl %r9d,%edi
-
- rorl $11,%r14d
- xorl %edx,%r13d
- addl %edi,%r12d
-
- movl %r11d,%edi
- addl (%rbp),%r12d
- xorl %r11d,%r14d
-
- xorl %eax,%edi
- rorl $6,%r13d
- movl %eax,%r10d
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%r10d
- addl %r12d,%ecx
- addl %r12d,%r10d
-
- leaq 4(%rbp),%rbp
- movl 12(%rsp),%r13d
- movl 0(%rsp),%r15d
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%r10d
- movl %r15d,%r14d
- rorl $2,%r15d
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%r15d
- shrl $10,%r14d
-
- rorl $17,%r15d
- xorl %r13d,%r12d
- xorl %r14d,%r15d
- addl 44(%rsp),%r12d
-
- addl 8(%rsp),%r12d
- movl %ecx,%r13d
- addl %r15d,%r12d
- movl %r10d,%r14d
- rorl $14,%r13d
- movl %edx,%r15d
-
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r15d
-
- movl %r12d,8(%rsp)
- xorl %r10d,%r14d
- andl %ecx,%r15d
-
- rorl $5,%r13d
- addl %r9d,%r12d
- xorl %r8d,%r15d
-
- rorl $11,%r14d
- xorl %ecx,%r13d
- addl %r15d,%r12d
-
- movl %r10d,%r15d
- addl (%rbp),%r12d
- xorl %r10d,%r14d
-
- xorl %r11d,%r15d
- rorl $6,%r13d
- movl %r11d,%r9d
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%r9d
- addl %r12d,%ebx
- addl %r12d,%r9d
-
- leaq 4(%rbp),%rbp
- movl 16(%rsp),%r13d
- movl 4(%rsp),%edi
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%r9d
- movl %edi,%r14d
- rorl $2,%edi
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%edi
- shrl $10,%r14d
-
- rorl $17,%edi
- xorl %r13d,%r12d
- xorl %r14d,%edi
- addl 48(%rsp),%r12d
-
- addl 12(%rsp),%r12d
- movl %ebx,%r13d
- addl %edi,%r12d
- movl %r9d,%r14d
- rorl $14,%r13d
- movl %ecx,%edi
-
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%edi
-
- movl %r12d,12(%rsp)
- xorl %r9d,%r14d
- andl %ebx,%edi
-
- rorl $5,%r13d
- addl %r8d,%r12d
- xorl %edx,%edi
-
- rorl $11,%r14d
- xorl %ebx,%r13d
- addl %edi,%r12d
-
- movl %r9d,%edi
- addl (%rbp),%r12d
- xorl %r9d,%r14d
-
- xorl %r10d,%edi
- rorl $6,%r13d
- movl %r10d,%r8d
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%r8d
- addl %r12d,%eax
- addl %r12d,%r8d
-
- leaq 20(%rbp),%rbp
- movl 20(%rsp),%r13d
- movl 8(%rsp),%r15d
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%r8d
- movl %r15d,%r14d
- rorl $2,%r15d
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%r15d
- shrl $10,%r14d
-
- rorl $17,%r15d
- xorl %r13d,%r12d
- xorl %r14d,%r15d
- addl 52(%rsp),%r12d
-
- addl 16(%rsp),%r12d
- movl %eax,%r13d
- addl %r15d,%r12d
- movl %r8d,%r14d
- rorl $14,%r13d
- movl %ebx,%r15d
-
- xorl %eax,%r13d
- rorl $9,%r14d
- xorl %ecx,%r15d
-
- movl %r12d,16(%rsp)
- xorl %r8d,%r14d
- andl %eax,%r15d
-
- rorl $5,%r13d
- addl %edx,%r12d
- xorl %ecx,%r15d
-
- rorl $11,%r14d
- xorl %eax,%r13d
- addl %r15d,%r12d
-
- movl %r8d,%r15d
- addl (%rbp),%r12d
- xorl %r8d,%r14d
-
- xorl %r9d,%r15d
- rorl $6,%r13d
- movl %r9d,%edx
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%edx
- addl %r12d,%r11d
- addl %r12d,%edx
-
- leaq 4(%rbp),%rbp
- movl 24(%rsp),%r13d
- movl 12(%rsp),%edi
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%edx
- movl %edi,%r14d
- rorl $2,%edi
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%edi
- shrl $10,%r14d
-
- rorl $17,%edi
- xorl %r13d,%r12d
- xorl %r14d,%edi
- addl 56(%rsp),%r12d
-
- addl 20(%rsp),%r12d
- movl %r11d,%r13d
- addl %edi,%r12d
- movl %edx,%r14d
- rorl $14,%r13d
- movl %eax,%edi
-
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%edi
-
- movl %r12d,20(%rsp)
- xorl %edx,%r14d
- andl %r11d,%edi
-
- rorl $5,%r13d
- addl %ecx,%r12d
- xorl %ebx,%edi
-
- rorl $11,%r14d
- xorl %r11d,%r13d
- addl %edi,%r12d
-
- movl %edx,%edi
- addl (%rbp),%r12d
- xorl %edx,%r14d
-
- xorl %r8d,%edi
- rorl $6,%r13d
- movl %r8d,%ecx
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%ecx
- addl %r12d,%r10d
- addl %r12d,%ecx
-
- leaq 4(%rbp),%rbp
- movl 28(%rsp),%r13d
- movl 16(%rsp),%r15d
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%ecx
- movl %r15d,%r14d
- rorl $2,%r15d
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%r15d
- shrl $10,%r14d
-
- rorl $17,%r15d
- xorl %r13d,%r12d
- xorl %r14d,%r15d
- addl 60(%rsp),%r12d
-
- addl 24(%rsp),%r12d
- movl %r10d,%r13d
- addl %r15d,%r12d
- movl %ecx,%r14d
- rorl $14,%r13d
- movl %r11d,%r15d
-
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r15d
-
- movl %r12d,24(%rsp)
- xorl %ecx,%r14d
- andl %r10d,%r15d
-
- rorl $5,%r13d
- addl %ebx,%r12d
- xorl %eax,%r15d
-
- rorl $11,%r14d
- xorl %r10d,%r13d
- addl %r15d,%r12d
-
- movl %ecx,%r15d
- addl (%rbp),%r12d
- xorl %ecx,%r14d
-
- xorl %edx,%r15d
- rorl $6,%r13d
- movl %edx,%ebx
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%ebx
- addl %r12d,%r9d
- addl %r12d,%ebx
-
- leaq 4(%rbp),%rbp
- movl 32(%rsp),%r13d
- movl 20(%rsp),%edi
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%ebx
- movl %edi,%r14d
- rorl $2,%edi
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%edi
- shrl $10,%r14d
-
- rorl $17,%edi
- xorl %r13d,%r12d
- xorl %r14d,%edi
- addl 0(%rsp),%r12d
-
- addl 28(%rsp),%r12d
- movl %r9d,%r13d
- addl %edi,%r12d
- movl %ebx,%r14d
- rorl $14,%r13d
- movl %r10d,%edi
-
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%edi
-
- movl %r12d,28(%rsp)
- xorl %ebx,%r14d
- andl %r9d,%edi
-
- rorl $5,%r13d
- addl %eax,%r12d
- xorl %r11d,%edi
-
- rorl $11,%r14d
- xorl %r9d,%r13d
- addl %edi,%r12d
-
- movl %ebx,%edi
- addl (%rbp),%r12d
- xorl %ebx,%r14d
-
- xorl %ecx,%edi
- rorl $6,%r13d
- movl %ecx,%eax
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%eax
- addl %r12d,%r8d
- addl %r12d,%eax
-
- leaq 20(%rbp),%rbp
- movl 36(%rsp),%r13d
- movl 24(%rsp),%r15d
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%eax
- movl %r15d,%r14d
- rorl $2,%r15d
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%r15d
- shrl $10,%r14d
-
- rorl $17,%r15d
- xorl %r13d,%r12d
- xorl %r14d,%r15d
- addl 4(%rsp),%r12d
-
- addl 32(%rsp),%r12d
- movl %r8d,%r13d
- addl %r15d,%r12d
- movl %eax,%r14d
- rorl $14,%r13d
- movl %r9d,%r15d
-
- xorl %r8d,%r13d
- rorl $9,%r14d
- xorl %r10d,%r15d
-
- movl %r12d,32(%rsp)
- xorl %eax,%r14d
- andl %r8d,%r15d
-
- rorl $5,%r13d
- addl %r11d,%r12d
- xorl %r10d,%r15d
-
- rorl $11,%r14d
- xorl %r8d,%r13d
- addl %r15d,%r12d
-
- movl %eax,%r15d
- addl (%rbp),%r12d
- xorl %eax,%r14d
-
- xorl %ebx,%r15d
- rorl $6,%r13d
- movl %ebx,%r11d
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%r11d
- addl %r12d,%edx
- addl %r12d,%r11d
-
- leaq 4(%rbp),%rbp
- movl 40(%rsp),%r13d
- movl 28(%rsp),%edi
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%r11d
- movl %edi,%r14d
- rorl $2,%edi
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%edi
- shrl $10,%r14d
-
- rorl $17,%edi
- xorl %r13d,%r12d
- xorl %r14d,%edi
- addl 8(%rsp),%r12d
-
- addl 36(%rsp),%r12d
- movl %edx,%r13d
- addl %edi,%r12d
- movl %r11d,%r14d
- rorl $14,%r13d
- movl %r8d,%edi
-
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%edi
-
- movl %r12d,36(%rsp)
- xorl %r11d,%r14d
- andl %edx,%edi
-
- rorl $5,%r13d
- addl %r10d,%r12d
- xorl %r9d,%edi
-
- rorl $11,%r14d
- xorl %edx,%r13d
- addl %edi,%r12d
-
- movl %r11d,%edi
- addl (%rbp),%r12d
- xorl %r11d,%r14d
-
- xorl %eax,%edi
- rorl $6,%r13d
- movl %eax,%r10d
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%r10d
- addl %r12d,%ecx
- addl %r12d,%r10d
-
- leaq 4(%rbp),%rbp
- movl 44(%rsp),%r13d
- movl 32(%rsp),%r15d
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%r10d
- movl %r15d,%r14d
- rorl $2,%r15d
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%r15d
- shrl $10,%r14d
-
- rorl $17,%r15d
- xorl %r13d,%r12d
- xorl %r14d,%r15d
- addl 12(%rsp),%r12d
-
- addl 40(%rsp),%r12d
- movl %ecx,%r13d
- addl %r15d,%r12d
- movl %r10d,%r14d
- rorl $14,%r13d
- movl %edx,%r15d
-
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r15d
-
- movl %r12d,40(%rsp)
- xorl %r10d,%r14d
- andl %ecx,%r15d
-
- rorl $5,%r13d
- addl %r9d,%r12d
- xorl %r8d,%r15d
-
- rorl $11,%r14d
- xorl %ecx,%r13d
- addl %r15d,%r12d
-
- movl %r10d,%r15d
- addl (%rbp),%r12d
- xorl %r10d,%r14d
-
- xorl %r11d,%r15d
- rorl $6,%r13d
- movl %r11d,%r9d
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%r9d
- addl %r12d,%ebx
- addl %r12d,%r9d
-
- leaq 4(%rbp),%rbp
- movl 48(%rsp),%r13d
- movl 36(%rsp),%edi
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%r9d
- movl %edi,%r14d
- rorl $2,%edi
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%edi
- shrl $10,%r14d
-
- rorl $17,%edi
- xorl %r13d,%r12d
- xorl %r14d,%edi
- addl 16(%rsp),%r12d
-
- addl 44(%rsp),%r12d
- movl %ebx,%r13d
- addl %edi,%r12d
- movl %r9d,%r14d
- rorl $14,%r13d
- movl %ecx,%edi
-
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%edi
-
- movl %r12d,44(%rsp)
- xorl %r9d,%r14d
- andl %ebx,%edi
-
- rorl $5,%r13d
- addl %r8d,%r12d
- xorl %edx,%edi
-
- rorl $11,%r14d
- xorl %ebx,%r13d
- addl %edi,%r12d
-
- movl %r9d,%edi
- addl (%rbp),%r12d
- xorl %r9d,%r14d
-
- xorl %r10d,%edi
- rorl $6,%r13d
- movl %r10d,%r8d
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%r8d
- addl %r12d,%eax
- addl %r12d,%r8d
-
- leaq 20(%rbp),%rbp
- movl 52(%rsp),%r13d
- movl 40(%rsp),%r15d
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%r8d
- movl %r15d,%r14d
- rorl $2,%r15d
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%r15d
- shrl $10,%r14d
-
- rorl $17,%r15d
- xorl %r13d,%r12d
- xorl %r14d,%r15d
- addl 20(%rsp),%r12d
-
- addl 48(%rsp),%r12d
- movl %eax,%r13d
- addl %r15d,%r12d
- movl %r8d,%r14d
- rorl $14,%r13d
- movl %ebx,%r15d
-
- xorl %eax,%r13d
- rorl $9,%r14d
- xorl %ecx,%r15d
-
- movl %r12d,48(%rsp)
- xorl %r8d,%r14d
- andl %eax,%r15d
-
- rorl $5,%r13d
- addl %edx,%r12d
- xorl %ecx,%r15d
-
- rorl $11,%r14d
- xorl %eax,%r13d
- addl %r15d,%r12d
-
- movl %r8d,%r15d
- addl (%rbp),%r12d
- xorl %r8d,%r14d
-
- xorl %r9d,%r15d
- rorl $6,%r13d
- movl %r9d,%edx
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%edx
- addl %r12d,%r11d
- addl %r12d,%edx
-
- leaq 4(%rbp),%rbp
- movl 56(%rsp),%r13d
- movl 44(%rsp),%edi
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%edx
- movl %edi,%r14d
- rorl $2,%edi
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%edi
- shrl $10,%r14d
-
- rorl $17,%edi
- xorl %r13d,%r12d
- xorl %r14d,%edi
- addl 24(%rsp),%r12d
-
- addl 52(%rsp),%r12d
- movl %r11d,%r13d
- addl %edi,%r12d
- movl %edx,%r14d
- rorl $14,%r13d
- movl %eax,%edi
-
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%edi
-
- movl %r12d,52(%rsp)
- xorl %edx,%r14d
- andl %r11d,%edi
-
- rorl $5,%r13d
- addl %ecx,%r12d
- xorl %ebx,%edi
-
- rorl $11,%r14d
- xorl %r11d,%r13d
- addl %edi,%r12d
-
- movl %edx,%edi
- addl (%rbp),%r12d
- xorl %edx,%r14d
-
- xorl %r8d,%edi
- rorl $6,%r13d
- movl %r8d,%ecx
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%ecx
- addl %r12d,%r10d
- addl %r12d,%ecx
-
- leaq 4(%rbp),%rbp
- movl 60(%rsp),%r13d
- movl 48(%rsp),%r15d
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%ecx
- movl %r15d,%r14d
- rorl $2,%r15d
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%r15d
- shrl $10,%r14d
-
- rorl $17,%r15d
- xorl %r13d,%r12d
- xorl %r14d,%r15d
- addl 28(%rsp),%r12d
-
- addl 56(%rsp),%r12d
- movl %r10d,%r13d
- addl %r15d,%r12d
- movl %ecx,%r14d
- rorl $14,%r13d
- movl %r11d,%r15d
-
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r15d
-
- movl %r12d,56(%rsp)
- xorl %ecx,%r14d
- andl %r10d,%r15d
-
- rorl $5,%r13d
- addl %ebx,%r12d
- xorl %eax,%r15d
-
- rorl $11,%r14d
- xorl %r10d,%r13d
- addl %r15d,%r12d
-
- movl %ecx,%r15d
- addl (%rbp),%r12d
- xorl %ecx,%r14d
-
- xorl %edx,%r15d
- rorl $6,%r13d
- movl %edx,%ebx
-
- andl %r15d,%edi
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %edi,%ebx
- addl %r12d,%r9d
- addl %r12d,%ebx
-
- leaq 4(%rbp),%rbp
- movl 0(%rsp),%r13d
- movl 52(%rsp),%edi
-
- movl %r13d,%r12d
- rorl $11,%r13d
- addl %r14d,%ebx
- movl %edi,%r14d
- rorl $2,%edi
-
- xorl %r12d,%r13d
- shrl $3,%r12d
- rorl $7,%r13d
- xorl %r14d,%edi
- shrl $10,%r14d
-
- rorl $17,%edi
- xorl %r13d,%r12d
- xorl %r14d,%edi
- addl 32(%rsp),%r12d
-
- addl 60(%rsp),%r12d
- movl %r9d,%r13d
- addl %edi,%r12d
- movl %ebx,%r14d
- rorl $14,%r13d
- movl %r10d,%edi
-
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%edi
-
- movl %r12d,60(%rsp)
- xorl %ebx,%r14d
- andl %r9d,%edi
-
- rorl $5,%r13d
- addl %eax,%r12d
- xorl %r11d,%edi
-
- rorl $11,%r14d
- xorl %r9d,%r13d
- addl %edi,%r12d
-
- movl %ebx,%edi
- addl (%rbp),%r12d
- xorl %ebx,%r14d
-
- xorl %ecx,%edi
- rorl $6,%r13d
- movl %ecx,%eax
-
- andl %edi,%r15d
- rorl $2,%r14d
- addl %r13d,%r12d
-
- xorl %r15d,%eax
- addl %r12d,%r8d
- addl %r12d,%eax
-
- leaq 20(%rbp),%rbp
- cmpb $0,3(%rbp)
- jnz L$rounds_16_xx
-
- movq 64+0(%rsp),%rdi
- addl %r14d,%eax
- leaq 64(%rsi),%rsi
-
- addl 0(%rdi),%eax
- addl 4(%rdi),%ebx
- addl 8(%rdi),%ecx
- addl 12(%rdi),%edx
- addl 16(%rdi),%r8d
- addl 20(%rdi),%r9d
- addl 24(%rdi),%r10d
- addl 28(%rdi),%r11d
-
- cmpq 64+16(%rsp),%rsi
-
- movl %eax,0(%rdi)
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
- jb L$loop
-
- movq 88(%rsp),%rsi
-
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbp
-
- movq -8(%rsi),%rbx
-
- leaq (%rsi),%rsp
-
-L$epilogue:
- ret
-
-
-.section __DATA,__const
-.p2align 6
-
-K256:
-.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
-.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
-.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
-.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
-.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.text
-
-.p2align 6
-sha256_block_data_order_shaext:
-
-L$shaext_shortcut:
- leaq K256+128(%rip),%rcx
- movdqu (%rdi),%xmm1
- movdqu 16(%rdi),%xmm2
- movdqa 512-128(%rcx),%xmm7
-
- pshufd $0x1b,%xmm1,%xmm0
- pshufd $0xb1,%xmm1,%xmm1
- pshufd $0x1b,%xmm2,%xmm2
- movdqa %xmm7,%xmm8
-.byte 102,15,58,15,202,8
- punpcklqdq %xmm0,%xmm2
- jmp L$oop_shaext
-
-.p2align 4
-L$oop_shaext:
- movdqu (%rsi),%xmm3
- movdqu 16(%rsi),%xmm4
- movdqu 32(%rsi),%xmm5
-.byte 102,15,56,0,223
- movdqu 48(%rsi),%xmm6
-
- movdqa 0-128(%rcx),%xmm0
- paddd %xmm3,%xmm0
-.byte 102,15,56,0,231
- movdqa %xmm2,%xmm10
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- nop
- movdqa %xmm1,%xmm9
-.byte 15,56,203,202
-
- movdqa 32-128(%rcx),%xmm0
- paddd %xmm4,%xmm0
-.byte 102,15,56,0,239
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- leaq 64(%rsi),%rsi
-.byte 15,56,204,220
-.byte 15,56,203,202
-
- movdqa 64-128(%rcx),%xmm0
- paddd %xmm5,%xmm0
-.byte 102,15,56,0,247
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm6,%xmm7
-.byte 102,15,58,15,253,4
- nop
- paddd %xmm7,%xmm3
-.byte 15,56,204,229
-.byte 15,56,203,202
-
- movdqa 96-128(%rcx),%xmm0
- paddd %xmm6,%xmm0
-.byte 15,56,205,222
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm3,%xmm7
-.byte 102,15,58,15,254,4
- nop
- paddd %xmm7,%xmm4
-.byte 15,56,204,238
-.byte 15,56,203,202
- movdqa 128-128(%rcx),%xmm0
- paddd %xmm3,%xmm0
-.byte 15,56,205,227
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm4,%xmm7
-.byte 102,15,58,15,251,4
- nop
- paddd %xmm7,%xmm5
-.byte 15,56,204,243
-.byte 15,56,203,202
- movdqa 160-128(%rcx),%xmm0
- paddd %xmm4,%xmm0
-.byte 15,56,205,236
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm5,%xmm7
-.byte 102,15,58,15,252,4
- nop
- paddd %xmm7,%xmm6
-.byte 15,56,204,220
-.byte 15,56,203,202
- movdqa 192-128(%rcx),%xmm0
- paddd %xmm5,%xmm0
-.byte 15,56,205,245
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm6,%xmm7
-.byte 102,15,58,15,253,4
- nop
- paddd %xmm7,%xmm3
-.byte 15,56,204,229
-.byte 15,56,203,202
- movdqa 224-128(%rcx),%xmm0
- paddd %xmm6,%xmm0
-.byte 15,56,205,222
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm3,%xmm7
-.byte 102,15,58,15,254,4
- nop
- paddd %xmm7,%xmm4
-.byte 15,56,204,238
-.byte 15,56,203,202
- movdqa 256-128(%rcx),%xmm0
- paddd %xmm3,%xmm0
-.byte 15,56,205,227
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm4,%xmm7
-.byte 102,15,58,15,251,4
- nop
- paddd %xmm7,%xmm5
-.byte 15,56,204,243
-.byte 15,56,203,202
- movdqa 288-128(%rcx),%xmm0
- paddd %xmm4,%xmm0
-.byte 15,56,205,236
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm5,%xmm7
-.byte 102,15,58,15,252,4
- nop
- paddd %xmm7,%xmm6
-.byte 15,56,204,220
-.byte 15,56,203,202
- movdqa 320-128(%rcx),%xmm0
- paddd %xmm5,%xmm0
-.byte 15,56,205,245
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm6,%xmm7
-.byte 102,15,58,15,253,4
- nop
- paddd %xmm7,%xmm3
-.byte 15,56,204,229
-.byte 15,56,203,202
- movdqa 352-128(%rcx),%xmm0
- paddd %xmm6,%xmm0
-.byte 15,56,205,222
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm3,%xmm7
-.byte 102,15,58,15,254,4
- nop
- paddd %xmm7,%xmm4
-.byte 15,56,204,238
-.byte 15,56,203,202
- movdqa 384-128(%rcx),%xmm0
- paddd %xmm3,%xmm0
-.byte 15,56,205,227
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm4,%xmm7
-.byte 102,15,58,15,251,4
- nop
- paddd %xmm7,%xmm5
-.byte 15,56,204,243
-.byte 15,56,203,202
- movdqa 416-128(%rcx),%xmm0
- paddd %xmm4,%xmm0
-.byte 15,56,205,236
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm5,%xmm7
-.byte 102,15,58,15,252,4
-.byte 15,56,203,202
- paddd %xmm7,%xmm6
-
- movdqa 448-128(%rcx),%xmm0
- paddd %xmm5,%xmm0
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
-.byte 15,56,205,245
- movdqa %xmm8,%xmm7
-.byte 15,56,203,202
-
- movdqa 480-128(%rcx),%xmm0
- paddd %xmm6,%xmm0
- nop
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- decq %rdx
- nop
-.byte 15,56,203,202
-
- paddd %xmm10,%xmm2
- paddd %xmm9,%xmm1
- jnz L$oop_shaext
-
- pshufd $0xb1,%xmm2,%xmm2
- pshufd $0x1b,%xmm1,%xmm7
- pshufd $0xb1,%xmm1,%xmm1
- punpckhqdq %xmm2,%xmm1
-.byte 102,15,58,15,215,8
-
- movdqu %xmm1,(%rdi)
- movdqu %xmm2,16(%rdi)
- ret
-
-
-
-.p2align 6
-sha256_block_data_order_ssse3:
-
-L$ssse3_shortcut:
- movq %rsp,%rax
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- shlq $4,%rdx
- subq $96,%rsp
- leaq (%rsi,%rdx,4),%rdx
- andq $-64,%rsp
- movq %rdi,64+0(%rsp)
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
- movq %rax,88(%rsp)
-
-L$prologue_ssse3:
-
- movl 0(%rdi),%eax
- movl 4(%rdi),%ebx
- movl 8(%rdi),%ecx
- movl 12(%rdi),%edx
- movl 16(%rdi),%r8d
- movl 20(%rdi),%r9d
- movl 24(%rdi),%r10d
- movl 28(%rdi),%r11d
-
-
- jmp L$loop_ssse3
-.p2align 4
-L$loop_ssse3:
- movdqa K256+512(%rip),%xmm7
- movdqu 0(%rsi),%xmm0
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
-.byte 102,15,56,0,199
- movdqu 48(%rsi),%xmm3
- leaq K256(%rip),%rbp
-.byte 102,15,56,0,207
- movdqa 0(%rbp),%xmm4
- movdqa 32(%rbp),%xmm5
-.byte 102,15,56,0,215
- paddd %xmm0,%xmm4
- movdqa 64(%rbp),%xmm6
-.byte 102,15,56,0,223
- movdqa 96(%rbp),%xmm7
- paddd %xmm1,%xmm5
- paddd %xmm2,%xmm6
- paddd %xmm3,%xmm7
- movdqa %xmm4,0(%rsp)
- movl %eax,%r14d
- movdqa %xmm5,16(%rsp)
- movl %ebx,%edi
- movdqa %xmm6,32(%rsp)
- xorl %ecx,%edi
- movdqa %xmm7,48(%rsp)
- movl %r8d,%r13d
- jmp L$ssse3_00_47
-
-.p2align 4
-L$ssse3_00_47:
- subq $-128,%rbp
- rorl $14,%r13d
- movdqa %xmm1,%xmm4
- movl %r14d,%eax
- movl %r9d,%r12d
- movdqa %xmm3,%xmm7
- rorl $9,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- rorl $5,%r13d
- xorl %eax,%r14d
-.byte 102,15,58,15,224,4
- andl %r8d,%r12d
- xorl %r8d,%r13d
-.byte 102,15,58,15,250,4
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- xorl %r10d,%r12d
- rorl $11,%r14d
- movdqa %xmm4,%xmm5
- xorl %ebx,%r15d
- addl %r12d,%r11d
- movdqa %xmm4,%xmm6
- rorl $6,%r13d
- andl %r15d,%edi
- psrld $3,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- paddd %xmm7,%xmm0
- rorl $2,%r14d
- addl %r11d,%edx
- psrld $7,%xmm6
- addl %edi,%r11d
- movl %edx,%r13d
- pshufd $250,%xmm3,%xmm7
- addl %r11d,%r14d
- rorl $14,%r13d
- pslld $14,%xmm5
- movl %r14d,%r11d
- movl %r8d,%r12d
- pxor %xmm6,%xmm4
- rorl $9,%r14d
- xorl %edx,%r13d
- xorl %r9d,%r12d
- rorl $5,%r13d
- psrld $11,%xmm6
- xorl %r11d,%r14d
- pxor %xmm5,%xmm4
- andl %edx,%r12d
- xorl %edx,%r13d
- pslld $11,%xmm5
- addl 4(%rsp),%r10d
- movl %r11d,%edi
- pxor %xmm6,%xmm4
- xorl %r9d,%r12d
- rorl $11,%r14d
- movdqa %xmm7,%xmm6
- xorl %eax,%edi
- addl %r12d,%r10d
- pxor %xmm5,%xmm4
- rorl $6,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- psrld $10,%xmm7
- addl %r13d,%r10d
- xorl %eax,%r15d
- paddd %xmm4,%xmm0
- rorl $2,%r14d
- addl %r10d,%ecx
- psrlq $17,%xmm6
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- pxor %xmm6,%xmm7
- rorl $14,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- rorl $9,%r14d
- psrlq $2,%xmm6
- xorl %ecx,%r13d
- xorl %r8d,%r12d
- pxor %xmm6,%xmm7
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- pshufd $128,%xmm7,%xmm7
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- psrldq $8,%xmm7
- xorl %r8d,%r12d
- rorl $11,%r14d
- xorl %r11d,%r15d
- addl %r12d,%r9d
- rorl $6,%r13d
- paddd %xmm7,%xmm0
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- pshufd $80,%xmm0,%xmm7
- xorl %r11d,%edi
- rorl $2,%r14d
- addl %r9d,%ebx
- movdqa %xmm7,%xmm6
- addl %edi,%r9d
- movl %ebx,%r13d
- psrld $10,%xmm7
- addl %r9d,%r14d
- rorl $14,%r13d
- psrlq $17,%xmm6
- movl %r14d,%r9d
- movl %ecx,%r12d
- pxor %xmm6,%xmm7
- rorl $9,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- psrlq $2,%xmm6
- andl %ebx,%r12d
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- pxor %xmm6,%xmm7
- movl %r9d,%edi
- xorl %edx,%r12d
- rorl $11,%r14d
- pshufd $8,%xmm7,%xmm7
- xorl %r10d,%edi
- addl %r12d,%r8d
- movdqa 0(%rbp),%xmm6
- rorl $6,%r13d
- andl %edi,%r15d
- pslldq $8,%xmm7
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- paddd %xmm7,%xmm0
- rorl $2,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- paddd %xmm0,%xmm6
- movl %eax,%r13d
- addl %r8d,%r14d
- movdqa %xmm6,0(%rsp)
- rorl $14,%r13d
- movdqa %xmm2,%xmm4
- movl %r14d,%r8d
- movl %ebx,%r12d
- movdqa %xmm0,%xmm7
- rorl $9,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- rorl $5,%r13d
- xorl %r8d,%r14d
-.byte 102,15,58,15,225,4
- andl %eax,%r12d
- xorl %eax,%r13d
-.byte 102,15,58,15,251,4
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- xorl %ecx,%r12d
- rorl $11,%r14d
- movdqa %xmm4,%xmm5
- xorl %r9d,%r15d
- addl %r12d,%edx
- movdqa %xmm4,%xmm6
- rorl $6,%r13d
- andl %r15d,%edi
- psrld $3,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- paddd %xmm7,%xmm1
- rorl $2,%r14d
- addl %edx,%r11d
- psrld $7,%xmm6
- addl %edi,%edx
- movl %r11d,%r13d
- pshufd $250,%xmm0,%xmm7
- addl %edx,%r14d
- rorl $14,%r13d
- pslld $14,%xmm5
- movl %r14d,%edx
- movl %eax,%r12d
- pxor %xmm6,%xmm4
- rorl $9,%r14d
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- rorl $5,%r13d
- psrld $11,%xmm6
- xorl %edx,%r14d
- pxor %xmm5,%xmm4
- andl %r11d,%r12d
- xorl %r11d,%r13d
- pslld $11,%xmm5
- addl 20(%rsp),%ecx
- movl %edx,%edi
- pxor %xmm6,%xmm4
- xorl %ebx,%r12d
- rorl $11,%r14d
- movdqa %xmm7,%xmm6
- xorl %r8d,%edi
- addl %r12d,%ecx
- pxor %xmm5,%xmm4
- rorl $6,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- psrld $10,%xmm7
- addl %r13d,%ecx
- xorl %r8d,%r15d
- paddd %xmm4,%xmm1
- rorl $2,%r14d
- addl %ecx,%r10d
- psrlq $17,%xmm6
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- pxor %xmm6,%xmm7
- rorl $14,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- rorl $9,%r14d
- psrlq $2,%xmm6
- xorl %r10d,%r13d
- xorl %eax,%r12d
- pxor %xmm6,%xmm7
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- pshufd $128,%xmm7,%xmm7
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- psrldq $8,%xmm7
- xorl %eax,%r12d
- rorl $11,%r14d
- xorl %edx,%r15d
- addl %r12d,%ebx
- rorl $6,%r13d
- paddd %xmm7,%xmm1
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- pshufd $80,%xmm1,%xmm7
- xorl %edx,%edi
- rorl $2,%r14d
- addl %ebx,%r9d
- movdqa %xmm7,%xmm6
- addl %edi,%ebx
- movl %r9d,%r13d
- psrld $10,%xmm7
- addl %ebx,%r14d
- rorl $14,%r13d
- psrlq $17,%xmm6
- movl %r14d,%ebx
- movl %r10d,%r12d
- pxor %xmm6,%xmm7
- rorl $9,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- psrlq $2,%xmm6
- andl %r9d,%r12d
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- pxor %xmm6,%xmm7
- movl %ebx,%edi
- xorl %r11d,%r12d
- rorl $11,%r14d
- pshufd $8,%xmm7,%xmm7
- xorl %ecx,%edi
- addl %r12d,%eax
- movdqa 32(%rbp),%xmm6
- rorl $6,%r13d
- andl %edi,%r15d
- pslldq $8,%xmm7
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- paddd %xmm7,%xmm1
- rorl $2,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- paddd %xmm1,%xmm6
- movl %r8d,%r13d
- addl %eax,%r14d
- movdqa %xmm6,16(%rsp)
- rorl $14,%r13d
- movdqa %xmm3,%xmm4
- movl %r14d,%eax
- movl %r9d,%r12d
- movdqa %xmm1,%xmm7
- rorl $9,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- rorl $5,%r13d
- xorl %eax,%r14d
-.byte 102,15,58,15,226,4
- andl %r8d,%r12d
- xorl %r8d,%r13d
-.byte 102,15,58,15,248,4
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- xorl %r10d,%r12d
- rorl $11,%r14d
- movdqa %xmm4,%xmm5
- xorl %ebx,%r15d
- addl %r12d,%r11d
- movdqa %xmm4,%xmm6
- rorl $6,%r13d
- andl %r15d,%edi
- psrld $3,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- paddd %xmm7,%xmm2
- rorl $2,%r14d
- addl %r11d,%edx
- psrld $7,%xmm6
- addl %edi,%r11d
- movl %edx,%r13d
- pshufd $250,%xmm1,%xmm7
- addl %r11d,%r14d
- rorl $14,%r13d
- pslld $14,%xmm5
- movl %r14d,%r11d
- movl %r8d,%r12d
- pxor %xmm6,%xmm4
- rorl $9,%r14d
- xorl %edx,%r13d
- xorl %r9d,%r12d
- rorl $5,%r13d
- psrld $11,%xmm6
- xorl %r11d,%r14d
- pxor %xmm5,%xmm4
- andl %edx,%r12d
- xorl %edx,%r13d
- pslld $11,%xmm5
- addl 36(%rsp),%r10d
- movl %r11d,%edi
- pxor %xmm6,%xmm4
- xorl %r9d,%r12d
- rorl $11,%r14d
- movdqa %xmm7,%xmm6
- xorl %eax,%edi
- addl %r12d,%r10d
- pxor %xmm5,%xmm4
- rorl $6,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- psrld $10,%xmm7
- addl %r13d,%r10d
- xorl %eax,%r15d
- paddd %xmm4,%xmm2
- rorl $2,%r14d
- addl %r10d,%ecx
- psrlq $17,%xmm6
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- pxor %xmm6,%xmm7
- rorl $14,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- rorl $9,%r14d
- psrlq $2,%xmm6
- xorl %ecx,%r13d
- xorl %r8d,%r12d
- pxor %xmm6,%xmm7
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- pshufd $128,%xmm7,%xmm7
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- psrldq $8,%xmm7
- xorl %r8d,%r12d
- rorl $11,%r14d
- xorl %r11d,%r15d
- addl %r12d,%r9d
- rorl $6,%r13d
- paddd %xmm7,%xmm2
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- pshufd $80,%xmm2,%xmm7
- xorl %r11d,%edi
- rorl $2,%r14d
- addl %r9d,%ebx
- movdqa %xmm7,%xmm6
- addl %edi,%r9d
- movl %ebx,%r13d
- psrld $10,%xmm7
- addl %r9d,%r14d
- rorl $14,%r13d
- psrlq $17,%xmm6
- movl %r14d,%r9d
- movl %ecx,%r12d
- pxor %xmm6,%xmm7
- rorl $9,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- psrlq $2,%xmm6
- andl %ebx,%r12d
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- pxor %xmm6,%xmm7
- movl %r9d,%edi
- xorl %edx,%r12d
- rorl $11,%r14d
- pshufd $8,%xmm7,%xmm7
- xorl %r10d,%edi
- addl %r12d,%r8d
- movdqa 64(%rbp),%xmm6
- rorl $6,%r13d
- andl %edi,%r15d
- pslldq $8,%xmm7
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- paddd %xmm7,%xmm2
- rorl $2,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- paddd %xmm2,%xmm6
- movl %eax,%r13d
- addl %r8d,%r14d
- movdqa %xmm6,32(%rsp)
- rorl $14,%r13d
- movdqa %xmm0,%xmm4
- movl %r14d,%r8d
- movl %ebx,%r12d
- movdqa %xmm2,%xmm7
- rorl $9,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- rorl $5,%r13d
- xorl %r8d,%r14d
-.byte 102,15,58,15,227,4
- andl %eax,%r12d
- xorl %eax,%r13d
-.byte 102,15,58,15,249,4
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- xorl %ecx,%r12d
- rorl $11,%r14d
- movdqa %xmm4,%xmm5
- xorl %r9d,%r15d
- addl %r12d,%edx
- movdqa %xmm4,%xmm6
- rorl $6,%r13d
- andl %r15d,%edi
- psrld $3,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- paddd %xmm7,%xmm3
- rorl $2,%r14d
- addl %edx,%r11d
- psrld $7,%xmm6
- addl %edi,%edx
- movl %r11d,%r13d
- pshufd $250,%xmm2,%xmm7
- addl %edx,%r14d
- rorl $14,%r13d
- pslld $14,%xmm5
- movl %r14d,%edx
- movl %eax,%r12d
- pxor %xmm6,%xmm4
- rorl $9,%r14d
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- rorl $5,%r13d
- psrld $11,%xmm6
- xorl %edx,%r14d
- pxor %xmm5,%xmm4
- andl %r11d,%r12d
- xorl %r11d,%r13d
- pslld $11,%xmm5
- addl 52(%rsp),%ecx
- movl %edx,%edi
- pxor %xmm6,%xmm4
- xorl %ebx,%r12d
- rorl $11,%r14d
- movdqa %xmm7,%xmm6
- xorl %r8d,%edi
- addl %r12d,%ecx
- pxor %xmm5,%xmm4
- rorl $6,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- psrld $10,%xmm7
- addl %r13d,%ecx
- xorl %r8d,%r15d
- paddd %xmm4,%xmm3
- rorl $2,%r14d
- addl %ecx,%r10d
- psrlq $17,%xmm6
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- pxor %xmm6,%xmm7
- rorl $14,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- rorl $9,%r14d
- psrlq $2,%xmm6
- xorl %r10d,%r13d
- xorl %eax,%r12d
- pxor %xmm6,%xmm7
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- pshufd $128,%xmm7,%xmm7
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- psrldq $8,%xmm7
- xorl %eax,%r12d
- rorl $11,%r14d
- xorl %edx,%r15d
- addl %r12d,%ebx
- rorl $6,%r13d
- paddd %xmm7,%xmm3
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- pshufd $80,%xmm3,%xmm7
- xorl %edx,%edi
- rorl $2,%r14d
- addl %ebx,%r9d
- movdqa %xmm7,%xmm6
- addl %edi,%ebx
- movl %r9d,%r13d
- psrld $10,%xmm7
- addl %ebx,%r14d
- rorl $14,%r13d
- psrlq $17,%xmm6
- movl %r14d,%ebx
- movl %r10d,%r12d
- pxor %xmm6,%xmm7
- rorl $9,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- psrlq $2,%xmm6
- andl %r9d,%r12d
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- pxor %xmm6,%xmm7
- movl %ebx,%edi
- xorl %r11d,%r12d
- rorl $11,%r14d
- pshufd $8,%xmm7,%xmm7
- xorl %ecx,%edi
- addl %r12d,%eax
- movdqa 96(%rbp),%xmm6
- rorl $6,%r13d
- andl %edi,%r15d
- pslldq $8,%xmm7
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- paddd %xmm7,%xmm3
- rorl $2,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- paddd %xmm3,%xmm6
- movl %r8d,%r13d
- addl %eax,%r14d
- movdqa %xmm6,48(%rsp)
- cmpb $0,131(%rbp)
- jne L$ssse3_00_47
- rorl $14,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- rorl $9,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- rorl $5,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- xorl %r10d,%r12d
- rorl $11,%r14d
- xorl %ebx,%r15d
- addl %r12d,%r11d
- rorl $6,%r13d
- andl %r15d,%edi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- rorl $2,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- rorl $14,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- rorl $9,%r14d
- xorl %edx,%r13d
- xorl %r9d,%r12d
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- addl 4(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- rorl $11,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- rorl $6,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- rorl $2,%r14d
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- rorl $9,%r14d
- xorl %ecx,%r13d
- xorl %r8d,%r12d
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- xorl %r8d,%r12d
- rorl $11,%r14d
- xorl %r11d,%r15d
- addl %r12d,%r9d
- rorl $6,%r13d
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%edi
- rorl $2,%r14d
- addl %r9d,%ebx
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- rorl $9,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- rorl $11,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- rorl $6,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- rorl $2,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- rorl $14,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- rorl $9,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- rorl $5,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- xorl %ecx,%r12d
- rorl $11,%r14d
- xorl %r9d,%r15d
- addl %r12d,%edx
- rorl $6,%r13d
- andl %r15d,%edi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- rorl $2,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- rorl $14,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- rorl $9,%r14d
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- addl 20(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- rorl $11,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- rorl $6,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- rorl $2,%r14d
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- rorl $9,%r14d
- xorl %r10d,%r13d
- xorl %eax,%r12d
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- xorl %eax,%r12d
- rorl $11,%r14d
- xorl %edx,%r15d
- addl %r12d,%ebx
- rorl $6,%r13d
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%edi
- rorl $2,%r14d
- addl %ebx,%r9d
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- rorl $9,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- rorl $11,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- rorl $6,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- rorl $2,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- rorl $14,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- rorl $9,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- rorl $5,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- xorl %r10d,%r12d
- rorl $11,%r14d
- xorl %ebx,%r15d
- addl %r12d,%r11d
- rorl $6,%r13d
- andl %r15d,%edi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- rorl $2,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- rorl $14,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- rorl $9,%r14d
- xorl %edx,%r13d
- xorl %r9d,%r12d
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- addl 36(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- rorl $11,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- rorl $6,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- rorl $2,%r14d
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- rorl $9,%r14d
- xorl %ecx,%r13d
- xorl %r8d,%r12d
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- xorl %r8d,%r12d
- rorl $11,%r14d
- xorl %r11d,%r15d
- addl %r12d,%r9d
- rorl $6,%r13d
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%edi
- rorl $2,%r14d
- addl %r9d,%ebx
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- rorl $9,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- rorl $11,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- rorl $6,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- rorl $2,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- rorl $14,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- rorl $9,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- rorl $5,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- xorl %ecx,%r12d
- rorl $11,%r14d
- xorl %r9d,%r15d
- addl %r12d,%edx
- rorl $6,%r13d
- andl %r15d,%edi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- rorl $2,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- rorl $14,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- rorl $9,%r14d
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- addl 52(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- rorl $11,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- rorl $6,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- rorl $2,%r14d
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- rorl $9,%r14d
- xorl %r10d,%r13d
- xorl %eax,%r12d
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- xorl %eax,%r12d
- rorl $11,%r14d
- xorl %edx,%r15d
- addl %r12d,%ebx
- rorl $6,%r13d
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%edi
- rorl $2,%r14d
- addl %ebx,%r9d
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- rorl $9,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- rorl $11,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- rorl $6,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- rorl $2,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- movq 64+0(%rsp),%rdi
- movl %r14d,%eax
-
- addl 0(%rdi),%eax
- leaq 64(%rsi),%rsi
- addl 4(%rdi),%ebx
- addl 8(%rdi),%ecx
- addl 12(%rdi),%edx
- addl 16(%rdi),%r8d
- addl 20(%rdi),%r9d
- addl 24(%rdi),%r10d
- addl 28(%rdi),%r11d
-
- cmpq 64+16(%rsp),%rsi
-
- movl %eax,0(%rdi)
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
- jb L$loop_ssse3
-
- movq 88(%rsp),%rsi
-
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbp
-
- movq -8(%rsi),%rbx
-
- leaq (%rsi),%rsp
-
-L$epilogue_ssse3:
- ret
-
-
-
-.p2align 6
-sha256_block_data_order_avx:
-
-L$avx_shortcut:
- movq %rsp,%rax
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- shlq $4,%rdx
- subq $96,%rsp
- leaq (%rsi,%rdx,4),%rdx
- andq $-64,%rsp
- movq %rdi,64+0(%rsp)
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
- movq %rax,88(%rsp)
-
-L$prologue_avx:
-
- vzeroupper
- movl 0(%rdi),%eax
- movl 4(%rdi),%ebx
- movl 8(%rdi),%ecx
- movl 12(%rdi),%edx
- movl 16(%rdi),%r8d
- movl 20(%rdi),%r9d
- movl 24(%rdi),%r10d
- movl 28(%rdi),%r11d
- vmovdqa K256+512+32(%rip),%xmm8
- vmovdqa K256+512+64(%rip),%xmm9
- jmp L$loop_avx
-.p2align 4
-L$loop_avx:
- vmovdqa K256+512(%rip),%xmm7
- vmovdqu 0(%rsi),%xmm0
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm7,%xmm0,%xmm0
- leaq K256(%rip),%rbp
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd 0(%rbp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 32(%rbp),%xmm1,%xmm5
- vpaddd 64(%rbp),%xmm2,%xmm6
- vpaddd 96(%rbp),%xmm3,%xmm7
- vmovdqa %xmm4,0(%rsp)
- movl %eax,%r14d
- vmovdqa %xmm5,16(%rsp)
- movl %ebx,%edi
- vmovdqa %xmm6,32(%rsp)
- xorl %ecx,%edi
- vmovdqa %xmm7,48(%rsp)
- movl %r8d,%r13d
- jmp L$avx_00_47
-
-.p2align 4
-L$avx_00_47:
- subq $-128,%rbp
- vpalignr $4,%xmm0,%xmm1,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm2,%xmm3,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- vpshufd $250,%xmm3,%xmm7
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 4(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm0,%xmm0
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpaddd %xmm6,%xmm0,%xmm0
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- vpshufd $80,%xmm0,%xmm7
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- vpaddd %xmm6,%xmm0,%xmm0
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpaddd 0(%rbp),%xmm0,%xmm6
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,0(%rsp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm3,%xmm0,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- vpshufd $250,%xmm0,%xmm7
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 20(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm1,%xmm1
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpaddd %xmm6,%xmm1,%xmm1
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- vpshufd $80,%xmm1,%xmm7
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- vpaddd %xmm6,%xmm1,%xmm1
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpaddd 32(%rbp),%xmm1,%xmm6
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,16(%rsp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm0,%xmm1,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- vpshufd $250,%xmm1,%xmm7
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 36(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm2,%xmm2
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpaddd %xmm6,%xmm2,%xmm2
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- vpshufd $80,%xmm2,%xmm7
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- vpaddd %xmm6,%xmm2,%xmm2
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpaddd 64(%rbp),%xmm2,%xmm6
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,32(%rsp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm1,%xmm2,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- vpshufd $250,%xmm2,%xmm7
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 52(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm3,%xmm3
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpaddd %xmm6,%xmm3,%xmm3
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- vpshufd $80,%xmm3,%xmm7
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- vpaddd %xmm6,%xmm3,%xmm3
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpaddd 96(%rbp),%xmm3,%xmm6
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,48(%rsp)
- cmpb $0,131(%rbp)
- jne L$avx_00_47
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- addl 4(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- addl 20(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- addl 36(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- addl 52(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- movq 64+0(%rsp),%rdi
- movl %r14d,%eax
-
- addl 0(%rdi),%eax
- leaq 64(%rsi),%rsi
- addl 4(%rdi),%ebx
- addl 8(%rdi),%ecx
- addl 12(%rdi),%edx
- addl 16(%rdi),%r8d
- addl 20(%rdi),%r9d
- addl 24(%rdi),%r10d
- addl 28(%rdi),%r11d
-
- cmpq 64+16(%rsp),%rsi
-
- movl %eax,0(%rdi)
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
- jb L$loop_avx
-
- movq 88(%rsp),%rsi
-
- vzeroupper
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbp
-
- movq -8(%rsi),%rbx
-
- leaq (%rsi),%rsp
-
-L$epilogue_avx:
- ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/sha512-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/sha512-x86_64-apple.S
deleted file mode 100644
index 6e2e13e..0000000
--- a/apple-x86_64/crypto/fipsmodule/sha512-x86_64-apple.S
+++ /dev/null
@@ -1,2986 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-
-.globl _sha512_block_data_order
-.private_extern _sha512_block_data_order
-
-.p2align 4
-_sha512_block_data_order:
-
-_CET_ENDBR
- leaq _OPENSSL_ia32cap_P(%rip),%r11
- movl 0(%r11),%r9d
- movl 4(%r11),%r10d
- movl 8(%r11),%r11d
- andl $1073741824,%r9d
- andl $268435968,%r10d
- orl %r9d,%r10d
- cmpl $1342177792,%r10d
- je L$avx_shortcut
- movq %rsp,%rax
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- shlq $4,%rdx
- subq $128+32,%rsp
- leaq (%rsi,%rdx,8),%rdx
- andq $-64,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %rax,152(%rsp)
-
-L$prologue:
-
- movq 0(%rdi),%rax
- movq 8(%rdi),%rbx
- movq 16(%rdi),%rcx
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp L$loop
-
-.p2align 4
-L$loop:
- movq %rbx,%rdi
- leaq K512(%rip),%rbp
- xorq %rcx,%rdi
- movq 0(%rsi),%r12
- movq %r8,%r13
- movq %rax,%r14
- bswapq %r12
- rorq $23,%r13
- movq %r9,%r15
-
- xorq %r8,%r13
- rorq $5,%r14
- xorq %r10,%r15
-
- movq %r12,0(%rsp)
- xorq %rax,%r14
- andq %r8,%r15
-
- rorq $4,%r13
- addq %r11,%r12
- xorq %r10,%r15
-
- rorq $6,%r14
- xorq %r8,%r13
- addq %r15,%r12
-
- movq %rax,%r15
- addq (%rbp),%r12
- xorq %rax,%r14
-
- xorq %rbx,%r15
- rorq $14,%r13
- movq %rbx,%r11
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%r11
- addq %r12,%rdx
- addq %r12,%r11
-
- leaq 8(%rbp),%rbp
- addq %r14,%r11
- movq 8(%rsi),%r12
- movq %rdx,%r13
- movq %r11,%r14
- bswapq %r12
- rorq $23,%r13
- movq %r8,%rdi
-
- xorq %rdx,%r13
- rorq $5,%r14
- xorq %r9,%rdi
-
- movq %r12,8(%rsp)
- xorq %r11,%r14
- andq %rdx,%rdi
-
- rorq $4,%r13
- addq %r10,%r12
- xorq %r9,%rdi
-
- rorq $6,%r14
- xorq %rdx,%r13
- addq %rdi,%r12
-
- movq %r11,%rdi
- addq (%rbp),%r12
- xorq %r11,%r14
-
- xorq %rax,%rdi
- rorq $14,%r13
- movq %rax,%r10
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%r10
- addq %r12,%rcx
- addq %r12,%r10
-
- leaq 24(%rbp),%rbp
- addq %r14,%r10
- movq 16(%rsi),%r12
- movq %rcx,%r13
- movq %r10,%r14
- bswapq %r12
- rorq $23,%r13
- movq %rdx,%r15
-
- xorq %rcx,%r13
- rorq $5,%r14
- xorq %r8,%r15
-
- movq %r12,16(%rsp)
- xorq %r10,%r14
- andq %rcx,%r15
-
- rorq $4,%r13
- addq %r9,%r12
- xorq %r8,%r15
-
- rorq $6,%r14
- xorq %rcx,%r13
- addq %r15,%r12
-
- movq %r10,%r15
- addq (%rbp),%r12
- xorq %r10,%r14
-
- xorq %r11,%r15
- rorq $14,%r13
- movq %r11,%r9
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%r9
- addq %r12,%rbx
- addq %r12,%r9
-
- leaq 8(%rbp),%rbp
- addq %r14,%r9
- movq 24(%rsi),%r12
- movq %rbx,%r13
- movq %r9,%r14
- bswapq %r12
- rorq $23,%r13
- movq %rcx,%rdi
-
- xorq %rbx,%r13
- rorq $5,%r14
- xorq %rdx,%rdi
-
- movq %r12,24(%rsp)
- xorq %r9,%r14
- andq %rbx,%rdi
-
- rorq $4,%r13
- addq %r8,%r12
- xorq %rdx,%rdi
-
- rorq $6,%r14
- xorq %rbx,%r13
- addq %rdi,%r12
-
- movq %r9,%rdi
- addq (%rbp),%r12
- xorq %r9,%r14
-
- xorq %r10,%rdi
- rorq $14,%r13
- movq %r10,%r8
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%r8
- addq %r12,%rax
- addq %r12,%r8
-
- leaq 24(%rbp),%rbp
- addq %r14,%r8
- movq 32(%rsi),%r12
- movq %rax,%r13
- movq %r8,%r14
- bswapq %r12
- rorq $23,%r13
- movq %rbx,%r15
-
- xorq %rax,%r13
- rorq $5,%r14
- xorq %rcx,%r15
-
- movq %r12,32(%rsp)
- xorq %r8,%r14
- andq %rax,%r15
-
- rorq $4,%r13
- addq %rdx,%r12
- xorq %rcx,%r15
-
- rorq $6,%r14
- xorq %rax,%r13
- addq %r15,%r12
-
- movq %r8,%r15
- addq (%rbp),%r12
- xorq %r8,%r14
-
- xorq %r9,%r15
- rorq $14,%r13
- movq %r9,%rdx
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%rdx
- addq %r12,%r11
- addq %r12,%rdx
-
- leaq 8(%rbp),%rbp
- addq %r14,%rdx
- movq 40(%rsi),%r12
- movq %r11,%r13
- movq %rdx,%r14
- bswapq %r12
- rorq $23,%r13
- movq %rax,%rdi
-
- xorq %r11,%r13
- rorq $5,%r14
- xorq %rbx,%rdi
-
- movq %r12,40(%rsp)
- xorq %rdx,%r14
- andq %r11,%rdi
-
- rorq $4,%r13
- addq %rcx,%r12
- xorq %rbx,%rdi
-
- rorq $6,%r14
- xorq %r11,%r13
- addq %rdi,%r12
-
- movq %rdx,%rdi
- addq (%rbp),%r12
- xorq %rdx,%r14
-
- xorq %r8,%rdi
- rorq $14,%r13
- movq %r8,%rcx
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%rcx
- addq %r12,%r10
- addq %r12,%rcx
-
- leaq 24(%rbp),%rbp
- addq %r14,%rcx
- movq 48(%rsi),%r12
- movq %r10,%r13
- movq %rcx,%r14
- bswapq %r12
- rorq $23,%r13
- movq %r11,%r15
-
- xorq %r10,%r13
- rorq $5,%r14
- xorq %rax,%r15
-
- movq %r12,48(%rsp)
- xorq %rcx,%r14
- andq %r10,%r15
-
- rorq $4,%r13
- addq %rbx,%r12
- xorq %rax,%r15
-
- rorq $6,%r14
- xorq %r10,%r13
- addq %r15,%r12
-
- movq %rcx,%r15
- addq (%rbp),%r12
- xorq %rcx,%r14
-
- xorq %rdx,%r15
- rorq $14,%r13
- movq %rdx,%rbx
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%rbx
- addq %r12,%r9
- addq %r12,%rbx
-
- leaq 8(%rbp),%rbp
- addq %r14,%rbx
- movq 56(%rsi),%r12
- movq %r9,%r13
- movq %rbx,%r14
- bswapq %r12
- rorq $23,%r13
- movq %r10,%rdi
-
- xorq %r9,%r13
- rorq $5,%r14
- xorq %r11,%rdi
-
- movq %r12,56(%rsp)
- xorq %rbx,%r14
- andq %r9,%rdi
-
- rorq $4,%r13
- addq %rax,%r12
- xorq %r11,%rdi
-
- rorq $6,%r14
- xorq %r9,%r13
- addq %rdi,%r12
-
- movq %rbx,%rdi
- addq (%rbp),%r12
- xorq %rbx,%r14
-
- xorq %rcx,%rdi
- rorq $14,%r13
- movq %rcx,%rax
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%rax
- addq %r12,%r8
- addq %r12,%rax
-
- leaq 24(%rbp),%rbp
- addq %r14,%rax
- movq 64(%rsi),%r12
- movq %r8,%r13
- movq %rax,%r14
- bswapq %r12
- rorq $23,%r13
- movq %r9,%r15
-
- xorq %r8,%r13
- rorq $5,%r14
- xorq %r10,%r15
-
- movq %r12,64(%rsp)
- xorq %rax,%r14
- andq %r8,%r15
-
- rorq $4,%r13
- addq %r11,%r12
- xorq %r10,%r15
-
- rorq $6,%r14
- xorq %r8,%r13
- addq %r15,%r12
-
- movq %rax,%r15
- addq (%rbp),%r12
- xorq %rax,%r14
-
- xorq %rbx,%r15
- rorq $14,%r13
- movq %rbx,%r11
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%r11
- addq %r12,%rdx
- addq %r12,%r11
-
- leaq 8(%rbp),%rbp
- addq %r14,%r11
- movq 72(%rsi),%r12
- movq %rdx,%r13
- movq %r11,%r14
- bswapq %r12
- rorq $23,%r13
- movq %r8,%rdi
-
- xorq %rdx,%r13
- rorq $5,%r14
- xorq %r9,%rdi
-
- movq %r12,72(%rsp)
- xorq %r11,%r14
- andq %rdx,%rdi
-
- rorq $4,%r13
- addq %r10,%r12
- xorq %r9,%rdi
-
- rorq $6,%r14
- xorq %rdx,%r13
- addq %rdi,%r12
-
- movq %r11,%rdi
- addq (%rbp),%r12
- xorq %r11,%r14
-
- xorq %rax,%rdi
- rorq $14,%r13
- movq %rax,%r10
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%r10
- addq %r12,%rcx
- addq %r12,%r10
-
- leaq 24(%rbp),%rbp
- addq %r14,%r10
- movq 80(%rsi),%r12
- movq %rcx,%r13
- movq %r10,%r14
- bswapq %r12
- rorq $23,%r13
- movq %rdx,%r15
-
- xorq %rcx,%r13
- rorq $5,%r14
- xorq %r8,%r15
-
- movq %r12,80(%rsp)
- xorq %r10,%r14
- andq %rcx,%r15
-
- rorq $4,%r13
- addq %r9,%r12
- xorq %r8,%r15
-
- rorq $6,%r14
- xorq %rcx,%r13
- addq %r15,%r12
-
- movq %r10,%r15
- addq (%rbp),%r12
- xorq %r10,%r14
-
- xorq %r11,%r15
- rorq $14,%r13
- movq %r11,%r9
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%r9
- addq %r12,%rbx
- addq %r12,%r9
-
- leaq 8(%rbp),%rbp
- addq %r14,%r9
- movq 88(%rsi),%r12
- movq %rbx,%r13
- movq %r9,%r14
- bswapq %r12
- rorq $23,%r13
- movq %rcx,%rdi
-
- xorq %rbx,%r13
- rorq $5,%r14
- xorq %rdx,%rdi
-
- movq %r12,88(%rsp)
- xorq %r9,%r14
- andq %rbx,%rdi
-
- rorq $4,%r13
- addq %r8,%r12
- xorq %rdx,%rdi
-
- rorq $6,%r14
- xorq %rbx,%r13
- addq %rdi,%r12
-
- movq %r9,%rdi
- addq (%rbp),%r12
- xorq %r9,%r14
-
- xorq %r10,%rdi
- rorq $14,%r13
- movq %r10,%r8
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%r8
- addq %r12,%rax
- addq %r12,%r8
-
- leaq 24(%rbp),%rbp
- addq %r14,%r8
- movq 96(%rsi),%r12
- movq %rax,%r13
- movq %r8,%r14
- bswapq %r12
- rorq $23,%r13
- movq %rbx,%r15
-
- xorq %rax,%r13
- rorq $5,%r14
- xorq %rcx,%r15
-
- movq %r12,96(%rsp)
- xorq %r8,%r14
- andq %rax,%r15
-
- rorq $4,%r13
- addq %rdx,%r12
- xorq %rcx,%r15
-
- rorq $6,%r14
- xorq %rax,%r13
- addq %r15,%r12
-
- movq %r8,%r15
- addq (%rbp),%r12
- xorq %r8,%r14
-
- xorq %r9,%r15
- rorq $14,%r13
- movq %r9,%rdx
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%rdx
- addq %r12,%r11
- addq %r12,%rdx
-
- leaq 8(%rbp),%rbp
- addq %r14,%rdx
- movq 104(%rsi),%r12
- movq %r11,%r13
- movq %rdx,%r14
- bswapq %r12
- rorq $23,%r13
- movq %rax,%rdi
-
- xorq %r11,%r13
- rorq $5,%r14
- xorq %rbx,%rdi
-
- movq %r12,104(%rsp)
- xorq %rdx,%r14
- andq %r11,%rdi
-
- rorq $4,%r13
- addq %rcx,%r12
- xorq %rbx,%rdi
-
- rorq $6,%r14
- xorq %r11,%r13
- addq %rdi,%r12
-
- movq %rdx,%rdi
- addq (%rbp),%r12
- xorq %rdx,%r14
-
- xorq %r8,%rdi
- rorq $14,%r13
- movq %r8,%rcx
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%rcx
- addq %r12,%r10
- addq %r12,%rcx
-
- leaq 24(%rbp),%rbp
- addq %r14,%rcx
- movq 112(%rsi),%r12
- movq %r10,%r13
- movq %rcx,%r14
- bswapq %r12
- rorq $23,%r13
- movq %r11,%r15
-
- xorq %r10,%r13
- rorq $5,%r14
- xorq %rax,%r15
-
- movq %r12,112(%rsp)
- xorq %rcx,%r14
- andq %r10,%r15
-
- rorq $4,%r13
- addq %rbx,%r12
- xorq %rax,%r15
-
- rorq $6,%r14
- xorq %r10,%r13
- addq %r15,%r12
-
- movq %rcx,%r15
- addq (%rbp),%r12
- xorq %rcx,%r14
-
- xorq %rdx,%r15
- rorq $14,%r13
- movq %rdx,%rbx
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%rbx
- addq %r12,%r9
- addq %r12,%rbx
-
- leaq 8(%rbp),%rbp
- addq %r14,%rbx
- movq 120(%rsi),%r12
- movq %r9,%r13
- movq %rbx,%r14
- bswapq %r12
- rorq $23,%r13
- movq %r10,%rdi
-
- xorq %r9,%r13
- rorq $5,%r14
- xorq %r11,%rdi
-
- movq %r12,120(%rsp)
- xorq %rbx,%r14
- andq %r9,%rdi
-
- rorq $4,%r13
- addq %rax,%r12
- xorq %r11,%rdi
-
- rorq $6,%r14
- xorq %r9,%r13
- addq %rdi,%r12
-
- movq %rbx,%rdi
- addq (%rbp),%r12
- xorq %rbx,%r14
-
- xorq %rcx,%rdi
- rorq $14,%r13
- movq %rcx,%rax
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%rax
- addq %r12,%r8
- addq %r12,%rax
-
- leaq 24(%rbp),%rbp
- jmp L$rounds_16_xx
-.p2align 4
-L$rounds_16_xx:
- movq 8(%rsp),%r13
- movq 112(%rsp),%r15
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%rax
- movq %r15,%r14
- rorq $42,%r15
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%r15
- shrq $6,%r14
-
- rorq $19,%r15
- xorq %r13,%r12
- xorq %r14,%r15
- addq 72(%rsp),%r12
-
- addq 0(%rsp),%r12
- movq %r8,%r13
- addq %r15,%r12
- movq %rax,%r14
- rorq $23,%r13
- movq %r9,%r15
-
- xorq %r8,%r13
- rorq $5,%r14
- xorq %r10,%r15
-
- movq %r12,0(%rsp)
- xorq %rax,%r14
- andq %r8,%r15
-
- rorq $4,%r13
- addq %r11,%r12
- xorq %r10,%r15
-
- rorq $6,%r14
- xorq %r8,%r13
- addq %r15,%r12
-
- movq %rax,%r15
- addq (%rbp),%r12
- xorq %rax,%r14
-
- xorq %rbx,%r15
- rorq $14,%r13
- movq %rbx,%r11
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%r11
- addq %r12,%rdx
- addq %r12,%r11
-
- leaq 8(%rbp),%rbp
- movq 16(%rsp),%r13
- movq 120(%rsp),%rdi
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%r11
- movq %rdi,%r14
- rorq $42,%rdi
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%rdi
- shrq $6,%r14
-
- rorq $19,%rdi
- xorq %r13,%r12
- xorq %r14,%rdi
- addq 80(%rsp),%r12
-
- addq 8(%rsp),%r12
- movq %rdx,%r13
- addq %rdi,%r12
- movq %r11,%r14
- rorq $23,%r13
- movq %r8,%rdi
-
- xorq %rdx,%r13
- rorq $5,%r14
- xorq %r9,%rdi
-
- movq %r12,8(%rsp)
- xorq %r11,%r14
- andq %rdx,%rdi
-
- rorq $4,%r13
- addq %r10,%r12
- xorq %r9,%rdi
-
- rorq $6,%r14
- xorq %rdx,%r13
- addq %rdi,%r12
-
- movq %r11,%rdi
- addq (%rbp),%r12
- xorq %r11,%r14
-
- xorq %rax,%rdi
- rorq $14,%r13
- movq %rax,%r10
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%r10
- addq %r12,%rcx
- addq %r12,%r10
-
- leaq 24(%rbp),%rbp
- movq 24(%rsp),%r13
- movq 0(%rsp),%r15
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%r10
- movq %r15,%r14
- rorq $42,%r15
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%r15
- shrq $6,%r14
-
- rorq $19,%r15
- xorq %r13,%r12
- xorq %r14,%r15
- addq 88(%rsp),%r12
-
- addq 16(%rsp),%r12
- movq %rcx,%r13
- addq %r15,%r12
- movq %r10,%r14
- rorq $23,%r13
- movq %rdx,%r15
-
- xorq %rcx,%r13
- rorq $5,%r14
- xorq %r8,%r15
-
- movq %r12,16(%rsp)
- xorq %r10,%r14
- andq %rcx,%r15
-
- rorq $4,%r13
- addq %r9,%r12
- xorq %r8,%r15
-
- rorq $6,%r14
- xorq %rcx,%r13
- addq %r15,%r12
-
- movq %r10,%r15
- addq (%rbp),%r12
- xorq %r10,%r14
-
- xorq %r11,%r15
- rorq $14,%r13
- movq %r11,%r9
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%r9
- addq %r12,%rbx
- addq %r12,%r9
-
- leaq 8(%rbp),%rbp
- movq 32(%rsp),%r13
- movq 8(%rsp),%rdi
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%r9
- movq %rdi,%r14
- rorq $42,%rdi
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%rdi
- shrq $6,%r14
-
- rorq $19,%rdi
- xorq %r13,%r12
- xorq %r14,%rdi
- addq 96(%rsp),%r12
-
- addq 24(%rsp),%r12
- movq %rbx,%r13
- addq %rdi,%r12
- movq %r9,%r14
- rorq $23,%r13
- movq %rcx,%rdi
-
- xorq %rbx,%r13
- rorq $5,%r14
- xorq %rdx,%rdi
-
- movq %r12,24(%rsp)
- xorq %r9,%r14
- andq %rbx,%rdi
-
- rorq $4,%r13
- addq %r8,%r12
- xorq %rdx,%rdi
-
- rorq $6,%r14
- xorq %rbx,%r13
- addq %rdi,%r12
-
- movq %r9,%rdi
- addq (%rbp),%r12
- xorq %r9,%r14
-
- xorq %r10,%rdi
- rorq $14,%r13
- movq %r10,%r8
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%r8
- addq %r12,%rax
- addq %r12,%r8
-
- leaq 24(%rbp),%rbp
- movq 40(%rsp),%r13
- movq 16(%rsp),%r15
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%r8
- movq %r15,%r14
- rorq $42,%r15
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%r15
- shrq $6,%r14
-
- rorq $19,%r15
- xorq %r13,%r12
- xorq %r14,%r15
- addq 104(%rsp),%r12
-
- addq 32(%rsp),%r12
- movq %rax,%r13
- addq %r15,%r12
- movq %r8,%r14
- rorq $23,%r13
- movq %rbx,%r15
-
- xorq %rax,%r13
- rorq $5,%r14
- xorq %rcx,%r15
-
- movq %r12,32(%rsp)
- xorq %r8,%r14
- andq %rax,%r15
-
- rorq $4,%r13
- addq %rdx,%r12
- xorq %rcx,%r15
-
- rorq $6,%r14
- xorq %rax,%r13
- addq %r15,%r12
-
- movq %r8,%r15
- addq (%rbp),%r12
- xorq %r8,%r14
-
- xorq %r9,%r15
- rorq $14,%r13
- movq %r9,%rdx
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%rdx
- addq %r12,%r11
- addq %r12,%rdx
-
- leaq 8(%rbp),%rbp
- movq 48(%rsp),%r13
- movq 24(%rsp),%rdi
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%rdx
- movq %rdi,%r14
- rorq $42,%rdi
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%rdi
- shrq $6,%r14
-
- rorq $19,%rdi
- xorq %r13,%r12
- xorq %r14,%rdi
- addq 112(%rsp),%r12
-
- addq 40(%rsp),%r12
- movq %r11,%r13
- addq %rdi,%r12
- movq %rdx,%r14
- rorq $23,%r13
- movq %rax,%rdi
-
- xorq %r11,%r13
- rorq $5,%r14
- xorq %rbx,%rdi
-
- movq %r12,40(%rsp)
- xorq %rdx,%r14
- andq %r11,%rdi
-
- rorq $4,%r13
- addq %rcx,%r12
- xorq %rbx,%rdi
-
- rorq $6,%r14
- xorq %r11,%r13
- addq %rdi,%r12
-
- movq %rdx,%rdi
- addq (%rbp),%r12
- xorq %rdx,%r14
-
- xorq %r8,%rdi
- rorq $14,%r13
- movq %r8,%rcx
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%rcx
- addq %r12,%r10
- addq %r12,%rcx
-
- leaq 24(%rbp),%rbp
- movq 56(%rsp),%r13
- movq 32(%rsp),%r15
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%rcx
- movq %r15,%r14
- rorq $42,%r15
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%r15
- shrq $6,%r14
-
- rorq $19,%r15
- xorq %r13,%r12
- xorq %r14,%r15
- addq 120(%rsp),%r12
-
- addq 48(%rsp),%r12
- movq %r10,%r13
- addq %r15,%r12
- movq %rcx,%r14
- rorq $23,%r13
- movq %r11,%r15
-
- xorq %r10,%r13
- rorq $5,%r14
- xorq %rax,%r15
-
- movq %r12,48(%rsp)
- xorq %rcx,%r14
- andq %r10,%r15
-
- rorq $4,%r13
- addq %rbx,%r12
- xorq %rax,%r15
-
- rorq $6,%r14
- xorq %r10,%r13
- addq %r15,%r12
-
- movq %rcx,%r15
- addq (%rbp),%r12
- xorq %rcx,%r14
-
- xorq %rdx,%r15
- rorq $14,%r13
- movq %rdx,%rbx
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%rbx
- addq %r12,%r9
- addq %r12,%rbx
-
- leaq 8(%rbp),%rbp
- movq 64(%rsp),%r13
- movq 40(%rsp),%rdi
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%rbx
- movq %rdi,%r14
- rorq $42,%rdi
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%rdi
- shrq $6,%r14
-
- rorq $19,%rdi
- xorq %r13,%r12
- xorq %r14,%rdi
- addq 0(%rsp),%r12
-
- addq 56(%rsp),%r12
- movq %r9,%r13
- addq %rdi,%r12
- movq %rbx,%r14
- rorq $23,%r13
- movq %r10,%rdi
-
- xorq %r9,%r13
- rorq $5,%r14
- xorq %r11,%rdi
-
- movq %r12,56(%rsp)
- xorq %rbx,%r14
- andq %r9,%rdi
-
- rorq $4,%r13
- addq %rax,%r12
- xorq %r11,%rdi
-
- rorq $6,%r14
- xorq %r9,%r13
- addq %rdi,%r12
-
- movq %rbx,%rdi
- addq (%rbp),%r12
- xorq %rbx,%r14
-
- xorq %rcx,%rdi
- rorq $14,%r13
- movq %rcx,%rax
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%rax
- addq %r12,%r8
- addq %r12,%rax
-
- leaq 24(%rbp),%rbp
- movq 72(%rsp),%r13
- movq 48(%rsp),%r15
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%rax
- movq %r15,%r14
- rorq $42,%r15
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%r15
- shrq $6,%r14
-
- rorq $19,%r15
- xorq %r13,%r12
- xorq %r14,%r15
- addq 8(%rsp),%r12
-
- addq 64(%rsp),%r12
- movq %r8,%r13
- addq %r15,%r12
- movq %rax,%r14
- rorq $23,%r13
- movq %r9,%r15
-
- xorq %r8,%r13
- rorq $5,%r14
- xorq %r10,%r15
-
- movq %r12,64(%rsp)
- xorq %rax,%r14
- andq %r8,%r15
-
- rorq $4,%r13
- addq %r11,%r12
- xorq %r10,%r15
-
- rorq $6,%r14
- xorq %r8,%r13
- addq %r15,%r12
-
- movq %rax,%r15
- addq (%rbp),%r12
- xorq %rax,%r14
-
- xorq %rbx,%r15
- rorq $14,%r13
- movq %rbx,%r11
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%r11
- addq %r12,%rdx
- addq %r12,%r11
-
- leaq 8(%rbp),%rbp
- movq 80(%rsp),%r13
- movq 56(%rsp),%rdi
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%r11
- movq %rdi,%r14
- rorq $42,%rdi
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%rdi
- shrq $6,%r14
-
- rorq $19,%rdi
- xorq %r13,%r12
- xorq %r14,%rdi
- addq 16(%rsp),%r12
-
- addq 72(%rsp),%r12
- movq %rdx,%r13
- addq %rdi,%r12
- movq %r11,%r14
- rorq $23,%r13
- movq %r8,%rdi
-
- xorq %rdx,%r13
- rorq $5,%r14
- xorq %r9,%rdi
-
- movq %r12,72(%rsp)
- xorq %r11,%r14
- andq %rdx,%rdi
-
- rorq $4,%r13
- addq %r10,%r12
- xorq %r9,%rdi
-
- rorq $6,%r14
- xorq %rdx,%r13
- addq %rdi,%r12
-
- movq %r11,%rdi
- addq (%rbp),%r12
- xorq %r11,%r14
-
- xorq %rax,%rdi
- rorq $14,%r13
- movq %rax,%r10
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%r10
- addq %r12,%rcx
- addq %r12,%r10
-
- leaq 24(%rbp),%rbp
- movq 88(%rsp),%r13
- movq 64(%rsp),%r15
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%r10
- movq %r15,%r14
- rorq $42,%r15
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%r15
- shrq $6,%r14
-
- rorq $19,%r15
- xorq %r13,%r12
- xorq %r14,%r15
- addq 24(%rsp),%r12
-
- addq 80(%rsp),%r12
- movq %rcx,%r13
- addq %r15,%r12
- movq %r10,%r14
- rorq $23,%r13
- movq %rdx,%r15
-
- xorq %rcx,%r13
- rorq $5,%r14
- xorq %r8,%r15
-
- movq %r12,80(%rsp)
- xorq %r10,%r14
- andq %rcx,%r15
-
- rorq $4,%r13
- addq %r9,%r12
- xorq %r8,%r15
-
- rorq $6,%r14
- xorq %rcx,%r13
- addq %r15,%r12
-
- movq %r10,%r15
- addq (%rbp),%r12
- xorq %r10,%r14
-
- xorq %r11,%r15
- rorq $14,%r13
- movq %r11,%r9
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%r9
- addq %r12,%rbx
- addq %r12,%r9
-
- leaq 8(%rbp),%rbp
- movq 96(%rsp),%r13
- movq 72(%rsp),%rdi
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%r9
- movq %rdi,%r14
- rorq $42,%rdi
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%rdi
- shrq $6,%r14
-
- rorq $19,%rdi
- xorq %r13,%r12
- xorq %r14,%rdi
- addq 32(%rsp),%r12
-
- addq 88(%rsp),%r12
- movq %rbx,%r13
- addq %rdi,%r12
- movq %r9,%r14
- rorq $23,%r13
- movq %rcx,%rdi
-
- xorq %rbx,%r13
- rorq $5,%r14
- xorq %rdx,%rdi
-
- movq %r12,88(%rsp)
- xorq %r9,%r14
- andq %rbx,%rdi
-
- rorq $4,%r13
- addq %r8,%r12
- xorq %rdx,%rdi
-
- rorq $6,%r14
- xorq %rbx,%r13
- addq %rdi,%r12
-
- movq %r9,%rdi
- addq (%rbp),%r12
- xorq %r9,%r14
-
- xorq %r10,%rdi
- rorq $14,%r13
- movq %r10,%r8
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%r8
- addq %r12,%rax
- addq %r12,%r8
-
- leaq 24(%rbp),%rbp
- movq 104(%rsp),%r13
- movq 80(%rsp),%r15
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%r8
- movq %r15,%r14
- rorq $42,%r15
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%r15
- shrq $6,%r14
-
- rorq $19,%r15
- xorq %r13,%r12
- xorq %r14,%r15
- addq 40(%rsp),%r12
-
- addq 96(%rsp),%r12
- movq %rax,%r13
- addq %r15,%r12
- movq %r8,%r14
- rorq $23,%r13
- movq %rbx,%r15
-
- xorq %rax,%r13
- rorq $5,%r14
- xorq %rcx,%r15
-
- movq %r12,96(%rsp)
- xorq %r8,%r14
- andq %rax,%r15
-
- rorq $4,%r13
- addq %rdx,%r12
- xorq %rcx,%r15
-
- rorq $6,%r14
- xorq %rax,%r13
- addq %r15,%r12
-
- movq %r8,%r15
- addq (%rbp),%r12
- xorq %r8,%r14
-
- xorq %r9,%r15
- rorq $14,%r13
- movq %r9,%rdx
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%rdx
- addq %r12,%r11
- addq %r12,%rdx
-
- leaq 8(%rbp),%rbp
- movq 112(%rsp),%r13
- movq 88(%rsp),%rdi
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%rdx
- movq %rdi,%r14
- rorq $42,%rdi
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%rdi
- shrq $6,%r14
-
- rorq $19,%rdi
- xorq %r13,%r12
- xorq %r14,%rdi
- addq 48(%rsp),%r12
-
- addq 104(%rsp),%r12
- movq %r11,%r13
- addq %rdi,%r12
- movq %rdx,%r14
- rorq $23,%r13
- movq %rax,%rdi
-
- xorq %r11,%r13
- rorq $5,%r14
- xorq %rbx,%rdi
-
- movq %r12,104(%rsp)
- xorq %rdx,%r14
- andq %r11,%rdi
-
- rorq $4,%r13
- addq %rcx,%r12
- xorq %rbx,%rdi
-
- rorq $6,%r14
- xorq %r11,%r13
- addq %rdi,%r12
-
- movq %rdx,%rdi
- addq (%rbp),%r12
- xorq %rdx,%r14
-
- xorq %r8,%rdi
- rorq $14,%r13
- movq %r8,%rcx
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%rcx
- addq %r12,%r10
- addq %r12,%rcx
-
- leaq 24(%rbp),%rbp
- movq 120(%rsp),%r13
- movq 96(%rsp),%r15
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%rcx
- movq %r15,%r14
- rorq $42,%r15
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%r15
- shrq $6,%r14
-
- rorq $19,%r15
- xorq %r13,%r12
- xorq %r14,%r15
- addq 56(%rsp),%r12
-
- addq 112(%rsp),%r12
- movq %r10,%r13
- addq %r15,%r12
- movq %rcx,%r14
- rorq $23,%r13
- movq %r11,%r15
-
- xorq %r10,%r13
- rorq $5,%r14
- xorq %rax,%r15
-
- movq %r12,112(%rsp)
- xorq %rcx,%r14
- andq %r10,%r15
-
- rorq $4,%r13
- addq %rbx,%r12
- xorq %rax,%r15
-
- rorq $6,%r14
- xorq %r10,%r13
- addq %r15,%r12
-
- movq %rcx,%r15
- addq (%rbp),%r12
- xorq %rcx,%r14
-
- xorq %rdx,%r15
- rorq $14,%r13
- movq %rdx,%rbx
-
- andq %r15,%rdi
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %rdi,%rbx
- addq %r12,%r9
- addq %r12,%rbx
-
- leaq 8(%rbp),%rbp
- movq 0(%rsp),%r13
- movq 104(%rsp),%rdi
-
- movq %r13,%r12
- rorq $7,%r13
- addq %r14,%rbx
- movq %rdi,%r14
- rorq $42,%rdi
-
- xorq %r12,%r13
- shrq $7,%r12
- rorq $1,%r13
- xorq %r14,%rdi
- shrq $6,%r14
-
- rorq $19,%rdi
- xorq %r13,%r12
- xorq %r14,%rdi
- addq 64(%rsp),%r12
-
- addq 120(%rsp),%r12
- movq %r9,%r13
- addq %rdi,%r12
- movq %rbx,%r14
- rorq $23,%r13
- movq %r10,%rdi
-
- xorq %r9,%r13
- rorq $5,%r14
- xorq %r11,%rdi
-
- movq %r12,120(%rsp)
- xorq %rbx,%r14
- andq %r9,%rdi
-
- rorq $4,%r13
- addq %rax,%r12
- xorq %r11,%rdi
-
- rorq $6,%r14
- xorq %r9,%r13
- addq %rdi,%r12
-
- movq %rbx,%rdi
- addq (%rbp),%r12
- xorq %rbx,%r14
-
- xorq %rcx,%rdi
- rorq $14,%r13
- movq %rcx,%rax
-
- andq %rdi,%r15
- rorq $28,%r14
- addq %r13,%r12
-
- xorq %r15,%rax
- addq %r12,%r8
- addq %r12,%rax
-
- leaq 24(%rbp),%rbp
- cmpb $0,7(%rbp)
- jnz L$rounds_16_xx
-
- movq 128+0(%rsp),%rdi
- addq %r14,%rax
- leaq 128(%rsi),%rsi
-
- addq 0(%rdi),%rax
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
- jb L$loop
-
- movq 152(%rsp),%rsi
-
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbp
-
- movq -8(%rsi),%rbx
-
- leaq (%rsi),%rsp
-
-L$epilogue:
- ret
-
-
-.section __DATA,__const
-.p2align 6
-
-K512:
-.quad 0x428a2f98d728ae22,0x7137449123ef65cd
-.quad 0x428a2f98d728ae22,0x7137449123ef65cd
-.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
-.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
-.quad 0x3956c25bf348b538,0x59f111f1b605d019
-.quad 0x3956c25bf348b538,0x59f111f1b605d019
-.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
-.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
-.quad 0xd807aa98a3030242,0x12835b0145706fbe
-.quad 0xd807aa98a3030242,0x12835b0145706fbe
-.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
-.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
-.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
-.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
-.quad 0x9bdc06a725c71235,0xc19bf174cf692694
-.quad 0x9bdc06a725c71235,0xc19bf174cf692694
-.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
-.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
-.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
-.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
-.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
-.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
-.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
-.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
-.quad 0x983e5152ee66dfab,0xa831c66d2db43210
-.quad 0x983e5152ee66dfab,0xa831c66d2db43210
-.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
-.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
-.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
-.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
-.quad 0x06ca6351e003826f,0x142929670a0e6e70
-.quad 0x06ca6351e003826f,0x142929670a0e6e70
-.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
-.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
-.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
-.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
-.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
-.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
-.quad 0x81c2c92e47edaee6,0x92722c851482353b
-.quad 0x81c2c92e47edaee6,0x92722c851482353b
-.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
-.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
-.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
-.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
-.quad 0xd192e819d6ef5218,0xd69906245565a910
-.quad 0xd192e819d6ef5218,0xd69906245565a910
-.quad 0xf40e35855771202a,0x106aa07032bbd1b8
-.quad 0xf40e35855771202a,0x106aa07032bbd1b8
-.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
-.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
-.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
-.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
-.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
-.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
-.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
-.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
-.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
-.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
-.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
-.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
-.quad 0x90befffa23631e28,0xa4506cebde82bde9
-.quad 0x90befffa23631e28,0xa4506cebde82bde9
-.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
-.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
-.quad 0xca273eceea26619c,0xd186b8c721c0c207
-.quad 0xca273eceea26619c,0xd186b8c721c0c207
-.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
-.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
-.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
-.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
-.quad 0x113f9804bef90dae,0x1b710b35131c471b
-.quad 0x113f9804bef90dae,0x1b710b35131c471b
-.quad 0x28db77f523047d84,0x32caab7b40c72493
-.quad 0x28db77f523047d84,0x32caab7b40c72493
-.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
-.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
-.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
-.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
-.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
-.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
-
-.quad 0x0001020304050607,0x08090a0b0c0d0e0f
-.quad 0x0001020304050607,0x08090a0b0c0d0e0f
-.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.text
-
-.p2align 6
-sha512_block_data_order_avx:
-
-L$avx_shortcut:
- movq %rsp,%rax
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- shlq $4,%rdx
- subq $160,%rsp
- leaq (%rsi,%rdx,8),%rdx
- andq $-64,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %rax,152(%rsp)
-
-L$prologue_avx:
-
- vzeroupper
- movq 0(%rdi),%rax
- movq 8(%rdi),%rbx
- movq 16(%rdi),%rcx
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp L$loop_avx
-.p2align 4
-L$loop_avx:
- vmovdqa K512+1280(%rip),%xmm11
- vmovdqu 0(%rsi),%xmm0
- leaq K512+128(%rip),%rbp
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vpshufb %xmm11,%xmm0,%xmm0
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm11,%xmm1,%xmm1
- vmovdqu 64(%rsi),%xmm4
- vpshufb %xmm11,%xmm2,%xmm2
- vmovdqu 80(%rsi),%xmm5
- vpshufb %xmm11,%xmm3,%xmm3
- vmovdqu 96(%rsi),%xmm6
- vpshufb %xmm11,%xmm4,%xmm4
- vmovdqu 112(%rsi),%xmm7
- vpshufb %xmm11,%xmm5,%xmm5
- vpaddq -128(%rbp),%xmm0,%xmm8
- vpshufb %xmm11,%xmm6,%xmm6
- vpaddq -96(%rbp),%xmm1,%xmm9
- vpshufb %xmm11,%xmm7,%xmm7
- vpaddq -64(%rbp),%xmm2,%xmm10
- vpaddq -32(%rbp),%xmm3,%xmm11
- vmovdqa %xmm8,0(%rsp)
- vpaddq 0(%rbp),%xmm4,%xmm8
- vmovdqa %xmm9,16(%rsp)
- vpaddq 32(%rbp),%xmm5,%xmm9
- vmovdqa %xmm10,32(%rsp)
- vpaddq 64(%rbp),%xmm6,%xmm10
- vmovdqa %xmm11,48(%rsp)
- vpaddq 96(%rbp),%xmm7,%xmm11
- vmovdqa %xmm8,64(%rsp)
- movq %rax,%r14
- vmovdqa %xmm9,80(%rsp)
- movq %rbx,%rdi
- vmovdqa %xmm10,96(%rsp)
- xorq %rcx,%rdi
- vmovdqa %xmm11,112(%rsp)
- movq %r8,%r13
- jmp L$avx_00_47
-
-.p2align 4
-L$avx_00_47:
- addq $256,%rbp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rax
- vpalignr $8,%xmm4,%xmm5,%xmm11
- movq %r9,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r8,%r13
- xorq %r10,%r12
- vpaddq %xmm11,%xmm0,%xmm0
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r8,%r12
- xorq %r8,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 0(%rsp),%r11
- movq %rax,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rbx,%r15
- addq %r12,%r11
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm7,%xmm11
- addq %r11,%rdx
- addq %rdi,%r11
- vpxor %xmm9,%xmm8,%xmm8
- movq %rdx,%r13
- addq %r11,%r14
- vpsllq $3,%xmm7,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r11
- vpaddq %xmm8,%xmm0,%xmm0
- movq %r8,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm7,%xmm9
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rdx,%r12
- xorq %rdx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 8(%rsp),%r10
- movq %r11,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rax,%rdi
- addq %r12,%r10
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm0,%xmm0
- xorq %r11,%r14
- addq %r13,%r10
- vpaddq -128(%rbp),%xmm0,%xmm10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,0(%rsp)
- vpalignr $8,%xmm1,%xmm2,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r10
- vpalignr $8,%xmm5,%xmm6,%xmm11
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rcx,%r13
- xorq %r8,%r12
- vpaddq %xmm11,%xmm1,%xmm1
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rcx,%r12
- xorq %rcx,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 16(%rsp),%r9
- movq %r10,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r11,%r15
- addq %r12,%r9
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm0,%xmm11
- addq %r9,%rbx
- addq %rdi,%r9
- vpxor %xmm9,%xmm8,%xmm8
- movq %rbx,%r13
- addq %r9,%r14
- vpsllq $3,%xmm0,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r9
- vpaddq %xmm8,%xmm1,%xmm1
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm0,%xmm9
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rbx,%r12
- xorq %rbx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 24(%rsp),%r8
- movq %r9,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r10,%rdi
- addq %r12,%r8
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm1,%xmm1
- xorq %r9,%r14
- addq %r13,%r8
- vpaddq -96(%rbp),%xmm1,%xmm10
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,16(%rsp)
- vpalignr $8,%xmm2,%xmm3,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r8
- vpalignr $8,%xmm6,%xmm7,%xmm11
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rax,%r13
- xorq %rcx,%r12
- vpaddq %xmm11,%xmm2,%xmm2
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rax,%r12
- xorq %rax,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 32(%rsp),%rdx
- movq %r8,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r9,%r15
- addq %r12,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm1,%xmm11
- addq %rdx,%r11
- addq %rdi,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r11,%r13
- addq %rdx,%r14
- vpsllq $3,%xmm1,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- vpaddq %xmm8,%xmm2,%xmm2
- movq %rax,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm1,%xmm9
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r11,%r12
- xorq %r11,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r8,%rdi
- addq %r12,%rcx
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm2,%xmm2
- xorq %rdx,%r14
- addq %r13,%rcx
- vpaddq -64(%rbp),%xmm2,%xmm10
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,32(%rsp)
- vpalignr $8,%xmm3,%xmm4,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm7,%xmm0,%xmm11
- movq %r11,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r10,%r13
- xorq %rax,%r12
- vpaddq %xmm11,%xmm3,%xmm3
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r10,%r12
- xorq %r10,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rdx,%r15
- addq %r12,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm2,%xmm11
- addq %rbx,%r9
- addq %rdi,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r9,%r13
- addq %rbx,%r14
- vpsllq $3,%xmm2,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- vpaddq %xmm8,%xmm3,%xmm3
- movq %r10,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm2,%xmm9
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r9,%r12
- xorq %r9,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rcx,%rdi
- addq %r12,%rax
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm3,%xmm3
- xorq %rbx,%r14
- addq %r13,%rax
- vpaddq -32(%rbp),%xmm3,%xmm10
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,48(%rsp)
- vpalignr $8,%xmm4,%xmm5,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rax
- vpalignr $8,%xmm0,%xmm1,%xmm11
- movq %r9,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r8,%r13
- xorq %r10,%r12
- vpaddq %xmm11,%xmm4,%xmm4
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r8,%r12
- xorq %r8,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 64(%rsp),%r11
- movq %rax,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rbx,%r15
- addq %r12,%r11
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm3,%xmm11
- addq %r11,%rdx
- addq %rdi,%r11
- vpxor %xmm9,%xmm8,%xmm8
- movq %rdx,%r13
- addq %r11,%r14
- vpsllq $3,%xmm3,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r11
- vpaddq %xmm8,%xmm4,%xmm4
- movq %r8,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm3,%xmm9
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rdx,%r12
- xorq %rdx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 72(%rsp),%r10
- movq %r11,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rax,%rdi
- addq %r12,%r10
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm4,%xmm4
- xorq %r11,%r14
- addq %r13,%r10
- vpaddq 0(%rbp),%xmm4,%xmm10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,64(%rsp)
- vpalignr $8,%xmm5,%xmm6,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r10
- vpalignr $8,%xmm1,%xmm2,%xmm11
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rcx,%r13
- xorq %r8,%r12
- vpaddq %xmm11,%xmm5,%xmm5
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rcx,%r12
- xorq %rcx,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 80(%rsp),%r9
- movq %r10,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r11,%r15
- addq %r12,%r9
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm4,%xmm11
- addq %r9,%rbx
- addq %rdi,%r9
- vpxor %xmm9,%xmm8,%xmm8
- movq %rbx,%r13
- addq %r9,%r14
- vpsllq $3,%xmm4,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r9
- vpaddq %xmm8,%xmm5,%xmm5
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm4,%xmm9
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rbx,%r12
- xorq %rbx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 88(%rsp),%r8
- movq %r9,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r10,%rdi
- addq %r12,%r8
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm5,%xmm5
- xorq %r9,%r14
- addq %r13,%r8
- vpaddq 32(%rbp),%xmm5,%xmm10
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,80(%rsp)
- vpalignr $8,%xmm6,%xmm7,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r8
- vpalignr $8,%xmm2,%xmm3,%xmm11
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rax,%r13
- xorq %rcx,%r12
- vpaddq %xmm11,%xmm6,%xmm6
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rax,%r12
- xorq %rax,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 96(%rsp),%rdx
- movq %r8,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r9,%r15
- addq %r12,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm5,%xmm11
- addq %rdx,%r11
- addq %rdi,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r11,%r13
- addq %rdx,%r14
- vpsllq $3,%xmm5,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- vpaddq %xmm8,%xmm6,%xmm6
- movq %rax,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm5,%xmm9
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r11,%r12
- xorq %r11,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r8,%rdi
- addq %r12,%rcx
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm6,%xmm6
- xorq %rdx,%r14
- addq %r13,%rcx
- vpaddq 64(%rbp),%xmm6,%xmm10
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,96(%rsp)
- vpalignr $8,%xmm7,%xmm0,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm3,%xmm4,%xmm11
- movq %r11,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r10,%r13
- xorq %rax,%r12
- vpaddq %xmm11,%xmm7,%xmm7
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r10,%r12
- xorq %r10,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rdx,%r15
- addq %r12,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm6,%xmm11
- addq %rbx,%r9
- addq %rdi,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r9,%r13
- addq %rbx,%r14
- vpsllq $3,%xmm6,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- vpaddq %xmm8,%xmm7,%xmm7
- movq %r10,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm6,%xmm9
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r9,%r12
- xorq %r9,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rcx,%rdi
- addq %r12,%rax
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm7,%xmm7
- xorq %rbx,%r14
- addq %r13,%rax
- vpaddq 96(%rbp),%xmm7,%xmm10
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,112(%rsp)
- cmpb $0,135(%rbp)
- jne L$avx_00_47
- shrdq $23,%r13,%r13
- movq %r14,%rax
- movq %r9,%r12
- shrdq $5,%r14,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r11
- movq %r8,%r12
- shrdq $5,%r14,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r10
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- xorq %r11,%r15
- addq %r12,%r9
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r9
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r8
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- movq %rax,%r12
- shrdq $5,%r14,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- movq %r11,%r12
- shrdq $5,%r14,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- movq %r10,%r12
- shrdq $5,%r14,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rax
- movq %r9,%r12
- shrdq $5,%r14,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r11
- movq %r8,%r12
- shrdq $5,%r14,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r10
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- xorq %r11,%r15
- addq %r12,%r9
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r9
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r8
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- movq %rax,%r12
- shrdq $5,%r14,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- movq %r11,%r12
- shrdq $5,%r14,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- movq %r10,%r12
- shrdq $5,%r14,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- movq 128+0(%rsp),%rdi
- movq %r14,%rax
-
- addq 0(%rdi),%rax
- leaq 128(%rsi),%rsi
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
- jb L$loop_avx
-
- movq 152(%rsp),%rsi
-
- vzeroupper
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbp
-
- movq -8(%rsi),%rbx
-
- leaq (%rsi),%rsp
-
-L$epilogue_avx:
- ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/vpaes-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/vpaes-x86_64-apple.S
deleted file mode 100644
index 5aea40f..0000000
--- a/apple-x86_64/crypto/fipsmodule/vpaes-x86_64-apple.S
+++ /dev/null
@@ -1,1131 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align 4
-_vpaes_encrypt_core:
-
- movq %rdx,%r9
- movq $16,%r11
- movl 240(%rdx),%eax
- movdqa %xmm9,%xmm1
- movdqa L$k_ipt(%rip),%xmm2
- pandn %xmm0,%xmm1
- movdqu (%r9),%xmm5
- psrld $4,%xmm1
- pand %xmm9,%xmm0
-.byte 102,15,56,0,208
- movdqa L$k_ipt+16(%rip),%xmm0
-.byte 102,15,56,0,193
- pxor %xmm5,%xmm2
- addq $16,%r9
- pxor %xmm2,%xmm0
- leaq L$k_mc_backward(%rip),%r10
- jmp L$enc_entry
-
-.p2align 4
-L$enc_loop:
-
- movdqa %xmm13,%xmm4
- movdqa %xmm12,%xmm0
-.byte 102,15,56,0,226
-.byte 102,15,56,0,195
- pxor %xmm5,%xmm4
- movdqa %xmm15,%xmm5
- pxor %xmm4,%xmm0
- movdqa -64(%r11,%r10,1),%xmm1
-.byte 102,15,56,0,234
- movdqa (%r11,%r10,1),%xmm4
- movdqa %xmm14,%xmm2
-.byte 102,15,56,0,211
- movdqa %xmm0,%xmm3
- pxor %xmm5,%xmm2
-.byte 102,15,56,0,193
- addq $16,%r9
- pxor %xmm2,%xmm0
-.byte 102,15,56,0,220
- addq $16,%r11
- pxor %xmm0,%xmm3
-.byte 102,15,56,0,193
- andq $0x30,%r11
- subq $1,%rax
- pxor %xmm3,%xmm0
-
-L$enc_entry:
-
- movdqa %xmm9,%xmm1
- movdqa %xmm11,%xmm5
- pandn %xmm0,%xmm1
- psrld $4,%xmm1
- pand %xmm9,%xmm0
-.byte 102,15,56,0,232
- movdqa %xmm10,%xmm3
- pxor %xmm1,%xmm0
-.byte 102,15,56,0,217
- movdqa %xmm10,%xmm4
- pxor %xmm5,%xmm3
-.byte 102,15,56,0,224
- movdqa %xmm10,%xmm2
- pxor %xmm5,%xmm4
-.byte 102,15,56,0,211
- movdqa %xmm10,%xmm3
- pxor %xmm0,%xmm2
-.byte 102,15,56,0,220
- movdqu (%r9),%xmm5
- pxor %xmm1,%xmm3
- jnz L$enc_loop
-
-
- movdqa -96(%r10),%xmm4
- movdqa -80(%r10),%xmm0
-.byte 102,15,56,0,226
- pxor %xmm5,%xmm4
-.byte 102,15,56,0,195
- movdqa 64(%r11,%r10,1),%xmm1
- pxor %xmm4,%xmm0
-.byte 102,15,56,0,193
- ret
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align 4
-_vpaes_encrypt_core_2x:
-
- movq %rdx,%r9
- movq $16,%r11
- movl 240(%rdx),%eax
- movdqa %xmm9,%xmm1
- movdqa %xmm9,%xmm7
- movdqa L$k_ipt(%rip),%xmm2
- movdqa %xmm2,%xmm8
- pandn %xmm0,%xmm1
- pandn %xmm6,%xmm7
- movdqu (%r9),%xmm5
-
- psrld $4,%xmm1
- psrld $4,%xmm7
- pand %xmm9,%xmm0
- pand %xmm9,%xmm6
-.byte 102,15,56,0,208
-.byte 102,68,15,56,0,198
- movdqa L$k_ipt+16(%rip),%xmm0
- movdqa %xmm0,%xmm6
-.byte 102,15,56,0,193
-.byte 102,15,56,0,247
- pxor %xmm5,%xmm2
- pxor %xmm5,%xmm8
- addq $16,%r9
- pxor %xmm2,%xmm0
- pxor %xmm8,%xmm6
- leaq L$k_mc_backward(%rip),%r10
- jmp L$enc2x_entry
-
-.p2align 4
-L$enc2x_loop:
-
- movdqa L$k_sb1(%rip),%xmm4
- movdqa L$k_sb1+16(%rip),%xmm0
- movdqa %xmm4,%xmm12
- movdqa %xmm0,%xmm6
-.byte 102,15,56,0,226
-.byte 102,69,15,56,0,224
-.byte 102,15,56,0,195
-.byte 102,65,15,56,0,243
- pxor %xmm5,%xmm4
- pxor %xmm5,%xmm12
- movdqa L$k_sb2(%rip),%xmm5
- movdqa %xmm5,%xmm13
- pxor %xmm4,%xmm0
- pxor %xmm12,%xmm6
- movdqa -64(%r11,%r10,1),%xmm1
-
-.byte 102,15,56,0,234
-.byte 102,69,15,56,0,232
- movdqa (%r11,%r10,1),%xmm4
-
- movdqa L$k_sb2+16(%rip),%xmm2
- movdqa %xmm2,%xmm8
-.byte 102,15,56,0,211
-.byte 102,69,15,56,0,195
- movdqa %xmm0,%xmm3
- movdqa %xmm6,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm13,%xmm8
-.byte 102,15,56,0,193
-.byte 102,15,56,0,241
- addq $16,%r9
- pxor %xmm2,%xmm0
- pxor %xmm8,%xmm6
-.byte 102,15,56,0,220
-.byte 102,68,15,56,0,220
- addq $16,%r11
- pxor %xmm0,%xmm3
- pxor %xmm6,%xmm11
-.byte 102,15,56,0,193
-.byte 102,15,56,0,241
- andq $0x30,%r11
- subq $1,%rax
- pxor %xmm3,%xmm0
- pxor %xmm11,%xmm6
-
-L$enc2x_entry:
-
- movdqa %xmm9,%xmm1
- movdqa %xmm9,%xmm7
- movdqa L$k_inv+16(%rip),%xmm5
- movdqa %xmm5,%xmm13
- pandn %xmm0,%xmm1
- pandn %xmm6,%xmm7
- psrld $4,%xmm1
- psrld $4,%xmm7
- pand %xmm9,%xmm0
- pand %xmm9,%xmm6
-.byte 102,15,56,0,232
-.byte 102,68,15,56,0,238
- movdqa %xmm10,%xmm3
- movdqa %xmm10,%xmm11
- pxor %xmm1,%xmm0
- pxor %xmm7,%xmm6
-.byte 102,15,56,0,217
-.byte 102,68,15,56,0,223
- movdqa %xmm10,%xmm4
- movdqa %xmm10,%xmm12
- pxor %xmm5,%xmm3
- pxor %xmm13,%xmm11
-.byte 102,15,56,0,224
-.byte 102,68,15,56,0,230
- movdqa %xmm10,%xmm2
- movdqa %xmm10,%xmm8
- pxor %xmm5,%xmm4
- pxor %xmm13,%xmm12
-.byte 102,15,56,0,211
-.byte 102,69,15,56,0,195
- movdqa %xmm10,%xmm3
- movdqa %xmm10,%xmm11
- pxor %xmm0,%xmm2
- pxor %xmm6,%xmm8
-.byte 102,15,56,0,220
-.byte 102,69,15,56,0,220
- movdqu (%r9),%xmm5
-
- pxor %xmm1,%xmm3
- pxor %xmm7,%xmm11
- jnz L$enc2x_loop
-
-
- movdqa -96(%r10),%xmm4
- movdqa -80(%r10),%xmm0
- movdqa %xmm4,%xmm12
- movdqa %xmm0,%xmm6
-.byte 102,15,56,0,226
-.byte 102,69,15,56,0,224
- pxor %xmm5,%xmm4
- pxor %xmm5,%xmm12
-.byte 102,15,56,0,195
-.byte 102,65,15,56,0,243
- movdqa 64(%r11,%r10,1),%xmm1
-
- pxor %xmm4,%xmm0
- pxor %xmm12,%xmm6
-.byte 102,15,56,0,193
-.byte 102,15,56,0,241
- ret
-
-
-
-
-
-
-
-
-
-.p2align 4
-_vpaes_decrypt_core:
-
- movq %rdx,%r9
- movl 240(%rdx),%eax
- movdqa %xmm9,%xmm1
- movdqa L$k_dipt(%rip),%xmm2
- pandn %xmm0,%xmm1
- movq %rax,%r11
- psrld $4,%xmm1
- movdqu (%r9),%xmm5
- shlq $4,%r11
- pand %xmm9,%xmm0
-.byte 102,15,56,0,208
- movdqa L$k_dipt+16(%rip),%xmm0
- xorq $0x30,%r11
- leaq L$k_dsbd(%rip),%r10
-.byte 102,15,56,0,193
- andq $0x30,%r11
- pxor %xmm5,%xmm2
- movdqa L$k_mc_forward+48(%rip),%xmm5
- pxor %xmm2,%xmm0
- addq $16,%r9
- addq %r10,%r11
- jmp L$dec_entry
-
-.p2align 4
-L$dec_loop:
-
-
-
- movdqa -32(%r10),%xmm4
- movdqa -16(%r10),%xmm1
-.byte 102,15,56,0,226
-.byte 102,15,56,0,203
- pxor %xmm4,%xmm0
- movdqa 0(%r10),%xmm4
- pxor %xmm1,%xmm0
- movdqa 16(%r10),%xmm1
-
-.byte 102,15,56,0,226
-.byte 102,15,56,0,197
-.byte 102,15,56,0,203
- pxor %xmm4,%xmm0
- movdqa 32(%r10),%xmm4
- pxor %xmm1,%xmm0
- movdqa 48(%r10),%xmm1
-
-.byte 102,15,56,0,226
-.byte 102,15,56,0,197
-.byte 102,15,56,0,203
- pxor %xmm4,%xmm0
- movdqa 64(%r10),%xmm4
- pxor %xmm1,%xmm0
- movdqa 80(%r10),%xmm1
-
-.byte 102,15,56,0,226
-.byte 102,15,56,0,197
-.byte 102,15,56,0,203
- pxor %xmm4,%xmm0
- addq $16,%r9
-.byte 102,15,58,15,237,12
- pxor %xmm1,%xmm0
- subq $1,%rax
-
-L$dec_entry:
-
- movdqa %xmm9,%xmm1
- pandn %xmm0,%xmm1
- movdqa %xmm11,%xmm2
- psrld $4,%xmm1
- pand %xmm9,%xmm0
-.byte 102,15,56,0,208
- movdqa %xmm10,%xmm3
- pxor %xmm1,%xmm0
-.byte 102,15,56,0,217
- movdqa %xmm10,%xmm4
- pxor %xmm2,%xmm3
-.byte 102,15,56,0,224
- pxor %xmm2,%xmm4
- movdqa %xmm10,%xmm2
-.byte 102,15,56,0,211
- movdqa %xmm10,%xmm3
- pxor %xmm0,%xmm2
-.byte 102,15,56,0,220
- movdqu (%r9),%xmm0
- pxor %xmm1,%xmm3
- jnz L$dec_loop
-
-
- movdqa 96(%r10),%xmm4
-.byte 102,15,56,0,226
- pxor %xmm0,%xmm4
- movdqa 112(%r10),%xmm0
- movdqa -352(%r11),%xmm2
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
-.byte 102,15,56,0,194
- ret
-
-
-
-
-
-
-
-
-
-.p2align 4
-_vpaes_schedule_core:
-
-
-
-
-
-
- call _vpaes_preheat
- movdqa L$k_rcon(%rip),%xmm8
- movdqu (%rdi),%xmm0
-
-
- movdqa %xmm0,%xmm3
- leaq L$k_ipt(%rip),%r11
- call _vpaes_schedule_transform
- movdqa %xmm0,%xmm7
-
- leaq L$k_sr(%rip),%r10
- testq %rcx,%rcx
- jnz L$schedule_am_decrypting
-
-
- movdqu %xmm0,(%rdx)
- jmp L$schedule_go
-
-L$schedule_am_decrypting:
-
- movdqa (%r8,%r10,1),%xmm1
-.byte 102,15,56,0,217
- movdqu %xmm3,(%rdx)
- xorq $0x30,%r8
-
-L$schedule_go:
- cmpl $192,%esi
- ja L$schedule_256
- je L$schedule_192
-
-
-
-
-
-
-
-
-
-
-L$schedule_128:
- movl $10,%esi
-
-L$oop_schedule_128:
- call _vpaes_schedule_round
- decq %rsi
- jz L$schedule_mangle_last
- call _vpaes_schedule_mangle
- jmp L$oop_schedule_128
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align 4
-L$schedule_192:
- movdqu 8(%rdi),%xmm0
- call _vpaes_schedule_transform
- movdqa %xmm0,%xmm6
- pxor %xmm4,%xmm4
- movhlps %xmm4,%xmm6
- movl $4,%esi
-
-L$oop_schedule_192:
- call _vpaes_schedule_round
-.byte 102,15,58,15,198,8
- call _vpaes_schedule_mangle
- call _vpaes_schedule_192_smear
- call _vpaes_schedule_mangle
- call _vpaes_schedule_round
- decq %rsi
- jz L$schedule_mangle_last
- call _vpaes_schedule_mangle
- call _vpaes_schedule_192_smear
- jmp L$oop_schedule_192
-
-
-
-
-
-
-
-
-
-
-
-.p2align 4
-L$schedule_256:
- movdqu 16(%rdi),%xmm0
- call _vpaes_schedule_transform
- movl $7,%esi
-
-L$oop_schedule_256:
- call _vpaes_schedule_mangle
- movdqa %xmm0,%xmm6
-
-
- call _vpaes_schedule_round
- decq %rsi
- jz L$schedule_mangle_last
- call _vpaes_schedule_mangle
-
-
- pshufd $0xFF,%xmm0,%xmm0
- movdqa %xmm7,%xmm5
- movdqa %xmm6,%xmm7
- call _vpaes_schedule_low_round
- movdqa %xmm5,%xmm7
-
- jmp L$oop_schedule_256
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align 4
-L$schedule_mangle_last:
-
- leaq L$k_deskew(%rip),%r11
- testq %rcx,%rcx
- jnz L$schedule_mangle_last_dec
-
-
- movdqa (%r8,%r10,1),%xmm1
-.byte 102,15,56,0,193
- leaq L$k_opt(%rip),%r11
- addq $32,%rdx
-
-L$schedule_mangle_last_dec:
- addq $-16,%rdx
- pxor L$k_s63(%rip),%xmm0
- call _vpaes_schedule_transform
- movdqu %xmm0,(%rdx)
-
-
- pxor %xmm0,%xmm0
- pxor %xmm1,%xmm1
- pxor %xmm2,%xmm2
- pxor %xmm3,%xmm3
- pxor %xmm4,%xmm4
- pxor %xmm5,%xmm5
- pxor %xmm6,%xmm6
- pxor %xmm7,%xmm7
- ret
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align 4
-_vpaes_schedule_192_smear:
-
- pshufd $0x80,%xmm6,%xmm1
- pshufd $0xFE,%xmm7,%xmm0
- pxor %xmm1,%xmm6
- pxor %xmm1,%xmm1
- pxor %xmm0,%xmm6
- movdqa %xmm6,%xmm0
- movhlps %xmm1,%xmm6
- ret
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align 4
-_vpaes_schedule_round:
-
-
- pxor %xmm1,%xmm1
-.byte 102,65,15,58,15,200,15
-.byte 102,69,15,58,15,192,15
- pxor %xmm1,%xmm7
-
-
- pshufd $0xFF,%xmm0,%xmm0
-.byte 102,15,58,15,192,1
-
-
-
-
-_vpaes_schedule_low_round:
-
- movdqa %xmm7,%xmm1
- pslldq $4,%xmm7
- pxor %xmm1,%xmm7
- movdqa %xmm7,%xmm1
- pslldq $8,%xmm7
- pxor %xmm1,%xmm7
- pxor L$k_s63(%rip),%xmm7
-
-
- movdqa %xmm9,%xmm1
- pandn %xmm0,%xmm1
- psrld $4,%xmm1
- pand %xmm9,%xmm0
- movdqa %xmm11,%xmm2
-.byte 102,15,56,0,208
- pxor %xmm1,%xmm0
- movdqa %xmm10,%xmm3
-.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
- movdqa %xmm10,%xmm4
-.byte 102,15,56,0,224
- pxor %xmm2,%xmm4
- movdqa %xmm10,%xmm2
-.byte 102,15,56,0,211
- pxor %xmm0,%xmm2
- movdqa %xmm10,%xmm3
-.byte 102,15,56,0,220
- pxor %xmm1,%xmm3
- movdqa %xmm13,%xmm4
-.byte 102,15,56,0,226
- movdqa %xmm12,%xmm0
-.byte 102,15,56,0,195
- pxor %xmm4,%xmm0
-
-
- pxor %xmm7,%xmm0
- movdqa %xmm0,%xmm7
- ret
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align 4
-_vpaes_schedule_transform:
-
- movdqa %xmm9,%xmm1
- pandn %xmm0,%xmm1
- psrld $4,%xmm1
- pand %xmm9,%xmm0
- movdqa (%r11),%xmm2
-.byte 102,15,56,0,208
- movdqa 16(%r11),%xmm0
-.byte 102,15,56,0,193
- pxor %xmm2,%xmm0
- ret
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align 4
-_vpaes_schedule_mangle:
-
- movdqa %xmm0,%xmm4
- movdqa L$k_mc_forward(%rip),%xmm5
- testq %rcx,%rcx
- jnz L$schedule_mangle_dec
-
-
- addq $16,%rdx
- pxor L$k_s63(%rip),%xmm4
-.byte 102,15,56,0,229
- movdqa %xmm4,%xmm3
-.byte 102,15,56,0,229
- pxor %xmm4,%xmm3
-.byte 102,15,56,0,229
- pxor %xmm4,%xmm3
-
- jmp L$schedule_mangle_both
-.p2align 4
-L$schedule_mangle_dec:
-
- leaq L$k_dksd(%rip),%r11
- movdqa %xmm9,%xmm1
- pandn %xmm4,%xmm1
- psrld $4,%xmm1
- pand %xmm9,%xmm4
-
- movdqa 0(%r11),%xmm2
-.byte 102,15,56,0,212
- movdqa 16(%r11),%xmm3
-.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
-.byte 102,15,56,0,221
-
- movdqa 32(%r11),%xmm2
-.byte 102,15,56,0,212
- pxor %xmm3,%xmm2
- movdqa 48(%r11),%xmm3
-.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
-.byte 102,15,56,0,221
-
- movdqa 64(%r11),%xmm2
-.byte 102,15,56,0,212
- pxor %xmm3,%xmm2
- movdqa 80(%r11),%xmm3
-.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
-.byte 102,15,56,0,221
-
- movdqa 96(%r11),%xmm2
-.byte 102,15,56,0,212
- pxor %xmm3,%xmm2
- movdqa 112(%r11),%xmm3
-.byte 102,15,56,0,217
- pxor %xmm2,%xmm3
-
- addq $-16,%rdx
-
-L$schedule_mangle_both:
- movdqa (%r8,%r10,1),%xmm1
-.byte 102,15,56,0,217
- addq $-16,%r8
- andq $0x30,%r8
- movdqu %xmm3,(%rdx)
- ret
-
-
-
-
-
-
-.globl _vpaes_set_encrypt_key
-.private_extern _vpaes_set_encrypt_key
-
-.p2align 4
-_vpaes_set_encrypt_key:
-
-_CET_ENDBR
-#ifdef BORINGSSL_DISPATCH_TEST
-
- movb $1,_BORINGSSL_function_hit+5(%rip)
-#endif
-
- movl %esi,%eax
- shrl $5,%eax
- addl $5,%eax
- movl %eax,240(%rdx)
-
- movl $0,%ecx
- movl $0x30,%r8d
- call _vpaes_schedule_core
- xorl %eax,%eax
- ret
-
-
-
-.globl _vpaes_set_decrypt_key
-.private_extern _vpaes_set_decrypt_key
-
-.p2align 4
-_vpaes_set_decrypt_key:
-
-_CET_ENDBR
- movl %esi,%eax
- shrl $5,%eax
- addl $5,%eax
- movl %eax,240(%rdx)
- shll $4,%eax
- leaq 16(%rdx,%rax,1),%rdx
-
- movl $1,%ecx
- movl %esi,%r8d
- shrl $1,%r8d
- andl $32,%r8d
- xorl $32,%r8d
- call _vpaes_schedule_core
- xorl %eax,%eax
- ret
-
-
-
-.globl _vpaes_encrypt
-.private_extern _vpaes_encrypt
-
-.p2align 4
-_vpaes_encrypt:
-
-_CET_ENDBR
-#ifdef BORINGSSL_DISPATCH_TEST
-
- movb $1,_BORINGSSL_function_hit+4(%rip)
-#endif
- movdqu (%rdi),%xmm0
- call _vpaes_preheat
- call _vpaes_encrypt_core
- movdqu %xmm0,(%rsi)
- ret
-
-
-
-.globl _vpaes_decrypt
-.private_extern _vpaes_decrypt
-
-.p2align 4
-_vpaes_decrypt:
-
-_CET_ENDBR
- movdqu (%rdi),%xmm0
- call _vpaes_preheat
- call _vpaes_decrypt_core
- movdqu %xmm0,(%rsi)
- ret
-
-
-.globl _vpaes_cbc_encrypt
-.private_extern _vpaes_cbc_encrypt
-
-.p2align 4
-_vpaes_cbc_encrypt:
-
-_CET_ENDBR
- xchgq %rcx,%rdx
- subq $16,%rcx
- jc L$cbc_abort
- movdqu (%r8),%xmm6
- subq %rdi,%rsi
- call _vpaes_preheat
- cmpl $0,%r9d
- je L$cbc_dec_loop
- jmp L$cbc_enc_loop
-.p2align 4
-L$cbc_enc_loop:
- movdqu (%rdi),%xmm0
- pxor %xmm6,%xmm0
- call _vpaes_encrypt_core
- movdqa %xmm0,%xmm6
- movdqu %xmm0,(%rsi,%rdi,1)
- leaq 16(%rdi),%rdi
- subq $16,%rcx
- jnc L$cbc_enc_loop
- jmp L$cbc_done
-.p2align 4
-L$cbc_dec_loop:
- movdqu (%rdi),%xmm0
- movdqa %xmm0,%xmm7
- call _vpaes_decrypt_core
- pxor %xmm6,%xmm0
- movdqa %xmm7,%xmm6
- movdqu %xmm0,(%rsi,%rdi,1)
- leaq 16(%rdi),%rdi
- subq $16,%rcx
- jnc L$cbc_dec_loop
-L$cbc_done:
- movdqu %xmm6,(%r8)
-L$cbc_abort:
- ret
-
-
-.globl _vpaes_ctr32_encrypt_blocks
-.private_extern _vpaes_ctr32_encrypt_blocks
-
-.p2align 4
-_vpaes_ctr32_encrypt_blocks:
-
-_CET_ENDBR
-
- xchgq %rcx,%rdx
- testq %rcx,%rcx
- jz L$ctr32_abort
- movdqu (%r8),%xmm0
- movdqa L$ctr_add_one(%rip),%xmm8
- subq %rdi,%rsi
- call _vpaes_preheat
- movdqa %xmm0,%xmm6
- pshufb L$rev_ctr(%rip),%xmm6
-
- testq $1,%rcx
- jz L$ctr32_prep_loop
-
-
-
- movdqu (%rdi),%xmm7
- call _vpaes_encrypt_core
- pxor %xmm7,%xmm0
- paddd %xmm8,%xmm6
- movdqu %xmm0,(%rsi,%rdi,1)
- subq $1,%rcx
- leaq 16(%rdi),%rdi
- jz L$ctr32_done
-
-L$ctr32_prep_loop:
-
-
- movdqa %xmm6,%xmm14
- movdqa %xmm6,%xmm15
- paddd %xmm8,%xmm15
-
-L$ctr32_loop:
- movdqa L$rev_ctr(%rip),%xmm1
- movdqa %xmm14,%xmm0
- movdqa %xmm15,%xmm6
-.byte 102,15,56,0,193
-.byte 102,15,56,0,241
- call _vpaes_encrypt_core_2x
- movdqu (%rdi),%xmm1
- movdqu 16(%rdi),%xmm2
- movdqa L$ctr_add_two(%rip),%xmm3
- pxor %xmm1,%xmm0
- pxor %xmm2,%xmm6
- paddd %xmm3,%xmm14
- paddd %xmm3,%xmm15
- movdqu %xmm0,(%rsi,%rdi,1)
- movdqu %xmm6,16(%rsi,%rdi,1)
- subq $2,%rcx
- leaq 32(%rdi),%rdi
- jnz L$ctr32_loop
-
-L$ctr32_done:
-L$ctr32_abort:
- ret
-
-
-
-
-
-
-
-
-
-.p2align 4
-_vpaes_preheat:
-
- leaq L$k_s0F(%rip),%r10
- movdqa -32(%r10),%xmm10
- movdqa -16(%r10),%xmm11
- movdqa 0(%r10),%xmm9
- movdqa 48(%r10),%xmm13
- movdqa 64(%r10),%xmm12
- movdqa 80(%r10),%xmm15
- movdqa 96(%r10),%xmm14
- ret
-
-
-
-
-
-
-
-
-.section __DATA,__const
-.p2align 6
-_vpaes_consts:
-L$k_inv:
-.quad 0x0E05060F0D080180, 0x040703090A0B0C02
-.quad 0x01040A060F0B0780, 0x030D0E0C02050809
-
-L$k_s0F:
-.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
-
-L$k_ipt:
-.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
-.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
-
-L$k_sb1:
-.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
-.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
-L$k_sb2:
-.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
-.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
-L$k_sbo:
-.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
-.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
-
-L$k_mc_forward:
-.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
-.quad 0x080B0A0904070605, 0x000302010C0F0E0D
-.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
-.quad 0x000302010C0F0E0D, 0x080B0A0904070605
-
-L$k_mc_backward:
-.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
-.quad 0x020100030E0D0C0F, 0x0A09080B06050407
-.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
-.quad 0x0A09080B06050407, 0x020100030E0D0C0F
-
-L$k_sr:
-.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
-.quad 0x030E09040F0A0500, 0x0B06010C07020D08
-.quad 0x0F060D040B020900, 0x070E050C030A0108
-.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
-
-L$k_rcon:
-.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
-
-L$k_s63:
-.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
-
-L$k_opt:
-.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
-.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
-
-L$k_deskew:
-.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
-.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
-
-
-
-
-
-L$k_dksd:
-.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
-.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
-L$k_dksb:
-.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
-.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
-L$k_dkse:
-.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
-.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
-L$k_dks9:
-.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
-.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
-
-
-
-
-
-L$k_dipt:
-.quad 0x0F505B040B545F00, 0x154A411E114E451A
-.quad 0x86E383E660056500, 0x12771772F491F194
-
-L$k_dsb9:
-.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
-.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
-L$k_dsbd:
-.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
-.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
-L$k_dsbb:
-.quad 0xD022649296B44200, 0x602646F6B0F2D404
-.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
-L$k_dsbe:
-.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
-.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
-L$k_dsbo:
-.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
-.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-
-
-L$rev_ctr:
-.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
-
-
-L$ctr_add_one:
-.quad 0x0000000000000000, 0x0000000100000000
-L$ctr_add_two:
-.quad 0x0000000000000000, 0x0000000200000000
-
-.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
-.p2align 6
-
-.text
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/x86_64-mont-apple.S b/apple-x86_64/crypto/fipsmodule/x86_64-mont-apple.S
deleted file mode 100644
index a4c719c..0000000
--- a/apple-x86_64/crypto/fipsmodule/x86_64-mont-apple.S
+++ /dev/null
@@ -1,1250 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text
-
-
-
-.globl _bn_mul_mont
-.private_extern _bn_mul_mont
-
-.p2align 4
-_bn_mul_mont:
-
-_CET_ENDBR
- movl %r9d,%r9d
- movq %rsp,%rax
-
- testl $3,%r9d
- jnz L$mul_enter
- cmpl $8,%r9d
- jb L$mul_enter
- leaq _OPENSSL_ia32cap_P(%rip),%r11
- movl 8(%r11),%r11d
- cmpq %rsi,%rdx
- jne L$mul4x_enter
- testl $7,%r9d
- jz L$sqr8x_enter
- jmp L$mul4x_enter
-
-.p2align 4
-L$mul_enter:
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
-
- negq %r9
- movq %rsp,%r11
- leaq -16(%rsp,%r9,8),%r10
- negq %r9
- andq $-1024,%r10
-
-
-
-
-
-
-
-
-
- subq %r10,%r11
- andq $-4096,%r11
- leaq (%r10,%r11,1),%rsp
- movq (%rsp),%r11
- cmpq %r10,%rsp
- ja L$mul_page_walk
- jmp L$mul_page_walk_done
-
-.p2align 4
-L$mul_page_walk:
- leaq -4096(%rsp),%rsp
- movq (%rsp),%r11
- cmpq %r10,%rsp
- ja L$mul_page_walk
-L$mul_page_walk_done:
-
- movq %rax,8(%rsp,%r9,8)
-
-L$mul_body:
- movq %rdx,%r12
- movq (%r8),%r8
- movq (%r12),%rbx
- movq (%rsi),%rax
-
- xorq %r14,%r14
- xorq %r15,%r15
-
- movq %r8,%rbp
- mulq %rbx
- movq %rax,%r10
- movq (%rcx),%rax
-
- imulq %r10,%rbp
- movq %rdx,%r11
-
- mulq %rbp
- addq %rax,%r10
- movq 8(%rsi),%rax
- adcq $0,%rdx
- movq %rdx,%r13
-
- leaq 1(%r15),%r15
- jmp L$1st_enter
-
-.p2align 4
-L$1st:
- addq %rax,%r13
- movq (%rsi,%r15,8),%rax
- adcq $0,%rdx
- addq %r11,%r13
- movq %r10,%r11
- adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
- movq %rdx,%r13
-
-L$1st_enter:
- mulq %rbx
- addq %rax,%r11
- movq (%rcx,%r15,8),%rax
- adcq $0,%rdx
- leaq 1(%r15),%r15
- movq %rdx,%r10
-
- mulq %rbp
- cmpq %r9,%r15
- jne L$1st
-
- addq %rax,%r13
- movq (%rsi),%rax
- adcq $0,%rdx
- addq %r11,%r13
- adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
- movq %rdx,%r13
- movq %r10,%r11
-
- xorq %rdx,%rdx
- addq %r11,%r13
- adcq $0,%rdx
- movq %r13,-8(%rsp,%r9,8)
- movq %rdx,(%rsp,%r9,8)
-
- leaq 1(%r14),%r14
- jmp L$outer
-.p2align 4
-L$outer:
- movq (%r12,%r14,8),%rbx
- xorq %r15,%r15
- movq %r8,%rbp
- movq (%rsp),%r10
- mulq %rbx
- addq %rax,%r10
- movq (%rcx),%rax
- adcq $0,%rdx
-
- imulq %r10,%rbp
- movq %rdx,%r11
-
- mulq %rbp
- addq %rax,%r10
- movq 8(%rsi),%rax
- adcq $0,%rdx
- movq 8(%rsp),%r10
- movq %rdx,%r13
-
- leaq 1(%r15),%r15
- jmp L$inner_enter
-
-.p2align 4
-L$inner:
- addq %rax,%r13
- movq (%rsi,%r15,8),%rax
- adcq $0,%rdx
- addq %r10,%r13
- movq (%rsp,%r15,8),%r10
- adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
- movq %rdx,%r13
-
-L$inner_enter:
- mulq %rbx
- addq %rax,%r11
- movq (%rcx,%r15,8),%rax
- adcq $0,%rdx
- addq %r11,%r10
- movq %rdx,%r11
- adcq $0,%r11
- leaq 1(%r15),%r15
-
- mulq %rbp
- cmpq %r9,%r15
- jne L$inner
-
- addq %rax,%r13
- movq (%rsi),%rax
- adcq $0,%rdx
- addq %r10,%r13
- movq (%rsp,%r15,8),%r10
- adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
- movq %rdx,%r13
-
- xorq %rdx,%rdx
- addq %r11,%r13
- adcq $0,%rdx
- addq %r10,%r13
- adcq $0,%rdx
- movq %r13,-8(%rsp,%r9,8)
- movq %rdx,(%rsp,%r9,8)
-
- leaq 1(%r14),%r14
- cmpq %r9,%r14
- jb L$outer
-
- xorq %r14,%r14
- movq (%rsp),%rax
- movq %r9,%r15
-
-.p2align 4
-L$sub: sbbq (%rcx,%r14,8),%rax
- movq %rax,(%rdi,%r14,8)
- movq 8(%rsp,%r14,8),%rax
- leaq 1(%r14),%r14
- decq %r15
- jnz L$sub
-
- sbbq $0,%rax
- movq $-1,%rbx
- xorq %rax,%rbx
- xorq %r14,%r14
- movq %r9,%r15
-
-L$copy:
- movq (%rdi,%r14,8),%rcx
- movq (%rsp,%r14,8),%rdx
- andq %rbx,%rcx
- andq %rax,%rdx
- movq %r9,(%rsp,%r14,8)
- orq %rcx,%rdx
- movq %rdx,(%rdi,%r14,8)
- leaq 1(%r14),%r14
- subq $1,%r15
- jnz L$copy
-
- movq 8(%rsp,%r9,8),%rsi
-
- movq $1,%rax
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- mov