Roll generated boringssl files to 2db0eb3f96a5756298dcd7f9319e56a98585bd10.

Change-Id: I7d0428b1a4aee7c7faf5bcbe928c0ee45a957ea6
Reviewed-on: https://dart-review.googlesource.com/c/boringssl_gen/+/375140
Reviewed-by: Brian Quinlan <bquinlan@google.com>
diff --git a/BUILD.generated.gni b/BUILD.generated.gni
index 910ebff..599b590 100644
--- a/BUILD.generated.gni
+++ b/BUILD.generated.gni
@@ -15,7 +15,6 @@
 # This file is created by generate_build_files.py. Do not edit manually.
 
 crypto_sources = [
-  "err_data.c",
   "src/crypto/asn1/a_bitstr.c",
   "src/crypto/asn1/a_bool.c",
   "src/crypto/asn1/a_d2i_fp.c",
@@ -121,6 +120,8 @@
   "src/crypto/evp/evp_asn1.c",
   "src/crypto/evp/evp_ctx.c",
   "src/crypto/evp/internal.h",
+  "src/crypto/evp/p_dh.c",
+  "src/crypto/evp/p_dh_asn1.c",
   "src/crypto/evp/p_dsa_asn1.c",
   "src/crypto/evp/p_ec.c",
   "src/crypto/evp/p_ec_asn1.c",
@@ -165,8 +166,9 @@
   "src/crypto/hrss/hrss.c",
   "src/crypto/hrss/internal.h",
   "src/crypto/internal.h",
+  "src/crypto/keccak/internal.h",
+  "src/crypto/keccak/keccak.c",
   "src/crypto/kyber/internal.h",
-  "src/crypto/kyber/keccak.c",
   "src/crypto/kyber/kyber.c",
   "src/crypto/lhash/internal.h",
   "src/crypto/lhash/lhash.c",
@@ -210,6 +212,20 @@
   "src/crypto/rsa_extra/rsa_crypt.c",
   "src/crypto/rsa_extra/rsa_print.c",
   "src/crypto/siphash/siphash.c",
+  "src/crypto/spx/address.c",
+  "src/crypto/spx/address.h",
+  "src/crypto/spx/fors.c",
+  "src/crypto/spx/fors.h",
+  "src/crypto/spx/merkle.c",
+  "src/crypto/spx/merkle.h",
+  "src/crypto/spx/params.h",
+  "src/crypto/spx/spx.c",
+  "src/crypto/spx/spx_util.c",
+  "src/crypto/spx/spx_util.h",
+  "src/crypto/spx/thash.c",
+  "src/crypto/spx/thash.h",
+  "src/crypto/spx/wots.c",
+  "src/crypto/spx/wots.h",
   "src/crypto/stack/stack.c",
   "src/crypto/thread.c",
   "src/crypto/thread_none.c",
@@ -226,6 +242,7 @@
   "src/crypto/x509/asn1_gen.c",
   "src/crypto/x509/by_dir.c",
   "src/crypto/x509/by_file.c",
+  "src/crypto/x509/ext_dat.h",
   "src/crypto/x509/i2d_pr.c",
   "src/crypto/x509/internal.h",
   "src/crypto/x509/name_print.c",
@@ -235,6 +252,29 @@
   "src/crypto/x509/t_req.c",
   "src/crypto/x509/t_x509.c",
   "src/crypto/x509/t_x509a.c",
+  "src/crypto/x509/v3_akey.c",
+  "src/crypto/x509/v3_akeya.c",
+  "src/crypto/x509/v3_alt.c",
+  "src/crypto/x509/v3_bcons.c",
+  "src/crypto/x509/v3_bitst.c",
+  "src/crypto/x509/v3_conf.c",
+  "src/crypto/x509/v3_cpols.c",
+  "src/crypto/x509/v3_crld.c",
+  "src/crypto/x509/v3_enum.c",
+  "src/crypto/x509/v3_extku.c",
+  "src/crypto/x509/v3_genn.c",
+  "src/crypto/x509/v3_ia5.c",
+  "src/crypto/x509/v3_info.c",
+  "src/crypto/x509/v3_int.c",
+  "src/crypto/x509/v3_lib.c",
+  "src/crypto/x509/v3_ncons.c",
+  "src/crypto/x509/v3_ocsp.c",
+  "src/crypto/x509/v3_pcons.c",
+  "src/crypto/x509/v3_pmaps.c",
+  "src/crypto/x509/v3_prn.c",
+  "src/crypto/x509/v3_purp.c",
+  "src/crypto/x509/v3_skey.c",
+  "src/crypto/x509/v3_utl.c",
   "src/crypto/x509/x509.c",
   "src/crypto/x509/x509_att.c",
   "src/crypto/x509/x509_cmp.c",
@@ -259,9 +299,7 @@
   "src/crypto/x509/x_attrib.c",
   "src/crypto/x509/x_crl.c",
   "src/crypto/x509/x_exten.c",
-  "src/crypto/x509/x_info.c",
   "src/crypto/x509/x_name.c",
-  "src/crypto/x509/x_pkey.c",
   "src/crypto/x509/x_pubkey.c",
   "src/crypto/x509/x_req.c",
   "src/crypto/x509/x_sig.c",
@@ -269,31 +307,7 @@
   "src/crypto/x509/x_val.c",
   "src/crypto/x509/x_x509.c",
   "src/crypto/x509/x_x509a.c",
-  "src/crypto/x509v3/ext_dat.h",
-  "src/crypto/x509v3/internal.h",
-  "src/crypto/x509v3/v3_akey.c",
-  "src/crypto/x509v3/v3_akeya.c",
-  "src/crypto/x509v3/v3_alt.c",
-  "src/crypto/x509v3/v3_bcons.c",
-  "src/crypto/x509v3/v3_bitst.c",
-  "src/crypto/x509v3/v3_conf.c",
-  "src/crypto/x509v3/v3_cpols.c",
-  "src/crypto/x509v3/v3_crld.c",
-  "src/crypto/x509v3/v3_enum.c",
-  "src/crypto/x509v3/v3_extku.c",
-  "src/crypto/x509v3/v3_genn.c",
-  "src/crypto/x509v3/v3_ia5.c",
-  "src/crypto/x509v3/v3_info.c",
-  "src/crypto/x509v3/v3_int.c",
-  "src/crypto/x509v3/v3_lib.c",
-  "src/crypto/x509v3/v3_ncons.c",
-  "src/crypto/x509v3/v3_ocsp.c",
-  "src/crypto/x509v3/v3_pcons.c",
-  "src/crypto/x509v3/v3_pmaps.c",
-  "src/crypto/x509v3/v3_prn.c",
-  "src/crypto/x509v3/v3_purp.c",
-  "src/crypto/x509v3/v3_skey.c",
-  "src/crypto/x509v3/v3_utl.c",
+  "src/gen/crypto/err_data.c",
   "src/third_party/fiat/curve25519_32.h",
   "src/third_party/fiat/curve25519_64.h",
   "src/third_party/fiat/curve25519_64_adx.h",
@@ -304,177 +318,168 @@
 ]
 
 crypto_sources_asm = [
-  "apple-aarch64/crypto/chacha/chacha-armv8-apple.S",
-  "apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8-apple.S",
-  "apple-aarch64/crypto/fipsmodule/aesv8-armv8-apple.S",
-  "apple-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-apple.S",
-  "apple-aarch64/crypto/fipsmodule/armv8-mont-apple.S",
-  "apple-aarch64/crypto/fipsmodule/bn-armv8-apple.S",
-  "apple-aarch64/crypto/fipsmodule/ghash-neon-armv8-apple.S",
-  "apple-aarch64/crypto/fipsmodule/ghashv8-armv8-apple.S",
-  "apple-aarch64/crypto/fipsmodule/p256-armv8-asm-apple.S",
-  "apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-apple.S",
-  "apple-aarch64/crypto/fipsmodule/sha1-armv8-apple.S",
-  "apple-aarch64/crypto/fipsmodule/sha256-armv8-apple.S",
-  "apple-aarch64/crypto/fipsmodule/sha512-armv8-apple.S",
-  "apple-aarch64/crypto/fipsmodule/vpaes-armv8-apple.S",
-  "apple-aarch64/crypto/test/trampoline-armv8-apple.S",
-  "apple-arm/crypto/chacha/chacha-armv4-apple.S",
-  "apple-arm/crypto/fipsmodule/aesv8-armv7-apple.S",
-  "apple-arm/crypto/fipsmodule/armv4-mont-apple.S",
-  "apple-arm/crypto/fipsmodule/bsaes-armv7-apple.S",
-  "apple-arm/crypto/fipsmodule/ghash-armv4-apple.S",
-  "apple-arm/crypto/fipsmodule/ghashv8-armv7-apple.S",
-  "apple-arm/crypto/fipsmodule/sha1-armv4-large-apple.S",
-  "apple-arm/crypto/fipsmodule/sha256-armv4-apple.S",
-  "apple-arm/crypto/fipsmodule/sha512-armv4-apple.S",
-  "apple-arm/crypto/fipsmodule/vpaes-armv7-apple.S",
-  "apple-arm/crypto/test/trampoline-armv4-apple.S",
-  "apple-x86/crypto/chacha/chacha-x86-apple.S",
-  "apple-x86/crypto/fipsmodule/aesni-x86-apple.S",
-  "apple-x86/crypto/fipsmodule/bn-586-apple.S",
-  "apple-x86/crypto/fipsmodule/co-586-apple.S",
-  "apple-x86/crypto/fipsmodule/ghash-ssse3-x86-apple.S",
-  "apple-x86/crypto/fipsmodule/ghash-x86-apple.S",
-  "apple-x86/crypto/fipsmodule/md5-586-apple.S",
-  "apple-x86/crypto/fipsmodule/sha1-586-apple.S",
-  "apple-x86/crypto/fipsmodule/sha256-586-apple.S",
-  "apple-x86/crypto/fipsmodule/sha512-586-apple.S",
-  "apple-x86/crypto/fipsmodule/vpaes-x86-apple.S",
-  "apple-x86/crypto/fipsmodule/x86-mont-apple.S",
-  "apple-x86/crypto/test/trampoline-x86-apple.S",
-  "apple-x86_64/crypto/chacha/chacha-x86_64-apple.S",
-  "apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64-apple.S",
-  "apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64-apple.S",
-  "apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64-apple.S",
-  "apple-x86_64/crypto/fipsmodule/aesni-x86_64-apple.S",
-  "apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64-apple.S",
-  "apple-x86_64/crypto/fipsmodule/ghash-x86_64-apple.S",
-  "apple-x86_64/crypto/fipsmodule/md5-x86_64-apple.S",
-  "apple-x86_64/crypto/fipsmodule/p256-x86_64-asm-apple.S",
-  "apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm-apple.S",
-  "apple-x86_64/crypto/fipsmodule/rdrand-x86_64-apple.S",
-  "apple-x86_64/crypto/fipsmodule/rsaz-avx2-apple.S",
-  "apple-x86_64/crypto/fipsmodule/sha1-x86_64-apple.S",
-  "apple-x86_64/crypto/fipsmodule/sha256-x86_64-apple.S",
-  "apple-x86_64/crypto/fipsmodule/sha512-x86_64-apple.S",
-  "apple-x86_64/crypto/fipsmodule/vpaes-x86_64-apple.S",
-  "apple-x86_64/crypto/fipsmodule/x86_64-mont-apple.S",
-  "apple-x86_64/crypto/fipsmodule/x86_64-mont5-apple.S",
-  "apple-x86_64/crypto/test/trampoline-x86_64-apple.S",
-  "linux-aarch64/crypto/chacha/chacha-armv8-linux.S",
-  "linux-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8-linux.S",
-  "linux-aarch64/crypto/fipsmodule/aesv8-armv8-linux.S",
-  "linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-linux.S",
-  "linux-aarch64/crypto/fipsmodule/armv8-mont-linux.S",
-  "linux-aarch64/crypto/fipsmodule/bn-armv8-linux.S",
-  "linux-aarch64/crypto/fipsmodule/ghash-neon-armv8-linux.S",
-  "linux-aarch64/crypto/fipsmodule/ghashv8-armv8-linux.S",
-  "linux-aarch64/crypto/fipsmodule/p256-armv8-asm-linux.S",
-  "linux-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-linux.S",
-  "linux-aarch64/crypto/fipsmodule/sha1-armv8-linux.S",
-  "linux-aarch64/crypto/fipsmodule/sha256-armv8-linux.S",
-  "linux-aarch64/crypto/fipsmodule/sha512-armv8-linux.S",
-  "linux-aarch64/crypto/fipsmodule/vpaes-armv8-linux.S",
-  "linux-aarch64/crypto/test/trampoline-armv8-linux.S",
-  "linux-arm/crypto/chacha/chacha-armv4-linux.S",
-  "linux-arm/crypto/fipsmodule/aesv8-armv7-linux.S",
-  "linux-arm/crypto/fipsmodule/armv4-mont-linux.S",
-  "linux-arm/crypto/fipsmodule/bsaes-armv7-linux.S",
-  "linux-arm/crypto/fipsmodule/ghash-armv4-linux.S",
-  "linux-arm/crypto/fipsmodule/ghashv8-armv7-linux.S",
-  "linux-arm/crypto/fipsmodule/sha1-armv4-large-linux.S",
-  "linux-arm/crypto/fipsmodule/sha256-armv4-linux.S",
-  "linux-arm/crypto/fipsmodule/sha512-armv4-linux.S",
-  "linux-arm/crypto/fipsmodule/vpaes-armv7-linux.S",
-  "linux-arm/crypto/test/trampoline-armv4-linux.S",
-  "linux-x86/crypto/chacha/chacha-x86-linux.S",
-  "linux-x86/crypto/fipsmodule/aesni-x86-linux.S",
-  "linux-x86/crypto/fipsmodule/bn-586-linux.S",
-  "linux-x86/crypto/fipsmodule/co-586-linux.S",
-  "linux-x86/crypto/fipsmodule/ghash-ssse3-x86-linux.S",
-  "linux-x86/crypto/fipsmodule/ghash-x86-linux.S",
-  "linux-x86/crypto/fipsmodule/md5-586-linux.S",
-  "linux-x86/crypto/fipsmodule/sha1-586-linux.S",
-  "linux-x86/crypto/fipsmodule/sha256-586-linux.S",
-  "linux-x86/crypto/fipsmodule/sha512-586-linux.S",
-  "linux-x86/crypto/fipsmodule/vpaes-x86-linux.S",
-  "linux-x86/crypto/fipsmodule/x86-mont-linux.S",
-  "linux-x86/crypto/test/trampoline-x86-linux.S",
-  "linux-x86_64/crypto/chacha/chacha-x86_64-linux.S",
-  "linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64-linux.S",
-  "linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64-linux.S",
-  "linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64-linux.S",
-  "linux-x86_64/crypto/fipsmodule/aesni-x86_64-linux.S",
-  "linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64-linux.S",
-  "linux-x86_64/crypto/fipsmodule/ghash-x86_64-linux.S",
-  "linux-x86_64/crypto/fipsmodule/md5-x86_64-linux.S",
-  "linux-x86_64/crypto/fipsmodule/p256-x86_64-asm-linux.S",
-  "linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm-linux.S",
-  "linux-x86_64/crypto/fipsmodule/rdrand-x86_64-linux.S",
-  "linux-x86_64/crypto/fipsmodule/rsaz-avx2-linux.S",
-  "linux-x86_64/crypto/fipsmodule/sha1-x86_64-linux.S",
-  "linux-x86_64/crypto/fipsmodule/sha256-x86_64-linux.S",
-  "linux-x86_64/crypto/fipsmodule/sha512-x86_64-linux.S",
-  "linux-x86_64/crypto/fipsmodule/vpaes-x86_64-linux.S",
-  "linux-x86_64/crypto/fipsmodule/x86_64-mont-linux.S",
-  "linux-x86_64/crypto/fipsmodule/x86_64-mont5-linux.S",
-  "linux-x86_64/crypto/test/trampoline-x86_64-linux.S",
   "src/crypto/curve25519/asm/x25519-asm-arm.S",
   "src/crypto/hrss/asm/poly_rq_mul.S",
   "src/crypto/poly1305/poly1305_arm_asm.S",
+  "src/gen/bcm/aesni-gcm-x86_64-apple.S",
+  "src/gen/bcm/aesni-gcm-x86_64-linux.S",
+  "src/gen/bcm/aesni-x86-apple.S",
+  "src/gen/bcm/aesni-x86-linux.S",
+  "src/gen/bcm/aesni-x86_64-apple.S",
+  "src/gen/bcm/aesni-x86_64-linux.S",
+  "src/gen/bcm/aesv8-armv7-linux.S",
+  "src/gen/bcm/aesv8-armv8-apple.S",
+  "src/gen/bcm/aesv8-armv8-linux.S",
+  "src/gen/bcm/aesv8-armv8-win.S",
+  "src/gen/bcm/aesv8-gcm-armv8-apple.S",
+  "src/gen/bcm/aesv8-gcm-armv8-linux.S",
+  "src/gen/bcm/aesv8-gcm-armv8-win.S",
+  "src/gen/bcm/armv4-mont-linux.S",
+  "src/gen/bcm/armv8-mont-apple.S",
+  "src/gen/bcm/armv8-mont-linux.S",
+  "src/gen/bcm/armv8-mont-win.S",
+  "src/gen/bcm/bn-586-apple.S",
+  "src/gen/bcm/bn-586-linux.S",
+  "src/gen/bcm/bn-armv8-apple.S",
+  "src/gen/bcm/bn-armv8-linux.S",
+  "src/gen/bcm/bn-armv8-win.S",
+  "src/gen/bcm/bsaes-armv7-linux.S",
+  "src/gen/bcm/co-586-apple.S",
+  "src/gen/bcm/co-586-linux.S",
+  "src/gen/bcm/ghash-armv4-linux.S",
+  "src/gen/bcm/ghash-neon-armv8-apple.S",
+  "src/gen/bcm/ghash-neon-armv8-linux.S",
+  "src/gen/bcm/ghash-neon-armv8-win.S",
+  "src/gen/bcm/ghash-ssse3-x86-apple.S",
+  "src/gen/bcm/ghash-ssse3-x86-linux.S",
+  "src/gen/bcm/ghash-ssse3-x86_64-apple.S",
+  "src/gen/bcm/ghash-ssse3-x86_64-linux.S",
+  "src/gen/bcm/ghash-x86-apple.S",
+  "src/gen/bcm/ghash-x86-linux.S",
+  "src/gen/bcm/ghash-x86_64-apple.S",
+  "src/gen/bcm/ghash-x86_64-linux.S",
+  "src/gen/bcm/ghashv8-armv7-linux.S",
+  "src/gen/bcm/ghashv8-armv8-apple.S",
+  "src/gen/bcm/ghashv8-armv8-linux.S",
+  "src/gen/bcm/ghashv8-armv8-win.S",
+  "src/gen/bcm/md5-586-apple.S",
+  "src/gen/bcm/md5-586-linux.S",
+  "src/gen/bcm/md5-x86_64-apple.S",
+  "src/gen/bcm/md5-x86_64-linux.S",
+  "src/gen/bcm/p256-armv8-asm-apple.S",
+  "src/gen/bcm/p256-armv8-asm-linux.S",
+  "src/gen/bcm/p256-armv8-asm-win.S",
+  "src/gen/bcm/p256-x86_64-asm-apple.S",
+  "src/gen/bcm/p256-x86_64-asm-linux.S",
+  "src/gen/bcm/p256_beeu-armv8-asm-apple.S",
+  "src/gen/bcm/p256_beeu-armv8-asm-linux.S",
+  "src/gen/bcm/p256_beeu-armv8-asm-win.S",
+  "src/gen/bcm/p256_beeu-x86_64-asm-apple.S",
+  "src/gen/bcm/p256_beeu-x86_64-asm-linux.S",
+  "src/gen/bcm/rdrand-x86_64-apple.S",
+  "src/gen/bcm/rdrand-x86_64-linux.S",
+  "src/gen/bcm/rsaz-avx2-apple.S",
+  "src/gen/bcm/rsaz-avx2-linux.S",
+  "src/gen/bcm/sha1-586-apple.S",
+  "src/gen/bcm/sha1-586-linux.S",
+  "src/gen/bcm/sha1-armv4-large-linux.S",
+  "src/gen/bcm/sha1-armv8-apple.S",
+  "src/gen/bcm/sha1-armv8-linux.S",
+  "src/gen/bcm/sha1-armv8-win.S",
+  "src/gen/bcm/sha1-x86_64-apple.S",
+  "src/gen/bcm/sha1-x86_64-linux.S",
+  "src/gen/bcm/sha256-586-apple.S",
+  "src/gen/bcm/sha256-586-linux.S",
+  "src/gen/bcm/sha256-armv4-linux.S",
+  "src/gen/bcm/sha256-armv8-apple.S",
+  "src/gen/bcm/sha256-armv8-linux.S",
+  "src/gen/bcm/sha256-armv8-win.S",
+  "src/gen/bcm/sha256-x86_64-apple.S",
+  "src/gen/bcm/sha256-x86_64-linux.S",
+  "src/gen/bcm/sha512-586-apple.S",
+  "src/gen/bcm/sha512-586-linux.S",
+  "src/gen/bcm/sha512-armv4-linux.S",
+  "src/gen/bcm/sha512-armv8-apple.S",
+  "src/gen/bcm/sha512-armv8-linux.S",
+  "src/gen/bcm/sha512-armv8-win.S",
+  "src/gen/bcm/sha512-x86_64-apple.S",
+  "src/gen/bcm/sha512-x86_64-linux.S",
+  "src/gen/bcm/vpaes-armv7-linux.S",
+  "src/gen/bcm/vpaes-armv8-apple.S",
+  "src/gen/bcm/vpaes-armv8-linux.S",
+  "src/gen/bcm/vpaes-armv8-win.S",
+  "src/gen/bcm/vpaes-x86-apple.S",
+  "src/gen/bcm/vpaes-x86-linux.S",
+  "src/gen/bcm/vpaes-x86_64-apple.S",
+  "src/gen/bcm/vpaes-x86_64-linux.S",
+  "src/gen/bcm/x86-mont-apple.S",
+  "src/gen/bcm/x86-mont-linux.S",
+  "src/gen/bcm/x86_64-mont-apple.S",
+  "src/gen/bcm/x86_64-mont-linux.S",
+  "src/gen/bcm/x86_64-mont5-apple.S",
+  "src/gen/bcm/x86_64-mont5-linux.S",
+  "src/gen/crypto/aes128gcmsiv-x86_64-apple.S",
+  "src/gen/crypto/aes128gcmsiv-x86_64-linux.S",
+  "src/gen/crypto/chacha-armv4-linux.S",
+  "src/gen/crypto/chacha-armv8-apple.S",
+  "src/gen/crypto/chacha-armv8-linux.S",
+  "src/gen/crypto/chacha-armv8-win.S",
+  "src/gen/crypto/chacha-x86-apple.S",
+  "src/gen/crypto/chacha-x86-linux.S",
+  "src/gen/crypto/chacha-x86_64-apple.S",
+  "src/gen/crypto/chacha-x86_64-linux.S",
+  "src/gen/crypto/chacha20_poly1305_armv8-apple.S",
+  "src/gen/crypto/chacha20_poly1305_armv8-linux.S",
+  "src/gen/crypto/chacha20_poly1305_armv8-win.S",
+  "src/gen/crypto/chacha20_poly1305_x86_64-apple.S",
+  "src/gen/crypto/chacha20_poly1305_x86_64-linux.S",
+  "src/gen/test_support/trampoline-armv4-linux.S",
+  "src/gen/test_support/trampoline-armv8-apple.S",
+  "src/gen/test_support/trampoline-armv8-linux.S",
+  "src/gen/test_support/trampoline-armv8-win.S",
+  "src/gen/test_support/trampoline-x86-apple.S",
+  "src/gen/test_support/trampoline-x86-linux.S",
+  "src/gen/test_support/trampoline-x86_64-apple.S",
+  "src/gen/test_support/trampoline-x86_64-linux.S",
   "src/third_party/fiat/asm/fiat_curve25519_adx_mul.S",
   "src/third_party/fiat/asm/fiat_curve25519_adx_square.S",
-  "win-aarch64/crypto/chacha/chacha-armv8-win.S",
-  "win-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8-win.S",
-  "win-aarch64/crypto/fipsmodule/aesv8-armv8-win.S",
-  "win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-win.S",
-  "win-aarch64/crypto/fipsmodule/armv8-mont-win.S",
-  "win-aarch64/crypto/fipsmodule/bn-armv8-win.S",
-  "win-aarch64/crypto/fipsmodule/ghash-neon-armv8-win.S",
-  "win-aarch64/crypto/fipsmodule/ghashv8-armv8-win.S",
-  "win-aarch64/crypto/fipsmodule/p256-armv8-asm-win.S",
-  "win-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-win.S",
-  "win-aarch64/crypto/fipsmodule/sha1-armv8-win.S",
-  "win-aarch64/crypto/fipsmodule/sha256-armv8-win.S",
-  "win-aarch64/crypto/fipsmodule/sha512-armv8-win.S",
-  "win-aarch64/crypto/fipsmodule/vpaes-armv8-win.S",
-  "win-aarch64/crypto/test/trampoline-armv8-win.S",
+  "src/third_party/fiat/asm/fiat_p256_adx_mul.S",
+  "src/third_party/fiat/asm/fiat_p256_adx_sqr.S",
 ]
 
 crypto_sources_nasm = [
-  "win-x86/crypto/chacha/chacha-x86-win.asm",
-  "win-x86/crypto/fipsmodule/aesni-x86-win.asm",
-  "win-x86/crypto/fipsmodule/bn-586-win.asm",
-  "win-x86/crypto/fipsmodule/co-586-win.asm",
-  "win-x86/crypto/fipsmodule/ghash-ssse3-x86-win.asm",
-  "win-x86/crypto/fipsmodule/ghash-x86-win.asm",
-  "win-x86/crypto/fipsmodule/md5-586-win.asm",
-  "win-x86/crypto/fipsmodule/sha1-586-win.asm",
-  "win-x86/crypto/fipsmodule/sha256-586-win.asm",
-  "win-x86/crypto/fipsmodule/sha512-586-win.asm",
-  "win-x86/crypto/fipsmodule/vpaes-x86-win.asm",
-  "win-x86/crypto/fipsmodule/x86-mont-win.asm",
-  "win-x86/crypto/test/trampoline-x86-win.asm",
-  "win-x86_64/crypto/chacha/chacha-x86_64-win.asm",
-  "win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64-win.asm",
-  "win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64-win.asm",
-  "win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64-win.asm",
-  "win-x86_64/crypto/fipsmodule/aesni-x86_64-win.asm",
-  "win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64-win.asm",
-  "win-x86_64/crypto/fipsmodule/ghash-x86_64-win.asm",
-  "win-x86_64/crypto/fipsmodule/md5-x86_64-win.asm",
-  "win-x86_64/crypto/fipsmodule/p256-x86_64-asm-win.asm",
-  "win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm-win.asm",
-  "win-x86_64/crypto/fipsmodule/rdrand-x86_64-win.asm",
-  "win-x86_64/crypto/fipsmodule/rsaz-avx2-win.asm",
-  "win-x86_64/crypto/fipsmodule/sha1-x86_64-win.asm",
-  "win-x86_64/crypto/fipsmodule/sha256-x86_64-win.asm",
-  "win-x86_64/crypto/fipsmodule/sha512-x86_64-win.asm",
-  "win-x86_64/crypto/fipsmodule/vpaes-x86_64-win.asm",
-  "win-x86_64/crypto/fipsmodule/x86_64-mont-win.asm",
-  "win-x86_64/crypto/fipsmodule/x86_64-mont5-win.asm",
-  "win-x86_64/crypto/test/trampoline-x86_64-win.asm",
+  "src/gen/bcm/aesni-gcm-x86_64-win.asm",
+  "src/gen/bcm/aesni-x86-win.asm",
+  "src/gen/bcm/aesni-x86_64-win.asm",
+  "src/gen/bcm/bn-586-win.asm",
+  "src/gen/bcm/co-586-win.asm",
+  "src/gen/bcm/ghash-ssse3-x86-win.asm",
+  "src/gen/bcm/ghash-ssse3-x86_64-win.asm",
+  "src/gen/bcm/ghash-x86-win.asm",
+  "src/gen/bcm/ghash-x86_64-win.asm",
+  "src/gen/bcm/md5-586-win.asm",
+  "src/gen/bcm/md5-x86_64-win.asm",
+  "src/gen/bcm/p256-x86_64-asm-win.asm",
+  "src/gen/bcm/p256_beeu-x86_64-asm-win.asm",
+  "src/gen/bcm/rdrand-x86_64-win.asm",
+  "src/gen/bcm/rsaz-avx2-win.asm",
+  "src/gen/bcm/sha1-586-win.asm",
+  "src/gen/bcm/sha1-x86_64-win.asm",
+  "src/gen/bcm/sha256-586-win.asm",
+  "src/gen/bcm/sha256-x86_64-win.asm",
+  "src/gen/bcm/sha512-586-win.asm",
+  "src/gen/bcm/sha512-x86_64-win.asm",
+  "src/gen/bcm/vpaes-x86-win.asm",
+  "src/gen/bcm/vpaes-x86_64-win.asm",
+  "src/gen/bcm/x86-mont-win.asm",
+  "src/gen/bcm/x86_64-mont-win.asm",
+  "src/gen/bcm/x86_64-mont5-win.asm",
+  "src/gen/crypto/aes128gcmsiv-x86_64-win.asm",
+  "src/gen/crypto/chacha-x86-win.asm",
+  "src/gen/crypto/chacha-x86_64-win.asm",
+  "src/gen/crypto/chacha20_poly1305_x86_64-win.asm",
+  "src/gen/test_support/trampoline-x86-win.asm",
+  "src/gen/test_support/trampoline-x86_64-win.asm",
 ]
 
 crypto_headers = [
@@ -517,13 +522,14 @@
   "src/include/openssl/evp.h",
   "src/include/openssl/evp_errors.h",
   "src/include/openssl/ex_data.h",
+  "src/include/openssl/experimental/kyber.h",
+  "src/include/openssl/experimental/spx.h",
   "src/include/openssl/hkdf.h",
   "src/include/openssl/hmac.h",
   "src/include/openssl/hpke.h",
   "src/include/openssl/hrss.h",
   "src/include/openssl/is_boringssl.h",
   "src/include/openssl/kdf.h",
-  "src/include/openssl/kyber.h",
   "src/include/openssl/lhash.h",
   "src/include/openssl/md4.h",
   "src/include/openssl/md5.h",
@@ -541,6 +547,7 @@
   "src/include/openssl/pkcs8.h",
   "src/include/openssl/poly1305.h",
   "src/include/openssl/pool.h",
+  "src/include/openssl/posix_time.h",
   "src/include/openssl/rand.h",
   "src/include/openssl/rc4.h",
   "src/include/openssl/ripemd.h",
@@ -559,6 +566,33 @@
   "src/include/openssl/x509.h",
   "src/include/openssl/x509_vfy.h",
   "src/include/openssl/x509v3.h",
+  "src/include/openssl/x509v3_errors.h",
+]
+
+rust_bssl_sys = [ "src/rust/bssl-sys/src/lib.rs" ]
+
+rust_bssl_crypto = [
+  "src/rust/bssl-crypto/src/aead.rs",
+  "src/rust/bssl-crypto/src/aes.rs",
+  "src/rust/bssl-crypto/src/cipher/aes_cbc.rs",
+  "src/rust/bssl-crypto/src/cipher/aes_ctr.rs",
+  "src/rust/bssl-crypto/src/cipher/mod.rs",
+  "src/rust/bssl-crypto/src/digest.rs",
+  "src/rust/bssl-crypto/src/ec.rs",
+  "src/rust/bssl-crypto/src/ecdh.rs",
+  "src/rust/bssl-crypto/src/ecdsa.rs",
+  "src/rust/bssl-crypto/src/ed25519.rs",
+  "src/rust/bssl-crypto/src/hkdf.rs",
+  "src/rust/bssl-crypto/src/hmac.rs",
+  "src/rust/bssl-crypto/src/hpke.rs",
+  "src/rust/bssl-crypto/src/lib.rs",
+  "src/rust/bssl-crypto/src/macros.rs",
+  "src/rust/bssl-crypto/src/mem.rs",
+  "src/rust/bssl-crypto/src/rand.rs",
+  "src/rust/bssl-crypto/src/rsa.rs",
+  "src/rust/bssl-crypto/src/scoped.rs",
+  "src/rust/bssl-crypto/src/test_helpers.rs",
+  "src/rust/bssl-crypto/src/x25519.rs",
 ]
 
 ssl_sources = [
@@ -584,6 +618,7 @@
   "src/ssl/ssl_buffer.cc",
   "src/ssl/ssl_cert.cc",
   "src/ssl/ssl_cipher.cc",
+  "src/ssl/ssl_credential.cc",
   "src/ssl/ssl_file.cc",
   "src/ssl/ssl_key_share.cc",
   "src/ssl/ssl_lib.cc",
@@ -611,97 +646,91 @@
 ]
 
 pki_sources = [
-  "src/pki/asn1_util.h",
   "src/pki/cert_error_id.cc",
-  "src/pki/cert_error_id.h",
   "src/pki/cert_error_params.cc",
-  "src/pki/cert_error_params.h",
   "src/pki/cert_errors.cc",
+  "src/pki/cert_issuer_source_static.cc",
+  "src/pki/certificate.cc",
+  "src/pki/certificate_policies.cc",
+  "src/pki/common_cert_errors.cc",
+  "src/pki/crl.cc",
+  "src/pki/encode_values.cc",
+  "src/pki/extended_key_usage.cc",
+  "src/pki/general_names.cc",
+  "src/pki/input.cc",
+  "src/pki/ip_util.cc",
+  "src/pki/name_constraints.cc",
+  "src/pki/ocsp.cc",
+  "src/pki/ocsp_verify_result.cc",
+  "src/pki/parse_certificate.cc",
+  "src/pki/parse_name.cc",
+  "src/pki/parse_values.cc",
+  "src/pki/parsed_certificate.cc",
+  "src/pki/parser.cc",
+  "src/pki/path_builder.cc",
+  "src/pki/pem.cc",
+  "src/pki/revocation_util.cc",
+  "src/pki/signature_algorithm.cc",
+  "src/pki/simple_path_builder_delegate.cc",
+  "src/pki/string_util.cc",
+  "src/pki/trust_store.cc",
+  "src/pki/trust_store_collection.cc",
+  "src/pki/trust_store_in_memory.cc",
+  "src/pki/verify_certificate_chain.cc",
+  "src/pki/verify_error.cc",
+  "src/pki/verify_name_match.cc",
+  "src/pki/verify_signed_data.cc",
+]
+
+pki_internal_headers = [
+  "src/pki/cert_error_id.h",
+  "src/pki/cert_error_params.h",
   "src/pki/cert_errors.h",
   "src/pki/cert_issuer_source.h",
-  "src/pki/cert_issuer_source_static.cc",
   "src/pki/cert_issuer_source_static.h",
   "src/pki/cert_issuer_source_sync_unittest.h",
-  "src/pki/cert_status_flags.h",
-  "src/pki/cert_status_flags_list.h",
-  "src/pki/certificate_policies.cc",
   "src/pki/certificate_policies.h",
-  "src/pki/common_cert_errors.cc",
   "src/pki/common_cert_errors.h",
-  "src/pki/crl.cc",
   "src/pki/crl.h",
-  "src/pki/encode_values.cc",
   "src/pki/encode_values.h",
-  "src/pki/extended_key_usage.cc",
   "src/pki/extended_key_usage.h",
-  "src/pki/fillins/file_util.h",
-  "src/pki/fillins/fillins_base64.cc",
-  "src/pki/fillins/fillins_base64.h",
-  "src/pki/fillins/fillins_string_util.cc",
-  "src/pki/fillins/fillins_string_util.h",
-  "src/pki/fillins/log.h",
-  "src/pki/fillins/net_errors.h",
-  "src/pki/fillins/openssl_util.cc",
-  "src/pki/fillins/openssl_util.h",
-  "src/pki/fillins/path_service.h",
-  "src/pki/general_names.cc",
   "src/pki/general_names.h",
-  "src/pki/input.cc",
   "src/pki/input.h",
-  "src/pki/ip_util.cc",
   "src/pki/ip_util.h",
   "src/pki/mock_signature_verify_cache.h",
-  "src/pki/name_constraints.cc",
   "src/pki/name_constraints.h",
   "src/pki/nist_pkits_unittest.h",
-  "src/pki/ocsp.cc",
   "src/pki/ocsp.h",
   "src/pki/ocsp_revocation_status.h",
-  "src/pki/ocsp_verify_result.cc",
   "src/pki/ocsp_verify_result.h",
-  "src/pki/parse_certificate.cc",
   "src/pki/parse_certificate.h",
-  "src/pki/parse_name.cc",
   "src/pki/parse_name.h",
-  "src/pki/parse_values.cc",
   "src/pki/parse_values.h",
-  "src/pki/parsed_certificate.cc",
   "src/pki/parsed_certificate.h",
-  "src/pki/parser.cc",
   "src/pki/parser.h",
-  "src/pki/path_builder.cc",
   "src/pki/path_builder.h",
-  "src/pki/pem.cc",
   "src/pki/pem.h",
-  "src/pki/revocation_util.cc",
   "src/pki/revocation_util.h",
-  "src/pki/signature_algorithm.cc",
   "src/pki/signature_algorithm.h",
-  "src/pki/signature_verify_cache.h",
-  "src/pki/simple_path_builder_delegate.cc",
   "src/pki/simple_path_builder_delegate.h",
-  "src/pki/string_util.cc",
   "src/pki/string_util.h",
-  "src/pki/tag.cc",
-  "src/pki/tag.h",
   "src/pki/test_helpers.h",
   "src/pki/testdata/nist-pkits/pkits_testcases-inl.h",
-  "src/pki/testdata/test_certificate_data.h",
-  "src/pki/trust_store.cc",
   "src/pki/trust_store.h",
-  "src/pki/trust_store_collection.cc",
   "src/pki/trust_store_collection.h",
-  "src/pki/trust_store_in_memory.cc",
   "src/pki/trust_store_in_memory.h",
-  "src/pki/verify_certificate_chain.cc",
   "src/pki/verify_certificate_chain.h",
   "src/pki/verify_certificate_chain_typed_unittest.h",
-  "src/pki/verify_name_match.cc",
   "src/pki/verify_name_match.h",
-  "src/pki/verify_signed_data.cc",
   "src/pki/verify_signed_data.h",
 ]
 
+pki_headers = [
+  "src/include/openssl/pki/certificate.h",
+  "src/include/openssl/pki/signature_verify_cache.h",
+  "src/include/openssl/pki/verify_error.h",
+]
+
 tool_sources = [
   "src/tool/args.cc",
   "src/tool/ciphers.cc",
diff --git a/BUILD.generated_tests.gni b/BUILD.generated_tests.gni
index d7b63a7..a306858 100644
--- a/BUILD.generated_tests.gni
+++ b/BUILD.generated_tests.gni
@@ -19,7 +19,12 @@
   "src/crypto/test/abi_test.h",
   "src/crypto/test/file_test.cc",
   "src/crypto/test/file_test.h",
+  "src/crypto/test/file_test_gtest.cc",
+  "src/crypto/test/file_util.cc",
+  "src/crypto/test/file_util.h",
   "src/crypto/test/gtest_main.h",
+  "src/crypto/test/test_data.cc",
+  "src/crypto/test/test_data.h",
   "src/crypto/test/test_util.cc",
   "src/crypto/test/test_util.h",
   "src/crypto/test/wycheproof_util.cc",
@@ -36,7 +41,6 @@
 ]
 
 crypto_test_sources = [
-  "crypto_test_data.cc",
   "src/crypto/abi_self_test.cc",
   "src/crypto/asn1/asn1_test.cc",
   "src/crypto/base64/base64_test.cc",
@@ -69,6 +73,7 @@
   "src/crypto/fipsmodule/cmac/cmac_test.cc",
   "src/crypto/fipsmodule/ec/ec_test.cc",
   "src/crypto/fipsmodule/ec/p256-nistz_test.cc",
+  "src/crypto/fipsmodule/ec/p256_test.cc",
   "src/crypto/fipsmodule/ecdsa/ecdsa_test.cc",
   "src/crypto/fipsmodule/hkdf/hkdf_test.cc",
   "src/crypto/fipsmodule/md5/md5_test.cc",
@@ -81,6 +86,7 @@
   "src/crypto/hpke/hpke_test.cc",
   "src/crypto/hrss/hrss_test.cc",
   "src/crypto/impl_dispatch_test.cc",
+  "src/crypto/keccak/keccak_test.cc",
   "src/crypto/kyber/kyber_test.cc",
   "src/crypto/lhash/lhash_test.cc",
   "src/crypto/obj/obj_test.cc",
@@ -96,14 +102,14 @@
   "src/crypto/rsa_extra/rsa_test.cc",
   "src/crypto/self_test.cc",
   "src/crypto/siphash/siphash_test.cc",
+  "src/crypto/spx/spx_test.cc",
   "src/crypto/stack/stack_test.cc",
-  "src/crypto/test/file_test_gtest.cc",
   "src/crypto/test/gtest_main.cc",
   "src/crypto/thread_test.cc",
   "src/crypto/trust_token/trust_token_test.cc",
+  "src/crypto/x509/tab_test.cc",
   "src/crypto/x509/x509_test.cc",
   "src/crypto/x509/x509_time_test.cc",
-  "src/crypto/x509v3/tab_test.cc",
 ]
 
 crypto_test_data = [
@@ -167,9 +173,14 @@
   "src/crypto/fipsmodule/rand/ctrdrbg_vectors.txt",
   "src/crypto/hmac_extra/hmac_tests.txt",
   "src/crypto/hpke/hpke_test_vectors.txt",
-  "src/crypto/kyber/keccak_tests.txt",
+  "src/crypto/keccak/keccak_tests.txt",
   "src/crypto/kyber/kyber_tests.txt",
+  "src/crypto/pkcs8/test/bad1.p12",
+  "src/crypto/pkcs8/test/bad2.p12",
+  "src/crypto/pkcs8/test/bad3.p12",
   "src/crypto/pkcs8/test/empty_password.p12",
+  "src/crypto/pkcs8/test/empty_password_ber.p12",
+  "src/crypto/pkcs8/test/empty_password_ber_nested.p12",
   "src/crypto/pkcs8/test/no_encryption.p12",
   "src/crypto/pkcs8/test/nss.p12",
   "src/crypto/pkcs8/test/null_password.p12",
@@ -180,6 +191,8 @@
   "src/crypto/pkcs8/test/windows.p12",
   "src/crypto/poly1305/poly1305_tests.txt",
   "src/crypto/siphash/siphash_tests.txt",
+  "src/crypto/spx/spx_tests.txt",
+  "src/crypto/spx/spx_tests_deterministic.txt",
   "src/crypto/x509/test/basic_constraints_ca.pem",
   "src/crypto/x509/test/basic_constraints_ca_pathlen_0.pem",
   "src/crypto/x509/test/basic_constraints_ca_pathlen_1.pem",
@@ -346,27 +359,14 @@
   "src/pki/testdata/cert_issuer_source_static_unittest/d.pem",
   "src/pki/testdata/cert_issuer_source_static_unittest/e1.pem",
   "src/pki/testdata/cert_issuer_source_static_unittest/e2.pem",
-  "src/pki/testdata/cert_issuer_source_static_unittest/generate-certs.py",
   "src/pki/testdata/cert_issuer_source_static_unittest/i1_1.pem",
   "src/pki/testdata/cert_issuer_source_static_unittest/i1_2.pem",
   "src/pki/testdata/cert_issuer_source_static_unittest/i2.pem",
   "src/pki/testdata/cert_issuer_source_static_unittest/i3_1.pem",
   "src/pki/testdata/cert_issuer_source_static_unittest/i3_2.pem",
-  "src/pki/testdata/cert_issuer_source_static_unittest/keys/C1.key",
-  "src/pki/testdata/cert_issuer_source_static_unittest/keys/C2.key",
-  "src/pki/testdata/cert_issuer_source_static_unittest/keys/D.key",
-  "src/pki/testdata/cert_issuer_source_static_unittest/keys/E1.key",
-  "src/pki/testdata/cert_issuer_source_static_unittest/keys/E2.key",
-  "src/pki/testdata/cert_issuer_source_static_unittest/keys/I1.key",
-  "src/pki/testdata/cert_issuer_source_static_unittest/keys/I2.key",
-  "src/pki/testdata/cert_issuer_source_static_unittest/keys/I3.key",
-  "src/pki/testdata/cert_issuer_source_static_unittest/keys/I3_1.key",
-  "src/pki/testdata/cert_issuer_source_static_unittest/keys/Root.key",
-  "src/pki/testdata/cert_issuer_source_static_unittest/keys/i1_1.key",
   "src/pki/testdata/cert_issuer_source_static_unittest/root.pem",
   "src/pki/testdata/certificate_policies_unittest/anypolicy.pem",
   "src/pki/testdata/certificate_policies_unittest/anypolicy_with_qualifier.pem",
-  "src/pki/testdata/certificate_policies_unittest/generate_policies.py",
   "src/pki/testdata/certificate_policies_unittest/invalid-anypolicy_with_custom_qualifier.pem",
   "src/pki/testdata/certificate_policies_unittest/invalid-empty.pem",
   "src/pki/testdata/certificate_policies_unittest/invalid-policy_1_2_3_dupe.pem",
@@ -395,7 +395,6 @@
   "src/pki/testdata/crl_unittest/bad_thisupdate_in_future.pem",
   "src/pki/testdata/crl_unittest/bad_thisupdate_too_old.pem",
   "src/pki/testdata/crl_unittest/bad_wrong_issuer.pem",
-  "src/pki/testdata/crl_unittest/generate_crl_test_data.py",
   "src/pki/testdata/crl_unittest/good.pem",
   "src/pki/testdata/crl_unittest/good_fake_extension.pem",
   "src/pki/testdata/crl_unittest/good_fake_extension_no_nextupdate.pem",
@@ -466,7 +465,6 @@
   "src/pki/testdata/name_constraints_unittest/dnsname2.pem",
   "src/pki/testdata/name_constraints_unittest/edipartyname-excluded.pem",
   "src/pki/testdata/name_constraints_unittest/edipartyname-permitted.pem",
-  "src/pki/testdata/name_constraints_unittest/generate_name_constraints.py",
   "src/pki/testdata/name_constraints_unittest/invalid-empty_excluded_subtree.pem",
   "src/pki/testdata/name_constraints_unittest/invalid-empty_permitted_subtree.pem",
   "src/pki/testdata/name_constraints_unittest/invalid-no_subtrees.pem",
@@ -554,8 +552,6 @@
   "src/pki/testdata/name_constraints_unittest/uri-permitted.pem",
   "src/pki/testdata/name_constraints_unittest/x400address-excluded.pem",
   "src/pki/testdata/name_constraints_unittest/x400address-permitted.pem",
-  "src/pki/testdata/nist-pkits/BUILD.gn",
-  "src/pki/testdata/nist-pkits/README.chromium",
   "src/pki/testdata/nist-pkits/certs/AllCertificatesNoPoliciesTest2EE.crt",
   "src/pki/testdata/nist-pkits/certs/AllCertificatesSamePoliciesTest10EE.crt",
   "src/pki/testdata/nist-pkits/certs/AllCertificatesSamePoliciesTest13EE.crt",
@@ -1134,11 +1130,6 @@
   "src/pki/testdata/nist-pkits/crls/requireExplicitPolicy7subCARE2CRL.crl",
   "src/pki/testdata/nist-pkits/crls/requireExplicitPolicy7subsubCARE2RE4CRL.crl",
   "src/pki/testdata/nist-pkits/crls/requireExplicitPolicy7subsubsubCARE2RE4CRL.crl",
-  "src/pki/testdata/nist-pkits/generate_tests.py",
-  "src/pki/testdata/nist-pkits/pkits_testcases-inl.h",
-  "src/pki/testdata/nist-pkits/test_bundle_data.filelist",
-  "src/pki/testdata/nist-pkits/test_bundle_data.globlist",
-  "src/pki/testdata/ocsp_unittest/annotate_test_data.py",
   "src/pki/testdata/ocsp_unittest/bad_ocsp_type.pem",
   "src/pki/testdata/ocsp_unittest/bad_signature.pem",
   "src/pki/testdata/ocsp_unittest/bad_status.pem",
@@ -1151,7 +1142,6 @@
   "src/pki/testdata/ocsp_unittest/has_extension.pem",
   "src/pki/testdata/ocsp_unittest/has_single_extension.pem",
   "src/pki/testdata/ocsp_unittest/has_version.pem",
-  "src/pki/testdata/ocsp_unittest/make_ocsp.py",
   "src/pki/testdata/ocsp_unittest/malformed_request.pem",
   "src/pki/testdata/ocsp_unittest/missing_response.pem",
   "src/pki/testdata/ocsp_unittest/multiple_response.pem",
@@ -1170,7 +1160,6 @@
   "src/pki/testdata/parse_certificate_unittest/authority_key_identifier/empty_sequence.pem",
   "src/pki/testdata/parse_certificate_unittest/authority_key_identifier/extra_contents_after_extension_sequence.pem",
   "src/pki/testdata/parse_certificate_unittest/authority_key_identifier/extra_contents_after_issuer_and_serial.pem",
-  "src/pki/testdata/parse_certificate_unittest/authority_key_identifier/generate.py",
   "src/pki/testdata/parse_certificate_unittest/authority_key_identifier/invalid_contents.pem",
   "src/pki/testdata/parse_certificate_unittest/authority_key_identifier/invalid_issuer.pem",
   "src/pki/testdata/parse_certificate_unittest/authority_key_identifier/invalid_key_identifier.pem",
@@ -1229,8 +1218,6 @@
   "src/pki/testdata/parse_certificate_unittest/policy_constraints_inhibit_require.pem",
   "src/pki/testdata/parse_certificate_unittest/policy_constraints_require.pem",
   "src/pki/testdata/parse_certificate_unittest/policy_qualifiers_empty_sequence.pem",
-  "src/pki/testdata/parse_certificate_unittest/rebase-errors.py",
-  "src/pki/testdata/parse_certificate_unittest/regenerate_pem_from_ascii.py",
   "src/pki/testdata/parse_certificate_unittest/serial_37_bytes.pem",
   "src/pki/testdata/parse_certificate_unittest/serial_negative.pem",
   "src/pki/testdata/parse_certificate_unittest/serial_not_minimal.pem",
@@ -1273,20 +1260,12 @@
   "src/pki/testdata/parse_certificate_unittest/tbs_validity_relaxed.pem",
   "src/pki/testdata/parse_certificate_unittest/tbs_validity_utc_time_and_generalized_time.pem",
   "src/pki/testdata/parse_certificate_unittest/v1_explicit_version.pem",
-  "src/pki/testdata/parse_certificate_unittest/v3_certificate_template.pk8",
-  "src/pki/testdata/parse_certificate_unittest/v3_certificate_template.txt",
-  "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/generate-certs.py",
   "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/int_match_name_only.pem",
   "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/int_matching.pem",
   "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/int_mismatch.pem",
-  "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/keys/Intermediate.key",
-  "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/keys/Root.key",
-  "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/keys/Root2.key",
-  "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/keys/Target.key",
   "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/root.pem",
   "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/root2.pem",
   "src/pki/testdata/path_builder_unittest/key_id_name_and_serial_prioritization/target.pem",
-  "src/pki/testdata/path_builder_unittest/key_id_prioritization/generate-certs.py",
   "src/pki/testdata/path_builder_unittest/key_id_prioritization/int_different_ski_a.pem",
   "src/pki/testdata/path_builder_unittest/key_id_prioritization/int_different_ski_b.pem",
   "src/pki/testdata/path_builder_unittest/key_id_prioritization/int_different_ski_c.pem",
@@ -1296,277 +1275,34 @@
   "src/pki/testdata/path_builder_unittest/key_id_prioritization/int_no_ski_a.pem",
   "src/pki/testdata/path_builder_unittest/key_id_prioritization/int_no_ski_b.pem",
   "src/pki/testdata/path_builder_unittest/key_id_prioritization/int_no_ski_c.pem",
-  "src/pki/testdata/path_builder_unittest/key_id_prioritization/keys/Intermediate.key",
-  "src/pki/testdata/path_builder_unittest/key_id_prioritization/keys/Intermediate_1.key",
-  "src/pki/testdata/path_builder_unittest/key_id_prioritization/keys/Root.key",
-  "src/pki/testdata/path_builder_unittest/key_id_prioritization/keys/Target.key",
   "src/pki/testdata/path_builder_unittest/key_id_prioritization/root.pem",
   "src/pki/testdata/path_builder_unittest/key_id_prioritization/target.pem",
-  "src/pki/testdata/path_builder_unittest/self_issued_prioritization/generate-certs.py",
-  "src/pki/testdata/path_builder_unittest/self_issued_prioritization/keys/Root1.key",
-  "src/pki/testdata/path_builder_unittest/self_issued_prioritization/keys/Root2.key",
-  "src/pki/testdata/path_builder_unittest/self_issued_prioritization/keys/Target.key",
+  "src/pki/testdata/path_builder_unittest/multi-root-A-by-B.pem",
+  "src/pki/testdata/path_builder_unittest/multi-root-B-by-C.pem",
+  "src/pki/testdata/path_builder_unittest/multi-root-B-by-F.pem",
+  "src/pki/testdata/path_builder_unittest/multi-root-C-by-D.pem",
+  "src/pki/testdata/path_builder_unittest/multi-root-C-by-E.pem",
+  "src/pki/testdata/path_builder_unittest/multi-root-D-by-D.pem",
+  "src/pki/testdata/path_builder_unittest/multi-root-E-by-E.pem",
+  "src/pki/testdata/path_builder_unittest/multi-root-F-by-E.pem",
+  "src/pki/testdata/path_builder_unittest/precertificate/precertificate.pem",
+  "src/pki/testdata/path_builder_unittest/precertificate/root.pem",
   "src/pki/testdata/path_builder_unittest/self_issued_prioritization/root1.pem",
   "src/pki/testdata/path_builder_unittest/self_issued_prioritization/root1_cross.pem",
   "src/pki/testdata/path_builder_unittest/self_issued_prioritization/root2.pem",
   "src/pki/testdata/path_builder_unittest/self_issued_prioritization/target.pem",
-  "src/pki/testdata/path_builder_unittest/validity_date_prioritization/generate-certs.py",
   "src/pki/testdata/path_builder_unittest/validity_date_prioritization/int_ac.pem",
   "src/pki/testdata/path_builder_unittest/validity_date_prioritization/int_ad.pem",
   "src/pki/testdata/path_builder_unittest/validity_date_prioritization/int_bc.pem",
   "src/pki/testdata/path_builder_unittest/validity_date_prioritization/int_bd.pem",
-  "src/pki/testdata/path_builder_unittest/validity_date_prioritization/keys/Intermediate.key",
-  "src/pki/testdata/path_builder_unittest/validity_date_prioritization/keys/Root.key",
-  "src/pki/testdata/path_builder_unittest/validity_date_prioritization/keys/Target.key",
   "src/pki/testdata/path_builder_unittest/validity_date_prioritization/root.pem",
   "src/pki/testdata/path_builder_unittest/validity_date_prioritization/target.pem",
-  "src/pki/testdata/ssl/certificates/1024-rsa-ee-by-1024-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/1024-rsa-ee-by-2048-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/1024-rsa-ee-by-768-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/1024-rsa-ee-by-prime256v1-ecdsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/1024-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/10_year_validity.pem",
-  "src/pki/testdata/ssl/certificates/11_year_validity.pem",
-  "src/pki/testdata/ssl/certificates/2029_globalsign_com_cert.pem",
-  "src/pki/testdata/ssl/certificates/2048-rsa-ee-by-1024-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/2048-rsa-ee-by-2048-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/2048-rsa-ee-by-768-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/2048-rsa-ee-by-prime256v1-ecdsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/2048-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/2048-rsa-root.pem",
-  "src/pki/testdata/ssl/certificates/398_days_1_second_after_2020_09_01.pem",
-  "src/pki/testdata/ssl/certificates/398_days_after_2020_09_01.pem",
-  "src/pki/testdata/ssl/certificates/399_days_after_2020_09_01.pem",
-  "src/pki/testdata/ssl/certificates/39_months_after_2015_04.pem",
-  "src/pki/testdata/ssl/certificates/39_months_based_on_last_day.pem",
-  "src/pki/testdata/ssl/certificates/40_months_after_2015_04.pem",
-  "src/pki/testdata/ssl/certificates/60_months_after_2012_07.pem",
-  "src/pki/testdata/ssl/certificates/61_months_after_2012_07.pem",
-  "src/pki/testdata/ssl/certificates/768-rsa-ee-by-1024-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/768-rsa-ee-by-2048-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/768-rsa-ee-by-768-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/768-rsa-ee-by-prime256v1-ecdsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/768-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/825_days_1_second_after_2018_03_01.pem",
-  "src/pki/testdata/ssl/certificates/825_days_after_2018_03_01.pem",
-  "src/pki/testdata/ssl/certificates/826_days_after_2018_03_01.pem",
-  "src/pki/testdata/ssl/certificates/900_days_after_2019_07_01.pem",
-  "src/pki/testdata/ssl/certificates/BUILD.gn",
-  "src/pki/testdata/ssl/certificates/README",
-  "src/pki/testdata/ssl/certificates/bad_validity.pem",
-  "src/pki/testdata/ssl/certificates/can_sign_http_exchanges_draft_extension.pem",
-  "src/pki/testdata/ssl/certificates/can_sign_http_exchanges_draft_extension_invalid.pem",
-  "src/pki/testdata/ssl/certificates/client-empty-password.p12",
-  "src/pki/testdata/ssl/certificates/client-nokey.p12",
-  "src/pki/testdata/ssl/certificates/client-null-password.p12",
-  "src/pki/testdata/ssl/certificates/client.p12",
-  "src/pki/testdata/ssl/certificates/client_1.key",
-  "src/pki/testdata/ssl/certificates/client_1.pem",
-  "src/pki/testdata/ssl/certificates/client_1.pk8",
-  "src/pki/testdata/ssl/certificates/client_1_ca.pem",
-  "src/pki/testdata/ssl/certificates/client_2.key",
-  "src/pki/testdata/ssl/certificates/client_2.pem",
-  "src/pki/testdata/ssl/certificates/client_2.pk8",
-  "src/pki/testdata/ssl/certificates/client_2_ca.pem",
-  "src/pki/testdata/ssl/certificates/client_3.key",
-  "src/pki/testdata/ssl/certificates/client_3.pem",
-  "src/pki/testdata/ssl/certificates/client_3.pk8",
-  "src/pki/testdata/ssl/certificates/client_3_ca.pem",
-  "src/pki/testdata/ssl/certificates/client_4.key",
-  "src/pki/testdata/ssl/certificates/client_4.pem",
-  "src/pki/testdata/ssl/certificates/client_4.pk8",
-  "src/pki/testdata/ssl/certificates/client_4_ca.pem",
-  "src/pki/testdata/ssl/certificates/client_5.key",
-  "src/pki/testdata/ssl/certificates/client_5.pem",
-  "src/pki/testdata/ssl/certificates/client_5.pk8",
-  "src/pki/testdata/ssl/certificates/client_5_ca.pem",
-  "src/pki/testdata/ssl/certificates/client_6.key",
-  "src/pki/testdata/ssl/certificates/client_6.pem",
-  "src/pki/testdata/ssl/certificates/client_6.pk8",
-  "src/pki/testdata/ssl/certificates/client_6_ca.pem",
-  "src/pki/testdata/ssl/certificates/client_7.key",
-  "src/pki/testdata/ssl/certificates/client_7.pem",
-  "src/pki/testdata/ssl/certificates/client_7.pk8",
-  "src/pki/testdata/ssl/certificates/client_7_ca.pem",
-  "src/pki/testdata/ssl/certificates/client_root_ca.pem",
-  "src/pki/testdata/ssl/certificates/common_name_only.pem",
-  "src/pki/testdata/ssl/certificates/crit-codeSigning-chain.pem",
-  "src/pki/testdata/ssl/certificates/crlset_blocked_interception_by_intermediate.raw",
-  "src/pki/testdata/ssl/certificates/crlset_blocked_interception_by_root.raw",
-  "src/pki/testdata/ssl/certificates/crlset_by_intermediate_serial.raw",
-  "src/pki/testdata/ssl/certificates/crlset_by_leaf_spki.raw",
-  "src/pki/testdata/ssl/certificates/crlset_by_leaf_subject_no_spki.raw",
-  "src/pki/testdata/ssl/certificates/crlset_by_root_serial.raw",
-  "src/pki/testdata/ssl/certificates/crlset_by_root_spki.raw",
-  "src/pki/testdata/ssl/certificates/crlset_by_root_subject.raw",
-  "src/pki/testdata/ssl/certificates/crlset_by_root_subject_no_spki.raw",
-  "src/pki/testdata/ssl/certificates/crlset_known_interception_by_root.raw",
-  "src/pki/testdata/ssl/certificates/cross-signed-leaf.pem",
-  "src/pki/testdata/ssl/certificates/cross-signed-root-md5.pem",
-  "src/pki/testdata/ssl/certificates/cross-signed-root-sha256.pem",
-  "src/pki/testdata/ssl/certificates/ct-test-embedded-cert.pem",
-  "src/pki/testdata/ssl/certificates/ct-test-embedded-with-intermediate-chain.pem",
-  "src/pki/testdata/ssl/certificates/ct-test-embedded-with-intermediate-preca-chain.pem",
-  "src/pki/testdata/ssl/certificates/ct-test-embedded-with-preca-chain.pem",
-  "src/pki/testdata/ssl/certificates/ct-test-embedded-with-uids.pem",
-  "src/pki/testdata/ssl/certificates/dec_2017.pem",
-  "src/pki/testdata/ssl/certificates/diginotar_cyber_ca.pem",
-  "src/pki/testdata/ssl/certificates/diginotar_pkioverheid.pem",
-  "src/pki/testdata/ssl/certificates/diginotar_pkioverheid_g2.pem",
-  "src/pki/testdata/ssl/certificates/diginotar_public_ca_2025.pem",
-  "src/pki/testdata/ssl/certificates/diginotar_root_ca.pem",
-  "src/pki/testdata/ssl/certificates/diginotar_services_1024_ca.pem",
-  "src/pki/testdata/ssl/certificates/duplicate_cn_1.p12",
-  "src/pki/testdata/ssl/certificates/duplicate_cn_1.pem",
-  "src/pki/testdata/ssl/certificates/duplicate_cn_2.p12",
-  "src/pki/testdata/ssl/certificates/duplicate_cn_2.pem",
-  "src/pki/testdata/ssl/certificates/ec-prime256v1-1.key",
-  "src/pki/testdata/ssl/certificates/ec-prime256v1-2.key",
-  "src/pki/testdata/ssl/certificates/ec-prime256v1-3.key",
-  "src/pki/testdata/ssl/certificates/eku-test-root.pem",
-  "src/pki/testdata/ssl/certificates/ev_test.pem",
-  "src/pki/testdata/ssl/certificates/ev_test_state_only.pem",
-  "src/pki/testdata/ssl/certificates/expired_cert.pem",
-  "src/pki/testdata/ssl/certificates/foaf.me.chromium-test-cert.der",
-  "src/pki/testdata/ssl/certificates/google.binary.p7b",
-  "src/pki/testdata/ssl/certificates/google.chain.pem",
-  "src/pki/testdata/ssl/certificates/google.pem_cert.p7b",
-  "src/pki/testdata/ssl/certificates/google.pem_pkcs7.p7b",
-  "src/pki/testdata/ssl/certificates/google.single.der",
-  "src/pki/testdata/ssl/certificates/google.single.pem",
-  "src/pki/testdata/ssl/certificates/google_diginotar.pem",
-  "src/pki/testdata/ssl/certificates/intermediate_ca_cert.pem",
-  "src/pki/testdata/ssl/certificates/invalid_key_usage_cert.der",
-  "src/pki/testdata/ssl/certificates/key_usage_p256.key",
-  "src/pki/testdata/ssl/certificates/key_usage_p256_both.pem",
-  "src/pki/testdata/ssl/certificates/key_usage_p256_digitalsignature.pem",
-  "src/pki/testdata/ssl/certificates/key_usage_p256_keyagreement.pem",
-  "src/pki/testdata/ssl/certificates/key_usage_p256_no_extension.pem",
-  "src/pki/testdata/ssl/certificates/key_usage_rsa.key",
-  "src/pki/testdata/ssl/certificates/key_usage_rsa_both.pem",
-  "src/pki/testdata/ssl/certificates/key_usage_rsa_digitalsignature.pem",
-  "src/pki/testdata/ssl/certificates/key_usage_rsa_keyencipherment.pem",
-  "src/pki/testdata/ssl/certificates/key_usage_rsa_no_extension.pem",
-  "src/pki/testdata/ssl/certificates/large_key.pem",
-  "src/pki/testdata/ssl/certificates/leaf_from_known_root.pem",
-  "src/pki/testdata/ssl/certificates/lets-encrypt-dst-x3-root.pem",
-  "src/pki/testdata/ssl/certificates/lets-encrypt-isrg-x1-root.pem",
-  "src/pki/testdata/ssl/certificates/localhost_cert.pem",
-  "src/pki/testdata/ssl/certificates/may_2018.pem",
-  "src/pki/testdata/ssl/certificates/mit.davidben.der",
-  "src/pki/testdata/ssl/certificates/multi-root-A-by-B.pem",
-  "src/pki/testdata/ssl/certificates/multi-root-B-by-C.pem",
-  "src/pki/testdata/ssl/certificates/multi-root-B-by-F.pem",
-  "src/pki/testdata/ssl/certificates/multi-root-C-by-D.pem",
-  "src/pki/testdata/ssl/certificates/multi-root-C-by-E.pem",
-  "src/pki/testdata/ssl/certificates/multi-root-D-by-D.pem",
-  "src/pki/testdata/ssl/certificates/multi-root-E-by-E.pem",
-  "src/pki/testdata/ssl/certificates/multi-root-F-by-E.pem",
-  "src/pki/testdata/ssl/certificates/multi-root-chain1.pem",
-  "src/pki/testdata/ssl/certificates/multi-root-chain2.pem",
-  "src/pki/testdata/ssl/certificates/multi-root-crlset-C.raw",
-  "src/pki/testdata/ssl/certificates/multi-root-crlset-CD-and-FE.raw",
-  "src/pki/testdata/ssl/certificates/multi-root-crlset-D-and-E.raw",
-  "src/pki/testdata/ssl/certificates/multi-root-crlset-E.raw",
-  "src/pki/testdata/ssl/certificates/multi-root-crlset-unrelated.raw",
-  "src/pki/testdata/ssl/certificates/multi-root.keychain",
-  "src/pki/testdata/ssl/certificates/multivalue_rdn.pem",
-  "src/pki/testdata/ssl/certificates/name_constrained_key.pem",
-  "src/pki/testdata/ssl/certificates/ndn.ca.crt",
-  "src/pki/testdata/ssl/certificates/nist.der",
-  "src/pki/testdata/ssl/certificates/no_subject_common_name_cert.pem",
-  "src/pki/testdata/ssl/certificates/non-crit-codeSigning-chain.pem",
-  "src/pki/testdata/ssl/certificates/ok_cert.pem",
-  "src/pki/testdata/ssl/certificates/ok_cert_by_intermediate.pem",
-  "src/pki/testdata/ssl/certificates/policies_sanity_check.pem",
-  "src/pki/testdata/ssl/certificates/post_june_2016.pem",
-  "src/pki/testdata/ssl/certificates/pre_br_validity_bad_121.pem",
-  "src/pki/testdata/ssl/certificates/pre_br_validity_bad_2020.pem",
-  "src/pki/testdata/ssl/certificates/pre_br_validity_ok.pem",
-  "src/pki/testdata/ssl/certificates/pre_june_2016.pem",
-  "src/pki/testdata/ssl/certificates/prime256v1-ecdsa-ee-by-1024-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/prime256v1-ecdsa-ee-by-2048-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/prime256v1-ecdsa-ee-by-768-rsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/prime256v1-ecdsa-ee-by-prime256v1-ecdsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/prime256v1-ecdsa-intermediate.pem",
-  "src/pki/testdata/ssl/certificates/punycodetest.pem",
-  "src/pki/testdata/ssl/certificates/quic-chain.pem",
-  "src/pki/testdata/ssl/certificates/quic-ecdsa-leaf.key",
-  "src/pki/testdata/ssl/certificates/quic-leaf-cert.key",
-  "src/pki/testdata/ssl/certificates/quic-leaf-cert.key.pkcs8.pem",
-  "src/pki/testdata/ssl/certificates/quic-leaf-cert.key.sct",
-  "src/pki/testdata/ssl/certificates/quic-root.pem",
-  "src/pki/testdata/ssl/certificates/quic-short-lived.pem",
-  "src/pki/testdata/ssl/certificates/redundant-server-chain.pem",
-  "src/pki/testdata/ssl/certificates/redundant-validated-chain-root.pem",
-  "src/pki/testdata/ssl/certificates/redundant-validated-chain.pem",
-  "src/pki/testdata/ssl/certificates/root_ca_cert.pem",
-  "src/pki/testdata/ssl/certificates/rsa-1024-1.key",
-  "src/pki/testdata/ssl/certificates/rsa-1024-2.key",
-  "src/pki/testdata/ssl/certificates/rsa-1024-3.key",
-  "src/pki/testdata/ssl/certificates/rsa-2048-1.key",
-  "src/pki/testdata/ssl/certificates/rsa-2048-2.key",
-  "src/pki/testdata/ssl/certificates/rsa-2048-3.key",
-  "src/pki/testdata/ssl/certificates/rsa-768-1.key",
-  "src/pki/testdata/ssl/certificates/rsa-768-2.key",
-  "src/pki/testdata/ssl/certificates/rsa-768-3.key",
-  "src/pki/testdata/ssl/certificates/rsa-8200-1.key",
-  "src/pki/testdata/ssl/certificates/salesforce_com_test.pem",
-  "src/pki/testdata/ssl/certificates/self-signed-invalid-name.pem",
-  "src/pki/testdata/ssl/certificates/self-signed-invalid-sig.pem",
-  "src/pki/testdata/ssl/certificates/sha1_2016.pem",
-  "src/pki/testdata/ssl/certificates/sha1_leaf.pem",
-  "src/pki/testdata/ssl/certificates/spdy_pooling.pem",
-  "src/pki/testdata/ssl/certificates/start_after_expiry.pem",
-  "src/pki/testdata/ssl/certificates/subjectAltName_sanity_check.pem",
-  "src/pki/testdata/ssl/certificates/subjectAltName_www_example_com.pem",
-  "src/pki/testdata/ssl/certificates/test_names.pem",
-  "src/pki/testdata/ssl/certificates/treadclimber.pem",
-  "src/pki/testdata/ssl/certificates/treadclimber.sctlist",
-  "src/pki/testdata/ssl/certificates/unescaped.pem",
-  "src/pki/testdata/ssl/certificates/unittest.key.bin",
-  "src/pki/testdata/ssl/certificates/unittest.selfsigned.der",
-  "src/pki/testdata/ssl/certificates/verisign_intermediate_ca_2011.pem",
-  "src/pki/testdata/ssl/certificates/verisign_intermediate_ca_2016.pem",
-  "src/pki/testdata/ssl/certificates/weak_digest_md2_ee.pem",
-  "src/pki/testdata/ssl/certificates/weak_digest_md2_intermediate.pem",
-  "src/pki/testdata/ssl/certificates/weak_digest_md2_root.pem",
-  "src/pki/testdata/ssl/certificates/weak_digest_md4_ee.pem",
-  "src/pki/testdata/ssl/certificates/weak_digest_md4_intermediate.pem",
-  "src/pki/testdata/ssl/certificates/weak_digest_md4_root.pem",
-  "src/pki/testdata/ssl/certificates/weak_digest_md5_ee.pem",
-  "src/pki/testdata/ssl/certificates/weak_digest_md5_intermediate.pem",
-  "src/pki/testdata/ssl/certificates/weak_digest_md5_root.pem",
-  "src/pki/testdata/ssl/certificates/weak_digest_sha1_ee.pem",
-  "src/pki/testdata/ssl/certificates/weak_digest_sha1_intermediate.pem",
-  "src/pki/testdata/ssl/certificates/weak_digest_sha1_root.pem",
-  "src/pki/testdata/ssl/certificates/websocket_cacert.pem",
-  "src/pki/testdata/ssl/certificates/websocket_client_cert.p12",
-  "src/pki/testdata/ssl/certificates/wildcard.pem",
-  "src/pki/testdata/ssl/certificates/x509_verify_results.chain.pem",
-  "src/pki/testdata/test_certificate_data.h",
-  "src/pki/testdata/verify_certificate_chain_unittest/README",
   "src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/keys/Intermediate_1.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/basic-constraints-pathlen-0-self-issued/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/not-after.test",
   "src/pki/testdata/verify_certificate_chain_unittest/expired-intermediate/not-before.test",
   "src/pki/testdata/verify_certificate_chain_unittest/expired-root/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-root/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-root/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-root/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-root/keys/Target.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-root/keys/expired-unconstrained-root_Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-root/keys/expired-unconstrained-root_Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/expired-root/not-after-ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/expired-root/not-after-ta-with-expiration-and-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/expired-root/not-after-ta-with-expiration.test",
@@ -1574,67 +1310,32 @@
   "src/pki/testdata/verify_certificate_chain_unittest/expired-root/not-before-ta-with-expiration.test",
   "src/pki/testdata/verify_certificate_chain_unittest/expired-root/not-before.test",
   "src/pki/testdata/verify_certificate_chain_unittest/expired-target/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-target/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-target/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-target/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/expired-target/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/expired-target/not-after.test",
   "src/pki/testdata/verify_certificate_chain_unittest/expired-target/not-before.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/generate-all.sh",
   "src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/keys/BogusRoot.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/incorrect-trust-anchor/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/keys/Intermediate_1.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/keys/Root_1.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-and-target-wrong-signature/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-ca-false/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-ca-false/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-ca-false/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-ca-false/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-ca-false/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-ca-false/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-not-critical/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-not-critical/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-not-critical/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-not-critical/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-not-critical/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-basic-constraints-not-critical/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/any.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/chain.pem",
+  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/clientauth-strict-leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/clientauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/clientauth.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/keys/Target.key",
+  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/serverauth-strict-leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/serverauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-any-and-clientauth/serverauth.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/any.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/chain.pem",
+  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/clientauth-strict-leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/clientauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/clientauth.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/keys/Target.key",
+  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/serverauth-strict-leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/serverauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-clientauth/serverauth.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/keys/Intermediate_1.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/keys/Root_1.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/keys/Target.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/keys/Target_1.key",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/sha1-chain.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/sha1-eku-any.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/sha1-eku-clientAuth-strict.test",
@@ -1647,52 +1348,24 @@
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/sha256-eku-clientAuth.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/sha256-eku-serverAuth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-eku-server-gated-crypto/sha256-eku-serverAuth.test",
+  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-invalid-spki/chain.pem",
+  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-invalid-spki/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-basic-constraints/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-basic-constraints/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-basic-constraints/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-basic-constraints/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-basic-constraints/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-basic-constraints/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-signing-key-usage/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-signing-key-usage/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-signing-key-usage/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-signing-key-usage/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-signing-key-usage/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-lacks-signing-key-usage/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-signed-with-sha1/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-signed-with-sha1/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-signed-with-sha1/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-signed-with-sha1/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-signed-with-sha1/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-signed-with-sha1/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-critical-extension/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-critical-extension/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-critical-extension/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-critical-extension/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-critical-extension/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-critical-extension/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-non-critical-extension/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-non-critical-extension/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-non-critical-extension/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-non-critical-extension/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-non-critical-extension/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-unknown-non-critical-extension/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/keys/Root_1.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/intermediate-wrong-signature-no-authority-key-identifier/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/issuer-and-subject-not-byte-for-byte-equal/anchor.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/issuer-and-subject-not-byte-for-byte-equal/anchor.test",
   "src/pki/testdata/verify_certificate_chain_unittest/issuer-and-subject-not-byte-for-byte-equal/target.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/issuer-and-subject-not-byte-for-byte-equal/target.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/keys/Root_1.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/longrolloverchain.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/longrolloverchain.test",
   "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/newchain.pem",
@@ -1701,94 +1374,8 @@
   "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/oldchain.test",
   "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/rolloverchain.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/key-rollover/rolloverchain.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/keys/t0.key",
   "src/pki/testdata/verify_certificate_chain_unittest/many-names/ok-all-types.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/many-names/ok-all-types.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8D4.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8D5.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8D6.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8D7.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8D8.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8D9.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8DA.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/2FABB43DDCC077802A0309AD437402BF98D8DB.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6F5.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6F6.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6F7.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6F8.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6F9.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6FA.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6FB.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6FC.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/3CE5FC818859A85016C17FD7E52AE5967FC2F6FD.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.db",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.db.attr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.db.attr.old",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.db.old",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.serial",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate.serial.old",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_1.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_1.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_1.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_2.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_2.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_2.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_3.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_3.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_3.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_4.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_4.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_4.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_5.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_5.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_5.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_6.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_6.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_6.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_7.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_7.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Intermediate_7.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.db",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.db.attr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.db.attr.old",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.db.old",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.serial",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/Root.serial.old",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0.db",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0.serial",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_1.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_1.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_1.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_2.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_2.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_2.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_3.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_3.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_3.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_4.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_4.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_4.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_5.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_5.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_5.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_6.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_6.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_6.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_7.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_7.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/many-names/out/t0_7.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/many-names/toomany-all-types.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/many-names/toomany-all-types.test",
   "src/pki/testdata/verify_certificate_chain_unittest/many-names/toomany-dirnames-excluded.pem",
@@ -1804,11 +1391,6 @@
   "src/pki/testdata/verify_certificate_chain_unittest/many-names/toomany-ips-permitted.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/many-names/toomany-ips-permitted.test",
   "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/keys/ShadowRoot.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/non-self-signed-root/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/pkits_errors/4.1.2.txt",
@@ -1906,95 +1488,42 @@
   "src/pki/testdata/verify_certificate_chain_unittest/pkits_errors/4.9.7.txt",
   "src/pki/testdata/verify_certificate_chain_unittest/pkits_errors/4.9.8.txt",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-fail/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-anypolicy-by-root-ok/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-fail/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-inhibit-mapping-by-root-ok/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-ok/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-ok/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-ok/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-ok/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-ok/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-ok/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-ok/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-ok/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-on-root-wrong/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-fail/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policies-required-by-root-ok/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-fail/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/policy-mappings-on-root-ok/ta-with-constraints.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/rebase-errors.py",
   "src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/root-basic-constraints-ca-false/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/serverauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/serverauth-ta-with-constraints-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/serverauth-ta-with-constraints.test",
@@ -2002,28 +1531,16 @@
   "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/serverauth-ta-with-expiration.test",
   "src/pki/testdata/verify_certificate_chain_unittest/root-eku-clientauth/serverauth.test",
   "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/ta-with-constraints-require-basic-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-basic-constraints/ta-with-require-basic-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/root-lacks-keycertsign-key-usage/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/chain.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/distrusted-root-expired.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/distrusted-root.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/ta-with-constraints.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/ta-with-expiration.test",
@@ -2032,122 +1549,54 @@
   "src/pki/testdata/verify_certificate_chain_unittest/target-and-intermediate/unspecified-trust-root.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/any.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/chain.pem",
+  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/clientauth-strict-leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/clientauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/clientauth.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/keys/Target.key",
+  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/serverauth-strict-leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/serverauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-any/serverauth.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/any.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/chain.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/clientauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/clientauth.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/serverauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-clientauth/serverauth.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/any.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/chain.pem",
+  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/clientauth-strict-leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/clientauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/clientauth.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/keys/Target.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/3F1D2B1D127E34B62B61B278F274669ADC66ADCC.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/71F49EE7B5F73630C9845EA5B8398B58F3237B18.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/71F49EE7B5F73630C9845EA5B8398B58F3237B19.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.db",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.db.attr",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.db.old",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.serial",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Intermediate.serial.old",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Issuer.db",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Issuer.serial",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.db",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.db.attr",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.db.attr.old",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.db.old",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.serial",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Root.serial.old",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Target.cnf",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Target.csr",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Target.db",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Target.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/out/Target.serial",
+  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/serverauth-strict-leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/serverauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-many/serverauth.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/any.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/chain.pem",
+  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/clientauth-strict-leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/clientauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/clientauth.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/serverauth-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-eku-none/serverauth.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-has-512bit-rsa-key/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-512bit-rsa-key/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-512bit-rsa-key/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-512bit-rsa-key/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-512bit-rsa-key/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-has-512bit-rsa-key/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/target_only-trusted_leaf-strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/target_only-trusted_leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-has-ca-basic-constraints/target_only.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/target-has-keycertsign-but-not-ca/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-keycertsign-but-not-ca/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-keycertsign-but-not-ca/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-keycertsign-but-not-ca/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-keycertsign-but-not-ca/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-has-keycertsign-but-not-ca/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-has-pathlen-but-not-ca/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-pathlen-but-not-ca/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-pathlen-but-not-ca/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-pathlen-but-not-ca/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-has-pathlen-but-not-ca/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-has-pathlen-but-not-ca/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-and-eku/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-and-eku/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-and-eku/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-and-eku/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-and-eku/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-and-eku/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-no-eku/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-no-eku/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-no-eku/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-no-eku/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-no-eku/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-msapplicationpolicies-no-eku/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/main.test",
+  "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/strict-leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-not-end-entity/strict.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-only/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-only/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-only/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-only/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-only/trusted_anchor.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-only/trusted_leaf-and-trust_anchor.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-only/trusted_leaf-not_after.test",
@@ -2155,16 +1604,11 @@
   "src/pki/testdata/verify_certificate_chain_unittest/target-only/trusted_leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-only/trusted_leaf_require_self_signed.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/keys/Target.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/keys/Target_1.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/trusted_anchor.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/trusted_leaf-and-trust_anchor.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/trusted_leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-selfissued/trusted_leaf_require_self_signed.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-selfsigned/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-selfsigned/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-selfsigned/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-selfsigned/trusted_leaf-and-trust_anchor.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-selfsigned/trusted_leaf-not_after.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-selfsigned/trusted_leaf-wrong_eku.test",
@@ -2178,11 +1622,6 @@
   "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/ec-keyAgreement.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/ec-keyEncipherment.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/ec-keyEncipherment.test",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/keys/Target-ec.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/keys/Target-rsa.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/rsa-decipherOnly.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/rsa-decipherOnly.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/rsa-digitalSignature.pem",
@@ -2192,70 +1631,26 @@
   "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/rsa-keyEncipherment.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/target-serverauth-various-keyusages/rsa-keyEncipherment.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-signed-by-512bit-rsa/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-signed-by-512bit-rsa/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-signed-by-512bit-rsa/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-signed-by-512bit-rsa/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-signed-by-512bit-rsa/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-signed-by-512bit-rsa/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-signed-using-ecdsa/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-signed-using-ecdsa/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-signed-using-ecdsa/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-signed-using-ecdsa/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-signed-using-ecdsa/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-signed-using-ecdsa/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-signed-with-sha1/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-signed-with-sha1/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-signed-with-sha1/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-signed-with-sha1/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-signed-with-sha1/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-signed-with-sha1/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/target_only-trusted_leaf.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-unknown-critical-extension/target_only.pem",
   "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/keys/Intermediate_1.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature-no-authority-key-identifier/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/keys/Intermediate_1.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/target-wrong-signature/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/unknown-critical-policy-qualifier/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/unknown-critical-policy-qualifier/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/unknown-critical-policy-qualifier/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/unknown-critical-policy-qualifier/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/unknown-critical-policy-qualifier/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/unknown-critical-policy-qualifier/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/unknown-non-critical-policy-qualifier/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/unknown-non-critical-policy-qualifier/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/unknown-non-critical-policy-qualifier/keys/Intermediate.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/unknown-non-critical-policy-qualifier/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/unknown-non-critical-policy-qualifier/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/unknown-non-critical-policy-qualifier/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/keys/Intermediate1.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/keys/Intermediate2.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/violates-basic-constraints-pathlen-0/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/chain.pem",
-  "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/generate-chains.py",
-  "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/keys/Intermediate1.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/keys/Intermediate2.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/keys/Root.key",
-  "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/keys/Target.key",
   "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/main.test",
   "src/pki/testdata/verify_certificate_chain_unittest/violates-pathlen-1-from-root/ta-with-constraints.test",
   "src/pki/testdata/verify_name_match_unittest/names/ascii-BMPSTRING-case_swap-dupe_attr.pem",
@@ -2340,9 +1735,6 @@
   "src/pki/testdata/verify_name_match_unittest/names/unicode_supplementary-UTF8-unmangled.pem",
   "src/pki/testdata/verify_name_match_unittest/names/valid-Name-empty.pem",
   "src/pki/testdata/verify_name_match_unittest/names/valid-minimal.pem",
-  "src/pki/testdata/verify_name_match_unittest/scripts/generate_names.py",
-  "src/pki/testdata/verify_signed_data_unittest/README",
-  "src/pki/testdata/verify_signed_data_unittest/annotate_test_data.py",
   "src/pki/testdata/verify_signed_data_unittest/ecdsa-prime256v1-sha512-spki-params-null.pem",
   "src/pki/testdata/verify_signed_data_unittest/ecdsa-prime256v1-sha512-unused-bits-signature.pem",
   "src/pki/testdata/verify_signed_data_unittest/ecdsa-prime256v1-sha512-using-ecdh-key.pem",
@@ -2369,6 +1761,8 @@
   "src/pki/testdata/verify_signed_data_unittest/rsa-pss-sha256.pem",
   "src/pki/testdata/verify_signed_data_unittest/rsa-using-ec-key.pem",
   "src/pki/testdata/verify_signed_data_unittest/rsa2048-pkcs1-sha512.pem",
+  "src/pki/testdata/verify_unittest/google-leaf.der",
+  "src/pki/testdata/verify_unittest/self-issued.pem",
 ]
 
 ssl_test_sources = [
@@ -2382,11 +1776,10 @@
   "src/crypto/test/gtest_main.cc",
   "src/pki/cert_issuer_source_static_unittest.cc",
   "src/pki/certificate_policies_unittest.cc",
+  "src/pki/certificate_unittest.cc",
   "src/pki/crl_unittest.cc",
   "src/pki/encode_values_unittest.cc",
   "src/pki/extended_key_usage_unittest.cc",
-  "src/pki/fillins/file_util.cc",
-  "src/pki/fillins/path_service.cc",
   "src/pki/general_names_unittest.cc",
   "src/pki/input_unittest.cc",
   "src/pki/ip_util_unittest.cc",
@@ -2402,11 +1795,13 @@
   "src/pki/path_builder_pkits_unittest.cc",
   "src/pki/path_builder_unittest.cc",
   "src/pki/path_builder_verify_certificate_chain_unittest.cc",
+  "src/pki/pem_unittest.cc",
   "src/pki/signature_algorithm_unittest.cc",
   "src/pki/simple_path_builder_delegate_unittest.cc",
   "src/pki/string_util_unittest.cc",
   "src/pki/test_helpers.cc",
   "src/pki/trust_store_collection_unittest.cc",
+  "src/pki/trust_store_in_memory_unittest.cc",
   "src/pki/verify_certificate_chain_pkits_unittest.cc",
   "src/pki/verify_certificate_chain_unittest.cc",
   "src/pki/verify_name_match_unittest.cc",
diff --git a/apple-aarch64/crypto/chacha/chacha-armv8-apple.S b/apple-aarch64/crypto/chacha/chacha-armv8-apple.S
deleted file mode 100644
index dd1c964..0000000
--- a/apple-aarch64/crypto/chacha/chacha-armv8-apple.S
+++ /dev/null
@@ -1,1984 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-
-.private_extern	_OPENSSL_armcap_P
-
-.section	__TEXT,__const
-
-.align	5
-Lsigma:
-.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
-Lone:
-.long	1,0,0,0
-.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-
-.text
-
-.globl	_ChaCha20_ctr32
-.private_extern	_ChaCha20_ctr32
-
-.align	5
-_ChaCha20_ctr32:
-	AARCH64_VALID_CALL_TARGET
-	cbz	x2,Labort
-#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
-	adrp	x5,:pg_hi21_nc:_OPENSSL_armcap_P
-#else
-	adrp	x5,_OPENSSL_armcap_P@PAGE
-#endif
-	cmp	x2,#192
-	b.lo	Lshort
-	ldr	w17,[x5,_OPENSSL_armcap_P@PAGEOFF]
-	tst	w17,#ARMV7_NEON
-	b.ne	ChaCha20_neon
-
-Lshort:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-
-	adrp	x5,Lsigma@PAGE
-	add	x5,x5,Lsigma@PAGEOFF
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#64
-
-	ldp	x22,x23,[x5]		// load sigma
-	ldp	x24,x25,[x3]		// load key
-	ldp	x26,x27,[x3,#16]
-	ldp	x28,x30,[x4]		// load counter
-#ifdef	__AARCH64EB__
-	ror	x24,x24,#32
-	ror	x25,x25,#32
-	ror	x26,x26,#32
-	ror	x27,x27,#32
-	ror	x28,x28,#32
-	ror	x30,x30,#32
-#endif
-
-Loop_outer:
-	mov	w5,w22			// unpack key block
-	lsr	x6,x22,#32
-	mov	w7,w23
-	lsr	x8,x23,#32
-	mov	w9,w24
-	lsr	x10,x24,#32
-	mov	w11,w25
-	lsr	x12,x25,#32
-	mov	w13,w26
-	lsr	x14,x26,#32
-	mov	w15,w27
-	lsr	x16,x27,#32
-	mov	w17,w28
-	lsr	x19,x28,#32
-	mov	w20,w30
-	lsr	x21,x30,#32
-
-	mov	x4,#10
-	subs	x2,x2,#64
-Loop:
-	sub	x4,x4,#1
-	add	w5,w5,w9
-	add	w6,w6,w10
-	add	w7,w7,w11
-	add	w8,w8,w12
-	eor	w17,w17,w5
-	eor	w19,w19,w6
-	eor	w20,w20,w7
-	eor	w21,w21,w8
-	ror	w17,w17,#16
-	ror	w19,w19,#16
-	ror	w20,w20,#16
-	ror	w21,w21,#16
-	add	w13,w13,w17
-	add	w14,w14,w19
-	add	w15,w15,w20
-	add	w16,w16,w21
-	eor	w9,w9,w13
-	eor	w10,w10,w14
-	eor	w11,w11,w15
-	eor	w12,w12,w16
-	ror	w9,w9,#20
-	ror	w10,w10,#20
-	ror	w11,w11,#20
-	ror	w12,w12,#20
-	add	w5,w5,w9
-	add	w6,w6,w10
-	add	w7,w7,w11
-	add	w8,w8,w12
-	eor	w17,w17,w5
-	eor	w19,w19,w6
-	eor	w20,w20,w7
-	eor	w21,w21,w8
-	ror	w17,w17,#24
-	ror	w19,w19,#24
-	ror	w20,w20,#24
-	ror	w21,w21,#24
-	add	w13,w13,w17
-	add	w14,w14,w19
-	add	w15,w15,w20
-	add	w16,w16,w21
-	eor	w9,w9,w13
-	eor	w10,w10,w14
-	eor	w11,w11,w15
-	eor	w12,w12,w16
-	ror	w9,w9,#25
-	ror	w10,w10,#25
-	ror	w11,w11,#25
-	ror	w12,w12,#25
-	add	w5,w5,w10
-	add	w6,w6,w11
-	add	w7,w7,w12
-	add	w8,w8,w9
-	eor	w21,w21,w5
-	eor	w17,w17,w6
-	eor	w19,w19,w7
-	eor	w20,w20,w8
-	ror	w21,w21,#16
-	ror	w17,w17,#16
-	ror	w19,w19,#16
-	ror	w20,w20,#16
-	add	w15,w15,w21
-	add	w16,w16,w17
-	add	w13,w13,w19
-	add	w14,w14,w20
-	eor	w10,w10,w15
-	eor	w11,w11,w16
-	eor	w12,w12,w13
-	eor	w9,w9,w14
-	ror	w10,w10,#20
-	ror	w11,w11,#20
-	ror	w12,w12,#20
-	ror	w9,w9,#20
-	add	w5,w5,w10
-	add	w6,w6,w11
-	add	w7,w7,w12
-	add	w8,w8,w9
-	eor	w21,w21,w5
-	eor	w17,w17,w6
-	eor	w19,w19,w7
-	eor	w20,w20,w8
-	ror	w21,w21,#24
-	ror	w17,w17,#24
-	ror	w19,w19,#24
-	ror	w20,w20,#24
-	add	w15,w15,w21
-	add	w16,w16,w17
-	add	w13,w13,w19
-	add	w14,w14,w20
-	eor	w10,w10,w15
-	eor	w11,w11,w16
-	eor	w12,w12,w13
-	eor	w9,w9,w14
-	ror	w10,w10,#25
-	ror	w11,w11,#25
-	ror	w12,w12,#25
-	ror	w9,w9,#25
-	cbnz	x4,Loop
-
-	add	w5,w5,w22		// accumulate key block
-	add	x6,x6,x22,lsr#32
-	add	w7,w7,w23
-	add	x8,x8,x23,lsr#32
-	add	w9,w9,w24
-	add	x10,x10,x24,lsr#32
-	add	w11,w11,w25
-	add	x12,x12,x25,lsr#32
-	add	w13,w13,w26
-	add	x14,x14,x26,lsr#32
-	add	w15,w15,w27
-	add	x16,x16,x27,lsr#32
-	add	w17,w17,w28
-	add	x19,x19,x28,lsr#32
-	add	w20,w20,w30
-	add	x21,x21,x30,lsr#32
-
-	b.lo	Ltail
-
-	add	x5,x5,x6,lsl#32	// pack
-	add	x7,x7,x8,lsl#32
-	ldp	x6,x8,[x1,#0]		// load input
-	add	x9,x9,x10,lsl#32
-	add	x11,x11,x12,lsl#32
-	ldp	x10,x12,[x1,#16]
-	add	x13,x13,x14,lsl#32
-	add	x15,x15,x16,lsl#32
-	ldp	x14,x16,[x1,#32]
-	add	x17,x17,x19,lsl#32
-	add	x20,x20,x21,lsl#32
-	ldp	x19,x21,[x1,#48]
-	add	x1,x1,#64
-#ifdef	__AARCH64EB__
-	rev	x5,x5
-	rev	x7,x7
-	rev	x9,x9
-	rev	x11,x11
-	rev	x13,x13
-	rev	x15,x15
-	rev	x17,x17
-	rev	x20,x20
-#endif
-	eor	x5,x5,x6
-	eor	x7,x7,x8
-	eor	x9,x9,x10
-	eor	x11,x11,x12
-	eor	x13,x13,x14
-	eor	x15,x15,x16
-	eor	x17,x17,x19
-	eor	x20,x20,x21
-
-	stp	x5,x7,[x0,#0]		// store output
-	add	x28,x28,#1			// increment counter
-	stp	x9,x11,[x0,#16]
-	stp	x13,x15,[x0,#32]
-	stp	x17,x20,[x0,#48]
-	add	x0,x0,#64
-
-	b.hi	Loop_outer
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#64
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-Labort:
-	ret
-
-.align	4
-Ltail:
-	add	x2,x2,#64
-Less_than_64:
-	sub	x0,x0,#1
-	add	x1,x1,x2
-	add	x0,x0,x2
-	add	x4,sp,x2
-	neg	x2,x2
-
-	add	x5,x5,x6,lsl#32	// pack
-	add	x7,x7,x8,lsl#32
-	add	x9,x9,x10,lsl#32
-	add	x11,x11,x12,lsl#32
-	add	x13,x13,x14,lsl#32
-	add	x15,x15,x16,lsl#32
-	add	x17,x17,x19,lsl#32
-	add	x20,x20,x21,lsl#32
-#ifdef	__AARCH64EB__
-	rev	x5,x5
-	rev	x7,x7
-	rev	x9,x9
-	rev	x11,x11
-	rev	x13,x13
-	rev	x15,x15
-	rev	x17,x17
-	rev	x20,x20
-#endif
-	stp	x5,x7,[sp,#0]
-	stp	x9,x11,[sp,#16]
-	stp	x13,x15,[sp,#32]
-	stp	x17,x20,[sp,#48]
-
-Loop_tail:
-	ldrb	w10,[x1,x2]
-	ldrb	w11,[x4,x2]
-	add	x2,x2,#1
-	eor	w10,w10,w11
-	strb	w10,[x0,x2]
-	cbnz	x2,Loop_tail
-
-	stp	xzr,xzr,[sp,#0]
-	stp	xzr,xzr,[sp,#16]
-	stp	xzr,xzr,[sp,#32]
-	stp	xzr,xzr,[sp,#48]
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#64
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-
-.align	5
-ChaCha20_neon:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-
-	adrp	x5,Lsigma@PAGE
-	add	x5,x5,Lsigma@PAGEOFF
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	cmp	x2,#512
-	b.hs	L512_or_more_neon
-
-	sub	sp,sp,#64
-
-	ldp	x22,x23,[x5]		// load sigma
-	ld1	{v24.4s},[x5],#16
-	ldp	x24,x25,[x3]		// load key
-	ldp	x26,x27,[x3,#16]
-	ld1	{v25.4s,v26.4s},[x3]
-	ldp	x28,x30,[x4]		// load counter
-	ld1	{v27.4s},[x4]
-	ld1	{v31.4s},[x5]
-#ifdef	__AARCH64EB__
-	rev64	v24.4s,v24.4s
-	ror	x24,x24,#32
-	ror	x25,x25,#32
-	ror	x26,x26,#32
-	ror	x27,x27,#32
-	ror	x28,x28,#32
-	ror	x30,x30,#32
-#endif
-	add	v27.4s,v27.4s,v31.4s		// += 1
-	add	v28.4s,v27.4s,v31.4s
-	add	v29.4s,v28.4s,v31.4s
-	shl	v31.4s,v31.4s,#2			// 1 -> 4
-
-Loop_outer_neon:
-	mov	w5,w22			// unpack key block
-	lsr	x6,x22,#32
-	mov	v0.16b,v24.16b
-	mov	w7,w23
-	lsr	x8,x23,#32
-	mov	v4.16b,v24.16b
-	mov	w9,w24
-	lsr	x10,x24,#32
-	mov	v16.16b,v24.16b
-	mov	w11,w25
-	mov	v1.16b,v25.16b
-	lsr	x12,x25,#32
-	mov	v5.16b,v25.16b
-	mov	w13,w26
-	mov	v17.16b,v25.16b
-	lsr	x14,x26,#32
-	mov	v3.16b,v27.16b
-	mov	w15,w27
-	mov	v7.16b,v28.16b
-	lsr	x16,x27,#32
-	mov	v19.16b,v29.16b
-	mov	w17,w28
-	mov	v2.16b,v26.16b
-	lsr	x19,x28,#32
-	mov	v6.16b,v26.16b
-	mov	w20,w30
-	mov	v18.16b,v26.16b
-	lsr	x21,x30,#32
-
-	mov	x4,#10
-	subs	x2,x2,#256
-Loop_neon:
-	sub	x4,x4,#1
-	add	v0.4s,v0.4s,v1.4s
-	add	w5,w5,w9
-	add	v4.4s,v4.4s,v5.4s
-	add	w6,w6,w10
-	add	v16.4s,v16.4s,v17.4s
-	add	w7,w7,w11
-	eor	v3.16b,v3.16b,v0.16b
-	add	w8,w8,w12
-	eor	v7.16b,v7.16b,v4.16b
-	eor	w17,w17,w5
-	eor	v19.16b,v19.16b,v16.16b
-	eor	w19,w19,w6
-	rev32	v3.8h,v3.8h
-	eor	w20,w20,w7
-	rev32	v7.8h,v7.8h
-	eor	w21,w21,w8
-	rev32	v19.8h,v19.8h
-	ror	w17,w17,#16
-	add	v2.4s,v2.4s,v3.4s
-	ror	w19,w19,#16
-	add	v6.4s,v6.4s,v7.4s
-	ror	w20,w20,#16
-	add	v18.4s,v18.4s,v19.4s
-	ror	w21,w21,#16
-	eor	v20.16b,v1.16b,v2.16b
-	add	w13,w13,w17
-	eor	v21.16b,v5.16b,v6.16b
-	add	w14,w14,w19
-	eor	v22.16b,v17.16b,v18.16b
-	add	w15,w15,w20
-	ushr	v1.4s,v20.4s,#20
-	add	w16,w16,w21
-	ushr	v5.4s,v21.4s,#20
-	eor	w9,w9,w13
-	ushr	v17.4s,v22.4s,#20
-	eor	w10,w10,w14
-	sli	v1.4s,v20.4s,#12
-	eor	w11,w11,w15
-	sli	v5.4s,v21.4s,#12
-	eor	w12,w12,w16
-	sli	v17.4s,v22.4s,#12
-	ror	w9,w9,#20
-	add	v0.4s,v0.4s,v1.4s
-	ror	w10,w10,#20
-	add	v4.4s,v4.4s,v5.4s
-	ror	w11,w11,#20
-	add	v16.4s,v16.4s,v17.4s
-	ror	w12,w12,#20
-	eor	v20.16b,v3.16b,v0.16b
-	add	w5,w5,w9
-	eor	v21.16b,v7.16b,v4.16b
-	add	w6,w6,w10
-	eor	v22.16b,v19.16b,v16.16b
-	add	w7,w7,w11
-	ushr	v3.4s,v20.4s,#24
-	add	w8,w8,w12
-	ushr	v7.4s,v21.4s,#24
-	eor	w17,w17,w5
-	ushr	v19.4s,v22.4s,#24
-	eor	w19,w19,w6
-	sli	v3.4s,v20.4s,#8
-	eor	w20,w20,w7
-	sli	v7.4s,v21.4s,#8
-	eor	w21,w21,w8
-	sli	v19.4s,v22.4s,#8
-	ror	w17,w17,#24
-	add	v2.4s,v2.4s,v3.4s
-	ror	w19,w19,#24
-	add	v6.4s,v6.4s,v7.4s
-	ror	w20,w20,#24
-	add	v18.4s,v18.4s,v19.4s
-	ror	w21,w21,#24
-	eor	v20.16b,v1.16b,v2.16b
-	add	w13,w13,w17
-	eor	v21.16b,v5.16b,v6.16b
-	add	w14,w14,w19
-	eor	v22.16b,v17.16b,v18.16b
-	add	w15,w15,w20
-	ushr	v1.4s,v20.4s,#25
-	add	w16,w16,w21
-	ushr	v5.4s,v21.4s,#25
-	eor	w9,w9,w13
-	ushr	v17.4s,v22.4s,#25
-	eor	w10,w10,w14
-	sli	v1.4s,v20.4s,#7
-	eor	w11,w11,w15
-	sli	v5.4s,v21.4s,#7
-	eor	w12,w12,w16
-	sli	v17.4s,v22.4s,#7
-	ror	w9,w9,#25
-	ext	v2.16b,v2.16b,v2.16b,#8
-	ror	w10,w10,#25
-	ext	v6.16b,v6.16b,v6.16b,#8
-	ror	w11,w11,#25
-	ext	v18.16b,v18.16b,v18.16b,#8
-	ror	w12,w12,#25
-	ext	v3.16b,v3.16b,v3.16b,#12
-	ext	v7.16b,v7.16b,v7.16b,#12
-	ext	v19.16b,v19.16b,v19.16b,#12
-	ext	v1.16b,v1.16b,v1.16b,#4
-	ext	v5.16b,v5.16b,v5.16b,#4
-	ext	v17.16b,v17.16b,v17.16b,#4
-	add	v0.4s,v0.4s,v1.4s
-	add	w5,w5,w10
-	add	v4.4s,v4.4s,v5.4s
-	add	w6,w6,w11
-	add	v16.4s,v16.4s,v17.4s
-	add	w7,w7,w12
-	eor	v3.16b,v3.16b,v0.16b
-	add	w8,w8,w9
-	eor	v7.16b,v7.16b,v4.16b
-	eor	w21,w21,w5
-	eor	v19.16b,v19.16b,v16.16b
-	eor	w17,w17,w6
-	rev32	v3.8h,v3.8h
-	eor	w19,w19,w7
-	rev32	v7.8h,v7.8h
-	eor	w20,w20,w8
-	rev32	v19.8h,v19.8h
-	ror	w21,w21,#16
-	add	v2.4s,v2.4s,v3.4s
-	ror	w17,w17,#16
-	add	v6.4s,v6.4s,v7.4s
-	ror	w19,w19,#16
-	add	v18.4s,v18.4s,v19.4s
-	ror	w20,w20,#16
-	eor	v20.16b,v1.16b,v2.16b
-	add	w15,w15,w21
-	eor	v21.16b,v5.16b,v6.16b
-	add	w16,w16,w17
-	eor	v22.16b,v17.16b,v18.16b
-	add	w13,w13,w19
-	ushr	v1.4s,v20.4s,#20
-	add	w14,w14,w20
-	ushr	v5.4s,v21.4s,#20
-	eor	w10,w10,w15
-	ushr	v17.4s,v22.4s,#20
-	eor	w11,w11,w16
-	sli	v1.4s,v20.4s,#12
-	eor	w12,w12,w13
-	sli	v5.4s,v21.4s,#12
-	eor	w9,w9,w14
-	sli	v17.4s,v22.4s,#12
-	ror	w10,w10,#20
-	add	v0.4s,v0.4s,v1.4s
-	ror	w11,w11,#20
-	add	v4.4s,v4.4s,v5.4s
-	ror	w12,w12,#20
-	add	v16.4s,v16.4s,v17.4s
-	ror	w9,w9,#20
-	eor	v20.16b,v3.16b,v0.16b
-	add	w5,w5,w10
-	eor	v21.16b,v7.16b,v4.16b
-	add	w6,w6,w11
-	eor	v22.16b,v19.16b,v16.16b
-	add	w7,w7,w12
-	ushr	v3.4s,v20.4s,#24
-	add	w8,w8,w9
-	ushr	v7.4s,v21.4s,#24
-	eor	w21,w21,w5
-	ushr	v19.4s,v22.4s,#24
-	eor	w17,w17,w6
-	sli	v3.4s,v20.4s,#8
-	eor	w19,w19,w7
-	sli	v7.4s,v21.4s,#8
-	eor	w20,w20,w8
-	sli	v19.4s,v22.4s,#8
-	ror	w21,w21,#24
-	add	v2.4s,v2.4s,v3.4s
-	ror	w17,w17,#24
-	add	v6.4s,v6.4s,v7.4s
-	ror	w19,w19,#24
-	add	v18.4s,v18.4s,v19.4s
-	ror	w20,w20,#24
-	eor	v20.16b,v1.16b,v2.16b
-	add	w15,w15,w21
-	eor	v21.16b,v5.16b,v6.16b
-	add	w16,w16,w17
-	eor	v22.16b,v17.16b,v18.16b
-	add	w13,w13,w19
-	ushr	v1.4s,v20.4s,#25
-	add	w14,w14,w20
-	ushr	v5.4s,v21.4s,#25
-	eor	w10,w10,w15
-	ushr	v17.4s,v22.4s,#25
-	eor	w11,w11,w16
-	sli	v1.4s,v20.4s,#7
-	eor	w12,w12,w13
-	sli	v5.4s,v21.4s,#7
-	eor	w9,w9,w14
-	sli	v17.4s,v22.4s,#7
-	ror	w10,w10,#25
-	ext	v2.16b,v2.16b,v2.16b,#8
-	ror	w11,w11,#25
-	ext	v6.16b,v6.16b,v6.16b,#8
-	ror	w12,w12,#25
-	ext	v18.16b,v18.16b,v18.16b,#8
-	ror	w9,w9,#25
-	ext	v3.16b,v3.16b,v3.16b,#4
-	ext	v7.16b,v7.16b,v7.16b,#4
-	ext	v19.16b,v19.16b,v19.16b,#4
-	ext	v1.16b,v1.16b,v1.16b,#12
-	ext	v5.16b,v5.16b,v5.16b,#12
-	ext	v17.16b,v17.16b,v17.16b,#12
-	cbnz	x4,Loop_neon
-
-	add	w5,w5,w22		// accumulate key block
-	add	v0.4s,v0.4s,v24.4s
-	add	x6,x6,x22,lsr#32
-	add	v4.4s,v4.4s,v24.4s
-	add	w7,w7,w23
-	add	v16.4s,v16.4s,v24.4s
-	add	x8,x8,x23,lsr#32
-	add	v2.4s,v2.4s,v26.4s
-	add	w9,w9,w24
-	add	v6.4s,v6.4s,v26.4s
-	add	x10,x10,x24,lsr#32
-	add	v18.4s,v18.4s,v26.4s
-	add	w11,w11,w25
-	add	v3.4s,v3.4s,v27.4s
-	add	x12,x12,x25,lsr#32
-	add	w13,w13,w26
-	add	v7.4s,v7.4s,v28.4s
-	add	x14,x14,x26,lsr#32
-	add	w15,w15,w27
-	add	v19.4s,v19.4s,v29.4s
-	add	x16,x16,x27,lsr#32
-	add	w17,w17,w28
-	add	v1.4s,v1.4s,v25.4s
-	add	x19,x19,x28,lsr#32
-	add	w20,w20,w30
-	add	v5.4s,v5.4s,v25.4s
-	add	x21,x21,x30,lsr#32
-	add	v17.4s,v17.4s,v25.4s
-
-	b.lo	Ltail_neon
-
-	add	x5,x5,x6,lsl#32	// pack
-	add	x7,x7,x8,lsl#32
-	ldp	x6,x8,[x1,#0]		// load input
-	add	x9,x9,x10,lsl#32
-	add	x11,x11,x12,lsl#32
-	ldp	x10,x12,[x1,#16]
-	add	x13,x13,x14,lsl#32
-	add	x15,x15,x16,lsl#32
-	ldp	x14,x16,[x1,#32]
-	add	x17,x17,x19,lsl#32
-	add	x20,x20,x21,lsl#32
-	ldp	x19,x21,[x1,#48]
-	add	x1,x1,#64
-#ifdef	__AARCH64EB__
-	rev	x5,x5
-	rev	x7,x7
-	rev	x9,x9
-	rev	x11,x11
-	rev	x13,x13
-	rev	x15,x15
-	rev	x17,x17
-	rev	x20,x20
-#endif
-	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-	eor	x5,x5,x6
-	eor	x7,x7,x8
-	eor	x9,x9,x10
-	eor	x11,x11,x12
-	eor	x13,x13,x14
-	eor	v0.16b,v0.16b,v20.16b
-	eor	x15,x15,x16
-	eor	v1.16b,v1.16b,v21.16b
-	eor	x17,x17,x19
-	eor	v2.16b,v2.16b,v22.16b
-	eor	x20,x20,x21
-	eor	v3.16b,v3.16b,v23.16b
-	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-
-	stp	x5,x7,[x0,#0]		// store output
-	add	x28,x28,#4			// increment counter
-	stp	x9,x11,[x0,#16]
-	add	v27.4s,v27.4s,v31.4s		// += 4
-	stp	x13,x15,[x0,#32]
-	add	v28.4s,v28.4s,v31.4s
-	stp	x17,x20,[x0,#48]
-	add	v29.4s,v29.4s,v31.4s
-	add	x0,x0,#64
-
-	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
-	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
-
-	eor	v4.16b,v4.16b,v20.16b
-	eor	v5.16b,v5.16b,v21.16b
-	eor	v6.16b,v6.16b,v22.16b
-	eor	v7.16b,v7.16b,v23.16b
-	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
-
-	eor	v16.16b,v16.16b,v0.16b
-	eor	v17.16b,v17.16b,v1.16b
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v19.16b,v19.16b,v3.16b
-	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
-
-	b.hi	Loop_outer_neon
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#64
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-Ltail_neon:
-	add	x2,x2,#256
-	cmp	x2,#64
-	b.lo	Less_than_64
-
-	add	x5,x5,x6,lsl#32	// pack
-	add	x7,x7,x8,lsl#32
-	ldp	x6,x8,[x1,#0]		// load input
-	add	x9,x9,x10,lsl#32
-	add	x11,x11,x12,lsl#32
-	ldp	x10,x12,[x1,#16]
-	add	x13,x13,x14,lsl#32
-	add	x15,x15,x16,lsl#32
-	ldp	x14,x16,[x1,#32]
-	add	x17,x17,x19,lsl#32
-	add	x20,x20,x21,lsl#32
-	ldp	x19,x21,[x1,#48]
-	add	x1,x1,#64
-#ifdef	__AARCH64EB__
-	rev	x5,x5
-	rev	x7,x7
-	rev	x9,x9
-	rev	x11,x11
-	rev	x13,x13
-	rev	x15,x15
-	rev	x17,x17
-	rev	x20,x20
-#endif
-	eor	x5,x5,x6
-	eor	x7,x7,x8
-	eor	x9,x9,x10
-	eor	x11,x11,x12
-	eor	x13,x13,x14
-	eor	x15,x15,x16
-	eor	x17,x17,x19
-	eor	x20,x20,x21
-
-	stp	x5,x7,[x0,#0]		// store output
-	add	x28,x28,#4			// increment counter
-	stp	x9,x11,[x0,#16]
-	stp	x13,x15,[x0,#32]
-	stp	x17,x20,[x0,#48]
-	add	x0,x0,#64
-	b.eq	Ldone_neon
-	sub	x2,x2,#64
-	cmp	x2,#64
-	b.lo	Less_than_128
-
-	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-	eor	v0.16b,v0.16b,v20.16b
-	eor	v1.16b,v1.16b,v21.16b
-	eor	v2.16b,v2.16b,v22.16b
-	eor	v3.16b,v3.16b,v23.16b
-	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
-	b.eq	Ldone_neon
-	sub	x2,x2,#64
-	cmp	x2,#64
-	b.lo	Less_than_192
-
-	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-	eor	v4.16b,v4.16b,v20.16b
-	eor	v5.16b,v5.16b,v21.16b
-	eor	v6.16b,v6.16b,v22.16b
-	eor	v7.16b,v7.16b,v23.16b
-	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
-	b.eq	Ldone_neon
-	sub	x2,x2,#64
-
-	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
-	b	Last_neon
-
-Less_than_128:
-	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
-	b	Last_neon
-Less_than_192:
-	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
-	b	Last_neon
-
-.align	4
-Last_neon:
-	sub	x0,x0,#1
-	add	x1,x1,x2
-	add	x0,x0,x2
-	add	x4,sp,x2
-	neg	x2,x2
-
-Loop_tail_neon:
-	ldrb	w10,[x1,x2]
-	ldrb	w11,[x4,x2]
-	add	x2,x2,#1
-	eor	w10,w10,w11
-	strb	w10,[x0,x2]
-	cbnz	x2,Loop_tail_neon
-
-	stp	xzr,xzr,[sp,#0]
-	stp	xzr,xzr,[sp,#16]
-	stp	xzr,xzr,[sp,#32]
-	stp	xzr,xzr,[sp,#48]
-
-Ldone_neon:
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#64
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.align	5
-ChaCha20_512_neon:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-
-	adrp	x5,Lsigma@PAGE
-	add	x5,x5,Lsigma@PAGEOFF
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-
-L512_or_more_neon:
-	sub	sp,sp,#128+64
-
-	ldp	x22,x23,[x5]		// load sigma
-	ld1	{v24.4s},[x5],#16
-	ldp	x24,x25,[x3]		// load key
-	ldp	x26,x27,[x3,#16]
-	ld1	{v25.4s,v26.4s},[x3]
-	ldp	x28,x30,[x4]		// load counter
-	ld1	{v27.4s},[x4]
-	ld1	{v31.4s},[x5]
-#ifdef	__AARCH64EB__
-	rev64	v24.4s,v24.4s
-	ror	x24,x24,#32
-	ror	x25,x25,#32
-	ror	x26,x26,#32
-	ror	x27,x27,#32
-	ror	x28,x28,#32
-	ror	x30,x30,#32
-#endif
-	add	v27.4s,v27.4s,v31.4s		// += 1
-	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
-	add	v27.4s,v27.4s,v31.4s		// not typo
-	str	q26,[sp,#32]
-	add	v28.4s,v27.4s,v31.4s
-	add	v29.4s,v28.4s,v31.4s
-	add	v30.4s,v29.4s,v31.4s
-	shl	v31.4s,v31.4s,#2			// 1 -> 4
-
-	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
-	stp	d10,d11,[sp,#128+16]
-	stp	d12,d13,[sp,#128+32]
-	stp	d14,d15,[sp,#128+48]
-
-	sub	x2,x2,#512			// not typo
-
-Loop_outer_512_neon:
-	mov	v0.16b,v24.16b
-	mov	v4.16b,v24.16b
-	mov	v8.16b,v24.16b
-	mov	v12.16b,v24.16b
-	mov	v16.16b,v24.16b
-	mov	v20.16b,v24.16b
-	mov	v1.16b,v25.16b
-	mov	w5,w22			// unpack key block
-	mov	v5.16b,v25.16b
-	lsr	x6,x22,#32
-	mov	v9.16b,v25.16b
-	mov	w7,w23
-	mov	v13.16b,v25.16b
-	lsr	x8,x23,#32
-	mov	v17.16b,v25.16b
-	mov	w9,w24
-	mov	v21.16b,v25.16b
-	lsr	x10,x24,#32
-	mov	v3.16b,v27.16b
-	mov	w11,w25
-	mov	v7.16b,v28.16b
-	lsr	x12,x25,#32
-	mov	v11.16b,v29.16b
-	mov	w13,w26
-	mov	v15.16b,v30.16b
-	lsr	x14,x26,#32
-	mov	v2.16b,v26.16b
-	mov	w15,w27
-	mov	v6.16b,v26.16b
-	lsr	x16,x27,#32
-	add	v19.4s,v3.4s,v31.4s			// +4
-	mov	w17,w28
-	add	v23.4s,v7.4s,v31.4s			// +4
-	lsr	x19,x28,#32
-	mov	v10.16b,v26.16b
-	mov	w20,w30
-	mov	v14.16b,v26.16b
-	lsr	x21,x30,#32
-	mov	v18.16b,v26.16b
-	stp	q27,q28,[sp,#48]		// off-load key block, variable part
-	mov	v22.16b,v26.16b
-	str	q29,[sp,#80]
-
-	mov	x4,#5
-	subs	x2,x2,#512
-Loop_upper_neon:
-	sub	x4,x4,#1
-	add	v0.4s,v0.4s,v1.4s
-	add	w5,w5,w9
-	add	v4.4s,v4.4s,v5.4s
-	add	w6,w6,w10
-	add	v8.4s,v8.4s,v9.4s
-	add	w7,w7,w11
-	add	v12.4s,v12.4s,v13.4s
-	add	w8,w8,w12
-	add	v16.4s,v16.4s,v17.4s
-	eor	w17,w17,w5
-	add	v20.4s,v20.4s,v21.4s
-	eor	w19,w19,w6
-	eor	v3.16b,v3.16b,v0.16b
-	eor	w20,w20,w7
-	eor	v7.16b,v7.16b,v4.16b
-	eor	w21,w21,w8
-	eor	v11.16b,v11.16b,v8.16b
-	ror	w17,w17,#16
-	eor	v15.16b,v15.16b,v12.16b
-	ror	w19,w19,#16
-	eor	v19.16b,v19.16b,v16.16b
-	ror	w20,w20,#16
-	eor	v23.16b,v23.16b,v20.16b
-	ror	w21,w21,#16
-	rev32	v3.8h,v3.8h
-	add	w13,w13,w17
-	rev32	v7.8h,v7.8h
-	add	w14,w14,w19
-	rev32	v11.8h,v11.8h
-	add	w15,w15,w20
-	rev32	v15.8h,v15.8h
-	add	w16,w16,w21
-	rev32	v19.8h,v19.8h
-	eor	w9,w9,w13
-	rev32	v23.8h,v23.8h
-	eor	w10,w10,w14
-	add	v2.4s,v2.4s,v3.4s
-	eor	w11,w11,w15
-	add	v6.4s,v6.4s,v7.4s
-	eor	w12,w12,w16
-	add	v10.4s,v10.4s,v11.4s
-	ror	w9,w9,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w10,w10,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w11,w11,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w12,w12,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w9
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w10
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w11
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w12
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w17,w17,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w19,w19,w6
-	ushr	v1.4s,v24.4s,#20
-	eor	w20,w20,w7
-	ushr	v5.4s,v25.4s,#20
-	eor	w21,w21,w8
-	ushr	v9.4s,v26.4s,#20
-	ror	w17,w17,#24
-	ushr	v13.4s,v27.4s,#20
-	ror	w19,w19,#24
-	ushr	v17.4s,v28.4s,#20
-	ror	w20,w20,#24
-	ushr	v21.4s,v29.4s,#20
-	ror	w21,w21,#24
-	sli	v1.4s,v24.4s,#12
-	add	w13,w13,w17
-	sli	v5.4s,v25.4s,#12
-	add	w14,w14,w19
-	sli	v9.4s,v26.4s,#12
-	add	w15,w15,w20
-	sli	v13.4s,v27.4s,#12
-	add	w16,w16,w21
-	sli	v17.4s,v28.4s,#12
-	eor	w9,w9,w13
-	sli	v21.4s,v29.4s,#12
-	eor	w10,w10,w14
-	add	v0.4s,v0.4s,v1.4s
-	eor	w11,w11,w15
-	add	v4.4s,v4.4s,v5.4s
-	eor	w12,w12,w16
-	add	v8.4s,v8.4s,v9.4s
-	ror	w9,w9,#25
-	add	v12.4s,v12.4s,v13.4s
-	ror	w10,w10,#25
-	add	v16.4s,v16.4s,v17.4s
-	ror	w11,w11,#25
-	add	v20.4s,v20.4s,v21.4s
-	ror	w12,w12,#25
-	eor	v24.16b,v3.16b,v0.16b
-	add	w5,w5,w10
-	eor	v25.16b,v7.16b,v4.16b
-	add	w6,w6,w11
-	eor	v26.16b,v11.16b,v8.16b
-	add	w7,w7,w12
-	eor	v27.16b,v15.16b,v12.16b
-	add	w8,w8,w9
-	eor	v28.16b,v19.16b,v16.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v23.16b,v20.16b
-	eor	w17,w17,w6
-	ushr	v3.4s,v24.4s,#24
-	eor	w19,w19,w7
-	ushr	v7.4s,v25.4s,#24
-	eor	w20,w20,w8
-	ushr	v11.4s,v26.4s,#24
-	ror	w21,w21,#16
-	ushr	v15.4s,v27.4s,#24
-	ror	w17,w17,#16
-	ushr	v19.4s,v28.4s,#24
-	ror	w19,w19,#16
-	ushr	v23.4s,v29.4s,#24
-	ror	w20,w20,#16
-	sli	v3.4s,v24.4s,#8
-	add	w15,w15,w21
-	sli	v7.4s,v25.4s,#8
-	add	w16,w16,w17
-	sli	v11.4s,v26.4s,#8
-	add	w13,w13,w19
-	sli	v15.4s,v27.4s,#8
-	add	w14,w14,w20
-	sli	v19.4s,v28.4s,#8
-	eor	w10,w10,w15
-	sli	v23.4s,v29.4s,#8
-	eor	w11,w11,w16
-	add	v2.4s,v2.4s,v3.4s
-	eor	w12,w12,w13
-	add	v6.4s,v6.4s,v7.4s
-	eor	w9,w9,w14
-	add	v10.4s,v10.4s,v11.4s
-	ror	w10,w10,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w11,w11,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w12,w12,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w9,w9,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w10
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w11
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w12
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w9
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w17,w17,w6
-	ushr	v1.4s,v24.4s,#25
-	eor	w19,w19,w7
-	ushr	v5.4s,v25.4s,#25
-	eor	w20,w20,w8
-	ushr	v9.4s,v26.4s,#25
-	ror	w21,w21,#24
-	ushr	v13.4s,v27.4s,#25
-	ror	w17,w17,#24
-	ushr	v17.4s,v28.4s,#25
-	ror	w19,w19,#24
-	ushr	v21.4s,v29.4s,#25
-	ror	w20,w20,#24
-	sli	v1.4s,v24.4s,#7
-	add	w15,w15,w21
-	sli	v5.4s,v25.4s,#7
-	add	w16,w16,w17
-	sli	v9.4s,v26.4s,#7
-	add	w13,w13,w19
-	sli	v13.4s,v27.4s,#7
-	add	w14,w14,w20
-	sli	v17.4s,v28.4s,#7
-	eor	w10,w10,w15
-	sli	v21.4s,v29.4s,#7
-	eor	w11,w11,w16
-	ext	v2.16b,v2.16b,v2.16b,#8
-	eor	w12,w12,w13
-	ext	v6.16b,v6.16b,v6.16b,#8
-	eor	w9,w9,w14
-	ext	v10.16b,v10.16b,v10.16b,#8
-	ror	w10,w10,#25
-	ext	v14.16b,v14.16b,v14.16b,#8
-	ror	w11,w11,#25
-	ext	v18.16b,v18.16b,v18.16b,#8
-	ror	w12,w12,#25
-	ext	v22.16b,v22.16b,v22.16b,#8
-	ror	w9,w9,#25
-	ext	v3.16b,v3.16b,v3.16b,#12
-	ext	v7.16b,v7.16b,v7.16b,#12
-	ext	v11.16b,v11.16b,v11.16b,#12
-	ext	v15.16b,v15.16b,v15.16b,#12
-	ext	v19.16b,v19.16b,v19.16b,#12
-	ext	v23.16b,v23.16b,v23.16b,#12
-	ext	v1.16b,v1.16b,v1.16b,#4
-	ext	v5.16b,v5.16b,v5.16b,#4
-	ext	v9.16b,v9.16b,v9.16b,#4
-	ext	v13.16b,v13.16b,v13.16b,#4
-	ext	v17.16b,v17.16b,v17.16b,#4
-	ext	v21.16b,v21.16b,v21.16b,#4
-	add	v0.4s,v0.4s,v1.4s
-	add	w5,w5,w9
-	add	v4.4s,v4.4s,v5.4s
-	add	w6,w6,w10
-	add	v8.4s,v8.4s,v9.4s
-	add	w7,w7,w11
-	add	v12.4s,v12.4s,v13.4s
-	add	w8,w8,w12
-	add	v16.4s,v16.4s,v17.4s
-	eor	w17,w17,w5
-	add	v20.4s,v20.4s,v21.4s
-	eor	w19,w19,w6
-	eor	v3.16b,v3.16b,v0.16b
-	eor	w20,w20,w7
-	eor	v7.16b,v7.16b,v4.16b
-	eor	w21,w21,w8
-	eor	v11.16b,v11.16b,v8.16b
-	ror	w17,w17,#16
-	eor	v15.16b,v15.16b,v12.16b
-	ror	w19,w19,#16
-	eor	v19.16b,v19.16b,v16.16b
-	ror	w20,w20,#16
-	eor	v23.16b,v23.16b,v20.16b
-	ror	w21,w21,#16
-	rev32	v3.8h,v3.8h
-	add	w13,w13,w17
-	rev32	v7.8h,v7.8h
-	add	w14,w14,w19
-	rev32	v11.8h,v11.8h
-	add	w15,w15,w20
-	rev32	v15.8h,v15.8h
-	add	w16,w16,w21
-	rev32	v19.8h,v19.8h
-	eor	w9,w9,w13
-	rev32	v23.8h,v23.8h
-	eor	w10,w10,w14
-	add	v2.4s,v2.4s,v3.4s
-	eor	w11,w11,w15
-	add	v6.4s,v6.4s,v7.4s
-	eor	w12,w12,w16
-	add	v10.4s,v10.4s,v11.4s
-	ror	w9,w9,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w10,w10,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w11,w11,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w12,w12,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w9
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w10
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w11
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w12
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w17,w17,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w19,w19,w6
-	ushr	v1.4s,v24.4s,#20
-	eor	w20,w20,w7
-	ushr	v5.4s,v25.4s,#20
-	eor	w21,w21,w8
-	ushr	v9.4s,v26.4s,#20
-	ror	w17,w17,#24
-	ushr	v13.4s,v27.4s,#20
-	ror	w19,w19,#24
-	ushr	v17.4s,v28.4s,#20
-	ror	w20,w20,#24
-	ushr	v21.4s,v29.4s,#20
-	ror	w21,w21,#24
-	sli	v1.4s,v24.4s,#12
-	add	w13,w13,w17
-	sli	v5.4s,v25.4s,#12
-	add	w14,w14,w19
-	sli	v9.4s,v26.4s,#12
-	add	w15,w15,w20
-	sli	v13.4s,v27.4s,#12
-	add	w16,w16,w21
-	sli	v17.4s,v28.4s,#12
-	eor	w9,w9,w13
-	sli	v21.4s,v29.4s,#12
-	eor	w10,w10,w14
-	add	v0.4s,v0.4s,v1.4s
-	eor	w11,w11,w15
-	add	v4.4s,v4.4s,v5.4s
-	eor	w12,w12,w16
-	add	v8.4s,v8.4s,v9.4s
-	ror	w9,w9,#25
-	add	v12.4s,v12.4s,v13.4s
-	ror	w10,w10,#25
-	add	v16.4s,v16.4s,v17.4s
-	ror	w11,w11,#25
-	add	v20.4s,v20.4s,v21.4s
-	ror	w12,w12,#25
-	eor	v24.16b,v3.16b,v0.16b
-	add	w5,w5,w10
-	eor	v25.16b,v7.16b,v4.16b
-	add	w6,w6,w11
-	eor	v26.16b,v11.16b,v8.16b
-	add	w7,w7,w12
-	eor	v27.16b,v15.16b,v12.16b
-	add	w8,w8,w9
-	eor	v28.16b,v19.16b,v16.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v23.16b,v20.16b
-	eor	w17,w17,w6
-	ushr	v3.4s,v24.4s,#24
-	eor	w19,w19,w7
-	ushr	v7.4s,v25.4s,#24
-	eor	w20,w20,w8
-	ushr	v11.4s,v26.4s,#24
-	ror	w21,w21,#16
-	ushr	v15.4s,v27.4s,#24
-	ror	w17,w17,#16
-	ushr	v19.4s,v28.4s,#24
-	ror	w19,w19,#16
-	ushr	v23.4s,v29.4s,#24
-	ror	w20,w20,#16
-	sli	v3.4s,v24.4s,#8
-	add	w15,w15,w21
-	sli	v7.4s,v25.4s,#8
-	add	w16,w16,w17
-	sli	v11.4s,v26.4s,#8
-	add	w13,w13,w19
-	sli	v15.4s,v27.4s,#8
-	add	w14,w14,w20
-	sli	v19.4s,v28.4s,#8
-	eor	w10,w10,w15
-	sli	v23.4s,v29.4s,#8
-	eor	w11,w11,w16
-	add	v2.4s,v2.4s,v3.4s
-	eor	w12,w12,w13
-	add	v6.4s,v6.4s,v7.4s
-	eor	w9,w9,w14
-	add	v10.4s,v10.4s,v11.4s
-	ror	w10,w10,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w11,w11,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w12,w12,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w9,w9,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w10
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w11
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w12
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w9
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w17,w17,w6
-	ushr	v1.4s,v24.4s,#25
-	eor	w19,w19,w7
-	ushr	v5.4s,v25.4s,#25
-	eor	w20,w20,w8
-	ushr	v9.4s,v26.4s,#25
-	ror	w21,w21,#24
-	ushr	v13.4s,v27.4s,#25
-	ror	w17,w17,#24
-	ushr	v17.4s,v28.4s,#25
-	ror	w19,w19,#24
-	ushr	v21.4s,v29.4s,#25
-	ror	w20,w20,#24
-	sli	v1.4s,v24.4s,#7
-	add	w15,w15,w21
-	sli	v5.4s,v25.4s,#7
-	add	w16,w16,w17
-	sli	v9.4s,v26.4s,#7
-	add	w13,w13,w19
-	sli	v13.4s,v27.4s,#7
-	add	w14,w14,w20
-	sli	v17.4s,v28.4s,#7
-	eor	w10,w10,w15
-	sli	v21.4s,v29.4s,#7
-	eor	w11,w11,w16
-	ext	v2.16b,v2.16b,v2.16b,#8
-	eor	w12,w12,w13
-	ext	v6.16b,v6.16b,v6.16b,#8
-	eor	w9,w9,w14
-	ext	v10.16b,v10.16b,v10.16b,#8
-	ror	w10,w10,#25
-	ext	v14.16b,v14.16b,v14.16b,#8
-	ror	w11,w11,#25
-	ext	v18.16b,v18.16b,v18.16b,#8
-	ror	w12,w12,#25
-	ext	v22.16b,v22.16b,v22.16b,#8
-	ror	w9,w9,#25
-	ext	v3.16b,v3.16b,v3.16b,#4
-	ext	v7.16b,v7.16b,v7.16b,#4
-	ext	v11.16b,v11.16b,v11.16b,#4
-	ext	v15.16b,v15.16b,v15.16b,#4
-	ext	v19.16b,v19.16b,v19.16b,#4
-	ext	v23.16b,v23.16b,v23.16b,#4
-	ext	v1.16b,v1.16b,v1.16b,#12
-	ext	v5.16b,v5.16b,v5.16b,#12
-	ext	v9.16b,v9.16b,v9.16b,#12
-	ext	v13.16b,v13.16b,v13.16b,#12
-	ext	v17.16b,v17.16b,v17.16b,#12
-	ext	v21.16b,v21.16b,v21.16b,#12
-	cbnz	x4,Loop_upper_neon
-
-	add	w5,w5,w22		// accumulate key block
-	add	x6,x6,x22,lsr#32
-	add	w7,w7,w23
-	add	x8,x8,x23,lsr#32
-	add	w9,w9,w24
-	add	x10,x10,x24,lsr#32
-	add	w11,w11,w25
-	add	x12,x12,x25,lsr#32
-	add	w13,w13,w26
-	add	x14,x14,x26,lsr#32
-	add	w15,w15,w27
-	add	x16,x16,x27,lsr#32
-	add	w17,w17,w28
-	add	x19,x19,x28,lsr#32
-	add	w20,w20,w30
-	add	x21,x21,x30,lsr#32
-
-	add	x5,x5,x6,lsl#32	// pack
-	add	x7,x7,x8,lsl#32
-	ldp	x6,x8,[x1,#0]		// load input
-	add	x9,x9,x10,lsl#32
-	add	x11,x11,x12,lsl#32
-	ldp	x10,x12,[x1,#16]
-	add	x13,x13,x14,lsl#32
-	add	x15,x15,x16,lsl#32
-	ldp	x14,x16,[x1,#32]
-	add	x17,x17,x19,lsl#32
-	add	x20,x20,x21,lsl#32
-	ldp	x19,x21,[x1,#48]
-	add	x1,x1,#64
-#ifdef	__AARCH64EB__
-	rev	x5,x5
-	rev	x7,x7
-	rev	x9,x9
-	rev	x11,x11
-	rev	x13,x13
-	rev	x15,x15
-	rev	x17,x17
-	rev	x20,x20
-#endif
-	eor	x5,x5,x6
-	eor	x7,x7,x8
-	eor	x9,x9,x10
-	eor	x11,x11,x12
-	eor	x13,x13,x14
-	eor	x15,x15,x16
-	eor	x17,x17,x19
-	eor	x20,x20,x21
-
-	stp	x5,x7,[x0,#0]		// store output
-	add	x28,x28,#1			// increment counter
-	mov	w5,w22			// unpack key block
-	lsr	x6,x22,#32
-	stp	x9,x11,[x0,#16]
-	mov	w7,w23
-	lsr	x8,x23,#32
-	stp	x13,x15,[x0,#32]
-	mov	w9,w24
-	lsr	x10,x24,#32
-	stp	x17,x20,[x0,#48]
-	add	x0,x0,#64
-	mov	w11,w25
-	lsr	x12,x25,#32
-	mov	w13,w26
-	lsr	x14,x26,#32
-	mov	w15,w27
-	lsr	x16,x27,#32
-	mov	w17,w28
-	lsr	x19,x28,#32
-	mov	w20,w30
-	lsr	x21,x30,#32
-
-	mov	x4,#5
-Loop_lower_neon:
-	sub	x4,x4,#1
-	add	v0.4s,v0.4s,v1.4s
-	add	w5,w5,w9
-	add	v4.4s,v4.4s,v5.4s
-	add	w6,w6,w10
-	add	v8.4s,v8.4s,v9.4s
-	add	w7,w7,w11
-	add	v12.4s,v12.4s,v13.4s
-	add	w8,w8,w12
-	add	v16.4s,v16.4s,v17.4s
-	eor	w17,w17,w5
-	add	v20.4s,v20.4s,v21.4s
-	eor	w19,w19,w6
-	eor	v3.16b,v3.16b,v0.16b
-	eor	w20,w20,w7
-	eor	v7.16b,v7.16b,v4.16b
-	eor	w21,w21,w8
-	eor	v11.16b,v11.16b,v8.16b
-	ror	w17,w17,#16
-	eor	v15.16b,v15.16b,v12.16b
-	ror	w19,w19,#16
-	eor	v19.16b,v19.16b,v16.16b
-	ror	w20,w20,#16
-	eor	v23.16b,v23.16b,v20.16b
-	ror	w21,w21,#16
-	rev32	v3.8h,v3.8h
-	add	w13,w13,w17
-	rev32	v7.8h,v7.8h
-	add	w14,w14,w19
-	rev32	v11.8h,v11.8h
-	add	w15,w15,w20
-	rev32	v15.8h,v15.8h
-	add	w16,w16,w21
-	rev32	v19.8h,v19.8h
-	eor	w9,w9,w13
-	rev32	v23.8h,v23.8h
-	eor	w10,w10,w14
-	add	v2.4s,v2.4s,v3.4s
-	eor	w11,w11,w15
-	add	v6.4s,v6.4s,v7.4s
-	eor	w12,w12,w16
-	add	v10.4s,v10.4s,v11.4s
-	ror	w9,w9,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w10,w10,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w11,w11,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w12,w12,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w9
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w10
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w11
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w12
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w17,w17,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w19,w19,w6
-	ushr	v1.4s,v24.4s,#20
-	eor	w20,w20,w7
-	ushr	v5.4s,v25.4s,#20
-	eor	w21,w21,w8
-	ushr	v9.4s,v26.4s,#20
-	ror	w17,w17,#24
-	ushr	v13.4s,v27.4s,#20
-	ror	w19,w19,#24
-	ushr	v17.4s,v28.4s,#20
-	ror	w20,w20,#24
-	ushr	v21.4s,v29.4s,#20
-	ror	w21,w21,#24
-	sli	v1.4s,v24.4s,#12
-	add	w13,w13,w17
-	sli	v5.4s,v25.4s,#12
-	add	w14,w14,w19
-	sli	v9.4s,v26.4s,#12
-	add	w15,w15,w20
-	sli	v13.4s,v27.4s,#12
-	add	w16,w16,w21
-	sli	v17.4s,v28.4s,#12
-	eor	w9,w9,w13
-	sli	v21.4s,v29.4s,#12
-	eor	w10,w10,w14
-	add	v0.4s,v0.4s,v1.4s
-	eor	w11,w11,w15
-	add	v4.4s,v4.4s,v5.4s
-	eor	w12,w12,w16
-	add	v8.4s,v8.4s,v9.4s
-	ror	w9,w9,#25
-	add	v12.4s,v12.4s,v13.4s
-	ror	w10,w10,#25
-	add	v16.4s,v16.4s,v17.4s
-	ror	w11,w11,#25
-	add	v20.4s,v20.4s,v21.4s
-	ror	w12,w12,#25
-	eor	v24.16b,v3.16b,v0.16b
-	add	w5,w5,w10
-	eor	v25.16b,v7.16b,v4.16b
-	add	w6,w6,w11
-	eor	v26.16b,v11.16b,v8.16b
-	add	w7,w7,w12
-	eor	v27.16b,v15.16b,v12.16b
-	add	w8,w8,w9
-	eor	v28.16b,v19.16b,v16.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v23.16b,v20.16b
-	eor	w17,w17,w6
-	ushr	v3.4s,v24.4s,#24
-	eor	w19,w19,w7
-	ushr	v7.4s,v25.4s,#24
-	eor	w20,w20,w8
-	ushr	v11.4s,v26.4s,#24
-	ror	w21,w21,#16
-	ushr	v15.4s,v27.4s,#24
-	ror	w17,w17,#16
-	ushr	v19.4s,v28.4s,#24
-	ror	w19,w19,#16
-	ushr	v23.4s,v29.4s,#24
-	ror	w20,w20,#16
-	sli	v3.4s,v24.4s,#8
-	add	w15,w15,w21
-	sli	v7.4s,v25.4s,#8
-	add	w16,w16,w17
-	sli	v11.4s,v26.4s,#8
-	add	w13,w13,w19
-	sli	v15.4s,v27.4s,#8
-	add	w14,w14,w20
-	sli	v19.4s,v28.4s,#8
-	eor	w10,w10,w15
-	sli	v23.4s,v29.4s,#8
-	eor	w11,w11,w16
-	add	v2.4s,v2.4s,v3.4s
-	eor	w12,w12,w13
-	add	v6.4s,v6.4s,v7.4s
-	eor	w9,w9,w14
-	add	v10.4s,v10.4s,v11.4s
-	ror	w10,w10,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w11,w11,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w12,w12,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w9,w9,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w10
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w11
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w12
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w9
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w17,w17,w6
-	ushr	v1.4s,v24.4s,#25
-	eor	w19,w19,w7
-	ushr	v5.4s,v25.4s,#25
-	eor	w20,w20,w8
-	ushr	v9.4s,v26.4s,#25
-	ror	w21,w21,#24
-	ushr	v13.4s,v27.4s,#25
-	ror	w17,w17,#24
-	ushr	v17.4s,v28.4s,#25
-	ror	w19,w19,#24
-	ushr	v21.4s,v29.4s,#25
-	ror	w20,w20,#24
-	sli	v1.4s,v24.4s,#7
-	add	w15,w15,w21
-	sli	v5.4s,v25.4s,#7
-	add	w16,w16,w17
-	sli	v9.4s,v26.4s,#7
-	add	w13,w13,w19
-	sli	v13.4s,v27.4s,#7
-	add	w14,w14,w20
-	sli	v17.4s,v28.4s,#7
-	eor	w10,w10,w15
-	sli	v21.4s,v29.4s,#7
-	eor	w11,w11,w16
-	ext	v2.16b,v2.16b,v2.16b,#8
-	eor	w12,w12,w13
-	ext	v6.16b,v6.16b,v6.16b,#8
-	eor	w9,w9,w14
-	ext	v10.16b,v10.16b,v10.16b,#8
-	ror	w10,w10,#25
-	ext	v14.16b,v14.16b,v14.16b,#8
-	ror	w11,w11,#25
-	ext	v18.16b,v18.16b,v18.16b,#8
-	ror	w12,w12,#25
-	ext	v22.16b,v22.16b,v22.16b,#8
-	ror	w9,w9,#25
-	ext	v3.16b,v3.16b,v3.16b,#12
-	ext	v7.16b,v7.16b,v7.16b,#12
-	ext	v11.16b,v11.16b,v11.16b,#12
-	ext	v15.16b,v15.16b,v15.16b,#12
-	ext	v19.16b,v19.16b,v19.16b,#12
-	ext	v23.16b,v23.16b,v23.16b,#12
-	ext	v1.16b,v1.16b,v1.16b,#4
-	ext	v5.16b,v5.16b,v5.16b,#4
-	ext	v9.16b,v9.16b,v9.16b,#4
-	ext	v13.16b,v13.16b,v13.16b,#4
-	ext	v17.16b,v17.16b,v17.16b,#4
-	ext	v21.16b,v21.16b,v21.16b,#4
-	add	v0.4s,v0.4s,v1.4s
-	add	w5,w5,w9
-	add	v4.4s,v4.4s,v5.4s
-	add	w6,w6,w10
-	add	v8.4s,v8.4s,v9.4s
-	add	w7,w7,w11
-	add	v12.4s,v12.4s,v13.4s
-	add	w8,w8,w12
-	add	v16.4s,v16.4s,v17.4s
-	eor	w17,w17,w5
-	add	v20.4s,v20.4s,v21.4s
-	eor	w19,w19,w6
-	eor	v3.16b,v3.16b,v0.16b
-	eor	w20,w20,w7
-	eor	v7.16b,v7.16b,v4.16b
-	eor	w21,w21,w8
-	eor	v11.16b,v11.16b,v8.16b
-	ror	w17,w17,#16
-	eor	v15.16b,v15.16b,v12.16b
-	ror	w19,w19,#16
-	eor	v19.16b,v19.16b,v16.16b
-	ror	w20,w20,#16
-	eor	v23.16b,v23.16b,v20.16b
-	ror	w21,w21,#16
-	rev32	v3.8h,v3.8h
-	add	w13,w13,w17
-	rev32	v7.8h,v7.8h
-	add	w14,w14,w19
-	rev32	v11.8h,v11.8h
-	add	w15,w15,w20
-	rev32	v15.8h,v15.8h
-	add	w16,w16,w21
-	rev32	v19.8h,v19.8h
-	eor	w9,w9,w13
-	rev32	v23.8h,v23.8h
-	eor	w10,w10,w14
-	add	v2.4s,v2.4s,v3.4s
-	eor	w11,w11,w15
-	add	v6.4s,v6.4s,v7.4s
-	eor	w12,w12,w16
-	add	v10.4s,v10.4s,v11.4s
-	ror	w9,w9,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w10,w10,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w11,w11,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w12,w12,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w9
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w10
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w11
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w12
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w17,w17,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w19,w19,w6
-	ushr	v1.4s,v24.4s,#20
-	eor	w20,w20,w7
-	ushr	v5.4s,v25.4s,#20
-	eor	w21,w21,w8
-	ushr	v9.4s,v26.4s,#20
-	ror	w17,w17,#24
-	ushr	v13.4s,v27.4s,#20
-	ror	w19,w19,#24
-	ushr	v17.4s,v28.4s,#20
-	ror	w20,w20,#24
-	ushr	v21.4s,v29.4s,#20
-	ror	w21,w21,#24
-	sli	v1.4s,v24.4s,#12
-	add	w13,w13,w17
-	sli	v5.4s,v25.4s,#12
-	add	w14,w14,w19
-	sli	v9.4s,v26.4s,#12
-	add	w15,w15,w20
-	sli	v13.4s,v27.4s,#12
-	add	w16,w16,w21
-	sli	v17.4s,v28.4s,#12
-	eor	w9,w9,w13
-	sli	v21.4s,v29.4s,#12
-	eor	w10,w10,w14
-	add	v0.4s,v0.4s,v1.4s
-	eor	w11,w11,w15
-	add	v4.4s,v4.4s,v5.4s
-	eor	w12,w12,w16
-	add	v8.4s,v8.4s,v9.4s
-	ror	w9,w9,#25
-	add	v12.4s,v12.4s,v13.4s
-	ror	w10,w10,#25
-	add	v16.4s,v16.4s,v17.4s
-	ror	w11,w11,#25
-	add	v20.4s,v20.4s,v21.4s
-	ror	w12,w12,#25
-	eor	v24.16b,v3.16b,v0.16b
-	add	w5,w5,w10
-	eor	v25.16b,v7.16b,v4.16b
-	add	w6,w6,w11
-	eor	v26.16b,v11.16b,v8.16b
-	add	w7,w7,w12
-	eor	v27.16b,v15.16b,v12.16b
-	add	w8,w8,w9
-	eor	v28.16b,v19.16b,v16.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v23.16b,v20.16b
-	eor	w17,w17,w6
-	ushr	v3.4s,v24.4s,#24
-	eor	w19,w19,w7
-	ushr	v7.4s,v25.4s,#24
-	eor	w20,w20,w8
-	ushr	v11.4s,v26.4s,#24
-	ror	w21,w21,#16
-	ushr	v15.4s,v27.4s,#24
-	ror	w17,w17,#16
-	ushr	v19.4s,v28.4s,#24
-	ror	w19,w19,#16
-	ushr	v23.4s,v29.4s,#24
-	ror	w20,w20,#16
-	sli	v3.4s,v24.4s,#8
-	add	w15,w15,w21
-	sli	v7.4s,v25.4s,#8
-	add	w16,w16,w17
-	sli	v11.4s,v26.4s,#8
-	add	w13,w13,w19
-	sli	v15.4s,v27.4s,#8
-	add	w14,w14,w20
-	sli	v19.4s,v28.4s,#8
-	eor	w10,w10,w15
-	sli	v23.4s,v29.4s,#8
-	eor	w11,w11,w16
-	add	v2.4s,v2.4s,v3.4s
-	eor	w12,w12,w13
-	add	v6.4s,v6.4s,v7.4s
-	eor	w9,w9,w14
-	add	v10.4s,v10.4s,v11.4s
-	ror	w10,w10,#20
-	add	v14.4s,v14.4s,v15.4s
-	ror	w11,w11,#20
-	add	v18.4s,v18.4s,v19.4s
-	ror	w12,w12,#20
-	add	v22.4s,v22.4s,v23.4s
-	ror	w9,w9,#20
-	eor	v24.16b,v1.16b,v2.16b
-	add	w5,w5,w10
-	eor	v25.16b,v5.16b,v6.16b
-	add	w6,w6,w11
-	eor	v26.16b,v9.16b,v10.16b
-	add	w7,w7,w12
-	eor	v27.16b,v13.16b,v14.16b
-	add	w8,w8,w9
-	eor	v28.16b,v17.16b,v18.16b
-	eor	w21,w21,w5
-	eor	v29.16b,v21.16b,v22.16b
-	eor	w17,w17,w6
-	ushr	v1.4s,v24.4s,#25
-	eor	w19,w19,w7
-	ushr	v5.4s,v25.4s,#25
-	eor	w20,w20,w8
-	ushr	v9.4s,v26.4s,#25
-	ror	w21,w21,#24
-	ushr	v13.4s,v27.4s,#25
-	ror	w17,w17,#24
-	ushr	v17.4s,v28.4s,#25
-	ror	w19,w19,#24
-	ushr	v21.4s,v29.4s,#25
-	ror	w20,w20,#24
-	sli	v1.4s,v24.4s,#7
-	add	w15,w15,w21
-	sli	v5.4s,v25.4s,#7
-	add	w16,w16,w17
-	sli	v9.4s,v26.4s,#7
-	add	w13,w13,w19
-	sli	v13.4s,v27.4s,#7
-	add	w14,w14,w20
-	sli	v17.4s,v28.4s,#7
-	eor	w10,w10,w15
-	sli	v21.4s,v29.4s,#7
-	eor	w11,w11,w16
-	ext	v2.16b,v2.16b,v2.16b,#8
-	eor	w12,w12,w13
-	ext	v6.16b,v6.16b,v6.16b,#8
-	eor	w9,w9,w14
-	ext	v10.16b,v10.16b,v10.16b,#8
-	ror	w10,w10,#25
-	ext	v14.16b,v14.16b,v14.16b,#8
-	ror	w11,w11,#25
-	ext	v18.16b,v18.16b,v18.16b,#8
-	ror	w12,w12,#25
-	ext	v22.16b,v22.16b,v22.16b,#8
-	ror	w9,w9,#25
-	ext	v3.16b,v3.16b,v3.16b,#4
-	ext	v7.16b,v7.16b,v7.16b,#4
-	ext	v11.16b,v11.16b,v11.16b,#4
-	ext	v15.16b,v15.16b,v15.16b,#4
-	ext	v19.16b,v19.16b,v19.16b,#4
-	ext	v23.16b,v23.16b,v23.16b,#4
-	ext	v1.16b,v1.16b,v1.16b,#12
-	ext	v5.16b,v5.16b,v5.16b,#12
-	ext	v9.16b,v9.16b,v9.16b,#12
-	ext	v13.16b,v13.16b,v13.16b,#12
-	ext	v17.16b,v17.16b,v17.16b,#12
-	ext	v21.16b,v21.16b,v21.16b,#12
-	cbnz	x4,Loop_lower_neon
-
-	add	w5,w5,w22		// accumulate key block
-	ldp	q24,q25,[sp,#0]
-	add	x6,x6,x22,lsr#32
-	ldp	q26,q27,[sp,#32]
-	add	w7,w7,w23
-	ldp	q28,q29,[sp,#64]
-	add	x8,x8,x23,lsr#32
-	add	v0.4s,v0.4s,v24.4s
-	add	w9,w9,w24
-	add	v4.4s,v4.4s,v24.4s
-	add	x10,x10,x24,lsr#32
-	add	v8.4s,v8.4s,v24.4s
-	add	w11,w11,w25
-	add	v12.4s,v12.4s,v24.4s
-	add	x12,x12,x25,lsr#32
-	add	v16.4s,v16.4s,v24.4s
-	add	w13,w13,w26
-	add	v20.4s,v20.4s,v24.4s
-	add	x14,x14,x26,lsr#32
-	add	v2.4s,v2.4s,v26.4s
-	add	w15,w15,w27
-	add	v6.4s,v6.4s,v26.4s
-	add	x16,x16,x27,lsr#32
-	add	v10.4s,v10.4s,v26.4s
-	add	w17,w17,w28
-	add	v14.4s,v14.4s,v26.4s
-	add	x19,x19,x28,lsr#32
-	add	v18.4s,v18.4s,v26.4s
-	add	w20,w20,w30
-	add	v22.4s,v22.4s,v26.4s
-	add	x21,x21,x30,lsr#32
-	add	v19.4s,v19.4s,v31.4s			// +4
-	add	x5,x5,x6,lsl#32	// pack
-	add	v23.4s,v23.4s,v31.4s			// +4
-	add	x7,x7,x8,lsl#32
-	add	v3.4s,v3.4s,v27.4s
-	ldp	x6,x8,[x1,#0]		// load input
-	add	v7.4s,v7.4s,v28.4s
-	add	x9,x9,x10,lsl#32
-	add	v11.4s,v11.4s,v29.4s
-	add	x11,x11,x12,lsl#32
-	add	v15.4s,v15.4s,v30.4s
-	ldp	x10,x12,[x1,#16]
-	add	v19.4s,v19.4s,v27.4s
-	add	x13,x13,x14,lsl#32
-	add	v23.4s,v23.4s,v28.4s
-	add	x15,x15,x16,lsl#32
-	add	v1.4s,v1.4s,v25.4s
-	ldp	x14,x16,[x1,#32]
-	add	v5.4s,v5.4s,v25.4s
-	add	x17,x17,x19,lsl#32
-	add	v9.4s,v9.4s,v25.4s
-	add	x20,x20,x21,lsl#32
-	add	v13.4s,v13.4s,v25.4s
-	ldp	x19,x21,[x1,#48]
-	add	v17.4s,v17.4s,v25.4s
-	add	x1,x1,#64
-	add	v21.4s,v21.4s,v25.4s
-
-#ifdef	__AARCH64EB__
-	rev	x5,x5
-	rev	x7,x7
-	rev	x9,x9
-	rev	x11,x11
-	rev	x13,x13
-	rev	x15,x15
-	rev	x17,x17
-	rev	x20,x20
-#endif
-	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
-	eor	x5,x5,x6
-	eor	x7,x7,x8
-	eor	x9,x9,x10
-	eor	x11,x11,x12
-	eor	x13,x13,x14
-	eor	v0.16b,v0.16b,v24.16b
-	eor	x15,x15,x16
-	eor	v1.16b,v1.16b,v25.16b
-	eor	x17,x17,x19
-	eor	v2.16b,v2.16b,v26.16b
-	eor	x20,x20,x21
-	eor	v3.16b,v3.16b,v27.16b
-	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
-
-	stp	x5,x7,[x0,#0]		// store output
-	add	x28,x28,#7			// increment counter
-	stp	x9,x11,[x0,#16]
-	stp	x13,x15,[x0,#32]
-	stp	x17,x20,[x0,#48]
-	add	x0,x0,#64
-	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
-
-	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
-	eor	v4.16b,v4.16b,v24.16b
-	eor	v5.16b,v5.16b,v25.16b
-	eor	v6.16b,v6.16b,v26.16b
-	eor	v7.16b,v7.16b,v27.16b
-	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
-
-	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
-	eor	v8.16b,v8.16b,v0.16b
-	ldp	q24,q25,[sp,#0]
-	eor	v9.16b,v9.16b,v1.16b
-	ldp	q26,q27,[sp,#32]
-	eor	v10.16b,v10.16b,v2.16b
-	eor	v11.16b,v11.16b,v3.16b
-	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
-
-	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
-	eor	v12.16b,v12.16b,v4.16b
-	eor	v13.16b,v13.16b,v5.16b
-	eor	v14.16b,v14.16b,v6.16b
-	eor	v15.16b,v15.16b,v7.16b
-	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
-
-	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
-	eor	v16.16b,v16.16b,v8.16b
-	eor	v17.16b,v17.16b,v9.16b
-	eor	v18.16b,v18.16b,v10.16b
-	eor	v19.16b,v19.16b,v11.16b
-	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
-
-	shl	v0.4s,v31.4s,#1			// 4 -> 8
-	eor	v20.16b,v20.16b,v12.16b
-	eor	v21.16b,v21.16b,v13.16b
-	eor	v22.16b,v22.16b,v14.16b
-	eor	v23.16b,v23.16b,v15.16b
-	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
-
-	add	v27.4s,v27.4s,v0.4s			// += 8
-	add	v28.4s,v28.4s,v0.4s
-	add	v29.4s,v29.4s,v0.4s
-	add	v30.4s,v30.4s,v0.4s
-
-	b.hs	Loop_outer_512_neon
-
-	adds	x2,x2,#512
-	ushr	v0.4s,v31.4s,#2			// 4 -> 1
-
-	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
-	ldp	d10,d11,[sp,#128+16]
-	ldp	d12,d13,[sp,#128+32]
-	ldp	d14,d15,[sp,#128+48]
-
-	stp	q24,q31,[sp,#0]		// wipe off-load area
-	stp	q24,q31,[sp,#32]
-	stp	q24,q31,[sp,#64]
-
-	b.eq	Ldone_512_neon
-
-	cmp	x2,#192
-	sub	v27.4s,v27.4s,v0.4s			// -= 1
-	sub	v28.4s,v28.4s,v0.4s
-	sub	v29.4s,v29.4s,v0.4s
-	add	sp,sp,#128
-	b.hs	Loop_outer_neon
-
-	eor	v25.16b,v25.16b,v25.16b
-	eor	v26.16b,v26.16b,v26.16b
-	eor	v27.16b,v27.16b,v27.16b
-	eor	v28.16b,v28.16b,v28.16b
-	eor	v29.16b,v29.16b,v29.16b
-	eor	v30.16b,v30.16b,v30.16b
-	b	Loop_outer
-
-Ldone_512_neon:
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#128+64
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8-apple.S b/apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8-apple.S
deleted file mode 100644
index 04a1e22..0000000
--- a/apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8-apple.S
+++ /dev/null
@@ -1,3009 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-.section	__TEXT,__const
-
-.align	7
-Lchacha20_consts:
-.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
-Linc:
-.long	1,2,3,4
-Lrol8:
-.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
-Lclamp:
-.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
-
-.text
-
-
-.align	6
-Lpoly_hash_ad_internal:
-.cfi_startproc
-	cbnz	x4, Lpoly_hash_intro
-	ret
-
-Lpoly_hash_intro:
-	cmp	x4, #16
-	b.lt	Lpoly_hash_ad_tail
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	sub	x4, x4, #16
-	b	Lpoly_hash_ad_internal
-
-Lpoly_hash_ad_tail:
-	cbz	x4, Lpoly_hash_ad_ret
-
-	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
-	sub	x4, x4, #1
-
-Lpoly_hash_tail_16_compose:
-	ext	v20.16b, v20.16b, v20.16b, #15
-	ldrb	w11, [x3, x4]
-	mov	v20.b[0], w11
-	subs	x4, x4, #1
-	b.ge	Lpoly_hash_tail_16_compose
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-
-Lpoly_hash_ad_ret:
-	ret
-.cfi_endproc
-
-
-/////////////////////////////////
-//
-// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
-//
-.globl	_chacha20_poly1305_seal
-.private_extern	_chacha20_poly1305_seal
-
-.align	6
-_chacha20_poly1305_seal:
-	AARCH64_SIGN_LINK_REGISTER
-.cfi_startproc
-	stp	x29, x30, [sp, #-80]!
-.cfi_def_cfa_offset	80
-.cfi_offset	w30, -72
-.cfi_offset	w29, -80
-	mov	x29, sp
-    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
-    // we don't actually use the frame pointer like that, it's probably not
-    // worth bothering.
-	stp	d8, d9, [sp, #16]
-	stp	d10, d11, [sp, #32]
-	stp	d12, d13, [sp, #48]
-	stp	d14, d15, [sp, #64]
-.cfi_offset	b15, -8
-.cfi_offset	b14, -16
-.cfi_offset	b13, -24
-.cfi_offset	b12, -32
-.cfi_offset	b11, -40
-.cfi_offset	b10, -48
-.cfi_offset	b9, -56
-.cfi_offset	b8, -64
-
-	adrp	x11, Lchacha20_consts@PAGE
-	add	x11, x11, Lchacha20_consts@PAGEOFF
-
-	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
-	ld1	{v28.16b - v30.16b}, [x5]
-
-	mov	x15, #1 // Prepare the Poly1305 state
-	mov	x8, #0
-	mov	x9, #0
-	mov	x10, #0
-
-	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
-	add	x12, x12, x2
-	mov	v31.d[0], x4  // Store the input and aad lengths
-	mov	v31.d[1], x12
-
-	cmp	x2, #128
-	b.le	Lseal_128 // Optimization for smaller buffers
-
-    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
-    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
-    // the fifth block (A4-D4) horizontally.
-	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
-	mov	v4.16b, v24.16b
-
-	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
-	mov	v9.16b, v28.16b
-
-	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
-	mov	v14.16b, v29.16b
-
-	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
-	add	v15.4s, v15.4s, v25.4s
-	mov	v19.16b, v30.16b
-
-	sub	x5, x5, #32
-
-	mov	x6, #10
-
-.align	5
-Lseal_init_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	add	v3.4s, v3.4s, v8.4s
-	add	v4.4s, v4.4s, v9.4s
-
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	eor	v18.16b, v18.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-	rev32	v18.8h, v18.8h
-	rev32	v19.8h, v19.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	add	v13.4s, v13.4s, v18.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	eor	v8.16b, v8.16b, v13.16b
-	eor	v9.16b, v9.16b, v14.16b
-
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-	ushr	v7.4s, v8.4s, #20
-	sli	v7.4s, v8.4s, #12
-	ushr	v8.4s, v9.4s, #20
-	sli	v8.4s, v9.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	add	v3.4s, v3.4s, v7.4s
-	add	v4.4s, v4.4s, v8.4s
-
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	eor	v18.16b, v18.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-	tbl	v18.16b, {v18.16b}, v26.16b
-	tbl	v19.16b, {v19.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	add	v13.4s, v13.4s, v18.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	eor	v7.16b, v7.16b, v13.16b
-	eor	v8.16b, v8.16b, v14.16b
-
-	ushr	v9.4s, v8.4s, #25
-	sli	v9.4s, v8.4s, #7
-	ushr	v8.4s, v7.4s, #25
-	sli	v8.4s, v7.4s, #7
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v9.16b, v9.16b, v9.16b, #4
-	ext	v14.16b, v14.16b, v14.16b, #8
-	ext	v19.16b, v19.16b, v19.16b, #12
-	add	v0.4s, v0.4s, v6.4s
-	add	v1.4s, v1.4s, v7.4s
-	add	v2.4s, v2.4s, v8.4s
-	add	v3.4s, v3.4s, v5.4s
-	add	v4.4s, v4.4s, v9.4s
-
-	eor	v18.16b, v18.16b, v0.16b
-	eor	v15.16b, v15.16b, v1.16b
-	eor	v16.16b, v16.16b, v2.16b
-	eor	v17.16b, v17.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	rev32	v18.8h, v18.8h
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-	rev32	v19.8h, v19.8h
-
-	add	v12.4s, v12.4s, v18.4s
-	add	v13.4s, v13.4s, v15.4s
-	add	v10.4s, v10.4s, v16.4s
-	add	v11.4s, v11.4s, v17.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v6.16b, v6.16b, v12.16b
-	eor	v7.16b, v7.16b, v13.16b
-	eor	v8.16b, v8.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v9.16b, v9.16b, v14.16b
-
-	ushr	v20.4s, v6.4s, #20
-	sli	v20.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-	ushr	v7.4s, v8.4s, #20
-	sli	v7.4s, v8.4s, #12
-	ushr	v8.4s, v5.4s, #20
-	sli	v8.4s, v5.4s, #12
-	ushr	v5.4s, v9.4s, #20
-	sli	v5.4s, v9.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	add	v3.4s, v3.4s, v8.4s
-	add	v4.4s, v4.4s, v5.4s
-
-	eor	v18.16b, v18.16b, v0.16b
-	eor	v15.16b, v15.16b, v1.16b
-	eor	v16.16b, v16.16b, v2.16b
-	eor	v17.16b, v17.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	tbl	v18.16b, {v18.16b}, v26.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-	tbl	v19.16b, {v19.16b}, v26.16b
-
-	add	v12.4s, v12.4s, v18.4s
-	add	v13.4s, v13.4s, v15.4s
-	add	v10.4s, v10.4s, v16.4s
-	add	v11.4s, v11.4s, v17.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v20.16b, v20.16b, v12.16b
-	eor	v6.16b, v6.16b, v13.16b
-	eor	v7.16b, v7.16b, v10.16b
-	eor	v8.16b, v8.16b, v11.16b
-	eor	v5.16b, v5.16b, v14.16b
-
-	ushr	v9.4s, v5.4s, #25
-	sli	v9.4s, v5.4s, #7
-	ushr	v5.4s, v8.4s, #25
-	sli	v5.4s, v8.4s, #7
-	ushr	v8.4s, v7.4s, #25
-	sli	v8.4s, v7.4s, #7
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v20.4s, #25
-	sli	v6.4s, v20.4s, #7
-
-	ext	v9.16b, v9.16b, v9.16b, #12
-	ext	v14.16b, v14.16b, v14.16b, #8
-	ext	v19.16b, v19.16b, v19.16b, #4
-	subs	x6, x6, #1
-	b.hi	Lseal_init_rounds
-
-	add	v15.4s, v15.4s, v25.4s
-	mov	x11, #4
-	dup	v20.4s, w11
-	add	v25.4s, v25.4s, v20.4s
-
-	zip1	v20.4s, v0.4s, v1.4s
-	zip2	v21.4s, v0.4s, v1.4s
-	zip1	v22.4s, v2.4s, v3.4s
-	zip2	v23.4s, v2.4s, v3.4s
-
-	zip1	v0.2d, v20.2d, v22.2d
-	zip2	v1.2d, v20.2d, v22.2d
-	zip1	v2.2d, v21.2d, v23.2d
-	zip2	v3.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v5.4s, v6.4s
-	zip2	v21.4s, v5.4s, v6.4s
-	zip1	v22.4s, v7.4s, v8.4s
-	zip2	v23.4s, v7.4s, v8.4s
-
-	zip1	v5.2d, v20.2d, v22.2d
-	zip2	v6.2d, v20.2d, v22.2d
-	zip1	v7.2d, v21.2d, v23.2d
-	zip2	v8.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v10.4s, v11.4s
-	zip2	v21.4s, v10.4s, v11.4s
-	zip1	v22.4s, v12.4s, v13.4s
-	zip2	v23.4s, v12.4s, v13.4s
-
-	zip1	v10.2d, v20.2d, v22.2d
-	zip2	v11.2d, v20.2d, v22.2d
-	zip1	v12.2d, v21.2d, v23.2d
-	zip2	v13.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v15.4s, v16.4s
-	zip2	v21.4s, v15.4s, v16.4s
-	zip1	v22.4s, v17.4s, v18.4s
-	zip2	v23.4s, v17.4s, v18.4s
-
-	zip1	v15.2d, v20.2d, v22.2d
-	zip2	v16.2d, v20.2d, v22.2d
-	zip1	v17.2d, v21.2d, v23.2d
-	zip2	v18.2d, v21.2d, v23.2d
-
-	add	v4.4s, v4.4s, v24.4s
-	add	v9.4s, v9.4s, v28.4s
-	and	v4.16b, v4.16b, v27.16b
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-	add	v10.4s, v10.4s, v29.4s
-	add	v15.4s, v15.4s, v30.4s
-
-	add	v1.4s, v1.4s, v24.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v11.4s, v11.4s, v29.4s
-	add	v16.4s, v16.4s, v30.4s
-
-	add	v2.4s, v2.4s, v24.4s
-	add	v7.4s, v7.4s, v28.4s
-	add	v12.4s, v12.4s, v29.4s
-	add	v17.4s, v17.4s, v30.4s
-
-	add	v3.4s, v3.4s, v24.4s
-	add	v8.4s, v8.4s, v28.4s
-	add	v13.4s, v13.4s, v29.4s
-	add	v18.4s, v18.4s, v30.4s
-
-	mov	x16, v4.d[0] // Move the R key to GPRs
-	mov	x17, v4.d[1]
-	mov	v27.16b, v9.16b // Store the S key
-
-	bl	Lpoly_hash_ad_internal
-
-	mov	x3, x0
-	cmp	x2, #256
-	b.le	Lseal_tail
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v0.16b
-	eor	v21.16b, v21.16b, v5.16b
-	eor	v22.16b, v22.16b, v10.16b
-	eor	v23.16b, v23.16b, v15.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v1.16b
-	eor	v21.16b, v21.16b, v6.16b
-	eor	v22.16b, v22.16b, v11.16b
-	eor	v23.16b, v23.16b, v16.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v2.16b
-	eor	v21.16b, v21.16b, v7.16b
-	eor	v22.16b, v22.16b, v12.16b
-	eor	v23.16b, v23.16b, v17.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v3.16b
-	eor	v21.16b, v21.16b, v8.16b
-	eor	v22.16b, v22.16b, v13.16b
-	eor	v23.16b, v23.16b, v18.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #256
-
-	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
-	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
-
-Lseal_main_loop:
-	adrp	x11, Lchacha20_consts@PAGE
-	add	x11, x11, Lchacha20_consts@PAGEOFF
-
-	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
-	mov	v4.16b, v24.16b
-
-	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
-	mov	v9.16b, v28.16b
-
-	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
-	mov	v14.16b, v29.16b
-
-	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
-	add	v15.4s, v15.4s, v25.4s
-	mov	v19.16b, v30.16b
-
-	eor	v20.16b, v20.16b, v20.16b //zero
-	not	v21.16b, v20.16b // -1
-	sub	v21.4s, v25.4s, v21.4s // Add +1
-	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
-	add	v19.4s, v19.4s, v20.4s
-
-	sub	x5, x5, #32
-.align	5
-Lseal_main_loop_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	add	v3.4s, v3.4s, v8.4s
-	add	v4.4s, v4.4s, v9.4s
-
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	eor	v18.16b, v18.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-	rev32	v18.8h, v18.8h
-	rev32	v19.8h, v19.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	add	v13.4s, v13.4s, v18.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	eor	v8.16b, v8.16b, v13.16b
-	eor	v9.16b, v9.16b, v14.16b
-
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-	ushr	v7.4s, v8.4s, #20
-	sli	v7.4s, v8.4s, #12
-	ushr	v8.4s, v9.4s, #20
-	sli	v8.4s, v9.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	add	v3.4s, v3.4s, v7.4s
-	add	v4.4s, v4.4s, v8.4s
-
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	eor	v18.16b, v18.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-	tbl	v18.16b, {v18.16b}, v26.16b
-	tbl	v19.16b, {v19.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	add	v13.4s, v13.4s, v18.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	eor	v7.16b, v7.16b, v13.16b
-	eor	v8.16b, v8.16b, v14.16b
-
-	ushr	v9.4s, v8.4s, #25
-	sli	v9.4s, v8.4s, #7
-	ushr	v8.4s, v7.4s, #25
-	sli	v8.4s, v7.4s, #7
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v9.16b, v9.16b, v9.16b, #4
-	ext	v14.16b, v14.16b, v14.16b, #8
-	ext	v19.16b, v19.16b, v19.16b, #12
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	add	v0.4s, v0.4s, v6.4s
-	add	v1.4s, v1.4s, v7.4s
-	add	v2.4s, v2.4s, v8.4s
-	add	v3.4s, v3.4s, v5.4s
-	add	v4.4s, v4.4s, v9.4s
-
-	eor	v18.16b, v18.16b, v0.16b
-	eor	v15.16b, v15.16b, v1.16b
-	eor	v16.16b, v16.16b, v2.16b
-	eor	v17.16b, v17.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	rev32	v18.8h, v18.8h
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-	rev32	v19.8h, v19.8h
-
-	add	v12.4s, v12.4s, v18.4s
-	add	v13.4s, v13.4s, v15.4s
-	add	v10.4s, v10.4s, v16.4s
-	add	v11.4s, v11.4s, v17.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v6.16b, v6.16b, v12.16b
-	eor	v7.16b, v7.16b, v13.16b
-	eor	v8.16b, v8.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v9.16b, v9.16b, v14.16b
-
-	ushr	v20.4s, v6.4s, #20
-	sli	v20.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-	ushr	v7.4s, v8.4s, #20
-	sli	v7.4s, v8.4s, #12
-	ushr	v8.4s, v5.4s, #20
-	sli	v8.4s, v5.4s, #12
-	ushr	v5.4s, v9.4s, #20
-	sli	v5.4s, v9.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	add	v3.4s, v3.4s, v8.4s
-	add	v4.4s, v4.4s, v5.4s
-
-	eor	v18.16b, v18.16b, v0.16b
-	eor	v15.16b, v15.16b, v1.16b
-	eor	v16.16b, v16.16b, v2.16b
-	eor	v17.16b, v17.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	tbl	v18.16b, {v18.16b}, v26.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-	tbl	v19.16b, {v19.16b}, v26.16b
-
-	add	v12.4s, v12.4s, v18.4s
-	add	v13.4s, v13.4s, v15.4s
-	add	v10.4s, v10.4s, v16.4s
-	add	v11.4s, v11.4s, v17.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v20.16b, v20.16b, v12.16b
-	eor	v6.16b, v6.16b, v13.16b
-	eor	v7.16b, v7.16b, v10.16b
-	eor	v8.16b, v8.16b, v11.16b
-	eor	v5.16b, v5.16b, v14.16b
-
-	ushr	v9.4s, v5.4s, #25
-	sli	v9.4s, v5.4s, #7
-	ushr	v5.4s, v8.4s, #25
-	sli	v5.4s, v8.4s, #7
-	ushr	v8.4s, v7.4s, #25
-	sli	v8.4s, v7.4s, #7
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v20.4s, #25
-	sli	v6.4s, v20.4s, #7
-
-	ext	v9.16b, v9.16b, v9.16b, #12
-	ext	v14.16b, v14.16b, v14.16b, #8
-	ext	v19.16b, v19.16b, v19.16b, #4
-	subs	x6, x6, #1
-	b.ge	Lseal_main_loop_rounds
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	subs	x7, x7, #1
-	b.gt	Lseal_main_loop_rounds
-
-	eor	v20.16b, v20.16b, v20.16b //zero
-	not	v21.16b, v20.16b // -1
-	sub	v21.4s, v25.4s, v21.4s // Add +1
-	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
-	add	v19.4s, v19.4s, v20.4s
-
-	add	v15.4s, v15.4s, v25.4s
-	mov	x11, #5
-	dup	v20.4s, w11
-	add	v25.4s, v25.4s, v20.4s
-
-	zip1	v20.4s, v0.4s, v1.4s
-	zip2	v21.4s, v0.4s, v1.4s
-	zip1	v22.4s, v2.4s, v3.4s
-	zip2	v23.4s, v2.4s, v3.4s
-
-	zip1	v0.2d, v20.2d, v22.2d
-	zip2	v1.2d, v20.2d, v22.2d
-	zip1	v2.2d, v21.2d, v23.2d
-	zip2	v3.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v5.4s, v6.4s
-	zip2	v21.4s, v5.4s, v6.4s
-	zip1	v22.4s, v7.4s, v8.4s
-	zip2	v23.4s, v7.4s, v8.4s
-
-	zip1	v5.2d, v20.2d, v22.2d
-	zip2	v6.2d, v20.2d, v22.2d
-	zip1	v7.2d, v21.2d, v23.2d
-	zip2	v8.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v10.4s, v11.4s
-	zip2	v21.4s, v10.4s, v11.4s
-	zip1	v22.4s, v12.4s, v13.4s
-	zip2	v23.4s, v12.4s, v13.4s
-
-	zip1	v10.2d, v20.2d, v22.2d
-	zip2	v11.2d, v20.2d, v22.2d
-	zip1	v12.2d, v21.2d, v23.2d
-	zip2	v13.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v15.4s, v16.4s
-	zip2	v21.4s, v15.4s, v16.4s
-	zip1	v22.4s, v17.4s, v18.4s
-	zip2	v23.4s, v17.4s, v18.4s
-
-	zip1	v15.2d, v20.2d, v22.2d
-	zip2	v16.2d, v20.2d, v22.2d
-	zip1	v17.2d, v21.2d, v23.2d
-	zip2	v18.2d, v21.2d, v23.2d
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-	add	v10.4s, v10.4s, v29.4s
-	add	v15.4s, v15.4s, v30.4s
-
-	add	v1.4s, v1.4s, v24.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v11.4s, v11.4s, v29.4s
-	add	v16.4s, v16.4s, v30.4s
-
-	add	v2.4s, v2.4s, v24.4s
-	add	v7.4s, v7.4s, v28.4s
-	add	v12.4s, v12.4s, v29.4s
-	add	v17.4s, v17.4s, v30.4s
-
-	add	v3.4s, v3.4s, v24.4s
-	add	v8.4s, v8.4s, v28.4s
-	add	v13.4s, v13.4s, v29.4s
-	add	v18.4s, v18.4s, v30.4s
-
-	add	v4.4s, v4.4s, v24.4s
-	add	v9.4s, v9.4s, v28.4s
-	add	v14.4s, v14.4s, v29.4s
-	add	v19.4s, v19.4s, v30.4s
-
-	cmp	x2, #320
-	b.le	Lseal_tail
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v0.16b
-	eor	v21.16b, v21.16b, v5.16b
-	eor	v22.16b, v22.16b, v10.16b
-	eor	v23.16b, v23.16b, v15.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v1.16b
-	eor	v21.16b, v21.16b, v6.16b
-	eor	v22.16b, v22.16b, v11.16b
-	eor	v23.16b, v23.16b, v16.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v2.16b
-	eor	v21.16b, v21.16b, v7.16b
-	eor	v22.16b, v22.16b, v12.16b
-	eor	v23.16b, v23.16b, v17.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v3.16b
-	eor	v21.16b, v21.16b, v8.16b
-	eor	v22.16b, v22.16b, v13.16b
-	eor	v23.16b, v23.16b, v18.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v4.16b
-	eor	v21.16b, v21.16b, v9.16b
-	eor	v22.16b, v22.16b, v14.16b
-	eor	v23.16b, v23.16b, v19.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #320
-
-	mov	x6, #0
-	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
-
-	b	Lseal_main_loop
-
-Lseal_tail:
-    // This part of the function handles the storage and authentication of the last [0,320) bytes
-    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
-	cmp	x2, #64
-	b.lt	Lseal_tail_64
-
-    // Store and authenticate 64B blocks per iteration
-	ld1	{v20.16b - v23.16b}, [x1], #64
-
-	eor	v20.16b, v20.16b, v0.16b
-	eor	v21.16b, v21.16b, v5.16b
-	eor	v22.16b, v22.16b, v10.16b
-	eor	v23.16b, v23.16b, v15.16b
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	mov	x11, v21.d[0]
-	mov	x12, v21.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	mov	x11, v22.d[0]
-	mov	x12, v22.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	mov	x11, v23.d[0]
-	mov	x12, v23.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	st1	{v20.16b - v23.16b}, [x0], #64
-	sub	x2, x2, #64
-
-    // Shift the state left by 64 bytes for the next iteration of the loop
-	mov	v0.16b, v1.16b
-	mov	v5.16b, v6.16b
-	mov	v10.16b, v11.16b
-	mov	v15.16b, v16.16b
-
-	mov	v1.16b, v2.16b
-	mov	v6.16b, v7.16b
-	mov	v11.16b, v12.16b
-	mov	v16.16b, v17.16b
-
-	mov	v2.16b, v3.16b
-	mov	v7.16b, v8.16b
-	mov	v12.16b, v13.16b
-	mov	v17.16b, v18.16b
-
-	mov	v3.16b, v4.16b
-	mov	v8.16b, v9.16b
-	mov	v13.16b, v14.16b
-	mov	v18.16b, v19.16b
-
-	b	Lseal_tail
-
-Lseal_tail_64:
-	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
-
-    // Here we handle the last [0,64) bytes of plaintext
-	cmp	x2, #16
-	b.lt	Lseal_tail_16
-    // Each iteration encrypt and authenticate a 16B block
-	ld1	{v20.16b}, [x1], #16
-	eor	v20.16b, v20.16b, v0.16b
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	st1	{v20.16b}, [x0], #16
-
-	sub	x2, x2, #16
-
-    // Shift the state left by 16 bytes for the next iteration of the loop
-	mov	v0.16b, v5.16b
-	mov	v5.16b, v10.16b
-	mov	v10.16b, v15.16b
-
-	b	Lseal_tail_64
-
-Lseal_tail_16:
-    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
-	cbz	x2, Lseal_hash_extra
-
-	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
-	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
-	not	v22.16b, v20.16b
-
-	mov	x6, x2
-	add	x1, x1, x2
-
-	cbz	x4, Lseal_tail_16_compose // No extra data to pad with, zero padding
-
-	mov	x7, #16          // We need to load some extra_in first for padding
-	sub	x7, x7, x2
-	cmp	x4, x7
-	csel	x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
-	mov	x12, x7
-	add	x3, x3, x7
-	sub	x4, x4, x7
-
-Lseal_tail16_compose_extra_in:
-	ext	v20.16b, v20.16b, v20.16b, #15
-	ldrb	w11, [x3, #-1]!
-	mov	v20.b[0], w11
-	subs	x7, x7, #1
-	b.gt	Lseal_tail16_compose_extra_in
-
-	add	x3, x3, x12
-
-Lseal_tail_16_compose:
-	ext	v20.16b, v20.16b, v20.16b, #15
-	ldrb	w11, [x1, #-1]!
-	mov	v20.b[0], w11
-	ext	v21.16b, v22.16b, v21.16b, #15
-	subs	x2, x2, #1
-	b.gt	Lseal_tail_16_compose
-
-	and	v0.16b, v0.16b, v21.16b
-	eor	v20.16b, v20.16b, v0.16b
-	mov	v21.16b, v20.16b
-
-Lseal_tail_16_store:
-	umov	w11, v20.b[0]
-	strb	w11, [x0], #1
-	ext	v20.16b, v20.16b, v20.16b, #1
-	subs	x6, x6, #1
-	b.gt	Lseal_tail_16_store
-
-    // Hash in the final ct block concatenated with extra_in
-	mov	x11, v21.d[0]
-	mov	x12, v21.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-
-Lseal_hash_extra:
-	cbz	x4, Lseal_finalize
-
-Lseal_hash_extra_loop:
-	cmp	x4, #16
-	b.lt	Lseal_hash_extra_tail
-	ld1	{v20.16b}, [x3], #16
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	sub	x4, x4, #16
-	b	Lseal_hash_extra_loop
-
-Lseal_hash_extra_tail:
-	cbz	x4, Lseal_finalize
-	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
-	add	x3, x3, x4
-
-Lseal_hash_extra_load:
-	ext	v20.16b, v20.16b, v20.16b, #15
-	ldrb	w11, [x3, #-1]!
-	mov	v20.b[0], w11
-	subs	x4, x4, #1
-	b.gt	Lseal_hash_extra_load
-
-    // Hash in the final padded extra_in blcok
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-
-Lseal_finalize:
-	mov	x11, v31.d[0]
-	mov	x12, v31.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-    // Final reduction step
-	sub	x12, xzr, x15
-	orr	x13, xzr, #3
-	subs	x11, x8, #-5
-	sbcs	x12, x9, x12
-	sbcs	x13, x10, x13
-	csel	x8, x11, x8, cs
-	csel	x9, x12, x9, cs
-	csel	x10, x13, x10, cs
-	mov	x11, v27.d[0]
-	mov	x12, v27.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-
-	stp	x8, x9, [x5]
-
-	ldp	d8, d9, [sp, #16]
-	ldp	d10, d11, [sp, #32]
-	ldp	d12, d13, [sp, #48]
-	ldp	d14, d15, [sp, #64]
-.cfi_restore	b15
-.cfi_restore	b14
-.cfi_restore	b13
-.cfi_restore	b12
-.cfi_restore	b11
-.cfi_restore	b10
-.cfi_restore	b9
-.cfi_restore	b8
-	ldp	x29, x30, [sp], 80
-.cfi_restore	w29
-.cfi_restore	w30
-.cfi_def_cfa_offset	0
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-Lseal_128:
-    // On some architectures preparing 5 blocks for small buffers is wasteful
-	eor	v25.16b, v25.16b, v25.16b
-	mov	x11, #1
-	mov	v25.s[0], w11
-	mov	v0.16b, v24.16b
-	mov	v1.16b, v24.16b
-	mov	v2.16b, v24.16b
-	mov	v5.16b, v28.16b
-	mov	v6.16b, v28.16b
-	mov	v7.16b, v28.16b
-	mov	v10.16b, v29.16b
-	mov	v11.16b, v29.16b
-	mov	v12.16b, v29.16b
-	mov	v17.16b, v30.16b
-	add	v15.4s, v17.4s, v25.4s
-	add	v16.4s, v15.4s, v25.4s
-
-	mov	x6, #10
-
-Lseal_128_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v5.16b, v5.16b, v5.16b, #4
-	ext	v6.16b, v6.16b, v6.16b, #4
-	ext	v7.16b, v7.16b, v7.16b, #4
-
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v12.16b, v12.16b, v12.16b, #8
-
-	ext	v15.16b, v15.16b, v15.16b, #12
-	ext	v16.16b, v16.16b, v16.16b, #12
-	ext	v17.16b, v17.16b, v17.16b, #12
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ext	v6.16b, v6.16b, v6.16b, #12
-	ext	v7.16b, v7.16b, v7.16b, #12
-
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v12.16b, v12.16b, v12.16b, #8
-
-	ext	v15.16b, v15.16b, v15.16b, #4
-	ext	v16.16b, v16.16b, v16.16b, #4
-	ext	v17.16b, v17.16b, v17.16b, #4
-	subs	x6, x6, #1
-	b.hi	Lseal_128_rounds
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v1.4s, v1.4s, v24.4s
-	add	v2.4s, v2.4s, v24.4s
-
-	add	v5.4s, v5.4s, v28.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v7.4s, v7.4s, v28.4s
-
-    // Only the first 32 bytes of the third block (counter = 0) are needed,
-    // so skip updating v12 and v17.
-	add	v10.4s, v10.4s, v29.4s
-	add	v11.4s, v11.4s, v29.4s
-
-	add	v30.4s, v30.4s, v25.4s
-	add	v15.4s, v15.4s, v30.4s
-	add	v30.4s, v30.4s, v25.4s
-	add	v16.4s, v16.4s, v30.4s
-
-	and	v2.16b, v2.16b, v27.16b
-	mov	x16, v2.d[0] // Move the R key to GPRs
-	mov	x17, v2.d[1]
-	mov	v27.16b, v7.16b // Store the S key
-
-	bl	Lpoly_hash_ad_internal
-	b	Lseal_tail
-.cfi_endproc
-
-
-/////////////////////////////////
-//
-// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
-//
-.globl	_chacha20_poly1305_open
-.private_extern	_chacha20_poly1305_open
-
-.align	6
-_chacha20_poly1305_open:
-	AARCH64_SIGN_LINK_REGISTER
-.cfi_startproc
-	stp	x29, x30, [sp, #-80]!
-.cfi_def_cfa_offset	80
-.cfi_offset	w30, -72
-.cfi_offset	w29, -80
-	mov	x29, sp
-    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
-    // we don't actually use the frame pointer like that, it's probably not
-    // worth bothering.
-	stp	d8, d9, [sp, #16]
-	stp	d10, d11, [sp, #32]
-	stp	d12, d13, [sp, #48]
-	stp	d14, d15, [sp, #64]
-.cfi_offset	b15, -8
-.cfi_offset	b14, -16
-.cfi_offset	b13, -24
-.cfi_offset	b12, -32
-.cfi_offset	b11, -40
-.cfi_offset	b10, -48
-.cfi_offset	b9, -56
-.cfi_offset	b8, -64
-
-	adrp	x11, Lchacha20_consts@PAGE
-	add	x11, x11, Lchacha20_consts@PAGEOFF
-
-	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
-	ld1	{v28.16b - v30.16b}, [x5]
-
-	mov	x15, #1 // Prepare the Poly1305 state
-	mov	x8, #0
-	mov	x9, #0
-	mov	x10, #0
-
-	mov	v31.d[0], x4  // Store the input and aad lengths
-	mov	v31.d[1], x2
-
-	cmp	x2, #128
-	b.le	Lopen_128 // Optimization for smaller buffers
-
-    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
-	mov	v0.16b, v24.16b
-	mov	v5.16b, v28.16b
-	mov	v10.16b, v29.16b
-	mov	v15.16b, v30.16b
-
-	mov	x6, #10
-
-.align	5
-Lopen_init_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	eor	v15.16b, v15.16b, v0.16b
-	rev32	v15.8h, v15.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v5.16b, v5.16b, v10.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	add	v0.4s, v0.4s, v20.4s
-	eor	v15.16b, v15.16b, v0.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v20.16b, v20.16b, v10.16b
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-	ext	v5.16b, v5.16b, v5.16b, #4
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v15.16b, v15.16b, v15.16b, #12
-	add	v0.4s, v0.4s, v5.4s
-	eor	v15.16b, v15.16b, v0.16b
-	rev32	v15.8h, v15.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v5.16b, v5.16b, v10.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	add	v0.4s, v0.4s, v20.4s
-	eor	v15.16b, v15.16b, v0.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v20.16b, v20.16b, v10.16b
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v15.16b, v15.16b, v15.16b, #4
-	subs	x6, x6, #1
-	b.hi	Lopen_init_rounds
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-
-	and	v0.16b, v0.16b, v27.16b
-	mov	x16, v0.d[0] // Move the R key to GPRs
-	mov	x17, v0.d[1]
-	mov	v27.16b, v5.16b // Store the S key
-
-	bl	Lpoly_hash_ad_internal
-
-Lopen_ad_done:
-	mov	x3, x1
-
-// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
-Lopen_main_loop:
-
-	cmp	x2, #192
-	b.lt	Lopen_tail
-
-	adrp	x11, Lchacha20_consts@PAGE
-	add	x11, x11, Lchacha20_consts@PAGEOFF
-
-	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
-	mov	v4.16b, v24.16b
-
-	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
-	mov	v9.16b, v28.16b
-
-	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
-	mov	v14.16b, v29.16b
-
-	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
-	sub	x5, x5, #32
-	add	v15.4s, v15.4s, v25.4s
-	mov	v19.16b, v30.16b
-
-	eor	v20.16b, v20.16b, v20.16b //zero
-	not	v21.16b, v20.16b // -1
-	sub	v21.4s, v25.4s, v21.4s // Add +1
-	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
-	add	v19.4s, v19.4s, v20.4s
-
-	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
-	sub	x4, x4, #10
-
-	mov	x7, #10
-	subs	x6, x7, x4
-	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
-	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
-
-	cbz	x7, Lopen_main_loop_rounds_short
-
-.align	5
-Lopen_main_loop_rounds:
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-Lopen_main_loop_rounds_short:
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	add	v3.4s, v3.4s, v8.4s
-	add	v4.4s, v4.4s, v9.4s
-
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	eor	v18.16b, v18.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-	rev32	v18.8h, v18.8h
-	rev32	v19.8h, v19.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	add	v13.4s, v13.4s, v18.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	eor	v8.16b, v8.16b, v13.16b
-	eor	v9.16b, v9.16b, v14.16b
-
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-	ushr	v7.4s, v8.4s, #20
-	sli	v7.4s, v8.4s, #12
-	ushr	v8.4s, v9.4s, #20
-	sli	v8.4s, v9.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	add	v3.4s, v3.4s, v7.4s
-	add	v4.4s, v4.4s, v8.4s
-
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	eor	v18.16b, v18.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-	tbl	v18.16b, {v18.16b}, v26.16b
-	tbl	v19.16b, {v19.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	add	v13.4s, v13.4s, v18.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	eor	v7.16b, v7.16b, v13.16b
-	eor	v8.16b, v8.16b, v14.16b
-
-	ushr	v9.4s, v8.4s, #25
-	sli	v9.4s, v8.4s, #7
-	ushr	v8.4s, v7.4s, #25
-	sli	v8.4s, v7.4s, #7
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v9.16b, v9.16b, v9.16b, #4
-	ext	v14.16b, v14.16b, v14.16b, #8
-	ext	v19.16b, v19.16b, v19.16b, #12
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	add	v0.4s, v0.4s, v6.4s
-	add	v1.4s, v1.4s, v7.4s
-	add	v2.4s, v2.4s, v8.4s
-	add	v3.4s, v3.4s, v5.4s
-	add	v4.4s, v4.4s, v9.4s
-
-	eor	v18.16b, v18.16b, v0.16b
-	eor	v15.16b, v15.16b, v1.16b
-	eor	v16.16b, v16.16b, v2.16b
-	eor	v17.16b, v17.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	rev32	v18.8h, v18.8h
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-	rev32	v19.8h, v19.8h
-
-	add	v12.4s, v12.4s, v18.4s
-	add	v13.4s, v13.4s, v15.4s
-	add	v10.4s, v10.4s, v16.4s
-	add	v11.4s, v11.4s, v17.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v6.16b, v6.16b, v12.16b
-	eor	v7.16b, v7.16b, v13.16b
-	eor	v8.16b, v8.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v9.16b, v9.16b, v14.16b
-
-	ushr	v20.4s, v6.4s, #20
-	sli	v20.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-	ushr	v7.4s, v8.4s, #20
-	sli	v7.4s, v8.4s, #12
-	ushr	v8.4s, v5.4s, #20
-	sli	v8.4s, v5.4s, #12
-	ushr	v5.4s, v9.4s, #20
-	sli	v5.4s, v9.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	add	v3.4s, v3.4s, v8.4s
-	add	v4.4s, v4.4s, v5.4s
-
-	eor	v18.16b, v18.16b, v0.16b
-	eor	v15.16b, v15.16b, v1.16b
-	eor	v16.16b, v16.16b, v2.16b
-	eor	v17.16b, v17.16b, v3.16b
-	eor	v19.16b, v19.16b, v4.16b
-
-	tbl	v18.16b, {v18.16b}, v26.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-	tbl	v19.16b, {v19.16b}, v26.16b
-
-	add	v12.4s, v12.4s, v18.4s
-	add	v13.4s, v13.4s, v15.4s
-	add	v10.4s, v10.4s, v16.4s
-	add	v11.4s, v11.4s, v17.4s
-	add	v14.4s, v14.4s, v19.4s
-
-	eor	v20.16b, v20.16b, v12.16b
-	eor	v6.16b, v6.16b, v13.16b
-	eor	v7.16b, v7.16b, v10.16b
-	eor	v8.16b, v8.16b, v11.16b
-	eor	v5.16b, v5.16b, v14.16b
-
-	ushr	v9.4s, v5.4s, #25
-	sli	v9.4s, v5.4s, #7
-	ushr	v5.4s, v8.4s, #25
-	sli	v5.4s, v8.4s, #7
-	ushr	v8.4s, v7.4s, #25
-	sli	v8.4s, v7.4s, #7
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v20.4s, #25
-	sli	v6.4s, v20.4s, #7
-
-	ext	v9.16b, v9.16b, v9.16b, #12
-	ext	v14.16b, v14.16b, v14.16b, #8
-	ext	v19.16b, v19.16b, v19.16b, #4
-	subs	x7, x7, #1
-	b.gt	Lopen_main_loop_rounds
-	subs	x6, x6, #1
-	b.ge	Lopen_main_loop_rounds_short
-
-	eor	v20.16b, v20.16b, v20.16b //zero
-	not	v21.16b, v20.16b // -1
-	sub	v21.4s, v25.4s, v21.4s // Add +1
-	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
-	add	v19.4s, v19.4s, v20.4s
-
-	add	v15.4s, v15.4s, v25.4s
-	mov	x11, #5
-	dup	v20.4s, w11
-	add	v25.4s, v25.4s, v20.4s
-
-	zip1	v20.4s, v0.4s, v1.4s
-	zip2	v21.4s, v0.4s, v1.4s
-	zip1	v22.4s, v2.4s, v3.4s
-	zip2	v23.4s, v2.4s, v3.4s
-
-	zip1	v0.2d, v20.2d, v22.2d
-	zip2	v1.2d, v20.2d, v22.2d
-	zip1	v2.2d, v21.2d, v23.2d
-	zip2	v3.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v5.4s, v6.4s
-	zip2	v21.4s, v5.4s, v6.4s
-	zip1	v22.4s, v7.4s, v8.4s
-	zip2	v23.4s, v7.4s, v8.4s
-
-	zip1	v5.2d, v20.2d, v22.2d
-	zip2	v6.2d, v20.2d, v22.2d
-	zip1	v7.2d, v21.2d, v23.2d
-	zip2	v8.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v10.4s, v11.4s
-	zip2	v21.4s, v10.4s, v11.4s
-	zip1	v22.4s, v12.4s, v13.4s
-	zip2	v23.4s, v12.4s, v13.4s
-
-	zip1	v10.2d, v20.2d, v22.2d
-	zip2	v11.2d, v20.2d, v22.2d
-	zip1	v12.2d, v21.2d, v23.2d
-	zip2	v13.2d, v21.2d, v23.2d
-
-	zip1	v20.4s, v15.4s, v16.4s
-	zip2	v21.4s, v15.4s, v16.4s
-	zip1	v22.4s, v17.4s, v18.4s
-	zip2	v23.4s, v17.4s, v18.4s
-
-	zip1	v15.2d, v20.2d, v22.2d
-	zip2	v16.2d, v20.2d, v22.2d
-	zip1	v17.2d, v21.2d, v23.2d
-	zip2	v18.2d, v21.2d, v23.2d
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-	add	v10.4s, v10.4s, v29.4s
-	add	v15.4s, v15.4s, v30.4s
-
-	add	v1.4s, v1.4s, v24.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v11.4s, v11.4s, v29.4s
-	add	v16.4s, v16.4s, v30.4s
-
-	add	v2.4s, v2.4s, v24.4s
-	add	v7.4s, v7.4s, v28.4s
-	add	v12.4s, v12.4s, v29.4s
-	add	v17.4s, v17.4s, v30.4s
-
-	add	v3.4s, v3.4s, v24.4s
-	add	v8.4s, v8.4s, v28.4s
-	add	v13.4s, v13.4s, v29.4s
-	add	v18.4s, v18.4s, v30.4s
-
-	add	v4.4s, v4.4s, v24.4s
-	add	v9.4s, v9.4s, v28.4s
-	add	v14.4s, v14.4s, v29.4s
-	add	v19.4s, v19.4s, v30.4s
-
-    // We can always safely store 192 bytes
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v0.16b
-	eor	v21.16b, v21.16b, v5.16b
-	eor	v22.16b, v22.16b, v10.16b
-	eor	v23.16b, v23.16b, v15.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v1.16b
-	eor	v21.16b, v21.16b, v6.16b
-	eor	v22.16b, v22.16b, v11.16b
-	eor	v23.16b, v23.16b, v16.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v2.16b
-	eor	v21.16b, v21.16b, v7.16b
-	eor	v22.16b, v22.16b, v12.16b
-	eor	v23.16b, v23.16b, v17.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #192
-
-	mov	v0.16b, v3.16b
-	mov	v5.16b, v8.16b
-	mov	v10.16b, v13.16b
-	mov	v15.16b, v18.16b
-
-	cmp	x2, #64
-	b.lt	Lopen_tail_64_store
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v3.16b
-	eor	v21.16b, v21.16b, v8.16b
-	eor	v22.16b, v22.16b, v13.16b
-	eor	v23.16b, v23.16b, v18.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #64
-
-	mov	v0.16b, v4.16b
-	mov	v5.16b, v9.16b
-	mov	v10.16b, v14.16b
-	mov	v15.16b, v19.16b
-
-	cmp	x2, #64
-	b.lt	Lopen_tail_64_store
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-	eor	v20.16b, v20.16b, v4.16b
-	eor	v21.16b, v21.16b, v9.16b
-	eor	v22.16b, v22.16b, v14.16b
-	eor	v23.16b, v23.16b, v19.16b
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #64
-	b	Lopen_main_loop
-
-Lopen_tail:
-
-	cbz	x2, Lopen_finalize
-
-	lsr	x4, x2, #4 // How many whole blocks we have to hash
-
-	cmp	x2, #64
-	b.le	Lopen_tail_64
-	cmp	x2, #128
-	b.le	Lopen_tail_128
-
-Lopen_tail_192:
-     // We need three more blocks
-	mov	v0.16b, v24.16b
-	mov	v1.16b, v24.16b
-	mov	v2.16b, v24.16b
-	mov	v5.16b, v28.16b
-	mov	v6.16b, v28.16b
-	mov	v7.16b, v28.16b
-	mov	v10.16b, v29.16b
-	mov	v11.16b, v29.16b
-	mov	v12.16b, v29.16b
-	mov	v15.16b, v30.16b
-	mov	v16.16b, v30.16b
-	mov	v17.16b, v30.16b
-	eor	v23.16b, v23.16b, v23.16b
-	eor	v21.16b, v21.16b, v21.16b
-	ins	v23.s[0], v25.s[0]
-	ins	v21.d[0], x15
-
-	add	v22.4s, v23.4s, v21.4s
-	add	v21.4s, v22.4s, v21.4s
-
-	add	v15.4s, v15.4s, v21.4s
-	add	v16.4s, v16.4s, v23.4s
-	add	v17.4s, v17.4s, v22.4s
-
-	mov	x7, #10
-	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
-	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
-	sub	x4, x4, x7
-
-	cbz	x7, Lopen_tail_192_rounds_no_hash
-
-Lopen_tail_192_rounds:
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-Lopen_tail_192_rounds_no_hash:
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v5.16b, v5.16b, v5.16b, #4
-	ext	v6.16b, v6.16b, v6.16b, #4
-	ext	v7.16b, v7.16b, v7.16b, #4
-
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v12.16b, v12.16b, v12.16b, #8
-
-	ext	v15.16b, v15.16b, v15.16b, #12
-	ext	v16.16b, v16.16b, v16.16b, #12
-	ext	v17.16b, v17.16b, v17.16b, #12
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ext	v6.16b, v6.16b, v6.16b, #12
-	ext	v7.16b, v7.16b, v7.16b, #12
-
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v12.16b, v12.16b, v12.16b, #8
-
-	ext	v15.16b, v15.16b, v15.16b, #4
-	ext	v16.16b, v16.16b, v16.16b, #4
-	ext	v17.16b, v17.16b, v17.16b, #4
-	subs	x7, x7, #1
-	b.gt	Lopen_tail_192_rounds
-	subs	x6, x6, #1
-	b.ge	Lopen_tail_192_rounds_no_hash
-
-    // We hashed 160 bytes at most, may still have 32 bytes left
-Lopen_tail_192_hash:
-	cbz	x4, Lopen_tail_192_hash_done
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	sub	x4, x4, #1
-	b	Lopen_tail_192_hash
-
-Lopen_tail_192_hash_done:
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v1.4s, v1.4s, v24.4s
-	add	v2.4s, v2.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v7.4s, v7.4s, v28.4s
-	add	v10.4s, v10.4s, v29.4s
-	add	v11.4s, v11.4s, v29.4s
-	add	v12.4s, v12.4s, v29.4s
-	add	v15.4s, v15.4s, v30.4s
-	add	v16.4s, v16.4s, v30.4s
-	add	v17.4s, v17.4s, v30.4s
-
-	add	v15.4s, v15.4s, v21.4s
-	add	v16.4s, v16.4s, v23.4s
-	add	v17.4s, v17.4s, v22.4s
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-
-	eor	v20.16b, v20.16b, v1.16b
-	eor	v21.16b, v21.16b, v6.16b
-	eor	v22.16b, v22.16b, v11.16b
-	eor	v23.16b, v23.16b, v16.16b
-
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-
-	eor	v20.16b, v20.16b, v2.16b
-	eor	v21.16b, v21.16b, v7.16b
-	eor	v22.16b, v22.16b, v12.16b
-	eor	v23.16b, v23.16b, v17.16b
-
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #128
-	b	Lopen_tail_64_store
-
-Lopen_tail_128:
-     // We need two more blocks
-	mov	v0.16b, v24.16b
-	mov	v1.16b, v24.16b
-	mov	v5.16b, v28.16b
-	mov	v6.16b, v28.16b
-	mov	v10.16b, v29.16b
-	mov	v11.16b, v29.16b
-	mov	v15.16b, v30.16b
-	mov	v16.16b, v30.16b
-	eor	v23.16b, v23.16b, v23.16b
-	eor	v22.16b, v22.16b, v22.16b
-	ins	v23.s[0], v25.s[0]
-	ins	v22.d[0], x15
-	add	v22.4s, v22.4s, v23.4s
-
-	add	v15.4s, v15.4s, v22.4s
-	add	v16.4s, v16.4s, v23.4s
-
-	mov	x6, #10
-	sub	x6, x6, x4
-
-Lopen_tail_128_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	eor	v15.16b, v15.16b, v0.16b
-	rev32	v15.8h, v15.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v5.16b, v5.16b, v10.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	add	v0.4s, v0.4s, v20.4s
-	eor	v15.16b, v15.16b, v0.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v20.16b, v20.16b, v10.16b
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-	ext	v5.16b, v5.16b, v5.16b, #4
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v15.16b, v15.16b, v15.16b, #12
-	add	v1.4s, v1.4s, v6.4s
-	eor	v16.16b, v16.16b, v1.16b
-	rev32	v16.8h, v16.8h
-
-	add	v11.4s, v11.4s, v16.4s
-	eor	v6.16b, v6.16b, v11.16b
-	ushr	v20.4s, v6.4s, #20
-	sli	v20.4s, v6.4s, #12
-	add	v1.4s, v1.4s, v20.4s
-	eor	v16.16b, v16.16b, v1.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-
-	add	v11.4s, v11.4s, v16.4s
-	eor	v20.16b, v20.16b, v11.16b
-	ushr	v6.4s, v20.4s, #25
-	sli	v6.4s, v20.4s, #7
-	ext	v6.16b, v6.16b, v6.16b, #4
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v16.16b, v16.16b, v16.16b, #12
-	add	v0.4s, v0.4s, v5.4s
-	eor	v15.16b, v15.16b, v0.16b
-	rev32	v15.8h, v15.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v5.16b, v5.16b, v10.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	add	v0.4s, v0.4s, v20.4s
-	eor	v15.16b, v15.16b, v0.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v20.16b, v20.16b, v10.16b
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v15.16b, v15.16b, v15.16b, #4
-	add	v1.4s, v1.4s, v6.4s
-	eor	v16.16b, v16.16b, v1.16b
-	rev32	v16.8h, v16.8h
-
-	add	v11.4s, v11.4s, v16.4s
-	eor	v6.16b, v6.16b, v11.16b
-	ushr	v20.4s, v6.4s, #20
-	sli	v20.4s, v6.4s, #12
-	add	v1.4s, v1.4s, v20.4s
-	eor	v16.16b, v16.16b, v1.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-
-	add	v11.4s, v11.4s, v16.4s
-	eor	v20.16b, v20.16b, v11.16b
-	ushr	v6.4s, v20.4s, #25
-	sli	v6.4s, v20.4s, #7
-	ext	v6.16b, v6.16b, v6.16b, #12
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v16.16b, v16.16b, v16.16b, #4
-	subs	x6, x6, #1
-	b.gt	Lopen_tail_128_rounds
-	cbz	x4, Lopen_tail_128_rounds_done
-	subs	x4, x4, #1
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	b	Lopen_tail_128_rounds
-
-Lopen_tail_128_rounds_done:
-	add	v0.4s, v0.4s, v24.4s
-	add	v1.4s, v1.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v10.4s, v10.4s, v29.4s
-	add	v11.4s, v11.4s, v29.4s
-	add	v15.4s, v15.4s, v30.4s
-	add	v16.4s, v16.4s, v30.4s
-	add	v15.4s, v15.4s, v22.4s
-	add	v16.4s, v16.4s, v23.4s
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-
-	eor	v20.16b, v20.16b, v1.16b
-	eor	v21.16b, v21.16b, v6.16b
-	eor	v22.16b, v22.16b, v11.16b
-	eor	v23.16b, v23.16b, v16.16b
-
-	st1	{v20.16b - v23.16b}, [x0], #64
-	sub	x2, x2, #64
-
-	b	Lopen_tail_64_store
-
-Lopen_tail_64:
-    // We just need a single block
-	mov	v0.16b, v24.16b
-	mov	v5.16b, v28.16b
-	mov	v10.16b, v29.16b
-	mov	v15.16b, v30.16b
-	eor	v23.16b, v23.16b, v23.16b
-	ins	v23.s[0], v25.s[0]
-	add	v15.4s, v15.4s, v23.4s
-
-	mov	x6, #10
-	sub	x6, x6, x4
-
-Lopen_tail_64_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	eor	v15.16b, v15.16b, v0.16b
-	rev32	v15.8h, v15.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v5.16b, v5.16b, v10.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	add	v0.4s, v0.4s, v20.4s
-	eor	v15.16b, v15.16b, v0.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v20.16b, v20.16b, v10.16b
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-	ext	v5.16b, v5.16b, v5.16b, #4
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v15.16b, v15.16b, v15.16b, #12
-	add	v0.4s, v0.4s, v5.4s
-	eor	v15.16b, v15.16b, v0.16b
-	rev32	v15.8h, v15.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v5.16b, v5.16b, v10.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	add	v0.4s, v0.4s, v20.4s
-	eor	v15.16b, v15.16b, v0.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	eor	v20.16b, v20.16b, v10.16b
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v15.16b, v15.16b, v15.16b, #4
-	subs	x6, x6, #1
-	b.gt	Lopen_tail_64_rounds
-	cbz	x4, Lopen_tail_64_rounds_done
-	subs	x4, x4, #1
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	b	Lopen_tail_64_rounds
-
-Lopen_tail_64_rounds_done:
-	add	v0.4s, v0.4s, v24.4s
-	add	v5.4s, v5.4s, v28.4s
-	add	v10.4s, v10.4s, v29.4s
-	add	v15.4s, v15.4s, v30.4s
-	add	v15.4s, v15.4s, v23.4s
-
-Lopen_tail_64_store:
-	cmp	x2, #16
-	b.lt	Lopen_tail_16
-
-	ld1	{v20.16b}, [x1], #16
-	eor	v20.16b, v20.16b, v0.16b
-	st1	{v20.16b}, [x0], #16
-	mov	v0.16b, v5.16b
-	mov	v5.16b, v10.16b
-	mov	v10.16b, v15.16b
-	sub	x2, x2, #16
-	b	Lopen_tail_64_store
-
-Lopen_tail_16:
-    // Here we handle the last [0,16) bytes that require a padded block
-	cbz	x2, Lopen_finalize
-
-	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
-	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
-	not	v22.16b, v20.16b
-
-	add	x7, x1, x2
-	mov	x6, x2
-
-Lopen_tail_16_compose:
-	ext	v20.16b, v20.16b, v20.16b, #15
-	ldrb	w11, [x7, #-1]!
-	mov	v20.b[0], w11
-	ext	v21.16b, v22.16b, v21.16b, #15
-	subs	x2, x2, #1
-	b.gt	Lopen_tail_16_compose
-
-	and	v20.16b, v20.16b, v21.16b
-    // Hash in the final padded block
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	eor	v20.16b, v20.16b, v0.16b
-
-Lopen_tail_16_store:
-	umov	w11, v20.b[0]
-	strb	w11, [x0], #1
-	ext	v20.16b, v20.16b, v20.16b, #1
-	subs	x6, x6, #1
-	b.gt	Lopen_tail_16_store
-
-Lopen_finalize:
-	mov	x11, v31.d[0]
-	mov	x12, v31.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-    // Final reduction step
-	sub	x12, xzr, x15
-	orr	x13, xzr, #3
-	subs	x11, x8, #-5
-	sbcs	x12, x9, x12
-	sbcs	x13, x10, x13
-	csel	x8, x11, x8, cs
-	csel	x9, x12, x9, cs
-	csel	x10, x13, x10, cs
-	mov	x11, v27.d[0]
-	mov	x12, v27.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-
-	stp	x8, x9, [x5]
-
-	ldp	d8, d9, [sp, #16]
-	ldp	d10, d11, [sp, #32]
-	ldp	d12, d13, [sp, #48]
-	ldp	d14, d15, [sp, #64]
-.cfi_restore	b15
-.cfi_restore	b14
-.cfi_restore	b13
-.cfi_restore	b12
-.cfi_restore	b11
-.cfi_restore	b10
-.cfi_restore	b9
-.cfi_restore	b8
-	ldp	x29, x30, [sp], 80
-.cfi_restore	w29
-.cfi_restore	w30
-.cfi_def_cfa_offset	0
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-Lopen_128:
-    // On some architectures preparing 5 blocks for small buffers is wasteful
-	eor	v25.16b, v25.16b, v25.16b
-	mov	x11, #1
-	mov	v25.s[0], w11
-	mov	v0.16b, v24.16b
-	mov	v1.16b, v24.16b
-	mov	v2.16b, v24.16b
-	mov	v5.16b, v28.16b
-	mov	v6.16b, v28.16b
-	mov	v7.16b, v28.16b
-	mov	v10.16b, v29.16b
-	mov	v11.16b, v29.16b
-	mov	v12.16b, v29.16b
-	mov	v17.16b, v30.16b
-	add	v15.4s, v17.4s, v25.4s
-	add	v16.4s, v15.4s, v25.4s
-
-	mov	x6, #10
-
-Lopen_128_rounds:
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v5.16b, v5.16b, v5.16b, #4
-	ext	v6.16b, v6.16b, v6.16b, #4
-	ext	v7.16b, v7.16b, v7.16b, #4
-
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v12.16b, v12.16b, v12.16b, #8
-
-	ext	v15.16b, v15.16b, v15.16b, #12
-	ext	v16.16b, v16.16b, v16.16b, #12
-	ext	v17.16b, v17.16b, v17.16b, #12
-	add	v0.4s, v0.4s, v5.4s
-	add	v1.4s, v1.4s, v6.4s
-	add	v2.4s, v2.4s, v7.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	rev32	v15.8h, v15.8h
-	rev32	v16.8h, v16.8h
-	rev32	v17.8h, v17.8h
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v5.16b, v5.16b, v10.16b
-	eor	v6.16b, v6.16b, v11.16b
-	eor	v7.16b, v7.16b, v12.16b
-	ushr	v20.4s, v5.4s, #20
-	sli	v20.4s, v5.4s, #12
-	ushr	v5.4s, v6.4s, #20
-	sli	v5.4s, v6.4s, #12
-	ushr	v6.4s, v7.4s, #20
-	sli	v6.4s, v7.4s, #12
-
-	add	v0.4s, v0.4s, v20.4s
-	add	v1.4s, v1.4s, v5.4s
-	add	v2.4s, v2.4s, v6.4s
-	eor	v15.16b, v15.16b, v0.16b
-	eor	v16.16b, v16.16b, v1.16b
-	eor	v17.16b, v17.16b, v2.16b
-	tbl	v15.16b, {v15.16b}, v26.16b
-	tbl	v16.16b, {v16.16b}, v26.16b
-	tbl	v17.16b, {v17.16b}, v26.16b
-
-	add	v10.4s, v10.4s, v15.4s
-	add	v11.4s, v11.4s, v16.4s
-	add	v12.4s, v12.4s, v17.4s
-	eor	v20.16b, v20.16b, v10.16b
-	eor	v5.16b, v5.16b, v11.16b
-	eor	v6.16b, v6.16b, v12.16b
-	ushr	v7.4s, v6.4s, #25
-	sli	v7.4s, v6.4s, #7
-	ushr	v6.4s, v5.4s, #25
-	sli	v6.4s, v5.4s, #7
-	ushr	v5.4s, v20.4s, #25
-	sli	v5.4s, v20.4s, #7
-
-	ext	v5.16b, v5.16b, v5.16b, #12
-	ext	v6.16b, v6.16b, v6.16b, #12
-	ext	v7.16b, v7.16b, v7.16b, #12
-
-	ext	v10.16b, v10.16b, v10.16b, #8
-	ext	v11.16b, v11.16b, v11.16b, #8
-	ext	v12.16b, v12.16b, v12.16b, #8
-
-	ext	v15.16b, v15.16b, v15.16b, #4
-	ext	v16.16b, v16.16b, v16.16b, #4
-	ext	v17.16b, v17.16b, v17.16b, #4
-	subs	x6, x6, #1
-	b.hi	Lopen_128_rounds
-
-	add	v0.4s, v0.4s, v24.4s
-	add	v1.4s, v1.4s, v24.4s
-	add	v2.4s, v2.4s, v24.4s
-
-	add	v5.4s, v5.4s, v28.4s
-	add	v6.4s, v6.4s, v28.4s
-	add	v7.4s, v7.4s, v28.4s
-
-	add	v10.4s, v10.4s, v29.4s
-	add	v11.4s, v11.4s, v29.4s
-
-	add	v30.4s, v30.4s, v25.4s
-	add	v15.4s, v15.4s, v30.4s
-	add	v30.4s, v30.4s, v25.4s
-	add	v16.4s, v16.4s, v30.4s
-
-	and	v2.16b, v2.16b, v27.16b
-	mov	x16, v2.d[0] // Move the R key to GPRs
-	mov	x17, v2.d[1]
-	mov	v27.16b, v7.16b // Store the S key
-
-	bl	Lpoly_hash_ad_internal
-
-Lopen_128_store:
-	cmp	x2, #64
-	b.lt	Lopen_128_store_64
-
-	ld1	{v20.16b - v23.16b}, [x1], #64
-
-	mov	x11, v20.d[0]
-	mov	x12, v20.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	mov	x11, v21.d[0]
-	mov	x12, v21.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	mov	x11, v22.d[0]
-	mov	x12, v22.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	mov	x11, v23.d[0]
-	mov	x12, v23.d[1]
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-
-	eor	v20.16b, v20.16b, v0.16b
-	eor	v21.16b, v21.16b, v5.16b
-	eor	v22.16b, v22.16b, v10.16b
-	eor	v23.16b, v23.16b, v15.16b
-
-	st1	{v20.16b - v23.16b}, [x0], #64
-
-	sub	x2, x2, #64
-
-	mov	v0.16b, v1.16b
-	mov	v5.16b, v6.16b
-	mov	v10.16b, v11.16b
-	mov	v15.16b, v16.16b
-
-Lopen_128_store_64:
-
-	lsr	x4, x2, #4
-	mov	x3, x1
-
-Lopen_128_hash_64:
-	cbz	x4, Lopen_tail_64_store
-	ldp	x11, x12, [x3], 16
-	adds	x8, x8, x11
-	adcs	x9, x9, x12
-	adc	x10, x10, x15
-	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
-	umulh	x12, x8, x16
-	mul	x13, x9, x16
-	umulh	x14, x9, x16
-	adds	x12, x12, x13
-	mul	x13, x10, x16
-	adc	x13, x13, x14
-	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
-	umulh	x8, x8, x17
-	adds	x12, x12, x14
-	mul	x14, x9, x17
-	umulh	x9, x9, x17
-	adcs	x14, x14, x8
-	mul	x10, x10, x17
-	adc	x10, x10, x9
-	adds	x13, x13, x14
-	adc	x14, x10, xzr
-	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
-	and	x8, x13, #-4
-	extr	x13, x14, x13, #2
-	adds	x8, x8, x11
-	lsr	x11, x14, #2
-	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
-	adds	x8, x8, x13
-	adcs	x9, x9, x12
-	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
-	sub	x4, x4, #1
-	b	Lopen_128_hash_64
-.cfi_endproc
-
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/aesv8-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/aesv8-armv8-apple.S
deleted file mode 100644
index 144c4af..0000000
--- a/apple-aarch64/crypto/fipsmodule/aesv8-armv8-apple.S
+++ /dev/null
@@ -1,791 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-#if __ARM_MAX_ARCH__>=7
-.text
-
-.section	__TEXT,__const
-.align	5
-Lrcon:
-.long	0x01,0x01,0x01,0x01
-.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
-.long	0x1b,0x1b,0x1b,0x1b
-
-.text
-
-.globl	_aes_hw_set_encrypt_key
-.private_extern	_aes_hw_set_encrypt_key
-
-.align	5
-_aes_hw_set_encrypt_key:
-Lenc_key:
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	AARCH64_VALID_CALL_TARGET
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	mov	x3,#-1
-	cmp	x0,#0
-	b.eq	Lenc_key_abort
-	cmp	x2,#0
-	b.eq	Lenc_key_abort
-	mov	x3,#-2
-	cmp	w1,#128
-	b.lt	Lenc_key_abort
-	cmp	w1,#256
-	b.gt	Lenc_key_abort
-	tst	w1,#0x3f
-	b.ne	Lenc_key_abort
-
-	adrp	x3,Lrcon@PAGE
-	add	x3,x3,Lrcon@PAGEOFF
-	cmp	w1,#192
-
-	eor	v0.16b,v0.16b,v0.16b
-	ld1	{v3.16b},[x0],#16
-	mov	w1,#8		// reuse w1
-	ld1	{v1.4s,v2.4s},[x3],#32
-
-	b.lt	Loop128
-	b.eq	L192
-	b	L256
-
-.align	4
-Loop128:
-	tbl	v6.16b,{v3.16b},v2.16b
-	ext	v5.16b,v0.16b,v3.16b,#12
-	st1	{v3.4s},[x2],#16
-	aese	v6.16b,v0.16b
-	subs	w1,w1,#1
-
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v6.16b,v6.16b,v1.16b
-	eor	v3.16b,v3.16b,v5.16b
-	shl	v1.16b,v1.16b,#1
-	eor	v3.16b,v3.16b,v6.16b
-	b.ne	Loop128
-
-	ld1	{v1.4s},[x3]
-
-	tbl	v6.16b,{v3.16b},v2.16b
-	ext	v5.16b,v0.16b,v3.16b,#12
-	st1	{v3.4s},[x2],#16
-	aese	v6.16b,v0.16b
-
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v6.16b,v6.16b,v1.16b
-	eor	v3.16b,v3.16b,v5.16b
-	shl	v1.16b,v1.16b,#1
-	eor	v3.16b,v3.16b,v6.16b
-
-	tbl	v6.16b,{v3.16b},v2.16b
-	ext	v5.16b,v0.16b,v3.16b,#12
-	st1	{v3.4s},[x2],#16
-	aese	v6.16b,v0.16b
-
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v6.16b,v6.16b,v1.16b
-	eor	v3.16b,v3.16b,v5.16b
-	eor	v3.16b,v3.16b,v6.16b
-	st1	{v3.4s},[x2]
-	add	x2,x2,#0x50
-
-	mov	w12,#10
-	b	Ldone
-
-.align	4
-L192:
-	ld1	{v4.8b},[x0],#8
-	movi	v6.16b,#8			// borrow v6.16b
-	st1	{v3.4s},[x2],#16
-	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
-
-Loop192:
-	tbl	v6.16b,{v4.16b},v2.16b
-	ext	v5.16b,v0.16b,v3.16b,#12
-	st1	{v4.8b},[x2],#8
-	aese	v6.16b,v0.16b
-	subs	w1,w1,#1
-
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v3.16b,v3.16b,v5.16b
-
-	dup	v5.4s,v3.s[3]
-	eor	v5.16b,v5.16b,v4.16b
-	eor	v6.16b,v6.16b,v1.16b
-	ext	v4.16b,v0.16b,v4.16b,#12
-	shl	v1.16b,v1.16b,#1
-	eor	v4.16b,v4.16b,v5.16b
-	eor	v3.16b,v3.16b,v6.16b
-	eor	v4.16b,v4.16b,v6.16b
-	st1	{v3.4s},[x2],#16
-	b.ne	Loop192
-
-	mov	w12,#12
-	add	x2,x2,#0x20
-	b	Ldone
-
-.align	4
-L256:
-	ld1	{v4.16b},[x0]
-	mov	w1,#7
-	mov	w12,#14
-	st1	{v3.4s},[x2],#16
-
-Loop256:
-	tbl	v6.16b,{v4.16b},v2.16b
-	ext	v5.16b,v0.16b,v3.16b,#12
-	st1	{v4.4s},[x2],#16
-	aese	v6.16b,v0.16b
-	subs	w1,w1,#1
-
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v3.16b,v3.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v6.16b,v6.16b,v1.16b
-	eor	v3.16b,v3.16b,v5.16b
-	shl	v1.16b,v1.16b,#1
-	eor	v3.16b,v3.16b,v6.16b
-	st1	{v3.4s},[x2],#16
-	b.eq	Ldone
-
-	dup	v6.4s,v3.s[3]		// just splat
-	ext	v5.16b,v0.16b,v4.16b,#12
-	aese	v6.16b,v0.16b
-
-	eor	v4.16b,v4.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v4.16b,v4.16b,v5.16b
-	ext	v5.16b,v0.16b,v5.16b,#12
-	eor	v4.16b,v4.16b,v5.16b
-
-	eor	v4.16b,v4.16b,v6.16b
-	b	Loop256
-
-Ldone:
-	str	w12,[x2]
-	mov	x3,#0
-
-Lenc_key_abort:
-	mov	x0,x3			// return value
-	ldr	x29,[sp],#16
-	ret
-
-
-.globl	_aes_hw_set_decrypt_key
-.private_extern	_aes_hw_set_decrypt_key
-
-.align	5
-_aes_hw_set_decrypt_key:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	bl	Lenc_key
-
-	cmp	x0,#0
-	b.ne	Ldec_key_abort
-
-	sub	x2,x2,#240		// restore original x2
-	mov	x4,#-16
-	add	x0,x2,x12,lsl#4	// end of key schedule
-
-	ld1	{v0.4s},[x2]
-	ld1	{v1.4s},[x0]
-	st1	{v0.4s},[x0],x4
-	st1	{v1.4s},[x2],#16
-
-Loop_imc:
-	ld1	{v0.4s},[x2]
-	ld1	{v1.4s},[x0]
-	aesimc	v0.16b,v0.16b
-	aesimc	v1.16b,v1.16b
-	st1	{v0.4s},[x0],x4
-	st1	{v1.4s},[x2],#16
-	cmp	x0,x2
-	b.hi	Loop_imc
-
-	ld1	{v0.4s},[x2]
-	aesimc	v0.16b,v0.16b
-	st1	{v0.4s},[x0]
-
-	eor	x0,x0,x0		// return value
-Ldec_key_abort:
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.globl	_aes_hw_encrypt
-.private_extern	_aes_hw_encrypt
-
-.align	5
-_aes_hw_encrypt:
-	AARCH64_VALID_CALL_TARGET
-	ldr	w3,[x2,#240]
-	ld1	{v0.4s},[x2],#16
-	ld1	{v2.16b},[x0]
-	sub	w3,w3,#2
-	ld1	{v1.4s},[x2],#16
-
-Loop_enc:
-	aese	v2.16b,v0.16b
-	aesmc	v2.16b,v2.16b
-	ld1	{v0.4s},[x2],#16
-	subs	w3,w3,#2
-	aese	v2.16b,v1.16b
-	aesmc	v2.16b,v2.16b
-	ld1	{v1.4s},[x2],#16
-	b.gt	Loop_enc
-
-	aese	v2.16b,v0.16b
-	aesmc	v2.16b,v2.16b
-	ld1	{v0.4s},[x2]
-	aese	v2.16b,v1.16b
-	eor	v2.16b,v2.16b,v0.16b
-
-	st1	{v2.16b},[x1]
-	ret
-
-.globl	_aes_hw_decrypt
-.private_extern	_aes_hw_decrypt
-
-.align	5
-_aes_hw_decrypt:
-	AARCH64_VALID_CALL_TARGET
-	ldr	w3,[x2,#240]
-	ld1	{v0.4s},[x2],#16
-	ld1	{v2.16b},[x0]
-	sub	w3,w3,#2
-	ld1	{v1.4s},[x2],#16
-
-Loop_dec:
-	aesd	v2.16b,v0.16b
-	aesimc	v2.16b,v2.16b
-	ld1	{v0.4s},[x2],#16
-	subs	w3,w3,#2
-	aesd	v2.16b,v1.16b
-	aesimc	v2.16b,v2.16b
-	ld1	{v1.4s},[x2],#16
-	b.gt	Loop_dec
-
-	aesd	v2.16b,v0.16b
-	aesimc	v2.16b,v2.16b
-	ld1	{v0.4s},[x2]
-	aesd	v2.16b,v1.16b
-	eor	v2.16b,v2.16b,v0.16b
-
-	st1	{v2.16b},[x1]
-	ret
-
-.globl	_aes_hw_cbc_encrypt
-.private_extern	_aes_hw_cbc_encrypt
-
-.align	5
-_aes_hw_cbc_encrypt:
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	AARCH64_VALID_CALL_TARGET
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	subs	x2,x2,#16
-	mov	x8,#16
-	b.lo	Lcbc_abort
-	csel	x8,xzr,x8,eq
-
-	cmp	w5,#0			// en- or decrypting?
-	ldr	w5,[x3,#240]
-	and	x2,x2,#-16
-	ld1	{v6.16b},[x4]
-	ld1	{v0.16b},[x0],x8
-
-	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
-	sub	w5,w5,#6
-	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
-	sub	w5,w5,#2
-	ld1	{v18.4s,v19.4s},[x7],#32
-	ld1	{v20.4s,v21.4s},[x7],#32
-	ld1	{v22.4s,v23.4s},[x7],#32
-	ld1	{v7.4s},[x7]
-
-	add	x7,x3,#32
-	mov	w6,w5
-	b.eq	Lcbc_dec
-
-	cmp	w5,#2
-	eor	v0.16b,v0.16b,v6.16b
-	eor	v5.16b,v16.16b,v7.16b
-	b.eq	Lcbc_enc128
-
-	ld1	{v2.4s,v3.4s},[x7]
-	add	x7,x3,#16
-	add	x6,x3,#16*4
-	add	x12,x3,#16*5
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	add	x14,x3,#16*6
-	add	x3,x3,#16*7
-	b	Lenter_cbc_enc
-
-.align	4
-Loop_cbc_enc:
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	st1	{v6.16b},[x1],#16
-Lenter_cbc_enc:
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v2.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v16.4s},[x6]
-	cmp	w5,#4
-	aese	v0.16b,v3.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v17.4s},[x12]
-	b.eq	Lcbc_enc192
-
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v16.4s},[x14]
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v17.4s},[x3]
-	nop
-
-Lcbc_enc192:
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	subs	x2,x2,#16
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	csel	x8,xzr,x8,eq
-	aese	v0.16b,v18.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v19.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v16.16b},[x0],x8
-	aese	v0.16b,v20.16b
-	aesmc	v0.16b,v0.16b
-	eor	v16.16b,v16.16b,v5.16b
-	aese	v0.16b,v21.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
-	aese	v0.16b,v22.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v23.16b
-	eor	v6.16b,v0.16b,v7.16b
-	b.hs	Loop_cbc_enc
-
-	st1	{v6.16b},[x1],#16
-	b	Lcbc_done
-
-.align	5
-Lcbc_enc128:
-	ld1	{v2.4s,v3.4s},[x7]
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	b	Lenter_cbc_enc128
-Loop_cbc_enc128:
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	st1	{v6.16b},[x1],#16
-Lenter_cbc_enc128:
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	subs	x2,x2,#16
-	aese	v0.16b,v2.16b
-	aesmc	v0.16b,v0.16b
-	csel	x8,xzr,x8,eq
-	aese	v0.16b,v3.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v18.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v19.16b
-	aesmc	v0.16b,v0.16b
-	ld1	{v16.16b},[x0],x8
-	aese	v0.16b,v20.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v21.16b
-	aesmc	v0.16b,v0.16b
-	aese	v0.16b,v22.16b
-	aesmc	v0.16b,v0.16b
-	eor	v16.16b,v16.16b,v5.16b
-	aese	v0.16b,v23.16b
-	eor	v6.16b,v0.16b,v7.16b
-	b.hs	Loop_cbc_enc128
-
-	st1	{v6.16b},[x1],#16
-	b	Lcbc_done
-.align	5
-Lcbc_dec:
-	ld1	{v18.16b},[x0],#16
-	subs	x2,x2,#32		// bias
-	add	w6,w5,#2
-	orr	v3.16b,v0.16b,v0.16b
-	orr	v1.16b,v0.16b,v0.16b
-	orr	v19.16b,v18.16b,v18.16b
-	b.lo	Lcbc_dec_tail
-
-	orr	v1.16b,v18.16b,v18.16b
-	ld1	{v18.16b},[x0],#16
-	orr	v2.16b,v0.16b,v0.16b
-	orr	v3.16b,v1.16b,v1.16b
-	orr	v19.16b,v18.16b,v18.16b
-
-Loop3x_cbc_dec:
-	aesd	v0.16b,v16.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v16.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v16.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v16.4s},[x7],#16
-	subs	w6,w6,#2
-	aesd	v0.16b,v17.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v17.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v17.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v17.4s},[x7],#16
-	b.gt	Loop3x_cbc_dec
-
-	aesd	v0.16b,v16.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v16.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v16.16b
-	aesimc	v18.16b,v18.16b
-	eor	v4.16b,v6.16b,v7.16b
-	subs	x2,x2,#0x30
-	eor	v5.16b,v2.16b,v7.16b
-	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
-	aesd	v0.16b,v17.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v17.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v17.16b
-	aesimc	v18.16b,v18.16b
-	eor	v17.16b,v3.16b,v7.16b
-	add	x0,x0,x6		// x0 is adjusted in such way that
-					// at exit from the loop v1.16b-v18.16b
-					// are loaded with last "words"
-	orr	v6.16b,v19.16b,v19.16b
-	mov	x7,x3
-	aesd	v0.16b,v20.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v20.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v20.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v2.16b},[x0],#16
-	aesd	v0.16b,v21.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v21.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v21.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v3.16b},[x0],#16
-	aesd	v0.16b,v22.16b
-	aesimc	v0.16b,v0.16b
-	aesd	v1.16b,v22.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v22.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v19.16b},[x0],#16
-	aesd	v0.16b,v23.16b
-	aesd	v1.16b,v23.16b
-	aesd	v18.16b,v23.16b
-	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
-	add	w6,w5,#2
-	eor	v4.16b,v4.16b,v0.16b
-	eor	v5.16b,v5.16b,v1.16b
-	eor	v18.16b,v18.16b,v17.16b
-	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
-	st1	{v4.16b},[x1],#16
-	orr	v0.16b,v2.16b,v2.16b
-	st1	{v5.16b},[x1],#16
-	orr	v1.16b,v3.16b,v3.16b
-	st1	{v18.16b},[x1],#16
-	orr	v18.16b,v19.16b,v19.16b
-	b.hs	Loop3x_cbc_dec
-
-	cmn	x2,#0x30
-	b.eq	Lcbc_done
-	nop
-
-Lcbc_dec_tail:
-	aesd	v1.16b,v16.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v16.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v16.4s},[x7],#16
-	subs	w6,w6,#2
-	aesd	v1.16b,v17.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v17.16b
-	aesimc	v18.16b,v18.16b
-	ld1	{v17.4s},[x7],#16
-	b.gt	Lcbc_dec_tail
-
-	aesd	v1.16b,v16.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v16.16b
-	aesimc	v18.16b,v18.16b
-	aesd	v1.16b,v17.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v17.16b
-	aesimc	v18.16b,v18.16b
-	aesd	v1.16b,v20.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v20.16b
-	aesimc	v18.16b,v18.16b
-	cmn	x2,#0x20
-	aesd	v1.16b,v21.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v21.16b
-	aesimc	v18.16b,v18.16b
-	eor	v5.16b,v6.16b,v7.16b
-	aesd	v1.16b,v22.16b
-	aesimc	v1.16b,v1.16b
-	aesd	v18.16b,v22.16b
-	aesimc	v18.16b,v18.16b
-	eor	v17.16b,v3.16b,v7.16b
-	aesd	v1.16b,v23.16b
-	aesd	v18.16b,v23.16b
-	b.eq	Lcbc_dec_one
-	eor	v5.16b,v5.16b,v1.16b
-	eor	v17.16b,v17.16b,v18.16b
-	orr	v6.16b,v19.16b,v19.16b
-	st1	{v5.16b},[x1],#16
-	st1	{v17.16b},[x1],#16
-	b	Lcbc_done
-
-Lcbc_dec_one:
-	eor	v5.16b,v5.16b,v18.16b
-	orr	v6.16b,v19.16b,v19.16b
-	st1	{v5.16b},[x1],#16
-
-Lcbc_done:
-	st1	{v6.16b},[x4]
-Lcbc_abort:
-	ldr	x29,[sp],#16
-	ret
-
-.globl	_aes_hw_ctr32_encrypt_blocks
-.private_extern	_aes_hw_ctr32_encrypt_blocks
-
-.align	5
-_aes_hw_ctr32_encrypt_blocks:
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	AARCH64_VALID_CALL_TARGET
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	ldr	w5,[x3,#240]
-
-	ldr	w8, [x4, #12]
-	ld1	{v0.4s},[x4]
-
-	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
-	sub	w5,w5,#4
-	mov	x12,#16
-	cmp	x2,#2
-	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
-	sub	w5,w5,#2
-	ld1	{v20.4s,v21.4s},[x7],#32
-	ld1	{v22.4s,v23.4s},[x7],#32
-	ld1	{v7.4s},[x7]
-	add	x7,x3,#32
-	mov	w6,w5
-	csel	x12,xzr,x12,lo
-
-	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
-	// affected by silicon errata #1742098 [0] and #1655431 [1],
-	// respectively, where the second instruction of an aese/aesmc
-	// instruction pair may execute twice if an interrupt is taken right
-	// after the first instruction consumes an input register of which a
-	// single 32-bit lane has been updated the last time it was modified.
-	//
-	// This function uses a counter in one 32-bit lane. The vmov lines
-	// could write to v1.16b and v18.16b directly, but that trips this bugs.
-	// We write to v6.16b and copy to the final register as a workaround.
-	//
-	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
-	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
-#ifndef __AARCH64EB__
-	rev	w8, w8
-#endif
-	add	w10, w8, #1
-	orr	v6.16b,v0.16b,v0.16b
-	rev	w10, w10
-	mov	v6.s[3],w10
-	add	w8, w8, #2
-	orr	v1.16b,v6.16b,v6.16b
-	b.ls	Lctr32_tail
-	rev	w12, w8
-	mov	v6.s[3],w12
-	sub	x2,x2,#3		// bias
-	orr	v18.16b,v6.16b,v6.16b
-	b	Loop3x_ctr32
-
-.align	4
-Loop3x_ctr32:
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v16.16b
-	aesmc	v1.16b,v1.16b
-	aese	v18.16b,v16.16b
-	aesmc	v18.16b,v18.16b
-	ld1	{v16.4s},[x7],#16
-	subs	w6,w6,#2
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v17.16b
-	aesmc	v1.16b,v1.16b
-	aese	v18.16b,v17.16b
-	aesmc	v18.16b,v18.16b
-	ld1	{v17.4s},[x7],#16
-	b.gt	Loop3x_ctr32
-
-	aese	v0.16b,v16.16b
-	aesmc	v4.16b,v0.16b
-	aese	v1.16b,v16.16b
-	aesmc	v5.16b,v1.16b
-	ld1	{v2.16b},[x0],#16
-	add	w9,w8,#1
-	aese	v18.16b,v16.16b
-	aesmc	v18.16b,v18.16b
-	ld1	{v3.16b},[x0],#16
-	rev	w9,w9
-	aese	v4.16b,v17.16b
-	aesmc	v4.16b,v4.16b
-	aese	v5.16b,v17.16b
-	aesmc	v5.16b,v5.16b
-	ld1	{v19.16b},[x0],#16
-	mov	x7,x3
-	aese	v18.16b,v17.16b
-	aesmc	v17.16b,v18.16b
-	aese	v4.16b,v20.16b
-	aesmc	v4.16b,v4.16b
-	aese	v5.16b,v20.16b
-	aesmc	v5.16b,v5.16b
-	eor	v2.16b,v2.16b,v7.16b
-	add	w10,w8,#2
-	aese	v17.16b,v20.16b
-	aesmc	v17.16b,v17.16b
-	eor	v3.16b,v3.16b,v7.16b
-	add	w8,w8,#3
-	aese	v4.16b,v21.16b
-	aesmc	v4.16b,v4.16b
-	aese	v5.16b,v21.16b
-	aesmc	v5.16b,v5.16b
-	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
-	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
-	 // 32-bit mode. See the comment above.
-	eor	v19.16b,v19.16b,v7.16b
-	mov	v6.s[3], w9
-	aese	v17.16b,v21.16b
-	aesmc	v17.16b,v17.16b
-	orr	v0.16b,v6.16b,v6.16b
-	rev	w10,w10
-	aese	v4.16b,v22.16b
-	aesmc	v4.16b,v4.16b
-	mov	v6.s[3], w10
-	rev	w12,w8
-	aese	v5.16b,v22.16b
-	aesmc	v5.16b,v5.16b
-	orr	v1.16b,v6.16b,v6.16b
-	mov	v6.s[3], w12
-	aese	v17.16b,v22.16b
-	aesmc	v17.16b,v17.16b
-	orr	v18.16b,v6.16b,v6.16b
-	subs	x2,x2,#3
-	aese	v4.16b,v23.16b
-	aese	v5.16b,v23.16b
-	aese	v17.16b,v23.16b
-
-	eor	v2.16b,v2.16b,v4.16b
-	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
-	st1	{v2.16b},[x1],#16
-	eor	v3.16b,v3.16b,v5.16b
-	mov	w6,w5
-	st1	{v3.16b},[x1],#16
-	eor	v19.16b,v19.16b,v17.16b
-	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
-	st1	{v19.16b},[x1],#16
-	b.hs	Loop3x_ctr32
-
-	adds	x2,x2,#3
-	b.eq	Lctr32_done
-	cmp	x2,#1
-	mov	x12,#16
-	csel	x12,xzr,x12,eq
-
-Lctr32_tail:
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v16.16b
-	aesmc	v1.16b,v1.16b
-	ld1	{v16.4s},[x7],#16
-	subs	w6,w6,#2
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v17.16b
-	aesmc	v1.16b,v1.16b
-	ld1	{v17.4s},[x7],#16
-	b.gt	Lctr32_tail
-
-	aese	v0.16b,v16.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v16.16b
-	aesmc	v1.16b,v1.16b
-	aese	v0.16b,v17.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v17.16b
-	aesmc	v1.16b,v1.16b
-	ld1	{v2.16b},[x0],x12
-	aese	v0.16b,v20.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v20.16b
-	aesmc	v1.16b,v1.16b
-	ld1	{v3.16b},[x0]
-	aese	v0.16b,v21.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v21.16b
-	aesmc	v1.16b,v1.16b
-	eor	v2.16b,v2.16b,v7.16b
-	aese	v0.16b,v22.16b
-	aesmc	v0.16b,v0.16b
-	aese	v1.16b,v22.16b
-	aesmc	v1.16b,v1.16b
-	eor	v3.16b,v3.16b,v7.16b
-	aese	v0.16b,v23.16b
-	aese	v1.16b,v23.16b
-
-	cmp	x2,#1
-	eor	v2.16b,v2.16b,v0.16b
-	eor	v3.16b,v3.16b,v1.16b
-	st1	{v2.16b},[x1],#16
-	b.eq	Lctr32_done
-	st1	{v3.16b},[x1]
-
-Lctr32_done:
-	ldr	x29,[sp],#16
-	ret
-
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-apple.S
deleted file mode 100644
index 13be797..0000000
--- a/apple-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-apple.S
+++ /dev/null
@@ -1,1555 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-#if __ARM_MAX_ARCH__ >= 8
-
-
-.text
-.globl	_aes_gcm_enc_kernel
-.private_extern	_aes_gcm_enc_kernel
-
-.align	4
-_aes_gcm_enc_kernel:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29, x30, [sp, #-128]!
-	mov	x29, sp
-	stp	x19, x20, [sp, #16]
-	mov	x16, x4
-	mov	x8, x5
-	stp	x21, x22, [sp, #32]
-	stp	x23, x24, [sp, #48]
-	stp	d8, d9, [sp, #64]
-	stp	d10, d11, [sp, #80]
-	stp	d12, d13, [sp, #96]
-	stp	d14, d15, [sp, #112]
-	ldr	w17, [x8, #240]
-	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
-	ldp	x13, x14, [x19]                       // load round N keys
-	ldr	q31, [x19, #-16]                        // load round N-1 keys
-	add	x4, x0, x1, lsr #3   // end_input_ptr
-	lsr	x5, x1, #3              // byte_len
-	mov	x15, x5
-	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
-	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
-	sub	x5, x5, #1      // byte_len - 1
-	ldr	q18, [x8, #0]                                  // load rk0
-	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
-	ldr	q25, [x8, #112]                                // load rk7
-	add	x5, x5, x0
-	lsr	x12, x11, #32
-	fmov	d2, x10                               // CTR block 2
-	orr	w11, w11, w11
-	rev	w12, w12                                // rev_ctr32
-	fmov	d1, x10                               // CTR block 1
-	aese	v0.16b, v18.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
-	add	w12, w12, #1                            // increment rev_ctr32
-	rev	w9, w12                                 // CTR block 1
-	fmov	d3, x10                               // CTR block 3
-	orr	x9, x11, x9, lsl #32            // CTR block 1
-	add	w12, w12, #1                            // CTR block 1
-	ldr	q19, [x8, #16]                                 // load rk1
-	fmov	v1.d[1], x9                               // CTR block 1
-	rev	w9, w12                                 // CTR block 2
-	add	w12, w12, #1                            // CTR block 2
-	orr	x9, x11, x9, lsl #32            // CTR block 2
-	ldr	q20, [x8, #32]                                 // load rk2
-	fmov	v2.d[1], x9                               // CTR block 2
-	rev	w9, w12                                 // CTR block 3
-	aese	v0.16b, v19.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
-	orr	x9, x11, x9, lsl #32            // CTR block 3
-	fmov	v3.d[1], x9                               // CTR block 3
-	aese	v1.16b, v18.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
-	ldr	q21, [x8, #48]                                 // load rk3
-	aese	v0.16b, v20.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
-	ldr	q24, [x8, #96]                                 // load rk6
-	aese	v2.16b, v18.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
-	ldr	q23, [x8, #80]                                 // load rk5
-	aese	v1.16b, v19.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
-	ldr	q14, [x6, #48]                              // load h3l | h3h
-	ext	v14.16b, v14.16b, v14.16b, #8
-	aese	v3.16b, v18.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
-	aese	v2.16b, v19.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
-	ldr	q22, [x8, #64]                                 // load rk4
-	aese	v1.16b, v20.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
-	ldr	q13, [x6, #32]                              // load h2l | h2h
-	ext	v13.16b, v13.16b, v13.16b, #8
-	aese	v3.16b, v19.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
-	ldr	q30, [x8, #192]                               // load rk12
-	aese	v2.16b, v20.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
-	ldr	q15, [x6, #80]                              // load h4l | h4h
-	ext	v15.16b, v15.16b, v15.16b, #8
-	aese	v1.16b, v21.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
-	ldr	q29, [x8, #176]                               // load rk11
-	aese	v3.16b, v20.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
-	ldr	q26, [x8, #128]                                // load rk8
-	aese	v2.16b, v21.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
-	add	w12, w12, #1                            // CTR block 3
-	aese	v0.16b, v21.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
-	aese	v3.16b, v21.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
-	ld1	{ v11.16b}, [x3]
-	ext	v11.16b, v11.16b, v11.16b, #8
-	rev64	v11.16b, v11.16b
-	aese	v2.16b, v22.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
-	aese	v0.16b, v22.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
-	aese	v1.16b, v22.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
-	aese	v3.16b, v22.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
-	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
-	aese	v0.16b, v23.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
-	aese	v1.16b, v23.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
-	aese	v3.16b, v23.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
-	aese	v2.16b, v23.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
-	aese	v1.16b, v24.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
-	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
-	aese	v3.16b, v24.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
-	ldr	q27, [x8, #144]                                // load rk9
-	aese	v0.16b, v24.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
-	ldr	q12, [x6]                                   // load h1l | h1h
-	ext	v12.16b, v12.16b, v12.16b, #8
-	aese	v2.16b, v24.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
-	ldr	q28, [x8, #160]                               // load rk10
-	aese	v1.16b, v25.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
-	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
-	aese	v0.16b, v25.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
-	aese	v2.16b, v25.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
-	aese	v3.16b, v25.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
-	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
-	aese	v1.16b, v26.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
-	aese	v2.16b, v26.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
-	aese	v3.16b, v26.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
-	aese	v0.16b, v26.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
-	b.lt	Lenc_finish_first_blocks                         // branch if AES-128
-
-	aese	v1.16b, v27.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
-	aese	v2.16b, v27.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
-	aese	v3.16b, v27.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
-	aese	v0.16b, v27.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
-	aese	v1.16b, v28.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
-	aese	v2.16b, v28.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
-	aese	v3.16b, v28.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
-	aese	v0.16b, v28.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
-	b.eq	Lenc_finish_first_blocks                         // branch if AES-192
-
-	aese	v1.16b, v29.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
-	aese	v2.16b, v29.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
-	aese	v0.16b, v29.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
-	aese	v3.16b, v29.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
-	aese	v1.16b, v30.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
-	aese	v2.16b, v30.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
-	aese	v0.16b, v30.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
-	aese	v3.16b, v30.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
-
-Lenc_finish_first_blocks:
-	cmp	x0, x5                   // check if we have <= 4 blocks
-	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
-	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
-	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
-	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
-	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
-	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
-	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
-	b.ge	Lenc_tail                                        // handle tail
-
-	ldp	x19, x20, [x0, #16]           // AES block 1 - load plaintext
-	rev	w9, w12                                 // CTR block 4
-	ldp	x6, x7, [x0, #0]            // AES block 0 - load plaintext
-	ldp	x23, x24, [x0, #48]           // AES block 3 - load plaintext
-	ldp	x21, x22, [x0, #32]           // AES block 2 - load plaintext
-	add	x0, x0, #64                       // AES input_ptr update
-	eor	x19, x19, x13                      // AES block 1 - round N low
-	eor	x20, x20, x14                      // AES block 1 - round N high
-	fmov	d5, x19                               // AES block 1 - mov low
-	eor	x6, x6, x13                      // AES block 0 - round N low
-	eor	x7, x7, x14                      // AES block 0 - round N high
-	eor	x24, x24, x14                      // AES block 3 - round N high
-	fmov	d4, x6                               // AES block 0 - mov low
-	cmp	x0, x5                   // check if we have <= 8 blocks
-	fmov	v4.d[1], x7                           // AES block 0 - mov high
-	eor	x23, x23, x13                      // AES block 3 - round N low
-	eor	x21, x21, x13                      // AES block 2 - round N low
-	fmov	v5.d[1], x20                           // AES block 1 - mov high
-	fmov	d6, x21                               // AES block 2 - mov low
-	add	w12, w12, #1                            // CTR block 4
-	orr	x9, x11, x9, lsl #32            // CTR block 4
-	fmov	d7, x23                               // AES block 3 - mov low
-	eor	x22, x22, x14                      // AES block 2 - round N high
-	fmov	v6.d[1], x22                           // AES block 2 - mov high
-	eor	v4.16b, v4.16b, v0.16b                          // AES block 0 - result
-	fmov	d0, x10                               // CTR block 4
-	fmov	v0.d[1], x9                               // CTR block 4
-	rev	w9, w12                                 // CTR block 5
-	add	w12, w12, #1                            // CTR block 5
-	eor	v5.16b, v5.16b, v1.16b                          // AES block 1 - result
-	fmov	d1, x10                               // CTR block 5
-	orr	x9, x11, x9, lsl #32            // CTR block 5
-	fmov	v1.d[1], x9                               // CTR block 5
-	rev	w9, w12                                 // CTR block 6
-	st1	{ v4.16b}, [x2], #16                     // AES block 0 - store result
-	fmov	v7.d[1], x24                           // AES block 3 - mov high
-	orr	x9, x11, x9, lsl #32            // CTR block 6
-	eor	v6.16b, v6.16b, v2.16b                          // AES block 2 - result
-	st1	{ v5.16b}, [x2], #16                     // AES block 1 - store result
-	add	w12, w12, #1                            // CTR block 6
-	fmov	d2, x10                               // CTR block 6
-	fmov	v2.d[1], x9                               // CTR block 6
-	st1	{ v6.16b}, [x2], #16                     // AES block 2 - store result
-	rev	w9, w12                                 // CTR block 7
-	orr	x9, x11, x9, lsl #32            // CTR block 7
-	eor	v7.16b, v7.16b, v3.16b                          // AES block 3 - result
-	st1	{ v7.16b}, [x2], #16                     // AES block 3 - store result
-	b.ge	Lenc_prepretail                                  // do prepretail
-
-Lenc_main_loop:	//	main loop start
-	aese	v0.16b, v18.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
-	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
-	aese	v1.16b, v18.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
-	fmov	d3, x10                               // CTR block 4k+3
-	aese	v2.16b, v18.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
-	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
-	aese	v0.16b, v19.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
-	fmov	v3.d[1], x9                               // CTR block 4k+3
-	aese	v1.16b, v19.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
-	ldp	x23, x24, [x0, #48]           // AES block 4k+7 - load plaintext
-	aese	v2.16b, v19.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
-	ldp	x21, x22, [x0, #32]           // AES block 4k+6 - load plaintext
-	aese	v0.16b, v20.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
-	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
-	aese	v1.16b, v20.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
-	aese	v3.16b, v18.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
-	eor	x23, x23, x13                      // AES block 4k+7 - round N low
-	aese	v0.16b, v21.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
-	mov	d10, v17.d[1]                               // GHASH block 4k - mid
-	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
-	eor	x22, x22, x14                      // AES block 4k+6 - round N high
-	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
-	aese	v3.16b, v19.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
-	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
-	aese	v0.16b, v22.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
-	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
-	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
-	aese	v2.16b, v20.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
-	aese	v0.16b, v23.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
-	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
-	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
-	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
-	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
-	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
-	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
-	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
-	aese	v1.16b, v21.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
-	aese	v3.16b, v20.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
-	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
-	aese	v2.16b, v21.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
-	aese	v1.16b, v22.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
-	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
-	aese	v3.16b, v21.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
-	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
-	aese	v2.16b, v22.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
-	aese	v0.16b, v24.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
-	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
-	aese	v3.16b, v22.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
-	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
-	aese	v0.16b, v25.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
-	aese	v3.16b, v23.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
-	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
-	aese	v1.16b, v23.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
-	aese	v0.16b, v26.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
-	aese	v2.16b, v23.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
-	aese	v1.16b, v24.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
-	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
-	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
-	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
-	aese	v1.16b, v25.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
-	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
-	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
-	aese	v3.16b, v24.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
-	ldp	x19, x20, [x0, #16]           // AES block 4k+5 - load plaintext
-	aese	v1.16b, v26.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
-	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
-	aese	v2.16b, v24.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
-	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
-	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
-	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
-	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
-	aese	v2.16b, v25.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
-	eor	x19, x19, x13                      // AES block 4k+5 - round N low
-	aese	v2.16b, v26.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
-	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
-	aese	v3.16b, v25.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
-	eor	x21, x21, x13                      // AES block 4k+6 - round N low
-	aese	v3.16b, v26.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
-	movi	v8.8b, #0xc2
-	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
-	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
-	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
-	fmov	d5, x19                               // AES block 4k+5 - mov low
-	ldp	x6, x7, [x0, #0]            // AES block 4k+4 - load plaintext
-	b.lt	Lenc_main_loop_continue                          // branch if AES-128
-
-	aese	v1.16b, v27.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
-	aese	v0.16b, v27.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
-	aese	v2.16b, v27.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
-	aese	v3.16b, v27.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
-	aese	v0.16b, v28.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
-	aese	v1.16b, v28.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
-	aese	v2.16b, v28.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
-	aese	v3.16b, v28.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
-	b.eq	Lenc_main_loop_continue                          // branch if AES-192
-
-	aese	v0.16b, v29.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
-	aese	v1.16b, v29.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
-	aese	v2.16b, v29.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
-	aese	v3.16b, v29.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
-	aese	v1.16b, v30.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
-	aese	v0.16b, v30.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
-	aese	v2.16b, v30.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
-	aese	v3.16b, v30.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
-
-Lenc_main_loop_continue:
-	shl	d8, d8, #56               // mod_constant
-	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
-	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
-	add	w12, w12, #1                            // CTR block 4k+3
-	eor	v4.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
-	add	x0, x0, #64                       // AES input_ptr update
-	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
-	rev	w9, w12                                 // CTR block 4k+8
-	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
-	eor	x6, x6, x13                      // AES block 4k+4 - round N low
-	eor	v10.16b, v10.16b, v4.16b                         // MODULO - karatsuba tidy up
-	eor	x7, x7, x14                      // AES block 4k+4 - round N high
-	fmov	d4, x6                               // AES block 4k+4 - mov low
-	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
-	eor	v7.16b, v9.16b, v7.16b                   // MODULO - fold into mid
-	eor	x20, x20, x14                      // AES block 4k+5 - round N high
-	eor	x24, x24, x14                      // AES block 4k+7 - round N high
-	add	w12, w12, #1                            // CTR block 4k+8
-	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
-	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
-	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
-	fmov	d7, x23                               // AES block 4k+7 - mov low
-	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
-	fmov	v5.d[1], x20                           // AES block 4k+5 - mov high
-	fmov	d6, x21                               // AES block 4k+6 - mov low
-	cmp	x0, x5                   // LOOP CONTROL
-	fmov	v6.d[1], x22                           // AES block 4k+6 - mov high
-	pmull	v9.1q, v10.1d, v8.1d            // MODULO - mid 64b align with low
-	eor	v4.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
-	fmov	d0, x10                               // CTR block 4k+8
-	fmov	v0.d[1], x9                               // CTR block 4k+8
-	rev	w9, w12                                 // CTR block 4k+9
-	add	w12, w12, #1                            // CTR block 4k+9
-	eor	v5.16b, v5.16b, v1.16b                          // AES block 4k+5 - result
-	fmov	d1, x10                               // CTR block 4k+9
-	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
-	fmov	v1.d[1], x9                               // CTR block 4k+9
-	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
-	rev	w9, w12                                 // CTR block 4k+10
-	st1	{ v4.16b}, [x2], #16                     // AES block 4k+4 - store result
-	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
-	eor	v11.16b, v11.16b, v9.16b                         // MODULO - fold into low
-	fmov	v7.d[1], x24                           // AES block 4k+7 - mov high
-	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
-	st1	{ v5.16b}, [x2], #16                     // AES block 4k+5 - store result
-	add	w12, w12, #1                            // CTR block 4k+10
-	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
-	eor	v6.16b, v6.16b, v2.16b                          // AES block 4k+6 - result
-	fmov	d2, x10                               // CTR block 4k+10
-	st1	{ v6.16b}, [x2], #16                     // AES block 4k+6 - store result
-	fmov	v2.d[1], x9                               // CTR block 4k+10
-	rev	w9, w12                                 // CTR block 4k+11
-	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
-	orr	x9, x11, x9, lsl #32            // CTR block 4k+11
-	eor	v7.16b, v7.16b, v3.16b                          // AES block 4k+7 - result
-	st1	{ v7.16b}, [x2], #16                     // AES block 4k+7 - store result
-	b.lt	Lenc_main_loop
-
-Lenc_prepretail:	//	PREPRETAIL
-	aese	v1.16b, v18.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
-	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
-	aese	v2.16b, v18.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
-	fmov	d3, x10                               // CTR block 4k+3
-	aese	v0.16b, v18.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
-	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
-	fmov	v3.d[1], x9                               // CTR block 4k+3
-	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
-	aese	v2.16b, v19.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
-	aese	v0.16b, v19.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
-	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
-	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
-	aese	v2.16b, v20.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
-	aese	v3.16b, v18.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
-	mov	d10, v17.d[1]                               // GHASH block 4k - mid
-	aese	v1.16b, v19.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
-	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
-	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
-	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
-	aese	v2.16b, v21.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
-	aese	v1.16b, v20.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
-	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
-	aese	v0.16b, v20.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
-	aese	v3.16b, v19.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
-	aese	v1.16b, v21.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
-	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
-	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
-	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
-	aese	v3.16b, v20.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
-	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
-	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
-	aese	v0.16b, v21.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
-	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
-	aese	v3.16b, v21.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
-	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
-	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
-	aese	v0.16b, v22.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
-	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
-	aese	v3.16b, v22.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
-	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
-	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
-	add	w12, w12, #1                            // CTR block 4k+3
-	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
-	aese	v3.16b, v23.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
-	aese	v2.16b, v22.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
-	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
-	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
-	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
-	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
-	aese	v2.16b, v23.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
-	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
-	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
-	aese	v1.16b, v22.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
-	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
-	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
-	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
-	aese	v1.16b, v23.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
-	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
-	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
-	aese	v0.16b, v23.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
-	aese	v1.16b, v24.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
-	aese	v2.16b, v24.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
-	aese	v0.16b, v24.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
-	movi	v8.8b, #0xc2
-	aese	v3.16b, v24.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
-	aese	v1.16b, v25.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
-	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
-	aese	v0.16b, v25.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
-	aese	v3.16b, v25.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
-	shl	d8, d8, #56               // mod_constant
-	aese	v1.16b, v26.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
-	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
-	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
-	aese	v3.16b, v26.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
-	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
-	aese	v0.16b, v26.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
-	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
-	aese	v2.16b, v25.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
-	eor	v10.16b, v10.16b, v9.16b                         // karatsuba tidy up
-	aese	v2.16b, v26.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
-	pmull	v4.1q, v9.1d, v8.1d
-	ext	v9.16b, v9.16b, v9.16b, #8
-	eor	v10.16b, v10.16b, v11.16b
-	b.lt	Lenc_finish_prepretail                           // branch if AES-128
-
-	aese	v1.16b, v27.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
-	aese	v3.16b, v27.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
-	aese	v0.16b, v27.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
-	aese	v2.16b, v27.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
-	aese	v3.16b, v28.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
-	aese	v1.16b, v28.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
-	aese	v0.16b, v28.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
-	aese	v2.16b, v28.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
-	b.eq	Lenc_finish_prepretail                           // branch if AES-192
-
-	aese	v1.16b, v29.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
-	aese	v0.16b, v29.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
-	aese	v3.16b, v29.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
-	aese	v2.16b, v29.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
-	aese	v1.16b, v30.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
-	aese	v0.16b, v30.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
-	aese	v3.16b, v30.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
-	aese	v2.16b, v30.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
-
-Lenc_finish_prepretail:
-	eor	v10.16b, v10.16b, v4.16b
-	eor	v10.16b, v10.16b, v9.16b
-	pmull	v4.1q, v10.1d, v8.1d
-	ext	v10.16b, v10.16b, v10.16b, #8
-	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
-	eor	v11.16b, v11.16b, v4.16b
-	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
-	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
-	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
-	eor	v11.16b, v11.16b, v10.16b
-
-Lenc_tail:	//	TAIL
-	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
-	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
-	ldp	x6, x7, [x0], #16           // AES block 4k+4 - load plaintext
-	eor	x6, x6, x13                      // AES block 4k+4 - round N low
-	eor	x7, x7, x14                      // AES block 4k+4 - round N high
-	cmp	x5, #48
-	fmov	d4, x6                               // AES block 4k+4 - mov low
-	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
-	eor	v5.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
-	b.gt	Lenc_blocks_more_than_3
-	cmp	x5, #32
-	mov	v3.16b, v2.16b
-	movi	v11.8b, #0
-	movi	v9.8b, #0
-	sub	w12, w12, #1
-	mov	v2.16b, v1.16b
-	movi	v10.8b, #0
-	b.gt	Lenc_blocks_more_than_2
-	mov	v3.16b, v1.16b
-	sub	w12, w12, #1
-	cmp	x5, #16
-	b.gt	Lenc_blocks_more_than_1
-	sub	w12, w12, #1
-	b	Lenc_blocks_less_than_1
-Lenc_blocks_more_than_3:	//	blocks left >  3
-	st1	{ v5.16b}, [x2], #16                    // AES final-3 block  - store result
-	ldp	x6, x7, [x0], #16          // AES final-2 block - load input low & high
-	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
-	eor	x6, x6, x13                     // AES final-2 block - round N low
-	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
-	eor	x7, x7, x14                     // AES final-2 block - round N high
-	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
-	fmov	d5, x6                                // AES final-2 block - mov low
-	fmov	v5.d[1], x7                            // AES final-2 block - mov high
-	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
-	movi	v8.8b, #0                                       // suppress further partial tag feed in
-	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
-	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
-	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
-	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
-	eor	v5.16b, v5.16b, v1.16b                           // AES final-2 block - result
-Lenc_blocks_more_than_2:	//	blocks left >  2
-	st1	{ v5.16b}, [x2], #16                    // AES final-2 block - store result
-	ldp	x6, x7, [x0], #16          // AES final-1 block - load input low & high
-	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
-	eor	x6, x6, x13                     // AES final-1 block - round N low
-	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
-	fmov	d5, x6                                // AES final-1 block - mov low
-	eor	x7, x7, x14                     // AES final-1 block - round N high
-	fmov	v5.d[1], x7                            // AES final-1 block - mov high
-	movi	v8.8b, #0                                       // suppress further partial tag feed in
-	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
-	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
-	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
-	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
-	eor	v5.16b, v5.16b, v2.16b                           // AES final-1 block - result
-	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
-	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
-	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
-	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
-Lenc_blocks_more_than_1:	//	blocks left >  1
-	st1	{ v5.16b}, [x2], #16                    // AES final-1 block - store result
-	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
-	ldp	x6, x7, [x0], #16          // AES final block - load input low & high
-	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
-	movi	v8.8b, #0                                       // suppress further partial tag feed in
-	eor	x6, x6, x13                     // AES final block - round N low
-	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
-	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
-	eor	x7, x7, x14                     // AES final block - round N high
-	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
-	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
-	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
-	fmov	d5, x6                                // AES final block - mov low
-	fmov	v5.d[1], x7                            // AES final block - mov high
-	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
-	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
-	eor	v5.16b, v5.16b, v3.16b                           // AES final block - result
-	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
-	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
-Lenc_blocks_less_than_1:	//	blocks left <= 1
-	and	x1, x1, #127                   // bit_length %= 128
-	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
-	sub	x1, x1, #128                   // bit_length -= 128
-	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
-	ld1	{ v18.16b}, [x2]                           // load existing bytes where the possibly partial last block is to be stored
-	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
-	and	x1, x1, #127                   // bit_length %= 128
-	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
-	cmp	x1, #64
-	csel	x6, x13, x14, lt
-	csel	x7, x14, xzr, lt
-	fmov	d0, x6                                // ctr0b is mask for last block
-	fmov	v0.d[1], x7
-	and	v5.16b, v5.16b, v0.16b                           // possibly partial last block has zeroes in highest bits
-	rev64	v4.16b, v5.16b                                   // GHASH final block
-	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
-	bif	v5.16b, v18.16b, v0.16b                             // insert existing bytes in top end of result before storing
-	pmull2	v20.1q, v4.2d, v12.2d                         // GHASH final block - high
-	mov	d8, v4.d[1]                                 // GHASH final block - mid
-	rev	w9, w12
-	pmull	v21.1q, v4.1d, v12.1d                         // GHASH final block - low
-	eor	v9.16b, v9.16b, v20.16b                           // GHASH final block - high
-	eor	v8.8b, v8.8b, v4.8b                         // GHASH final block - mid
-	pmull	v8.1q, v8.1d, v16.1d                         // GHASH final block - mid
-	eor	v11.16b, v11.16b, v21.16b                           // GHASH final block - low
-	eor	v10.16b, v10.16b, v8.16b                        // GHASH final block - mid
-	movi	v8.8b, #0xc2
-	eor	v4.16b, v11.16b, v9.16b                        // MODULO - karatsuba tidy up
-	shl	d8, d8, #56              // mod_constant
-	eor	v10.16b, v10.16b, v4.16b                        // MODULO - karatsuba tidy up
-	pmull	v7.1q, v9.1d, v8.1d           // MODULO - top 64b align with mid
-	ext	v9.16b, v9.16b, v9.16b, #8                    // MODULO - other top alignment
-	eor	v10.16b, v10.16b, v7.16b                     // MODULO - fold into mid
-	eor	v10.16b, v10.16b, v9.16b                        // MODULO - fold into mid
-	pmull	v9.1q, v10.1d, v8.1d           // MODULO - mid 64b align with low
-	ext	v10.16b, v10.16b, v10.16b, #8                    // MODULO - other mid alignment
-	str	w9, [x16, #12]                         // store the updated counter
-	st1	{ v5.16b}, [x2]                         // store all 16B
-	eor	v11.16b, v11.16b, v9.16b                        // MODULO - fold into low
-	eor	v11.16b, v11.16b, v10.16b                        // MODULO - fold into low
-	ext	v11.16b, v11.16b, v11.16b, #8
-	rev64	v11.16b, v11.16b
-	mov	x0, x15
-	st1	{ v11.16b }, [x3]
-	ldp	x19, x20, [sp, #16]
-	ldp	x21, x22, [sp, #32]
-	ldp	x23, x24, [sp, #48]
-	ldp	d8, d9, [sp, #64]
-	ldp	d10, d11, [sp, #80]
-	ldp	d12, d13, [sp, #96]
-	ldp	d14, d15, [sp, #112]
-	ldp	x29, x30, [sp], #128
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.globl	_aes_gcm_dec_kernel
-.private_extern	_aes_gcm_dec_kernel
-
-.align	4
-_aes_gcm_dec_kernel:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29, x30, [sp, #-128]!
-	mov	x29, sp
-	stp	x19, x20, [sp, #16]
-	mov	x16, x4
-	mov	x8, x5
-	stp	x21, x22, [sp, #32]
-	stp	x23, x24, [sp, #48]
-	stp	d8, d9, [sp, #64]
-	stp	d10, d11, [sp, #80]
-	stp	d12, d13, [sp, #96]
-	stp	d14, d15, [sp, #112]
-	ldr	w17, [x8, #240]
-	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
-	ldp	x13, x14, [x19]                       // load round N keys
-	ldr	q31, [x19, #-16]                        // load round N-1 keys
-	lsr	x5, x1, #3              // byte_len
-	mov	x15, x5
-	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
-	ldr	q26, [x8, #128]                                // load rk8
-	sub	x5, x5, #1      // byte_len - 1
-	ldr	q25, [x8, #112]                                // load rk7
-	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
-	add	x4, x0, x1, lsr #3   // end_input_ptr
-	ldr	q24, [x8, #96]                                 // load rk6
-	lsr	x12, x11, #32
-	ldr	q23, [x8, #80]                                 // load rk5
-	orr	w11, w11, w11
-	ldr	q21, [x8, #48]                                 // load rk3
-	add	x5, x5, x0
-	rev	w12, w12                                // rev_ctr32
-	add	w12, w12, #1                            // increment rev_ctr32
-	fmov	d3, x10                               // CTR block 3
-	rev	w9, w12                                 // CTR block 1
-	add	w12, w12, #1                            // CTR block 1
-	fmov	d1, x10                               // CTR block 1
-	orr	x9, x11, x9, lsl #32            // CTR block 1
-	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
-	fmov	v1.d[1], x9                               // CTR block 1
-	rev	w9, w12                                 // CTR block 2
-	add	w12, w12, #1                            // CTR block 2
-	fmov	d2, x10                               // CTR block 2
-	orr	x9, x11, x9, lsl #32            // CTR block 2
-	fmov	v2.d[1], x9                               // CTR block 2
-	rev	w9, w12                                 // CTR block 3
-	orr	x9, x11, x9, lsl #32            // CTR block 3
-	ldr	q18, [x8, #0]                                  // load rk0
-	fmov	v3.d[1], x9                               // CTR block 3
-	add	w12, w12, #1                            // CTR block 3
-	ldr	q22, [x8, #64]                                 // load rk4
-	ldr	q19, [x8, #16]                                 // load rk1
-	aese	v0.16b, v18.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
-	ldr	q14, [x6, #48]                              // load h3l | h3h
-	ext	v14.16b, v14.16b, v14.16b, #8
-	aese	v3.16b, v18.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
-	ldr	q15, [x6, #80]                              // load h4l | h4h
-	ext	v15.16b, v15.16b, v15.16b, #8
-	aese	v1.16b, v18.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
-	ldr	q13, [x6, #32]                              // load h2l | h2h
-	ext	v13.16b, v13.16b, v13.16b, #8
-	aese	v2.16b, v18.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
-	ldr	q20, [x8, #32]                                 // load rk2
-	aese	v0.16b, v19.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
-	aese	v1.16b, v19.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
-	ld1	{ v11.16b}, [x3]
-	ext	v11.16b, v11.16b, v11.16b, #8
-	rev64	v11.16b, v11.16b
-	aese	v2.16b, v19.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
-	ldr	q27, [x8, #144]                                // load rk9
-	aese	v3.16b, v19.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
-	ldr	q30, [x8, #192]                               // load rk12
-	aese	v0.16b, v20.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
-	ldr	q12, [x6]                                   // load h1l | h1h
-	ext	v12.16b, v12.16b, v12.16b, #8
-	aese	v2.16b, v20.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
-	ldr	q28, [x8, #160]                               // load rk10
-	aese	v3.16b, v20.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
-	aese	v0.16b, v21.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
-	aese	v1.16b, v20.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
-	aese	v3.16b, v21.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
-	aese	v0.16b, v22.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
-	aese	v2.16b, v21.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
-	aese	v1.16b, v21.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
-	aese	v3.16b, v22.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
-	aese	v2.16b, v22.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
-	aese	v1.16b, v22.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
-	aese	v3.16b, v23.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
-	aese	v0.16b, v23.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
-	aese	v1.16b, v23.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
-	aese	v2.16b, v23.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
-	aese	v0.16b, v24.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
-	aese	v3.16b, v24.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
-	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
-	aese	v1.16b, v24.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
-	aese	v2.16b, v24.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
-	aese	v0.16b, v25.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
-	aese	v1.16b, v25.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
-	aese	v3.16b, v25.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
-	aese	v0.16b, v26.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
-	aese	v2.16b, v25.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
-	aese	v3.16b, v26.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
-	aese	v1.16b, v26.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
-	ldr	q29, [x8, #176]                               // load rk11
-	aese	v2.16b, v26.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
-	b.lt	Ldec_finish_first_blocks                         // branch if AES-128
-
-	aese	v0.16b, v27.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
-	aese	v1.16b, v27.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
-	aese	v3.16b, v27.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
-	aese	v2.16b, v27.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
-	aese	v0.16b, v28.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
-	aese	v1.16b, v28.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
-	aese	v3.16b, v28.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
-	aese	v2.16b, v28.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
-	b.eq	Ldec_finish_first_blocks                         // branch if AES-192
-
-	aese	v0.16b, v29.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
-	aese	v3.16b, v29.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
-	aese	v1.16b, v29.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
-	aese	v2.16b, v29.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
-	aese	v1.16b, v30.16b
-	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
-	aese	v0.16b, v30.16b
-	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
-	aese	v2.16b, v30.16b
-	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
-	aese	v3.16b, v30.16b
-	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
-
-Ldec_finish_first_blocks:
-	cmp	x0, x5                   // check if we have <= 4 blocks
-	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
-	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
-	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
-	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
-	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
-	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
-	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
-	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
-	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
-	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
-	b.ge	Ldec_tail                                        // handle tail
-
-	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
-	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
-	rev	w9, w12                                 // CTR block 4
-	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
-	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
-	rev64	v5.16b, v5.16b                                    // GHASH block 1
-	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
-	mov	x7, v0.d[1]                            // AES block 0 - mov high
-	mov	x6, v0.d[0]                            // AES block 0 - mov low
-	rev64	v4.16b, v4.16b                                    // GHASH block 0
-	add	w12, w12, #1                            // CTR block 4
-	fmov	d0, x10                               // CTR block 4
-	orr	x9, x11, x9, lsl #32            // CTR block 4
-	fmov	v0.d[1], x9                               // CTR block 4
-	rev	w9, w12                                 // CTR block 5
-	add	w12, w12, #1                            // CTR block 5
-	mov	x19, v1.d[0]                            // AES block 1 - mov low
-	orr	x9, x11, x9, lsl #32            // CTR block 5
-	mov	x20, v1.d[1]                            // AES block 1 - mov high
-	eor	x7, x7, x14                    // AES block 0 - round N high
-	eor	x6, x6, x13                    // AES block 0 - round N low
-	stp	x6, x7, [x2], #16        // AES block 0 - store result
-	fmov	d1, x10                               // CTR block 5
-	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
-	add	x0, x0, #64                       // AES input_ptr update
-	fmov	v1.d[1], x9                               // CTR block 5
-	rev	w9, w12                                 // CTR block 6
-	add	w12, w12, #1                            // CTR block 6
-	eor	x19, x19, x13                    // AES block 1 - round N low
-	orr	x9, x11, x9, lsl #32            // CTR block 6
-	eor	x20, x20, x14                    // AES block 1 - round N high
-	stp	x19, x20, [x2], #16        // AES block 1 - store result
-	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
-	cmp	x0, x5                   // check if we have <= 8 blocks
-	b.ge	Ldec_prepretail                                  // do prepretail
-
-Ldec_main_loop:	//	main loop start
-	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
-	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
-	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
-	aese	v0.16b, v18.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
-	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
-	aese	v1.16b, v18.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
-	fmov	d2, x10                               // CTR block 4k+6
-	fmov	v2.d[1], x9                               // CTR block 4k+6
-	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
-	rev	w9, w12                                 // CTR block 4k+7
-	aese	v0.16b, v19.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
-	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
-	aese	v1.16b, v19.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
-	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
-	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
-	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
-	fmov	d3, x10                               // CTR block 4k+7
-	aese	v0.16b, v20.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
-	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
-	aese	v2.16b, v18.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
-	fmov	v3.d[1], x9                               // CTR block 4k+7
-	aese	v1.16b, v20.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
-	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
-	aese	v0.16b, v21.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
-	eor	x22, x22, x14                    // AES block 4k+2 - round N high
-	aese	v2.16b, v19.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
-	mov	d10, v17.d[1]                               // GHASH block 4k - mid
-	aese	v1.16b, v21.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
-	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
-	aese	v3.16b, v18.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
-	eor	x21, x21, x13                    // AES block 4k+2 - round N low
-	aese	v2.16b, v20.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
-	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
-	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
-	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
-	aese	v2.16b, v21.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
-	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
-	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
-	eor	x23, x23, x13                    // AES block 4k+3 - round N low
-	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
-	eor	x24, x24, x14                    // AES block 4k+3 - round N high
-	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
-	aese	v2.16b, v22.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
-	aese	v3.16b, v19.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
-	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
-	aese	v0.16b, v22.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
-	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
-	aese	v2.16b, v23.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
-	add	w12, w12, #1                            // CTR block 4k+7
-	aese	v3.16b, v20.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
-	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
-	aese	v1.16b, v22.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
-	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
-	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
-	aese	v3.16b, v21.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
-	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
-	aese	v1.16b, v23.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
-	aese	v0.16b, v23.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
-	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
-	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
-	rev	w9, w12                                 // CTR block 4k+8
-	aese	v1.16b, v24.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
-	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
-	aese	v0.16b, v24.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
-	add	w12, w12, #1                            // CTR block 4k+8
-	aese	v3.16b, v22.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
-	aese	v1.16b, v25.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
-	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
-	aese	v0.16b, v25.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
-	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
-	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
-	aese	v3.16b, v23.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
-	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
-	aese	v0.16b, v26.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
-	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
-	aese	v3.16b, v24.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
-	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
-	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
-	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
-	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
-	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
-	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
-	aese	v1.16b, v26.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
-	aese	v2.16b, v24.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
-	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
-	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
-	movi	v8.8b, #0xc2
-	aese	v2.16b, v25.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
-	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
-	aese	v3.16b, v25.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
-	shl	d8, d8, #56               // mod_constant
-	aese	v2.16b, v26.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
-	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
-	aese	v3.16b, v26.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
-	b.lt	Ldec_main_loop_continue                          // branch if AES-128
-
-	aese	v0.16b, v27.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
-	aese	v2.16b, v27.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
-	aese	v1.16b, v27.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
-	aese	v3.16b, v27.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
-	aese	v0.16b, v28.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
-	aese	v1.16b, v28.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
-	aese	v2.16b, v28.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
-	aese	v3.16b, v28.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
-	b.eq	Ldec_main_loop_continue                          // branch if AES-192
-
-	aese	v0.16b, v29.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
-	aese	v1.16b, v29.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
-	aese	v2.16b, v29.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
-	aese	v3.16b, v29.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
-	aese	v0.16b, v30.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
-	aese	v1.16b, v30.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
-	aese	v2.16b, v30.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
-	aese	v3.16b, v30.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
-
-Ldec_main_loop_continue:
-	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
-	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
-	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
-	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
-	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
-	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
-	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
-	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
-	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
-	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
-	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
-	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
-	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
-	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
-	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
-	add	x0, x0, #64                       // AES input_ptr update
-	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
-	fmov	d0, x10                               // CTR block 4k+8
-	fmov	v0.d[1], x9                               // CTR block 4k+8
-	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
-	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
-	rev	w9, w12                                 // CTR block 4k+9
-	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
-	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
-	cmp	x0, x5                   // LOOP CONTROL
-	add	w12, w12, #1                            // CTR block 4k+9
-	eor	x6, x6, x13                    // AES block 4k+4 - round N low
-	eor	x7, x7, x14                    // AES block 4k+4 - round N high
-	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
-	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
-	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
-	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
-	fmov	d1, x10                               // CTR block 4k+9
-	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
-	fmov	v1.d[1], x9                               // CTR block 4k+9
-	rev	w9, w12                                 // CTR block 4k+10
-	add	w12, w12, #1                            // CTR block 4k+10
-	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
-	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
-	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
-	eor	x20, x20, x14                    // AES block 4k+5 - round N high
-	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
-	eor	x19, x19, x13                    // AES block 4k+5 - round N low
-	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
-	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
-	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
-	b.lt	Ldec_main_loop
-
-Ldec_prepretail:	//	PREPRETAIL
-	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
-	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
-	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
-	aese	v0.16b, v18.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
-	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
-	aese	v1.16b, v18.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
-	fmov	d2, x10                               // CTR block 4k+6
-	fmov	v2.d[1], x9                               // CTR block 4k+6
-	rev	w9, w12                                 // CTR block 4k+7
-	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
-	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
-	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
-	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
-	aese	v1.16b, v19.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
-	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
-	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
-	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
-	fmov	d3, x10                               // CTR block 4k+7
-	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
-	fmov	v3.d[1], x9                               // CTR block 4k+7
-	aese	v2.16b, v18.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
-	mov	d10, v17.d[1]                               // GHASH block 4k - mid
-	aese	v0.16b, v19.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
-	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
-	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
-	aese	v2.16b, v19.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
-	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
-	aese	v3.16b, v18.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
-	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
-	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
-	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
-	aese	v3.16b, v19.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
-	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
-	aese	v0.16b, v20.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
-	aese	v1.16b, v20.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
-	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
-	aese	v2.16b, v20.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
-	aese	v0.16b, v21.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
-	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
-	aese	v3.16b, v20.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
-	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
-	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
-	aese	v0.16b, v22.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
-	aese	v3.16b, v21.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
-	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
-	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
-	aese	v0.16b, v23.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
-	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
-	aese	v3.16b, v22.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
-	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
-	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
-	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
-	aese	v3.16b, v23.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
-	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
-	aese	v2.16b, v21.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
-	aese	v1.16b, v21.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
-	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
-	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
-	aese	v2.16b, v22.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
-	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
-	aese	v1.16b, v22.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
-	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
-	aese	v2.16b, v23.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
-	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
-	aese	v1.16b, v23.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
-	aese	v3.16b, v24.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
-	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
-	aese	v2.16b, v24.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
-	aese	v0.16b, v24.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
-	movi	v8.8b, #0xc2
-	aese	v1.16b, v24.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
-	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
-	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
-	aese	v3.16b, v25.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
-	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
-	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
-	aese	v1.16b, v25.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
-	aese	v0.16b, v25.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
-	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
-	aese	v3.16b, v26.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
-	aese	v2.16b, v25.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
-	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
-	aese	v1.16b, v26.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
-	aese	v0.16b, v26.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
-	shl	d8, d8, #56               // mod_constant
-	aese	v2.16b, v26.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
-	b.lt	Ldec_finish_prepretail                           // branch if AES-128
-
-	aese	v1.16b, v27.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
-	aese	v2.16b, v27.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
-	aese	v3.16b, v27.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
-	aese	v0.16b, v27.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
-	aese	v2.16b, v28.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
-	aese	v3.16b, v28.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
-	aese	v0.16b, v28.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
-	aese	v1.16b, v28.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
-	b.eq	Ldec_finish_prepretail                           // branch if AES-192
-
-	aese	v2.16b, v29.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
-	aese	v0.16b, v29.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
-	aese	v1.16b, v29.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
-	aese	v2.16b, v30.16b
-	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
-	aese	v3.16b, v29.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
-	aese	v1.16b, v30.16b
-	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
-	aese	v0.16b, v30.16b
-	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
-	aese	v3.16b, v30.16b
-	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
-
-Ldec_finish_prepretail:
-	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
-	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
-	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
-	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
-	eor	x22, x22, x14                    // AES block 4k+2 - round N high
-	eor	x23, x23, x13                    // AES block 4k+3 - round N low
-	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
-	add	w12, w12, #1                            // CTR block 4k+7
-	eor	x21, x21, x13                    // AES block 4k+2 - round N low
-	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
-	eor	x24, x24, x14                    // AES block 4k+3 - round N high
-	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
-	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
-	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
-
-	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
-	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
-	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
-	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
-	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
-	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
-
-Ldec_tail:	//	TAIL
-	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
-	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
-	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
-	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
-	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
-	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
-	cmp	x5, #48
-	eor	x6, x6, x13                    // AES block 4k+4 - round N low
-	eor	x7, x7, x14                    // AES block 4k+4 - round N high
-	b.gt	Ldec_blocks_more_than_3
-	sub	w12, w12, #1
-	mov	v3.16b, v2.16b
-	movi	v10.8b, #0
-	movi	v11.8b, #0
-	cmp	x5, #32
-	movi	v9.8b, #0
-	mov	v2.16b, v1.16b
-	b.gt	Ldec_blocks_more_than_2
-	sub	w12, w12, #1
-	mov	v3.16b, v1.16b
-	cmp	x5, #16
-	b.gt	Ldec_blocks_more_than_1
-	sub	w12, w12, #1
-	b	Ldec_blocks_less_than_1
-Ldec_blocks_more_than_3:	//	blocks left >  3
-	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
-	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
-	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
-	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
-	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
-	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
-	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
-	mov	x6, v0.d[0]                           // AES final-2 block - mov low
-	mov	x7, v0.d[1]                           // AES final-2 block - mov high
-	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
-	movi	v8.8b, #0                                       // suppress further partial tag feed in
-	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
-	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
-	eor	x6, x6, x13                   // AES final-2 block - round N low
-	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
-	eor	x7, x7, x14                   // AES final-2 block - round N high
-Ldec_blocks_more_than_2:	//	blocks left >  2
-	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
-	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
-	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
-	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
-	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
-	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
-	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
-	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
-	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
-	mov	x6, v0.d[0]                           // AES final-1 block - mov low
-	mov	x7, v0.d[1]                           // AES final-1 block - mov high
-	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
-	movi	v8.8b, #0                                       // suppress further partial tag feed in
-	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
-	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
-	eor	x6, x6, x13                   // AES final-1 block - round N low
-	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
-	eor	x7, x7, x14                   // AES final-1 block - round N high
-Ldec_blocks_more_than_1:	//	blocks left >  1
-	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
-	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
-	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
-	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
-	movi	v8.8b, #0                                       // suppress further partial tag feed in
-	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
-	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
-	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
-	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
-	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
-	mov	x6, v0.d[0]                           // AES final block - mov low
-	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
-	mov	x7, v0.d[1]                           // AES final block - mov high
-	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
-	eor	x6, x6, x13                   // AES final block - round N low
-	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
-	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
-	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
-	eor	x7, x7, x14                   // AES final block - round N high
-Ldec_blocks_less_than_1:	//	blocks left <= 1
-	and	x1, x1, #127                   // bit_length %= 128
-	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
-	sub	x1, x1, #128                   // bit_length -= 128
-	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
-	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
-	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
-	and	x1, x1, #127                   // bit_length %= 128
-	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
-	cmp	x1, #64
-	csel	x9, x13, x14, lt
-	csel	x10, x14, xzr, lt
-	fmov	d0, x9                                  // ctr0b is mask for last block
-	and	x6, x6, x9
-	mov	v0.d[1], x10
-	bic	x4, x4, x9          // mask out low existing bytes
-	rev	w9, w12
-	bic	x5, x5, x10      // mask out high existing bytes
-	orr	x6, x6, x4
-	and	x7, x7, x10
-	orr	x7, x7, x5
-	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
-	rev64	v4.16b, v5.16b                                    // GHASH final block
-	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
-	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
-	mov	d8, v4.d[1]                                  // GHASH final block - mid
-	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
-	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
-	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
-	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
-	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
-	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
-	movi	v8.8b, #0xc2
-	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
-	shl	d8, d8, #56               // mod_constant
-	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
-	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
-	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
-	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
-	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
-	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
-	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
-	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
-	stp	x6, x7, [x2]
-	str	w9, [x16, #12]                          // store the updated counter
-	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
-	ext	v11.16b, v11.16b, v11.16b, #8
-	rev64	v11.16b, v11.16b
-	mov	x0, x15
-	st1	{ v11.16b }, [x3]
-	ldp	x19, x20, [sp, #16]
-	ldp	x21, x22, [sp, #32]
-	ldp	x23, x24, [sp, #48]
-	ldp	d8, d9, [sp, #64]
-	ldp	d10, d11, [sp, #80]
-	ldp	d12, d13, [sp, #96]
-	ldp	d14, d15, [sp, #112]
-	ldp	x29, x30, [sp], #128
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/armv8-mont-apple.S b/apple-aarch64/crypto/fipsmodule/armv8-mont-apple.S
deleted file mode 100644
index cf798a3..0000000
--- a/apple-aarch64/crypto/fipsmodule/armv8-mont-apple.S
+++ /dev/null
@@ -1,1425 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.text
-
-.globl	_bn_mul_mont
-.private_extern	_bn_mul_mont
-
-.align	5
-_bn_mul_mont:
-	AARCH64_SIGN_LINK_REGISTER
-	tst	x5,#7
-	b.eq	__bn_sqr8x_mont
-	tst	x5,#3
-	b.eq	__bn_mul4x_mont
-Lmul_mont:
-	stp	x29,x30,[sp,#-64]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-
-	ldr	x9,[x2],#8		// bp[0]
-	sub	x22,sp,x5,lsl#3
-	ldp	x7,x8,[x1],#16	// ap[0..1]
-	lsl	x5,x5,#3
-	ldr	x4,[x4]		// *n0
-	and	x22,x22,#-16		// ABI says so
-	ldp	x13,x14,[x3],#16	// np[0..1]
-
-	mul	x6,x7,x9		// ap[0]*bp[0]
-	sub	x21,x5,#16		// j=num-2
-	umulh	x7,x7,x9
-	mul	x10,x8,x9		// ap[1]*bp[0]
-	umulh	x11,x8,x9
-
-	mul	x15,x6,x4		// "tp[0]"*n0
-	mov	sp,x22			// alloca
-
-	// (*)	mul	x12,x13,x15	// np[0]*m1
-	umulh	x13,x13,x15
-	mul	x16,x14,x15		// np[1]*m1
-	// (*)	adds	x12,x12,x6	// discarded
-	// (*)	As for removal of first multiplication and addition
-	//	instructions. The outcome of first addition is
-	//	guaranteed to be zero, which leaves two computationally
-	//	significant outcomes: it either carries or not. Then
-	//	question is when does it carry? Is there alternative
-	//	way to deduce it? If you follow operations, you can
-	//	observe that condition for carry is quite simple:
-	//	x6 being non-zero. So that carry can be calculated
-	//	by adding -1 to x6. That's what next instruction does.
-	subs	xzr,x6,#1		// (*)
-	umulh	x17,x14,x15
-	adc	x13,x13,xzr
-	cbz	x21,L1st_skip
-
-L1st:
-	ldr	x8,[x1],#8
-	adds	x6,x10,x7
-	sub	x21,x21,#8		// j--
-	adc	x7,x11,xzr
-
-	ldr	x14,[x3],#8
-	adds	x12,x16,x13
-	mul	x10,x8,x9		// ap[j]*bp[0]
-	adc	x13,x17,xzr
-	umulh	x11,x8,x9
-
-	adds	x12,x12,x6
-	mul	x16,x14,x15		// np[j]*m1
-	adc	x13,x13,xzr
-	umulh	x17,x14,x15
-	str	x12,[x22],#8		// tp[j-1]
-	cbnz	x21,L1st
-
-L1st_skip:
-	adds	x6,x10,x7
-	sub	x1,x1,x5		// rewind x1
-	adc	x7,x11,xzr
-
-	adds	x12,x16,x13
-	sub	x3,x3,x5		// rewind x3
-	adc	x13,x17,xzr
-
-	adds	x12,x12,x6
-	sub	x20,x5,#8		// i=num-1
-	adcs	x13,x13,x7
-
-	adc	x19,xzr,xzr		// upmost overflow bit
-	stp	x12,x13,[x22]
-
-Louter:
-	ldr	x9,[x2],#8		// bp[i]
-	ldp	x7,x8,[x1],#16
-	ldr	x23,[sp]		// tp[0]
-	add	x22,sp,#8
-
-	mul	x6,x7,x9		// ap[0]*bp[i]
-	sub	x21,x5,#16		// j=num-2
-	umulh	x7,x7,x9
-	ldp	x13,x14,[x3],#16
-	mul	x10,x8,x9		// ap[1]*bp[i]
-	adds	x6,x6,x23
-	umulh	x11,x8,x9
-	adc	x7,x7,xzr
-
-	mul	x15,x6,x4
-	sub	x20,x20,#8		// i--
-
-	// (*)	mul	x12,x13,x15	// np[0]*m1
-	umulh	x13,x13,x15
-	mul	x16,x14,x15		// np[1]*m1
-	// (*)	adds	x12,x12,x6
-	subs	xzr,x6,#1		// (*)
-	umulh	x17,x14,x15
-	cbz	x21,Linner_skip
-
-Linner:
-	ldr	x8,[x1],#8
-	adc	x13,x13,xzr
-	ldr	x23,[x22],#8		// tp[j]
-	adds	x6,x10,x7
-	sub	x21,x21,#8		// j--
-	adc	x7,x11,xzr
-
-	adds	x12,x16,x13
-	ldr	x14,[x3],#8
-	adc	x13,x17,xzr
-
-	mul	x10,x8,x9		// ap[j]*bp[i]
-	adds	x6,x6,x23
-	umulh	x11,x8,x9
-	adc	x7,x7,xzr
-
-	mul	x16,x14,x15		// np[j]*m1
-	adds	x12,x12,x6
-	umulh	x17,x14,x15
-	str	x12,[x22,#-16]		// tp[j-1]
-	cbnz	x21,Linner
-
-Linner_skip:
-	ldr	x23,[x22],#8		// tp[j]
-	adc	x13,x13,xzr
-	adds	x6,x10,x7
-	sub	x1,x1,x5		// rewind x1
-	adc	x7,x11,xzr
-
-	adds	x12,x16,x13
-	sub	x3,x3,x5		// rewind x3
-	adcs	x13,x17,x19
-	adc	x19,xzr,xzr
-
-	adds	x6,x6,x23
-	adc	x7,x7,xzr
-
-	adds	x12,x12,x6
-	adcs	x13,x13,x7
-	adc	x19,x19,xzr		// upmost overflow bit
-	stp	x12,x13,[x22,#-16]
-
-	cbnz	x20,Louter
-
-	// Final step. We see if result is larger than modulus, and
-	// if it is, subtract the modulus. But comparison implies
-	// subtraction. So we subtract modulus, see if it borrowed,
-	// and conditionally copy original value.
-	ldr	x23,[sp]		// tp[0]
-	add	x22,sp,#8
-	ldr	x14,[x3],#8		// np[0]
-	subs	x21,x5,#8		// j=num-1 and clear borrow
-	mov	x1,x0
-Lsub:
-	sbcs	x8,x23,x14		// tp[j]-np[j]
-	ldr	x23,[x22],#8
-	sub	x21,x21,#8		// j--
-	ldr	x14,[x3],#8
-	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
-	cbnz	x21,Lsub
-
-	sbcs	x8,x23,x14
-	sbcs	x19,x19,xzr		// did it borrow?
-	str	x8,[x1],#8		// rp[num-1]
-
-	ldr	x23,[sp]		// tp[0]
-	add	x22,sp,#8
-	ldr	x8,[x0],#8		// rp[0]
-	sub	x5,x5,#8		// num--
-	nop
-Lcond_copy:
-	sub	x5,x5,#8		// num--
-	csel	x14,x23,x8,lo		// did it borrow?
-	ldr	x23,[x22],#8
-	ldr	x8,[x0],#8
-	str	xzr,[x22,#-16]		// wipe tp
-	str	x14,[x0,#-16]
-	cbnz	x5,Lcond_copy
-
-	csel	x14,x23,x8,lo
-	str	xzr,[x22,#-8]		// wipe tp
-	str	x14,[x0,#-8]
-
-	ldp	x19,x20,[x29,#16]
-	mov	sp,x29
-	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
-	ldp	x23,x24,[x29,#48]
-	ldr	x29,[sp],#64
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.align	5
-__bn_sqr8x_mont:
-	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
-	// only from bn_mul_mont which has already signed the return address.
-	cmp	x1,x2
-	b.ne	__bn_mul4x_mont
-Lsqr8x_mont:
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	stp	x0,x3,[sp,#96]	// offload rp and np
-
-	ldp	x6,x7,[x1,#8*0]
-	ldp	x8,x9,[x1,#8*2]
-	ldp	x10,x11,[x1,#8*4]
-	ldp	x12,x13,[x1,#8*6]
-
-	sub	x2,sp,x5,lsl#4
-	lsl	x5,x5,#3
-	ldr	x4,[x4]		// *n0
-	mov	sp,x2			// alloca
-	sub	x27,x5,#8*8
-	b	Lsqr8x_zero_start
-
-Lsqr8x_zero:
-	sub	x27,x27,#8*8
-	stp	xzr,xzr,[x2,#8*0]
-	stp	xzr,xzr,[x2,#8*2]
-	stp	xzr,xzr,[x2,#8*4]
-	stp	xzr,xzr,[x2,#8*6]
-Lsqr8x_zero_start:
-	stp	xzr,xzr,[x2,#8*8]
-	stp	xzr,xzr,[x2,#8*10]
-	stp	xzr,xzr,[x2,#8*12]
-	stp	xzr,xzr,[x2,#8*14]
-	add	x2,x2,#8*16
-	cbnz	x27,Lsqr8x_zero
-
-	add	x3,x1,x5
-	add	x1,x1,#8*8
-	mov	x19,xzr
-	mov	x20,xzr
-	mov	x21,xzr
-	mov	x22,xzr
-	mov	x23,xzr
-	mov	x24,xzr
-	mov	x25,xzr
-	mov	x26,xzr
-	mov	x2,sp
-	str	x4,[x29,#112]		// offload n0
-
-	// Multiply everything but a[i]*a[i]
-.align	4
-Lsqr8x_outer_loop:
-        //                                                 a[1]a[0]	(i)
-        //                                             a[2]a[0]
-        //                                         a[3]a[0]
-        //                                     a[4]a[0]
-        //                                 a[5]a[0]
-        //                             a[6]a[0]
-        //                         a[7]a[0]
-        //                                         a[2]a[1]		(ii)
-        //                                     a[3]a[1]
-        //                                 a[4]a[1]
-        //                             a[5]a[1]
-        //                         a[6]a[1]
-        //                     a[7]a[1]
-        //                                 a[3]a[2]			(iii)
-        //                             a[4]a[2]
-        //                         a[5]a[2]
-        //                     a[6]a[2]
-        //                 a[7]a[2]
-        //                         a[4]a[3]				(iv)
-        //                     a[5]a[3]
-        //                 a[6]a[3]
-        //             a[7]a[3]
-        //                 a[5]a[4]					(v)
-        //             a[6]a[4]
-        //         a[7]a[4]
-        //         a[6]a[5]						(vi)
-        //     a[7]a[5]
-        // a[7]a[6]							(vii)
-
-	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
-	mul	x15,x8,x6
-	mul	x16,x9,x6
-	mul	x17,x10,x6
-	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
-	mul	x14,x11,x6
-	adcs	x21,x21,x15
-	mul	x15,x12,x6
-	adcs	x22,x22,x16
-	mul	x16,x13,x6
-	adcs	x23,x23,x17
-	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
-	adcs	x24,x24,x14
-	umulh	x14,x8,x6
-	adcs	x25,x25,x15
-	umulh	x15,x9,x6
-	adcs	x26,x26,x16
-	umulh	x16,x10,x6
-	stp	x19,x20,[x2],#8*2	// t[0..1]
-	adc	x19,xzr,xzr		// t[8]
-	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
-	umulh	x17,x11,x6
-	adcs	x22,x22,x14
-	umulh	x14,x12,x6
-	adcs	x23,x23,x15
-	umulh	x15,x13,x6
-	adcs	x24,x24,x16
-	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
-	adcs	x25,x25,x17
-	mul	x17,x9,x7
-	adcs	x26,x26,x14
-	mul	x14,x10,x7
-	adc	x19,x19,x15
-
-	mul	x15,x11,x7
-	adds	x22,x22,x16
-	mul	x16,x12,x7
-	adcs	x23,x23,x17
-	mul	x17,x13,x7
-	adcs	x24,x24,x14
-	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
-	adcs	x25,x25,x15
-	umulh	x15,x9,x7
-	adcs	x26,x26,x16
-	umulh	x16,x10,x7
-	adcs	x19,x19,x17
-	umulh	x17,x11,x7
-	stp	x21,x22,[x2],#8*2	// t[2..3]
-	adc	x20,xzr,xzr		// t[9]
-	adds	x23,x23,x14
-	umulh	x14,x12,x7
-	adcs	x24,x24,x15
-	umulh	x15,x13,x7
-	adcs	x25,x25,x16
-	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
-	adcs	x26,x26,x17
-	mul	x17,x10,x8
-	adcs	x19,x19,x14
-	mul	x14,x11,x8
-	adc	x20,x20,x15
-
-	mul	x15,x12,x8
-	adds	x24,x24,x16
-	mul	x16,x13,x8
-	adcs	x25,x25,x17
-	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
-	adcs	x26,x26,x14
-	umulh	x14,x10,x8
-	adcs	x19,x19,x15
-	umulh	x15,x11,x8
-	adcs	x20,x20,x16
-	umulh	x16,x12,x8
-	stp	x23,x24,[x2],#8*2	// t[4..5]
-	adc	x21,xzr,xzr		// t[10]
-	adds	x25,x25,x17
-	umulh	x17,x13,x8
-	adcs	x26,x26,x14
-	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
-	adcs	x19,x19,x15
-	mul	x15,x11,x9
-	adcs	x20,x20,x16
-	mul	x16,x12,x9
-	adc	x21,x21,x17
-
-	mul	x17,x13,x9
-	adds	x26,x26,x14
-	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
-	adcs	x19,x19,x15
-	umulh	x15,x11,x9
-	adcs	x20,x20,x16
-	umulh	x16,x12,x9
-	adcs	x21,x21,x17
-	umulh	x17,x13,x9
-	stp	x25,x26,[x2],#8*2	// t[6..7]
-	adc	x22,xzr,xzr		// t[11]
-	adds	x19,x19,x14
-	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
-	adcs	x20,x20,x15
-	mul	x15,x12,x10
-	adcs	x21,x21,x16
-	mul	x16,x13,x10
-	adc	x22,x22,x17
-
-	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
-	adds	x20,x20,x14
-	umulh	x14,x12,x10
-	adcs	x21,x21,x15
-	umulh	x15,x13,x10
-	adcs	x22,x22,x16
-	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
-	adc	x23,xzr,xzr		// t[12]
-	adds	x21,x21,x17
-	mul	x17,x13,x11
-	adcs	x22,x22,x14
-	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
-	adc	x23,x23,x15
-
-	umulh	x15,x13,x11
-	adds	x22,x22,x16
-	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
-	adcs	x23,x23,x17
-	umulh	x17,x13,x12		// hi(a[7]*a[6])
-	adc	x24,xzr,xzr		// t[13]
-	adds	x23,x23,x14
-	sub	x27,x3,x1	// done yet?
-	adc	x24,x24,x15
-
-	adds	x24,x24,x16
-	sub	x14,x3,x5	// rewinded ap
-	adc	x25,xzr,xzr		// t[14]
-	add	x25,x25,x17
-
-	cbz	x27,Lsqr8x_outer_break
-
-	mov	x4,x6
-	ldp	x6,x7,[x2,#8*0]
-	ldp	x8,x9,[x2,#8*2]
-	ldp	x10,x11,[x2,#8*4]
-	ldp	x12,x13,[x2,#8*6]
-	adds	x19,x19,x6
-	adcs	x20,x20,x7
-	ldp	x6,x7,[x1,#8*0]
-	adcs	x21,x21,x8
-	adcs	x22,x22,x9
-	ldp	x8,x9,[x1,#8*2]
-	adcs	x23,x23,x10
-	adcs	x24,x24,x11
-	ldp	x10,x11,[x1,#8*4]
-	adcs	x25,x25,x12
-	mov	x0,x1
-	adcs	x26,xzr,x13
-	ldp	x12,x13,[x1,#8*6]
-	add	x1,x1,#8*8
-	//adc	x28,xzr,xzr		// moved below
-	mov	x27,#-8*8
-
-	//                                                         a[8]a[0]
-	//                                                     a[9]a[0]
-	//                                                 a[a]a[0]
-	//                                             a[b]a[0]
-	//                                         a[c]a[0]
-	//                                     a[d]a[0]
-	//                                 a[e]a[0]
-	//                             a[f]a[0]
-	//                                                     a[8]a[1]
-	//                         a[f]a[1]........................
-	//                                                 a[8]a[2]
-	//                     a[f]a[2]........................
-	//                                             a[8]a[3]
-	//                 a[f]a[3]........................
-	//                                         a[8]a[4]
-	//             a[f]a[4]........................
-	//                                     a[8]a[5]
-	//         a[f]a[5]........................
-	//                                 a[8]a[6]
-	//     a[f]a[6]........................
-	//                             a[8]a[7]
-	// a[f]a[7]........................
-Lsqr8x_mul:
-	mul	x14,x6,x4
-	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
-	mul	x15,x7,x4
-	add	x27,x27,#8
-	mul	x16,x8,x4
-	mul	x17,x9,x4
-	adds	x19,x19,x14
-	mul	x14,x10,x4
-	adcs	x20,x20,x15
-	mul	x15,x11,x4
-	adcs	x21,x21,x16
-	mul	x16,x12,x4
-	adcs	x22,x22,x17
-	mul	x17,x13,x4
-	adcs	x23,x23,x14
-	umulh	x14,x6,x4
-	adcs	x24,x24,x15
-	umulh	x15,x7,x4
-	adcs	x25,x25,x16
-	umulh	x16,x8,x4
-	adcs	x26,x26,x17
-	umulh	x17,x9,x4
-	adc	x28,x28,xzr
-	str	x19,[x2],#8
-	adds	x19,x20,x14
-	umulh	x14,x10,x4
-	adcs	x20,x21,x15
-	umulh	x15,x11,x4
-	adcs	x21,x22,x16
-	umulh	x16,x12,x4
-	adcs	x22,x23,x17
-	umulh	x17,x13,x4
-	ldr	x4,[x0,x27]
-	adcs	x23,x24,x14
-	adcs	x24,x25,x15
-	adcs	x25,x26,x16
-	adcs	x26,x28,x17
-	//adc	x28,xzr,xzr		// moved above
-	cbnz	x27,Lsqr8x_mul
-					// note that carry flag is guaranteed
-					// to be zero at this point
-	cmp	x1,x3		// done yet?
-	b.eq	Lsqr8x_break
-
-	ldp	x6,x7,[x2,#8*0]
-	ldp	x8,x9,[x2,#8*2]
-	ldp	x10,x11,[x2,#8*4]
-	ldp	x12,x13,[x2,#8*6]
-	adds	x19,x19,x6
-	ldr	x4,[x0,#-8*8]
-	adcs	x20,x20,x7
-	ldp	x6,x7,[x1,#8*0]
-	adcs	x21,x21,x8
-	adcs	x22,x22,x9
-	ldp	x8,x9,[x1,#8*2]
-	adcs	x23,x23,x10
-	adcs	x24,x24,x11
-	ldp	x10,x11,[x1,#8*4]
-	adcs	x25,x25,x12
-	mov	x27,#-8*8
-	adcs	x26,x26,x13
-	ldp	x12,x13,[x1,#8*6]
-	add	x1,x1,#8*8
-	//adc	x28,xzr,xzr		// moved above
-	b	Lsqr8x_mul
-
-.align	4
-Lsqr8x_break:
-	ldp	x6,x7,[x0,#8*0]
-	add	x1,x0,#8*8
-	ldp	x8,x9,[x0,#8*2]
-	sub	x14,x3,x1		// is it last iteration?
-	ldp	x10,x11,[x0,#8*4]
-	sub	x15,x2,x14
-	ldp	x12,x13,[x0,#8*6]
-	cbz	x14,Lsqr8x_outer_loop
-
-	stp	x19,x20,[x2,#8*0]
-	ldp	x19,x20,[x15,#8*0]
-	stp	x21,x22,[x2,#8*2]
-	ldp	x21,x22,[x15,#8*2]
-	stp	x23,x24,[x2,#8*4]
-	ldp	x23,x24,[x15,#8*4]
-	stp	x25,x26,[x2,#8*6]
-	mov	x2,x15
-	ldp	x25,x26,[x15,#8*6]
-	b	Lsqr8x_outer_loop
-
-.align	4
-Lsqr8x_outer_break:
-	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
-	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
-	ldp	x15,x16,[sp,#8*1]
-	ldp	x11,x13,[x14,#8*2]
-	add	x1,x14,#8*4
-	ldp	x17,x14,[sp,#8*3]
-
-	stp	x19,x20,[x2,#8*0]
-	mul	x19,x7,x7
-	stp	x21,x22,[x2,#8*2]
-	umulh	x7,x7,x7
-	stp	x23,x24,[x2,#8*4]
-	mul	x8,x9,x9
-	stp	x25,x26,[x2,#8*6]
-	mov	x2,sp
-	umulh	x9,x9,x9
-	adds	x20,x7,x15,lsl#1
-	extr	x15,x16,x15,#63
-	sub	x27,x5,#8*4
-
-Lsqr4x_shift_n_add:
-	adcs	x21,x8,x15
-	extr	x16,x17,x16,#63
-	sub	x27,x27,#8*4
-	adcs	x22,x9,x16
-	ldp	x15,x16,[x2,#8*5]
-	mul	x10,x11,x11
-	ldp	x7,x9,[x1],#8*2
-	umulh	x11,x11,x11
-	mul	x12,x13,x13
-	umulh	x13,x13,x13
-	extr	x17,x14,x17,#63
-	stp	x19,x20,[x2,#8*0]
-	adcs	x23,x10,x17
-	extr	x14,x15,x14,#63
-	stp	x21,x22,[x2,#8*2]
-	adcs	x24,x11,x14
-	ldp	x17,x14,[x2,#8*7]
-	extr	x15,x16,x15,#63
-	adcs	x25,x12,x15
-	extr	x16,x17,x16,#63
-	adcs	x26,x13,x16
-	ldp	x15,x16,[x2,#8*9]
-	mul	x6,x7,x7
-	ldp	x11,x13,[x1],#8*2
-	umulh	x7,x7,x7
-	mul	x8,x9,x9
-	umulh	x9,x9,x9
-	stp	x23,x24,[x2,#8*4]
-	extr	x17,x14,x17,#63
-	stp	x25,x26,[x2,#8*6]
-	add	x2,x2,#8*8
-	adcs	x19,x6,x17
-	extr	x14,x15,x14,#63
-	adcs	x20,x7,x14
-	ldp	x17,x14,[x2,#8*3]
-	extr	x15,x16,x15,#63
-	cbnz	x27,Lsqr4x_shift_n_add
-	ldp	x1,x4,[x29,#104]	// pull np and n0
-
-	adcs	x21,x8,x15
-	extr	x16,x17,x16,#63
-	adcs	x22,x9,x16
-	ldp	x15,x16,[x2,#8*5]
-	mul	x10,x11,x11
-	umulh	x11,x11,x11
-	stp	x19,x20,[x2,#8*0]
-	mul	x12,x13,x13
-	umulh	x13,x13,x13
-	stp	x21,x22,[x2,#8*2]
-	extr	x17,x14,x17,#63
-	adcs	x23,x10,x17
-	extr	x14,x15,x14,#63
-	ldp	x19,x20,[sp,#8*0]
-	adcs	x24,x11,x14
-	extr	x15,x16,x15,#63
-	ldp	x6,x7,[x1,#8*0]
-	adcs	x25,x12,x15
-	extr	x16,xzr,x16,#63
-	ldp	x8,x9,[x1,#8*2]
-	adc	x26,x13,x16
-	ldp	x10,x11,[x1,#8*4]
-
-	// Reduce by 512 bits per iteration
-	mul	x28,x4,x19		// t[0]*n0
-	ldp	x12,x13,[x1,#8*6]
-	add	x3,x1,x5
-	ldp	x21,x22,[sp,#8*2]
-	stp	x23,x24,[x2,#8*4]
-	ldp	x23,x24,[sp,#8*4]
-	stp	x25,x26,[x2,#8*6]
-	ldp	x25,x26,[sp,#8*6]
-	add	x1,x1,#8*8
-	mov	x30,xzr		// initial top-most carry
-	mov	x2,sp
-	mov	x27,#8
-
-Lsqr8x_reduction:
-	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
-	mul	x15,x7,x28
-	sub	x27,x27,#1
-	mul	x16,x8,x28
-	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
-	mul	x17,x9,x28
-	// (*)	adds	xzr,x19,x14
-	subs	xzr,x19,#1		// (*)
-	mul	x14,x10,x28
-	adcs	x19,x20,x15
-	mul	x15,x11,x28
-	adcs	x20,x21,x16
-	mul	x16,x12,x28
-	adcs	x21,x22,x17
-	mul	x17,x13,x28
-	adcs	x22,x23,x14
-	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
-	adcs	x23,x24,x15
-	umulh	x15,x7,x28
-	adcs	x24,x25,x16
-	umulh	x16,x8,x28
-	adcs	x25,x26,x17
-	umulh	x17,x9,x28
-	adc	x26,xzr,xzr
-	adds	x19,x19,x14
-	umulh	x14,x10,x28
-	adcs	x20,x20,x15
-	umulh	x15,x11,x28
-	adcs	x21,x21,x16
-	umulh	x16,x12,x28
-	adcs	x22,x22,x17
-	umulh	x17,x13,x28
-	mul	x28,x4,x19		// next t[0]*n0
-	adcs	x23,x23,x14
-	adcs	x24,x24,x15
-	adcs	x25,x25,x16
-	adc	x26,x26,x17
-	cbnz	x27,Lsqr8x_reduction
-
-	ldp	x14,x15,[x2,#8*0]
-	ldp	x16,x17,[x2,#8*2]
-	mov	x0,x2
-	sub	x27,x3,x1	// done yet?
-	adds	x19,x19,x14
-	adcs	x20,x20,x15
-	ldp	x14,x15,[x2,#8*4]
-	adcs	x21,x21,x16
-	adcs	x22,x22,x17
-	ldp	x16,x17,[x2,#8*6]
-	adcs	x23,x23,x14
-	adcs	x24,x24,x15
-	adcs	x25,x25,x16
-	adcs	x26,x26,x17
-	//adc	x28,xzr,xzr		// moved below
-	cbz	x27,Lsqr8x8_post_condition
-
-	ldr	x4,[x2,#-8*8]
-	ldp	x6,x7,[x1,#8*0]
-	ldp	x8,x9,[x1,#8*2]
-	ldp	x10,x11,[x1,#8*4]
-	mov	x27,#-8*8
-	ldp	x12,x13,[x1,#8*6]
-	add	x1,x1,#8*8
-
-Lsqr8x_tail:
-	mul	x14,x6,x4
-	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
-	mul	x15,x7,x4
-	add	x27,x27,#8
-	mul	x16,x8,x4
-	mul	x17,x9,x4
-	adds	x19,x19,x14
-	mul	x14,x10,x4
-	adcs	x20,x20,x15
-	mul	x15,x11,x4
-	adcs	x21,x21,x16
-	mul	x16,x12,x4
-	adcs	x22,x22,x17
-	mul	x17,x13,x4
-	adcs	x23,x23,x14
-	umulh	x14,x6,x4
-	adcs	x24,x24,x15
-	umulh	x15,x7,x4
-	adcs	x25,x25,x16
-	umulh	x16,x8,x4
-	adcs	x26,x26,x17
-	umulh	x17,x9,x4
-	adc	x28,x28,xzr
-	str	x19,[x2],#8
-	adds	x19,x20,x14
-	umulh	x14,x10,x4
-	adcs	x20,x21,x15
-	umulh	x15,x11,x4
-	adcs	x21,x22,x16
-	umulh	x16,x12,x4
-	adcs	x22,x23,x17
-	umulh	x17,x13,x4
-	ldr	x4,[x0,x27]
-	adcs	x23,x24,x14
-	adcs	x24,x25,x15
-	adcs	x25,x26,x16
-	adcs	x26,x28,x17
-	//adc	x28,xzr,xzr		// moved above
-	cbnz	x27,Lsqr8x_tail
-					// note that carry flag is guaranteed
-					// to be zero at this point
-	ldp	x6,x7,[x2,#8*0]
-	sub	x27,x3,x1	// done yet?
-	sub	x16,x3,x5	// rewinded np
-	ldp	x8,x9,[x2,#8*2]
-	ldp	x10,x11,[x2,#8*4]
-	ldp	x12,x13,[x2,#8*6]
-	cbz	x27,Lsqr8x_tail_break
-
-	ldr	x4,[x0,#-8*8]
-	adds	x19,x19,x6
-	adcs	x20,x20,x7
-	ldp	x6,x7,[x1,#8*0]
-	adcs	x21,x21,x8
-	adcs	x22,x22,x9
-	ldp	x8,x9,[x1,#8*2]
-	adcs	x23,x23,x10
-	adcs	x24,x24,x11
-	ldp	x10,x11,[x1,#8*4]
-	adcs	x25,x25,x12
-	mov	x27,#-8*8
-	adcs	x26,x26,x13
-	ldp	x12,x13,[x1,#8*6]
-	add	x1,x1,#8*8
-	//adc	x28,xzr,xzr		// moved above
-	b	Lsqr8x_tail
-
-.align	4
-Lsqr8x_tail_break:
-	ldr	x4,[x29,#112]		// pull n0
-	add	x27,x2,#8*8		// end of current t[num] window
-
-	subs	xzr,x30,#1		// "move" top-most carry to carry bit
-	adcs	x14,x19,x6
-	adcs	x15,x20,x7
-	ldp	x19,x20,[x0,#8*0]
-	adcs	x21,x21,x8
-	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
-	adcs	x22,x22,x9
-	ldp	x8,x9,[x16,#8*2]
-	adcs	x23,x23,x10
-	adcs	x24,x24,x11
-	ldp	x10,x11,[x16,#8*4]
-	adcs	x25,x25,x12
-	adcs	x26,x26,x13
-	ldp	x12,x13,[x16,#8*6]
-	add	x1,x16,#8*8
-	adc	x30,xzr,xzr	// top-most carry
-	mul	x28,x4,x19
-	stp	x14,x15,[x2,#8*0]
-	stp	x21,x22,[x2,#8*2]
-	ldp	x21,x22,[x0,#8*2]
-	stp	x23,x24,[x2,#8*4]
-	ldp	x23,x24,[x0,#8*4]
-	cmp	x27,x29		// did we hit the bottom?
-	stp	x25,x26,[x2,#8*6]
-	mov	x2,x0			// slide the window
-	ldp	x25,x26,[x0,#8*6]
-	mov	x27,#8
-	b.ne	Lsqr8x_reduction
-
-	// Final step. We see if result is larger than modulus, and
-	// if it is, subtract the modulus. But comparison implies
-	// subtraction. So we subtract modulus, see if it borrowed,
-	// and conditionally copy original value.
-	ldr	x0,[x29,#96]		// pull rp
-	add	x2,x2,#8*8
-	subs	x14,x19,x6
-	sbcs	x15,x20,x7
-	sub	x27,x5,#8*8
-	mov	x3,x0		// x0 copy
-
-Lsqr8x_sub:
-	sbcs	x16,x21,x8
-	ldp	x6,x7,[x1,#8*0]
-	sbcs	x17,x22,x9
-	stp	x14,x15,[x0,#8*0]
-	sbcs	x14,x23,x10
-	ldp	x8,x9,[x1,#8*2]
-	sbcs	x15,x24,x11
-	stp	x16,x17,[x0,#8*2]
-	sbcs	x16,x25,x12
-	ldp	x10,x11,[x1,#8*4]
-	sbcs	x17,x26,x13
-	ldp	x12,x13,[x1,#8*6]
-	add	x1,x1,#8*8
-	ldp	x19,x20,[x2,#8*0]
-	sub	x27,x27,#8*8
-	ldp	x21,x22,[x2,#8*2]
-	ldp	x23,x24,[x2,#8*4]
-	ldp	x25,x26,[x2,#8*6]
-	add	x2,x2,#8*8
-	stp	x14,x15,[x0,#8*4]
-	sbcs	x14,x19,x6
-	stp	x16,x17,[x0,#8*6]
-	add	x0,x0,#8*8
-	sbcs	x15,x20,x7
-	cbnz	x27,Lsqr8x_sub
-
-	sbcs	x16,x21,x8
-	mov	x2,sp
-	add	x1,sp,x5
-	ldp	x6,x7,[x3,#8*0]
-	sbcs	x17,x22,x9
-	stp	x14,x15,[x0,#8*0]
-	sbcs	x14,x23,x10
-	ldp	x8,x9,[x3,#8*2]
-	sbcs	x15,x24,x11
-	stp	x16,x17,[x0,#8*2]
-	sbcs	x16,x25,x12
-	ldp	x19,x20,[x1,#8*0]
-	sbcs	x17,x26,x13
-	ldp	x21,x22,[x1,#8*2]
-	sbcs	xzr,x30,xzr	// did it borrow?
-	ldr	x30,[x29,#8]		// pull return address
-	stp	x14,x15,[x0,#8*4]
-	stp	x16,x17,[x0,#8*6]
-
-	sub	x27,x5,#8*4
-Lsqr4x_cond_copy:
-	sub	x27,x27,#8*4
-	csel	x14,x19,x6,lo
-	stp	xzr,xzr,[x2,#8*0]
-	csel	x15,x20,x7,lo
-	ldp	x6,x7,[x3,#8*4]
-	ldp	x19,x20,[x1,#8*4]
-	csel	x16,x21,x8,lo
-	stp	xzr,xzr,[x2,#8*2]
-	add	x2,x2,#8*4
-	csel	x17,x22,x9,lo
-	ldp	x8,x9,[x3,#8*6]
-	ldp	x21,x22,[x1,#8*6]
-	add	x1,x1,#8*4
-	stp	x14,x15,[x3,#8*0]
-	stp	x16,x17,[x3,#8*2]
-	add	x3,x3,#8*4
-	stp	xzr,xzr,[x1,#8*0]
-	stp	xzr,xzr,[x1,#8*2]
-	cbnz	x27,Lsqr4x_cond_copy
-
-	csel	x14,x19,x6,lo
-	stp	xzr,xzr,[x2,#8*0]
-	csel	x15,x20,x7,lo
-	stp	xzr,xzr,[x2,#8*2]
-	csel	x16,x21,x8,lo
-	csel	x17,x22,x9,lo
-	stp	x14,x15,[x3,#8*0]
-	stp	x16,x17,[x3,#8*2]
-
-	b	Lsqr8x_done
-
-.align	4
-Lsqr8x8_post_condition:
-	adc	x28,xzr,xzr
-	ldr	x30,[x29,#8]		// pull return address
-	// x19-7,x28 hold result, x6-7 hold modulus
-	subs	x6,x19,x6
-	ldr	x1,[x29,#96]		// pull rp
-	sbcs	x7,x20,x7
-	stp	xzr,xzr,[sp,#8*0]
-	sbcs	x8,x21,x8
-	stp	xzr,xzr,[sp,#8*2]
-	sbcs	x9,x22,x9
-	stp	xzr,xzr,[sp,#8*4]
-	sbcs	x10,x23,x10
-	stp	xzr,xzr,[sp,#8*6]
-	sbcs	x11,x24,x11
-	stp	xzr,xzr,[sp,#8*8]
-	sbcs	x12,x25,x12
-	stp	xzr,xzr,[sp,#8*10]
-	sbcs	x13,x26,x13
-	stp	xzr,xzr,[sp,#8*12]
-	sbcs	x28,x28,xzr	// did it borrow?
-	stp	xzr,xzr,[sp,#8*14]
-
-	// x6-7 hold result-modulus
-	csel	x6,x19,x6,lo
-	csel	x7,x20,x7,lo
-	csel	x8,x21,x8,lo
-	csel	x9,x22,x9,lo
-	stp	x6,x7,[x1,#8*0]
-	csel	x10,x23,x10,lo
-	csel	x11,x24,x11,lo
-	stp	x8,x9,[x1,#8*2]
-	csel	x12,x25,x12,lo
-	csel	x13,x26,x13,lo
-	stp	x10,x11,[x1,#8*4]
-	stp	x12,x13,[x1,#8*6]
-
-Lsqr8x_done:
-	ldp	x19,x20,[x29,#16]
-	mov	sp,x29
-	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	// x30 is popped earlier
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.align	5
-__bn_mul4x_mont:
-	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
-	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
-	// return address.
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-
-	sub	x26,sp,x5,lsl#3
-	lsl	x5,x5,#3
-	ldr	x4,[x4]		// *n0
-	sub	sp,x26,#8*4		// alloca
-
-	add	x10,x2,x5
-	add	x27,x1,x5
-	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
-
-	ldr	x24,[x2,#8*0]		// b[0]
-	ldp	x6,x7,[x1,#8*0]	// a[0..3]
-	ldp	x8,x9,[x1,#8*2]
-	add	x1,x1,#8*4
-	mov	x19,xzr
-	mov	x20,xzr
-	mov	x21,xzr
-	mov	x22,xzr
-	ldp	x14,x15,[x3,#8*0]	// n[0..3]
-	ldp	x16,x17,[x3,#8*2]
-	adds	x3,x3,#8*4		// clear carry bit
-	mov	x0,xzr
-	mov	x28,#0
-	mov	x26,sp
-
-Loop_mul4x_1st_reduction:
-	mul	x10,x6,x24		// lo(a[0..3]*b[0])
-	adc	x0,x0,xzr	// modulo-scheduled
-	mul	x11,x7,x24
-	add	x28,x28,#8
-	mul	x12,x8,x24
-	and	x28,x28,#31
-	mul	x13,x9,x24
-	adds	x19,x19,x10
-	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
-	adcs	x20,x20,x11
-	mul	x25,x19,x4		// t[0]*n0
-	adcs	x21,x21,x12
-	umulh	x11,x7,x24
-	adcs	x22,x22,x13
-	umulh	x12,x8,x24
-	adc	x23,xzr,xzr
-	umulh	x13,x9,x24
-	ldr	x24,[x2,x28]		// next b[i] (or b[0])
-	adds	x20,x20,x10
-	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
-	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
-	adcs	x21,x21,x11
-	mul	x11,x15,x25
-	adcs	x22,x22,x12
-	mul	x12,x16,x25
-	adc	x23,x23,x13		// can't overflow
-	mul	x13,x17,x25
-	// (*)	adds	xzr,x19,x10
-	subs	xzr,x19,#1		// (*)
-	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
-	adcs	x19,x20,x11
-	umulh	x11,x15,x25
-	adcs	x20,x21,x12
-	umulh	x12,x16,x25
-	adcs	x21,x22,x13
-	umulh	x13,x17,x25
-	adcs	x22,x23,x0
-	adc	x0,xzr,xzr
-	adds	x19,x19,x10
-	sub	x10,x27,x1
-	adcs	x20,x20,x11
-	adcs	x21,x21,x12
-	adcs	x22,x22,x13
-	//adc	x0,x0,xzr
-	cbnz	x28,Loop_mul4x_1st_reduction
-
-	cbz	x10,Lmul4x4_post_condition
-
-	ldp	x6,x7,[x1,#8*0]	// a[4..7]
-	ldp	x8,x9,[x1,#8*2]
-	add	x1,x1,#8*4
-	ldr	x25,[sp]		// a[0]*n0
-	ldp	x14,x15,[x3,#8*0]	// n[4..7]
-	ldp	x16,x17,[x3,#8*2]
-	add	x3,x3,#8*4
-
-Loop_mul4x_1st_tail:
-	mul	x10,x6,x24		// lo(a[4..7]*b[i])
-	adc	x0,x0,xzr	// modulo-scheduled
-	mul	x11,x7,x24
-	add	x28,x28,#8
-	mul	x12,x8,x24
-	and	x28,x28,#31
-	mul	x13,x9,x24
-	adds	x19,x19,x10
-	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
-	adcs	x20,x20,x11
-	umulh	x11,x7,x24
-	adcs	x21,x21,x12
-	umulh	x12,x8,x24
-	adcs	x22,x22,x13
-	umulh	x13,x9,x24
-	adc	x23,xzr,xzr
-	ldr	x24,[x2,x28]		// next b[i] (or b[0])
-	adds	x20,x20,x10
-	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
-	adcs	x21,x21,x11
-	mul	x11,x15,x25
-	adcs	x22,x22,x12
-	mul	x12,x16,x25
-	adc	x23,x23,x13		// can't overflow
-	mul	x13,x17,x25
-	adds	x19,x19,x10
-	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
-	adcs	x20,x20,x11
-	umulh	x11,x15,x25
-	adcs	x21,x21,x12
-	umulh	x12,x16,x25
-	adcs	x22,x22,x13
-	adcs	x23,x23,x0
-	umulh	x13,x17,x25
-	adc	x0,xzr,xzr
-	ldr	x25,[sp,x28]		// next t[0]*n0
-	str	x19,[x26],#8		// result!!!
-	adds	x19,x20,x10
-	sub	x10,x27,x1		// done yet?
-	adcs	x20,x21,x11
-	adcs	x21,x22,x12
-	adcs	x22,x23,x13
-	//adc	x0,x0,xzr
-	cbnz	x28,Loop_mul4x_1st_tail
-
-	sub	x11,x27,x5	// rewinded x1
-	cbz	x10,Lmul4x_proceed
-
-	ldp	x6,x7,[x1,#8*0]
-	ldp	x8,x9,[x1,#8*2]
-	add	x1,x1,#8*4
-	ldp	x14,x15,[x3,#8*0]
-	ldp	x16,x17,[x3,#8*2]
-	add	x3,x3,#8*4
-	b	Loop_mul4x_1st_tail
-
-.align	5
-Lmul4x_proceed:
-	ldr	x24,[x2,#8*4]!		// *++b
-	adc	x30,x0,xzr
-	ldp	x6,x7,[x11,#8*0]	// a[0..3]
-	sub	x3,x3,x5		// rewind np
-	ldp	x8,x9,[x11,#8*2]
-	add	x1,x11,#8*4
-
-	stp	x19,x20,[x26,#8*0]	// result!!!
-	ldp	x19,x20,[sp,#8*4]	// t[0..3]
-	stp	x21,x22,[x26,#8*2]	// result!!!
-	ldp	x21,x22,[sp,#8*6]
-
-	ldp	x14,x15,[x3,#8*0]	// n[0..3]
-	mov	x26,sp
-	ldp	x16,x17,[x3,#8*2]
-	adds	x3,x3,#8*4		// clear carry bit
-	mov	x0,xzr
-
-.align	4
-Loop_mul4x_reduction:
-	mul	x10,x6,x24		// lo(a[0..3]*b[4])
-	adc	x0,x0,xzr	// modulo-scheduled
-	mul	x11,x7,x24
-	add	x28,x28,#8
-	mul	x12,x8,x24
-	and	x28,x28,#31
-	mul	x13,x9,x24
-	adds	x19,x19,x10
-	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
-	adcs	x20,x20,x11
-	mul	x25,x19,x4		// t[0]*n0
-	adcs	x21,x21,x12
-	umulh	x11,x7,x24
-	adcs	x22,x22,x13
-	umulh	x12,x8,x24
-	adc	x23,xzr,xzr
-	umulh	x13,x9,x24
-	ldr	x24,[x2,x28]		// next b[i]
-	adds	x20,x20,x10
-	// (*)	mul	x10,x14,x25
-	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
-	adcs	x21,x21,x11
-	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
-	adcs	x22,x22,x12
-	mul	x12,x16,x25
-	adc	x23,x23,x13		// can't overflow
-	mul	x13,x17,x25
-	// (*)	adds	xzr,x19,x10
-	subs	xzr,x19,#1		// (*)
-	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
-	adcs	x19,x20,x11
-	umulh	x11,x15,x25
-	adcs	x20,x21,x12
-	umulh	x12,x16,x25
-	adcs	x21,x22,x13
-	umulh	x13,x17,x25
-	adcs	x22,x23,x0
-	adc	x0,xzr,xzr
-	adds	x19,x19,x10
-	adcs	x20,x20,x11
-	adcs	x21,x21,x12
-	adcs	x22,x22,x13
-	//adc	x0,x0,xzr
-	cbnz	x28,Loop_mul4x_reduction
-
-	adc	x0,x0,xzr
-	ldp	x10,x11,[x26,#8*4]	// t[4..7]
-	ldp	x12,x13,[x26,#8*6]
-	ldp	x6,x7,[x1,#8*0]	// a[4..7]
-	ldp	x8,x9,[x1,#8*2]
-	add	x1,x1,#8*4
-	adds	x19,x19,x10
-	adcs	x20,x20,x11
-	adcs	x21,x21,x12
-	adcs	x22,x22,x13
-	//adc	x0,x0,xzr
-
-	ldr	x25,[sp]		// t[0]*n0
-	ldp	x14,x15,[x3,#8*0]	// n[4..7]
-	ldp	x16,x17,[x3,#8*2]
-	add	x3,x3,#8*4
-
-.align	4
-Loop_mul4x_tail:
-	mul	x10,x6,x24		// lo(a[4..7]*b[4])
-	adc	x0,x0,xzr	// modulo-scheduled
-	mul	x11,x7,x24
-	add	x28,x28,#8
-	mul	x12,x8,x24
-	and	x28,x28,#31
-	mul	x13,x9,x24
-	adds	x19,x19,x10
-	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
-	adcs	x20,x20,x11
-	umulh	x11,x7,x24
-	adcs	x21,x21,x12
-	umulh	x12,x8,x24
-	adcs	x22,x22,x13
-	umulh	x13,x9,x24
-	adc	x23,xzr,xzr
-	ldr	x24,[x2,x28]		// next b[i]
-	adds	x20,x20,x10
-	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
-	adcs	x21,x21,x11
-	mul	x11,x15,x25
-	adcs	x22,x22,x12
-	mul	x12,x16,x25
-	adc	x23,x23,x13		// can't overflow
-	mul	x13,x17,x25
-	adds	x19,x19,x10
-	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
-	adcs	x20,x20,x11
-	umulh	x11,x15,x25
-	adcs	x21,x21,x12
-	umulh	x12,x16,x25
-	adcs	x22,x22,x13
-	umulh	x13,x17,x25
-	adcs	x23,x23,x0
-	ldr	x25,[sp,x28]		// next a[0]*n0
-	adc	x0,xzr,xzr
-	str	x19,[x26],#8		// result!!!
-	adds	x19,x20,x10
-	sub	x10,x27,x1		// done yet?
-	adcs	x20,x21,x11
-	adcs	x21,x22,x12
-	adcs	x22,x23,x13
-	//adc	x0,x0,xzr
-	cbnz	x28,Loop_mul4x_tail
-
-	sub	x11,x3,x5		// rewinded np?
-	adc	x0,x0,xzr
-	cbz	x10,Loop_mul4x_break
-
-	ldp	x10,x11,[x26,#8*4]
-	ldp	x12,x13,[x26,#8*6]
-	ldp	x6,x7,[x1,#8*0]
-	ldp	x8,x9,[x1,#8*2]
-	add	x1,x1,#8*4
-	adds	x19,x19,x10
-	adcs	x20,x20,x11
-	adcs	x21,x21,x12
-	adcs	x22,x22,x13
-	//adc	x0,x0,xzr
-	ldp	x14,x15,[x3,#8*0]
-	ldp	x16,x17,[x3,#8*2]
-	add	x3,x3,#8*4
-	b	Loop_mul4x_tail
-
-.align	4
-Loop_mul4x_break:
-	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
-	adds	x19,x19,x30
-	add	x2,x2,#8*4		// bp++
-	adcs	x20,x20,xzr
-	sub	x1,x1,x5		// rewind ap
-	adcs	x21,x21,xzr
-	stp	x19,x20,[x26,#8*0]	// result!!!
-	adcs	x22,x22,xzr
-	ldp	x19,x20,[sp,#8*4]	// t[0..3]
-	adc	x30,x0,xzr
-	stp	x21,x22,[x26,#8*2]	// result!!!
-	cmp	x2,x13			// done yet?
-	ldp	x21,x22,[sp,#8*6]
-	ldp	x14,x15,[x11,#8*0]	// n[0..3]
-	ldp	x16,x17,[x11,#8*2]
-	add	x3,x11,#8*4
-	b.eq	Lmul4x_post
-
-	ldr	x24,[x2]
-	ldp	x6,x7,[x1,#8*0]	// a[0..3]
-	ldp	x8,x9,[x1,#8*2]
-	adds	x1,x1,#8*4		// clear carry bit
-	mov	x0,xzr
-	mov	x26,sp
-	b	Loop_mul4x_reduction
-
-.align	4
-Lmul4x_post:
-	// Final step. We see if result is larger than modulus, and
-	// if it is, subtract the modulus. But comparison implies
-	// subtraction. So we subtract modulus, see if it borrowed,
-	// and conditionally copy original value.
-	mov	x0,x12
-	mov	x27,x12		// x0 copy
-	subs	x10,x19,x14
-	add	x26,sp,#8*8
-	sbcs	x11,x20,x15
-	sub	x28,x5,#8*4
-
-Lmul4x_sub:
-	sbcs	x12,x21,x16
-	ldp	x14,x15,[x3,#8*0]
-	sub	x28,x28,#8*4
-	ldp	x19,x20,[x26,#8*0]
-	sbcs	x13,x22,x17
-	ldp	x16,x17,[x3,#8*2]
-	add	x3,x3,#8*4
-	ldp	x21,x22,[x26,#8*2]
-	add	x26,x26,#8*4
-	stp	x10,x11,[x0,#8*0]
-	sbcs	x10,x19,x14
-	stp	x12,x13,[x0,#8*2]
-	add	x0,x0,#8*4
-	sbcs	x11,x20,x15
-	cbnz	x28,Lmul4x_sub
-
-	sbcs	x12,x21,x16
-	mov	x26,sp
-	add	x1,sp,#8*4
-	ldp	x6,x7,[x27,#8*0]
-	sbcs	x13,x22,x17
-	stp	x10,x11,[x0,#8*0]
-	ldp	x8,x9,[x27,#8*2]
-	stp	x12,x13,[x0,#8*2]
-	ldp	x19,x20,[x1,#8*0]
-	ldp	x21,x22,[x1,#8*2]
-	sbcs	xzr,x30,xzr	// did it borrow?
-	ldr	x30,[x29,#8]		// pull return address
-
-	sub	x28,x5,#8*4
-Lmul4x_cond_copy:
-	sub	x28,x28,#8*4
-	csel	x10,x19,x6,lo
-	stp	xzr,xzr,[x26,#8*0]
-	csel	x11,x20,x7,lo
-	ldp	x6,x7,[x27,#8*4]
-	ldp	x19,x20,[x1,#8*4]
-	csel	x12,x21,x8,lo
-	stp	xzr,xzr,[x26,#8*2]
-	add	x26,x26,#8*4
-	csel	x13,x22,x9,lo
-	ldp	x8,x9,[x27,#8*6]
-	ldp	x21,x22,[x1,#8*6]
-	add	x1,x1,#8*4
-	stp	x10,x11,[x27,#8*0]
-	stp	x12,x13,[x27,#8*2]
-	add	x27,x27,#8*4
-	cbnz	x28,Lmul4x_cond_copy
-
-	csel	x10,x19,x6,lo
-	stp	xzr,xzr,[x26,#8*0]
-	csel	x11,x20,x7,lo
-	stp	xzr,xzr,[x26,#8*2]
-	csel	x12,x21,x8,lo
-	stp	xzr,xzr,[x26,#8*3]
-	csel	x13,x22,x9,lo
-	stp	xzr,xzr,[x26,#8*4]
-	stp	x10,x11,[x27,#8*0]
-	stp	x12,x13,[x27,#8*2]
-
-	b	Lmul4x_done
-
-.align	4
-Lmul4x4_post_condition:
-	adc	x0,x0,xzr
-	ldr	x1,[x29,#96]		// pull rp
-	// x19-3,x0 hold result, x14-7 hold modulus
-	subs	x6,x19,x14
-	ldr	x30,[x29,#8]		// pull return address
-	sbcs	x7,x20,x15
-	stp	xzr,xzr,[sp,#8*0]
-	sbcs	x8,x21,x16
-	stp	xzr,xzr,[sp,#8*2]
-	sbcs	x9,x22,x17
-	stp	xzr,xzr,[sp,#8*4]
-	sbcs	xzr,x0,xzr		// did it borrow?
-	stp	xzr,xzr,[sp,#8*6]
-
-	// x6-3 hold result-modulus
-	csel	x6,x19,x6,lo
-	csel	x7,x20,x7,lo
-	csel	x8,x21,x8,lo
-	csel	x9,x22,x9,lo
-	stp	x6,x7,[x1,#8*0]
-	stp	x8,x9,[x1,#8*2]
-
-Lmul4x_done:
-	ldp	x19,x20,[x29,#16]
-	mov	sp,x29
-	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	// x30 is popped earlier
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	4
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/bn-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/bn-armv8-apple.S
deleted file mode 100644
index 5e3471a..0000000
--- a/apple-aarch64/crypto/fipsmodule/bn-armv8-apple.S
+++ /dev/null
@@ -1,89 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.text
-
-// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-//                       size_t num);
-
-.globl	_bn_add_words
-.private_extern	_bn_add_words
-.align	4
-_bn_add_words:
-	AARCH64_VALID_CALL_TARGET
-	# Clear the carry flag.
-	cmn	xzr, xzr
-
-	# aarch64 can load two registers at a time, so we do two loop iterations at
-	# at a time. Split x3 = 2 * x8 + x3. This allows loop
-	# operations to use CBNZ without clobbering the carry flag.
-	lsr	x8, x3, #1
-	and	x3, x3, #1
-
-	cbz	x8, Ladd_tail
-Ladd_loop:
-	ldp	x4, x5, [x1], #16
-	ldp	x6, x7, [x2], #16
-	sub	x8, x8, #1
-	adcs	x4, x4, x6
-	adcs	x5, x5, x7
-	stp	x4, x5, [x0], #16
-	cbnz	x8, Ladd_loop
-
-Ladd_tail:
-	cbz	x3, Ladd_exit
-	ldr	x4, [x1], #8
-	ldr	x6, [x2], #8
-	adcs	x4, x4, x6
-	str	x4, [x0], #8
-
-Ladd_exit:
-	cset	x0, cs
-	ret
-
-
-// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-//                       size_t num);
-
-.globl	_bn_sub_words
-.private_extern	_bn_sub_words
-.align	4
-_bn_sub_words:
-	AARCH64_VALID_CALL_TARGET
-	# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
-	# so we want C = 1 here.
-	cmp	xzr, xzr
-
-	# aarch64 can load two registers at a time, so we do two loop iterations at
-	# at a time. Split x3 = 2 * x8 + x3. This allows loop
-	# operations to use CBNZ without clobbering the carry flag.
-	lsr	x8, x3, #1
-	and	x3, x3, #1
-
-	cbz	x8, Lsub_tail
-Lsub_loop:
-	ldp	x4, x5, [x1], #16
-	ldp	x6, x7, [x2], #16
-	sub	x8, x8, #1
-	sbcs	x4, x4, x6
-	sbcs	x5, x5, x7
-	stp	x4, x5, [x0], #16
-	cbnz	x8, Lsub_loop
-
-Lsub_tail:
-	cbz	x3, Lsub_exit
-	ldr	x4, [x1], #8
-	ldr	x6, [x2], #8
-	sbcs	x4, x4, x6
-	str	x4, [x0], #8
-
-Lsub_exit:
-	cset	x0, cc
-	ret
-
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8-apple.S
deleted file mode 100644
index a76b8d1..0000000
--- a/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8-apple.S
+++ /dev/null
@@ -1,335 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.text
-
-.globl	_gcm_init_neon
-.private_extern	_gcm_init_neon
-
-.align	4
-_gcm_init_neon:
-	AARCH64_VALID_CALL_TARGET
-	// This function is adapted from gcm_init_v8. xC2 is t3.
-	ld1	{v17.2d}, [x1]			// load H
-	movi	v19.16b, #0xe1
-	shl	v19.2d, v19.2d, #57		// 0xc2.0
-	ext	v3.16b, v17.16b, v17.16b, #8
-	ushr	v18.2d, v19.2d, #63
-	dup	v17.4s, v17.s[1]
-	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
-	ushr	v18.2d, v3.2d, #63
-	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
-	and	v18.16b, v18.16b, v16.16b
-	shl	v3.2d, v3.2d, #1
-	ext	v18.16b, v18.16b, v18.16b, #8
-	and	v16.16b, v16.16b, v17.16b
-	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
-	eor	v5.16b, v3.16b, v16.16b	// twisted H
-	st1	{v5.2d}, [x0]			// store Htable[0]
-	ret
-
-
-.globl	_gcm_gmult_neon
-.private_extern	_gcm_gmult_neon
-
-.align	4
-_gcm_gmult_neon:
-	AARCH64_VALID_CALL_TARGET
-	ld1	{v3.16b}, [x0]		// load Xi
-	ld1	{v5.1d}, [x1], #8		// load twisted H
-	ld1	{v6.1d}, [x1]
-	adrp	x9, Lmasks@PAGE		// load constants
-	add	x9, x9, Lmasks@PAGEOFF
-	ld1	{v24.2d, v25.2d}, [x9]
-	rev64	v3.16b, v3.16b		// byteswap Xi
-	ext	v3.16b, v3.16b, v3.16b, #8
-	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
-
-	mov	x3, #16
-	b	Lgmult_neon
-
-
-.globl	_gcm_ghash_neon
-.private_extern	_gcm_ghash_neon
-
-.align	4
-_gcm_ghash_neon:
-	AARCH64_VALID_CALL_TARGET
-	ld1	{v0.16b}, [x0]		// load Xi
-	ld1	{v5.1d}, [x1], #8		// load twisted H
-	ld1	{v6.1d}, [x1]
-	adrp	x9, Lmasks@PAGE		// load constants
-	add	x9, x9, Lmasks@PAGEOFF
-	ld1	{v24.2d, v25.2d}, [x9]
-	rev64	v0.16b, v0.16b		// byteswap Xi
-	ext	v0.16b, v0.16b, v0.16b, #8
-	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
-
-Loop_neon:
-	ld1	{v3.16b}, [x2], #16	// load inp
-	rev64	v3.16b, v3.16b		// byteswap inp
-	ext	v3.16b, v3.16b, v3.16b, #8
-	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
-
-Lgmult_neon:
-	// Split the input into v3 and v4. (The upper halves are unused,
-	// so it is okay to leave them alone.)
-	ins	v4.d[0], v3.d[1]
-	ext	v16.8b, v5.8b, v5.8b, #1	// A1
-	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
-	ext	v0.8b, v3.8b, v3.8b, #1		// B1
-	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
-	ext	v17.8b, v5.8b, v5.8b, #2	// A2
-	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
-	ext	v19.8b, v3.8b, v3.8b, #2	// B2
-	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
-	ext	v18.8b, v5.8b, v5.8b, #3	// A3
-	eor	v16.16b, v16.16b, v0.16b	// L = E + F
-	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
-	ext	v0.8b, v3.8b, v3.8b, #3		// B3
-	eor	v17.16b, v17.16b, v19.16b	// M = G + H
-	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
-
-	// Here we diverge from the 32-bit version. It computes the following
-	// (instructions reordered for clarity):
-	//
-	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
-	//     vand	$t0#hi, $t0#hi, $k48
-	//     veor	$t0#lo, $t0#lo, $t0#hi
-	//
-	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
-	//     vand	$t1#hi, $t1#hi, $k32
-	//     veor	$t1#lo, $t1#lo, $t1#hi
-	//
-	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
-	//     vand	$t2#hi, $t2#hi, $k16
-	//     veor	$t2#lo, $t2#lo, $t2#hi
-	//
-	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
-	//     vmov.i64	$t3#hi, #0
-	//
-	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
-	// upper halves of SIMD registers, so we must split each half into
-	// separate registers. To compensate, we pair computations up and
-	// parallelize.
-
-	ext	v19.8b, v3.8b, v3.8b, #4	// B4
-	eor	v18.16b, v18.16b, v0.16b	// N = I + J
-	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
-
-	// This can probably be scheduled more efficiently. For now, we just
-	// pair up independent instructions.
-	zip1	v20.2d, v16.2d, v17.2d
-	zip1	v22.2d, v18.2d, v19.2d
-	zip2	v21.2d, v16.2d, v17.2d
-	zip2	v23.2d, v18.2d, v19.2d
-	eor	v20.16b, v20.16b, v21.16b
-	eor	v22.16b, v22.16b, v23.16b
-	and	v21.16b, v21.16b, v24.16b
-	and	v23.16b, v23.16b, v25.16b
-	eor	v20.16b, v20.16b, v21.16b
-	eor	v22.16b, v22.16b, v23.16b
-	zip1	v16.2d, v20.2d, v21.2d
-	zip1	v18.2d, v22.2d, v23.2d
-	zip2	v17.2d, v20.2d, v21.2d
-	zip2	v19.2d, v22.2d, v23.2d
-
-	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
-	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
-	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
-	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
-	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
-	eor	v16.16b, v16.16b, v17.16b
-	eor	v18.16b, v18.16b, v19.16b
-	eor	v0.16b, v0.16b, v16.16b
-	eor	v0.16b, v0.16b, v18.16b
-	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
-	ext	v16.8b, v7.8b, v7.8b, #1	// A1
-	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
-	ext	v1.8b, v3.8b, v3.8b, #1		// B1
-	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
-	ext	v17.8b, v7.8b, v7.8b, #2	// A2
-	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
-	ext	v19.8b, v3.8b, v3.8b, #2	// B2
-	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
-	ext	v18.8b, v7.8b, v7.8b, #3	// A3
-	eor	v16.16b, v16.16b, v1.16b	// L = E + F
-	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
-	ext	v1.8b, v3.8b, v3.8b, #3		// B3
-	eor	v17.16b, v17.16b, v19.16b	// M = G + H
-	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
-
-	// Here we diverge from the 32-bit version. It computes the following
-	// (instructions reordered for clarity):
-	//
-	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
-	//     vand	$t0#hi, $t0#hi, $k48
-	//     veor	$t0#lo, $t0#lo, $t0#hi
-	//
-	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
-	//     vand	$t1#hi, $t1#hi, $k32
-	//     veor	$t1#lo, $t1#lo, $t1#hi
-	//
-	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
-	//     vand	$t2#hi, $t2#hi, $k16
-	//     veor	$t2#lo, $t2#lo, $t2#hi
-	//
-	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
-	//     vmov.i64	$t3#hi, #0
-	//
-	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
-	// upper halves of SIMD registers, so we must split each half into
-	// separate registers. To compensate, we pair computations up and
-	// parallelize.
-
-	ext	v19.8b, v3.8b, v3.8b, #4	// B4
-	eor	v18.16b, v18.16b, v1.16b	// N = I + J
-	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
-
-	// This can probably be scheduled more efficiently. For now, we just
-	// pair up independent instructions.
-	zip1	v20.2d, v16.2d, v17.2d
-	zip1	v22.2d, v18.2d, v19.2d
-	zip2	v21.2d, v16.2d, v17.2d
-	zip2	v23.2d, v18.2d, v19.2d
-	eor	v20.16b, v20.16b, v21.16b
-	eor	v22.16b, v22.16b, v23.16b
-	and	v21.16b, v21.16b, v24.16b
-	and	v23.16b, v23.16b, v25.16b
-	eor	v20.16b, v20.16b, v21.16b
-	eor	v22.16b, v22.16b, v23.16b
-	zip1	v16.2d, v20.2d, v21.2d
-	zip1	v18.2d, v22.2d, v23.2d
-	zip2	v17.2d, v20.2d, v21.2d
-	zip2	v19.2d, v22.2d, v23.2d
-
-	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
-	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
-	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
-	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
-	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
-	eor	v16.16b, v16.16b, v17.16b
-	eor	v18.16b, v18.16b, v19.16b
-	eor	v1.16b, v1.16b, v16.16b
-	eor	v1.16b, v1.16b, v18.16b
-	ext	v16.8b, v6.8b, v6.8b, #1	// A1
-	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
-	ext	v2.8b, v4.8b, v4.8b, #1		// B1
-	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
-	ext	v17.8b, v6.8b, v6.8b, #2	// A2
-	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
-	ext	v19.8b, v4.8b, v4.8b, #2	// B2
-	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
-	ext	v18.8b, v6.8b, v6.8b, #3	// A3
-	eor	v16.16b, v16.16b, v2.16b	// L = E + F
-	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
-	ext	v2.8b, v4.8b, v4.8b, #3		// B3
-	eor	v17.16b, v17.16b, v19.16b	// M = G + H
-	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
-
-	// Here we diverge from the 32-bit version. It computes the following
-	// (instructions reordered for clarity):
-	//
-	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
-	//     vand	$t0#hi, $t0#hi, $k48
-	//     veor	$t0#lo, $t0#lo, $t0#hi
-	//
-	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
-	//     vand	$t1#hi, $t1#hi, $k32
-	//     veor	$t1#lo, $t1#lo, $t1#hi
-	//
-	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
-	//     vand	$t2#hi, $t2#hi, $k16
-	//     veor	$t2#lo, $t2#lo, $t2#hi
-	//
-	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
-	//     vmov.i64	$t3#hi, #0
-	//
-	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
-	// upper halves of SIMD registers, so we must split each half into
-	// separate registers. To compensate, we pair computations up and
-	// parallelize.
-
-	ext	v19.8b, v4.8b, v4.8b, #4	// B4
-	eor	v18.16b, v18.16b, v2.16b	// N = I + J
-	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
-
-	// This can probably be scheduled more efficiently. For now, we just
-	// pair up independent instructions.
-	zip1	v20.2d, v16.2d, v17.2d
-	zip1	v22.2d, v18.2d, v19.2d
-	zip2	v21.2d, v16.2d, v17.2d
-	zip2	v23.2d, v18.2d, v19.2d
-	eor	v20.16b, v20.16b, v21.16b
-	eor	v22.16b, v22.16b, v23.16b
-	and	v21.16b, v21.16b, v24.16b
-	and	v23.16b, v23.16b, v25.16b
-	eor	v20.16b, v20.16b, v21.16b
-	eor	v22.16b, v22.16b, v23.16b
-	zip1	v16.2d, v20.2d, v21.2d
-	zip1	v18.2d, v22.2d, v23.2d
-	zip2	v17.2d, v20.2d, v21.2d
-	zip2	v19.2d, v22.2d, v23.2d
-
-	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
-	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
-	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
-	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
-	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
-	eor	v16.16b, v16.16b, v17.16b
-	eor	v18.16b, v18.16b, v19.16b
-	eor	v2.16b, v2.16b, v16.16b
-	eor	v2.16b, v2.16b, v18.16b
-	ext	v16.16b, v0.16b, v2.16b, #8
-	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
-	eor	v1.16b, v1.16b, v2.16b
-	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
-	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
-	// This is a no-op due to the ins instruction below.
-	// ins	v2.d[0], v1.d[1]
-
-	// equivalent of reduction_avx from ghash-x86_64.pl
-	shl	v17.2d, v0.2d, #57		// 1st phase
-	shl	v18.2d, v0.2d, #62
-	eor	v18.16b, v18.16b, v17.16b	//
-	shl	v17.2d, v0.2d, #63
-	eor	v18.16b, v18.16b, v17.16b	//
-	// Note Xm contains {Xl.d[1], Xh.d[0]}.
-	eor	v18.16b, v18.16b, v1.16b
-	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
-	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
-
-	ushr	v18.2d, v0.2d, #1		// 2nd phase
-	eor	v2.16b, v2.16b,v0.16b
-	eor	v0.16b, v0.16b,v18.16b	//
-	ushr	v18.2d, v18.2d, #6
-	ushr	v0.2d, v0.2d, #1		//
-	eor	v0.16b, v0.16b, v2.16b	//
-	eor	v0.16b, v0.16b, v18.16b	//
-
-	subs	x3, x3, #16
-	bne	Loop_neon
-
-	rev64	v0.16b, v0.16b		// byteswap Xi and write
-	ext	v0.16b, v0.16b, v0.16b, #8
-	st1	{v0.16b}, [x0]
-
-	ret
-
-
-.section	__TEXT,__const
-.align	4
-Lmasks:
-.quad	0x0000ffffffffffff	// k48
-.quad	0x00000000ffffffff	// k32
-.quad	0x000000000000ffff	// k16
-.quad	0x0000000000000000	// k0
-.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/ghashv8-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/ghashv8-armv8-apple.S
deleted file mode 100644
index 6bc8a4f..0000000
--- a/apple-aarch64/crypto/fipsmodule/ghashv8-armv8-apple.S
+++ /dev/null
@@ -1,565 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-#if __ARM_MAX_ARCH__>=7
-.text
-
-.globl	_gcm_init_v8
-.private_extern	_gcm_init_v8
-
-.align	4
-_gcm_init_v8:
-	AARCH64_VALID_CALL_TARGET
-	ld1	{v17.2d},[x1]		//load input H
-	movi	v19.16b,#0xe1
-	shl	v19.2d,v19.2d,#57		//0xc2.0
-	ext	v3.16b,v17.16b,v17.16b,#8
-	ushr	v18.2d,v19.2d,#63
-	dup	v17.4s,v17.s[1]
-	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
-	ushr	v18.2d,v3.2d,#63
-	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
-	and	v18.16b,v18.16b,v16.16b
-	shl	v3.2d,v3.2d,#1
-	ext	v18.16b,v18.16b,v18.16b,#8
-	and	v16.16b,v16.16b,v17.16b
-	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
-	eor	v20.16b,v3.16b,v16.16b		//twisted H
-	st1	{v20.2d},[x0],#16		//store Htable[0]
-
-	//calculate H^2
-	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
-	pmull	v0.1q,v20.1d,v20.1d
-	eor	v16.16b,v16.16b,v20.16b
-	pmull2	v2.1q,v20.2d,v20.2d
-	pmull	v1.1q,v16.1d,v16.1d
-
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	eor	v1.16b,v1.16b,v18.16b
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase
-
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	eor	v0.16b,v1.16b,v18.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v22.16b,v0.16b,v18.16b
-
-	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
-	eor	v17.16b,v17.16b,v22.16b
-	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
-	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
-	//calculate H^3 and H^4
-	pmull	v0.1q,v20.1d, v22.1d
-	pmull	v5.1q,v22.1d,v22.1d
-	pmull2	v2.1q,v20.2d, v22.2d
-	pmull2	v7.1q,v22.2d,v22.2d
-	pmull	v1.1q,v16.1d,v17.1d
-	pmull	v6.1q,v17.1d,v17.1d
-
-	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	ext	v17.16b,v5.16b,v7.16b,#8
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v16.16b
-	eor	v4.16b,v5.16b,v7.16b
-	eor	v6.16b,v6.16b,v17.16b
-	eor	v1.16b,v1.16b,v18.16b
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase
-	eor	v6.16b,v6.16b,v4.16b
-	pmull	v4.1q,v5.1d,v19.1d
-
-	ins	v2.d[0],v1.d[1]
-	ins	v7.d[0],v6.d[1]
-	ins	v1.d[1],v0.d[0]
-	ins	v6.d[1],v5.d[0]
-	eor	v0.16b,v1.16b,v18.16b
-	eor	v5.16b,v6.16b,v4.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
-	ext	v4.16b,v5.16b,v5.16b,#8
-	pmull	v0.1q,v0.1d,v19.1d
-	pmull	v5.1q,v5.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v4.16b,v4.16b,v7.16b
-	eor	v20.16b, v0.16b,v18.16b		//H^3
-	eor	v22.16b,v5.16b,v4.16b		//H^4
-
-	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
-	ext	v17.16b,v22.16b,v22.16b,#8
-	eor	v16.16b,v16.16b,v20.16b
-	eor	v17.16b,v17.16b,v22.16b
-	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
-	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
-	ret
-
-.globl	_gcm_gmult_v8
-.private_extern	_gcm_gmult_v8
-
-.align	4
-_gcm_gmult_v8:
-	AARCH64_VALID_CALL_TARGET
-	ld1	{v17.2d},[x0]		//load Xi
-	movi	v19.16b,#0xe1
-	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
-	shl	v19.2d,v19.2d,#57
-#ifndef __AARCH64EB__
-	rev64	v17.16b,v17.16b
-#endif
-	ext	v3.16b,v17.16b,v17.16b,#8
-
-	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
-	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
-	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
-	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
-
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	eor	v1.16b,v1.16b,v18.16b
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	eor	v0.16b,v1.16b,v18.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v0.16b,v0.16b,v18.16b
-
-#ifndef __AARCH64EB__
-	rev64	v0.16b,v0.16b
-#endif
-	ext	v0.16b,v0.16b,v0.16b,#8
-	st1	{v0.2d},[x0]		//write out Xi
-
-	ret
-
-.globl	_gcm_ghash_v8
-.private_extern	_gcm_ghash_v8
-
-.align	4
-_gcm_ghash_v8:
-	AARCH64_VALID_CALL_TARGET
-	cmp	x3,#64
-	b.hs	Lgcm_ghash_v8_4x
-	ld1	{v0.2d},[x0]		//load [rotated] Xi
-						//"[rotated]" means that
-						//loaded value would have
-						//to be rotated in order to
-						//make it appear as in
-						//algorithm specification
-	subs	x3,x3,#32		//see if x3 is 32 or larger
-	mov	x12,#16		//x12 is used as post-
-						//increment for input pointer;
-						//as loop is modulo-scheduled
-						//x12 is zeroed just in time
-						//to preclude overstepping
-						//inp[len], which means that
-						//last block[s] are actually
-						//loaded twice, but last
-						//copy is not processed
-	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
-	movi	v19.16b,#0xe1
-	ld1	{v22.2d},[x1]
-	csel	x12,xzr,x12,eq			//is it time to zero x12?
-	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
-	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
-	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
-#ifndef __AARCH64EB__
-	rev64	v16.16b,v16.16b
-	rev64	v0.16b,v0.16b
-#endif
-	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
-	b.lo	Lodd_tail_v8		//x3 was less than 32
-	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
-#ifndef __AARCH64EB__
-	rev64	v17.16b,v17.16b
-#endif
-	ext	v7.16b,v17.16b,v17.16b,#8
-	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
-	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
-	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
-	pmull2	v6.1q,v20.2d,v7.2d
-	b	Loop_mod2x_v8
-
-.align	4
-Loop_mod2x_v8:
-	ext	v18.16b,v3.16b,v3.16b,#8
-	subs	x3,x3,#32		//is there more data?
-	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
-	csel	x12,xzr,x12,lo			//is it time to zero x12?
-
-	pmull	v5.1q,v21.1d,v17.1d
-	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
-	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
-	eor	v0.16b,v0.16b,v4.16b		//accumulate
-	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
-	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
-
-	eor	v2.16b,v2.16b,v6.16b
-	csel	x12,xzr,x12,eq			//is it time to zero x12?
-	eor	v1.16b,v1.16b,v5.16b
-
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
-#ifndef __AARCH64EB__
-	rev64	v16.16b,v16.16b
-#endif
-	eor	v1.16b,v1.16b,v18.16b
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-
-#ifndef __AARCH64EB__
-	rev64	v17.16b,v17.16b
-#endif
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	ext	v7.16b,v17.16b,v17.16b,#8
-	ext	v3.16b,v16.16b,v16.16b,#8
-	eor	v0.16b,v1.16b,v18.16b
-	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
-	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v3.16b,v3.16b,v18.16b
-	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
-	eor	v3.16b,v3.16b,v0.16b
-	pmull2	v6.1q,v20.2d,v7.2d
-	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
-
-	eor	v2.16b,v2.16b,v18.16b
-	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
-	adds	x3,x3,#32		//re-construct x3
-	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
-	b.eq	Ldone_v8		//is x3 zero?
-Lodd_tail_v8:
-	ext	v18.16b,v0.16b,v0.16b,#8
-	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
-	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
-
-	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
-	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
-	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
-	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
-
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	eor	v1.16b,v1.16b,v18.16b
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	eor	v0.16b,v1.16b,v18.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v0.16b,v0.16b,v18.16b
-
-Ldone_v8:
-#ifndef __AARCH64EB__
-	rev64	v0.16b,v0.16b
-#endif
-	ext	v0.16b,v0.16b,v0.16b,#8
-	st1	{v0.2d},[x0]		//write out Xi
-
-	ret
-
-
-.align	4
-gcm_ghash_v8_4x:
-Lgcm_ghash_v8_4x:
-	ld1	{v0.2d},[x0]		//load [rotated] Xi
-	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
-	movi	v19.16b,#0xe1
-	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
-	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
-
-	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
-#ifndef __AARCH64EB__
-	rev64	v0.16b,v0.16b
-	rev64	v5.16b,v5.16b
-	rev64	v6.16b,v6.16b
-	rev64	v7.16b,v7.16b
-	rev64	v4.16b,v4.16b
-#endif
-	ext	v25.16b,v7.16b,v7.16b,#8
-	ext	v24.16b,v6.16b,v6.16b,#8
-	ext	v23.16b,v5.16b,v5.16b,#8
-
-	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
-	eor	v7.16b,v7.16b,v25.16b
-	pmull2	v31.1q,v20.2d,v25.2d
-	pmull	v30.1q,v21.1d,v7.1d
-
-	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
-	eor	v6.16b,v6.16b,v24.16b
-	pmull2	v24.1q,v22.2d,v24.2d
-	pmull2	v6.1q,v21.2d,v6.2d
-
-	eor	v29.16b,v29.16b,v16.16b
-	eor	v31.16b,v31.16b,v24.16b
-	eor	v30.16b,v30.16b,v6.16b
-
-	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
-	eor	v5.16b,v5.16b,v23.16b
-	pmull2	v23.1q,v26.2d,v23.2d
-	pmull	v5.1q,v27.1d,v5.1d
-
-	eor	v29.16b,v29.16b,v7.16b
-	eor	v31.16b,v31.16b,v23.16b
-	eor	v30.16b,v30.16b,v5.16b
-
-	subs	x3,x3,#128
-	b.lo	Ltail4x
-
-	b	Loop4x
-
-.align	4
-Loop4x:
-	eor	v16.16b,v4.16b,v0.16b
-	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
-	ext	v3.16b,v16.16b,v16.16b,#8
-#ifndef __AARCH64EB__
-	rev64	v5.16b,v5.16b
-	rev64	v6.16b,v6.16b
-	rev64	v7.16b,v7.16b
-	rev64	v4.16b,v4.16b
-#endif
-
-	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
-	eor	v16.16b,v16.16b,v3.16b
-	pmull2	v2.1q,v28.2d,v3.2d
-	ext	v25.16b,v7.16b,v7.16b,#8
-	pmull2	v1.1q,v27.2d,v16.2d
-
-	eor	v0.16b,v0.16b,v29.16b
-	eor	v2.16b,v2.16b,v31.16b
-	ext	v24.16b,v6.16b,v6.16b,#8
-	eor	v1.16b,v1.16b,v30.16b
-	ext	v23.16b,v5.16b,v5.16b,#8
-
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
-	eor	v7.16b,v7.16b,v25.16b
-	eor	v1.16b,v1.16b,v17.16b
-	pmull2	v31.1q,v20.2d,v25.2d
-	eor	v1.16b,v1.16b,v18.16b
-	pmull	v30.1q,v21.1d,v7.1d
-
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
-	eor	v6.16b,v6.16b,v24.16b
-	pmull2	v24.1q,v22.2d,v24.2d
-	eor	v0.16b,v1.16b,v18.16b
-	pmull2	v6.1q,v21.2d,v6.2d
-
-	eor	v29.16b,v29.16b,v16.16b
-	eor	v31.16b,v31.16b,v24.16b
-	eor	v30.16b,v30.16b,v6.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
-	eor	v5.16b,v5.16b,v23.16b
-	eor	v18.16b,v18.16b,v2.16b
-	pmull2	v23.1q,v26.2d,v23.2d
-	pmull	v5.1q,v27.1d,v5.1d
-
-	eor	v0.16b,v0.16b,v18.16b
-	eor	v29.16b,v29.16b,v7.16b
-	eor	v31.16b,v31.16b,v23.16b
-	ext	v0.16b,v0.16b,v0.16b,#8
-	eor	v30.16b,v30.16b,v5.16b
-
-	subs	x3,x3,#64
-	b.hs	Loop4x
-
-Ltail4x:
-	eor	v16.16b,v4.16b,v0.16b
-	ext	v3.16b,v16.16b,v16.16b,#8
-
-	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
-	eor	v16.16b,v16.16b,v3.16b
-	pmull2	v2.1q,v28.2d,v3.2d
-	pmull2	v1.1q,v27.2d,v16.2d
-
-	eor	v0.16b,v0.16b,v29.16b
-	eor	v2.16b,v2.16b,v31.16b
-	eor	v1.16b,v1.16b,v30.16b
-
-	adds	x3,x3,#64
-	b.eq	Ldone4x
-
-	cmp	x3,#32
-	b.lo	Lone
-	b.eq	Ltwo
-Lthree:
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	ld1	{v4.2d,v5.2d,v6.2d},[x2]
-	eor	v1.16b,v1.16b,v18.16b
-#ifndef	__AARCH64EB__
-	rev64	v5.16b,v5.16b
-	rev64	v6.16b,v6.16b
-	rev64	v4.16b,v4.16b
-#endif
-
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	ext	v24.16b,v6.16b,v6.16b,#8
-	ext	v23.16b,v5.16b,v5.16b,#8
-	eor	v0.16b,v1.16b,v18.16b
-
-	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
-	eor	v6.16b,v6.16b,v24.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	pmull2	v31.1q,v20.2d,v24.2d
-	pmull	v30.1q,v21.1d,v6.1d
-	eor	v0.16b,v0.16b,v18.16b
-	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
-	eor	v5.16b,v5.16b,v23.16b
-	ext	v0.16b,v0.16b,v0.16b,#8
-
-	pmull2	v23.1q,v22.2d,v23.2d
-	eor	v16.16b,v4.16b,v0.16b
-	pmull2	v5.1q,v21.2d,v5.2d
-	ext	v3.16b,v16.16b,v16.16b,#8
-
-	eor	v29.16b,v29.16b,v7.16b
-	eor	v31.16b,v31.16b,v23.16b
-	eor	v30.16b,v30.16b,v5.16b
-
-	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
-	eor	v16.16b,v16.16b,v3.16b
-	pmull2	v2.1q,v26.2d,v3.2d
-	pmull	v1.1q,v27.1d,v16.1d
-
-	eor	v0.16b,v0.16b,v29.16b
-	eor	v2.16b,v2.16b,v31.16b
-	eor	v1.16b,v1.16b,v30.16b
-	b	Ldone4x
-
-.align	4
-Ltwo:
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	ld1	{v4.2d,v5.2d},[x2]
-	eor	v1.16b,v1.16b,v18.16b
-#ifndef	__AARCH64EB__
-	rev64	v5.16b,v5.16b
-	rev64	v4.16b,v4.16b
-#endif
-
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	ext	v23.16b,v5.16b,v5.16b,#8
-	eor	v0.16b,v1.16b,v18.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v0.16b,v0.16b,v18.16b
-	ext	v0.16b,v0.16b,v0.16b,#8
-
-	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
-	eor	v5.16b,v5.16b,v23.16b
-
-	eor	v16.16b,v4.16b,v0.16b
-	ext	v3.16b,v16.16b,v16.16b,#8
-
-	pmull2	v31.1q,v20.2d,v23.2d
-	pmull	v30.1q,v21.1d,v5.1d
-
-	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
-	eor	v16.16b,v16.16b,v3.16b
-	pmull2	v2.1q,v22.2d,v3.2d
-	pmull2	v1.1q,v21.2d,v16.2d
-
-	eor	v0.16b,v0.16b,v29.16b
-	eor	v2.16b,v2.16b,v31.16b
-	eor	v1.16b,v1.16b,v30.16b
-	b	Ldone4x
-
-.align	4
-Lone:
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	ld1	{v4.2d},[x2]
-	eor	v1.16b,v1.16b,v18.16b
-#ifndef	__AARCH64EB__
-	rev64	v4.16b,v4.16b
-#endif
-
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	eor	v0.16b,v1.16b,v18.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v0.16b,v0.16b,v18.16b
-	ext	v0.16b,v0.16b,v0.16b,#8
-
-	eor	v16.16b,v4.16b,v0.16b
-	ext	v3.16b,v16.16b,v16.16b,#8
-
-	pmull	v0.1q,v20.1d,v3.1d
-	eor	v16.16b,v16.16b,v3.16b
-	pmull2	v2.1q,v20.2d,v3.2d
-	pmull	v1.1q,v21.1d,v16.1d
-
-Ldone4x:
-	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
-	eor	v18.16b,v0.16b,v2.16b
-	eor	v1.16b,v1.16b,v17.16b
-	eor	v1.16b,v1.16b,v18.16b
-
-	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
-	ins	v2.d[0],v1.d[1]
-	ins	v1.d[1],v0.d[0]
-	eor	v0.16b,v1.16b,v18.16b
-
-	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
-	pmull	v0.1q,v0.1d,v19.1d
-	eor	v18.16b,v18.16b,v2.16b
-	eor	v0.16b,v0.16b,v18.16b
-	ext	v0.16b,v0.16b,v0.16b,#8
-
-#ifndef __AARCH64EB__
-	rev64	v0.16b,v0.16b
-#endif
-	st1	{v0.2d},[x0]		//write out Xi
-
-	ret
-
-.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/p256-armv8-asm-apple.S b/apple-aarch64/crypto/fipsmodule/p256-armv8-asm-apple.S
deleted file mode 100644
index c8469e6..0000000
--- a/apple-aarch64/crypto/fipsmodule/p256-armv8-asm-apple.S
+++ /dev/null
@@ -1,1726 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include "openssl/arm_arch.h"
-
-.section	__TEXT,__const
-.align	5
-Lpoly:
-.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
-LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
-.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
-Lone_mont:
-.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
-Lone:
-.quad	1,0,0,0
-Lord:
-.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
-LordK:
-.quad	0xccd1c8aaee00bc4f
-.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.text
-
-// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
-//					     const BN_ULONG x2[4]);
-.globl	_ecp_nistz256_mul_mont
-.private_extern	_ecp_nistz256_mul_mont
-
-.align	4
-_ecp_nistz256_mul_mont:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-32]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-
-	ldr	x3,[x2]		// bp[0]
-	ldp	x4,x5,[x1]
-	ldp	x6,x7,[x1,#16]
-	adrp	x13,Lpoly@PAGE
-	add	x13,x13,Lpoly@PAGEOFF
-	ldr	x12,[x13,#8]
-	ldr	x13,[x13,#24]
-
-	bl	__ecp_nistz256_mul_mont
-
-	ldp	x19,x20,[sp,#16]
-	ldp	x29,x30,[sp],#32
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl	_ecp_nistz256_sqr_mont
-.private_extern	_ecp_nistz256_sqr_mont
-
-.align	4
-_ecp_nistz256_sqr_mont:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-32]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-
-	ldp	x4,x5,[x1]
-	ldp	x6,x7,[x1,#16]
-	adrp	x13,Lpoly@PAGE
-	add	x13,x13,Lpoly@PAGEOFF
-	ldr	x12,[x13,#8]
-	ldr	x13,[x13,#24]
-
-	bl	__ecp_nistz256_sqr_mont
-
-	ldp	x19,x20,[sp,#16]
-	ldp	x29,x30,[sp],#32
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl	_ecp_nistz256_div_by_2
-.private_extern	_ecp_nistz256_div_by_2
-
-.align	4
-_ecp_nistz256_div_by_2:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ldp	x14,x15,[x1]
-	ldp	x16,x17,[x1,#16]
-	adrp	x13,Lpoly@PAGE
-	add	x13,x13,Lpoly@PAGEOFF
-	ldr	x12,[x13,#8]
-	ldr	x13,[x13,#24]
-
-	bl	__ecp_nistz256_div_by_2
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl	_ecp_nistz256_mul_by_2
-.private_extern	_ecp_nistz256_mul_by_2
-
-.align	4
-_ecp_nistz256_mul_by_2:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ldp	x14,x15,[x1]
-	ldp	x16,x17,[x1,#16]
-	adrp	x13,Lpoly@PAGE
-	add	x13,x13,Lpoly@PAGEOFF
-	ldr	x12,[x13,#8]
-	ldr	x13,[x13,#24]
-	mov	x8,x14
-	mov	x9,x15
-	mov	x10,x16
-	mov	x11,x17
-
-	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl	_ecp_nistz256_mul_by_3
-.private_extern	_ecp_nistz256_mul_by_3
-
-.align	4
-_ecp_nistz256_mul_by_3:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ldp	x14,x15,[x1]
-	ldp	x16,x17,[x1,#16]
-	adrp	x13,Lpoly@PAGE
-	add	x13,x13,Lpoly@PAGEOFF
-	ldr	x12,[x13,#8]
-	ldr	x13,[x13,#24]
-	mov	x8,x14
-	mov	x9,x15
-	mov	x10,x16
-	mov	x11,x17
-	mov	x4,x14
-	mov	x5,x15
-	mov	x6,x16
-	mov	x7,x17
-
-	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
-
-	mov	x8,x4
-	mov	x9,x5
-	mov	x10,x6
-	mov	x11,x7
-
-	bl	__ecp_nistz256_add_to	// ret += a	// 2*a+a=3*a
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
-//				        const BN_ULONG x2[4]);
-.globl	_ecp_nistz256_sub
-.private_extern	_ecp_nistz256_sub
-
-.align	4
-_ecp_nistz256_sub:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ldp	x14,x15,[x1]
-	ldp	x16,x17,[x1,#16]
-	adrp	x13,Lpoly@PAGE
-	add	x13,x13,Lpoly@PAGEOFF
-	ldr	x12,[x13,#8]
-	ldr	x13,[x13,#24]
-
-	bl	__ecp_nistz256_sub_from
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
-.globl	_ecp_nistz256_neg
-.private_extern	_ecp_nistz256_neg
-
-.align	4
-_ecp_nistz256_neg:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	mov	x2,x1
-	mov	x14,xzr		// a = 0
-	mov	x15,xzr
-	mov	x16,xzr
-	mov	x17,xzr
-	adrp	x13,Lpoly@PAGE
-	add	x13,x13,Lpoly@PAGEOFF
-	ldr	x12,[x13,#8]
-	ldr	x13,[x13,#24]
-
-	bl	__ecp_nistz256_sub_from
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
-// to x4-x7 and b[0] - to x3
-
-.align	4
-__ecp_nistz256_mul_mont:
-	mul	x14,x4,x3		// a[0]*b[0]
-	umulh	x8,x4,x3
-
-	mul	x15,x5,x3		// a[1]*b[0]
-	umulh	x9,x5,x3
-
-	mul	x16,x6,x3		// a[2]*b[0]
-	umulh	x10,x6,x3
-
-	mul	x17,x7,x3		// a[3]*b[0]
-	umulh	x11,x7,x3
-	ldr	x3,[x2,#8]		// b[1]
-
-	adds	x15,x15,x8		// accumulate high parts of multiplication
-	lsl	x8,x14,#32
-	adcs	x16,x16,x9
-	lsr	x9,x14,#32
-	adcs	x17,x17,x10
-	adc	x19,xzr,x11
-	mov	x20,xzr
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	mul	x8,x4,x3		// lo(a[0]*b[i])
-	adcs	x15,x16,x9
-	mul	x9,x5,x3		// lo(a[1]*b[i])
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	mul	x10,x6,x3		// lo(a[2]*b[i])
-	adcs	x17,x19,x11
-	mul	x11,x7,x3		// lo(a[3]*b[i])
-	adc	x19,x20,xzr
-
-	adds	x14,x14,x8		// accumulate low parts of multiplication
-	umulh	x8,x4,x3		// hi(a[0]*b[i])
-	adcs	x15,x15,x9
-	umulh	x9,x5,x3		// hi(a[1]*b[i])
-	adcs	x16,x16,x10
-	umulh	x10,x6,x3		// hi(a[2]*b[i])
-	adcs	x17,x17,x11
-	umulh	x11,x7,x3		// hi(a[3]*b[i])
-	adc	x19,x19,xzr
-	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
-	adds	x15,x15,x8		// accumulate high parts of multiplication
-	lsl	x8,x14,#32
-	adcs	x16,x16,x9
-	lsr	x9,x14,#32
-	adcs	x17,x17,x10
-	adcs	x19,x19,x11
-	adc	x20,xzr,xzr
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	mul	x8,x4,x3		// lo(a[0]*b[i])
-	adcs	x15,x16,x9
-	mul	x9,x5,x3		// lo(a[1]*b[i])
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	mul	x10,x6,x3		// lo(a[2]*b[i])
-	adcs	x17,x19,x11
-	mul	x11,x7,x3		// lo(a[3]*b[i])
-	adc	x19,x20,xzr
-
-	adds	x14,x14,x8		// accumulate low parts of multiplication
-	umulh	x8,x4,x3		// hi(a[0]*b[i])
-	adcs	x15,x15,x9
-	umulh	x9,x5,x3		// hi(a[1]*b[i])
-	adcs	x16,x16,x10
-	umulh	x10,x6,x3		// hi(a[2]*b[i])
-	adcs	x17,x17,x11
-	umulh	x11,x7,x3		// hi(a[3]*b[i])
-	adc	x19,x19,xzr
-	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
-	adds	x15,x15,x8		// accumulate high parts of multiplication
-	lsl	x8,x14,#32
-	adcs	x16,x16,x9
-	lsr	x9,x14,#32
-	adcs	x17,x17,x10
-	adcs	x19,x19,x11
-	adc	x20,xzr,xzr
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	mul	x8,x4,x3		// lo(a[0]*b[i])
-	adcs	x15,x16,x9
-	mul	x9,x5,x3		// lo(a[1]*b[i])
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	mul	x10,x6,x3		// lo(a[2]*b[i])
-	adcs	x17,x19,x11
-	mul	x11,x7,x3		// lo(a[3]*b[i])
-	adc	x19,x20,xzr
-
-	adds	x14,x14,x8		// accumulate low parts of multiplication
-	umulh	x8,x4,x3		// hi(a[0]*b[i])
-	adcs	x15,x15,x9
-	umulh	x9,x5,x3		// hi(a[1]*b[i])
-	adcs	x16,x16,x10
-	umulh	x10,x6,x3		// hi(a[2]*b[i])
-	adcs	x17,x17,x11
-	umulh	x11,x7,x3		// hi(a[3]*b[i])
-	adc	x19,x19,xzr
-	adds	x15,x15,x8		// accumulate high parts of multiplication
-	lsl	x8,x14,#32
-	adcs	x16,x16,x9
-	lsr	x9,x14,#32
-	adcs	x17,x17,x10
-	adcs	x19,x19,x11
-	adc	x20,xzr,xzr
-	// last reduction
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	adcs	x15,x16,x9
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	adcs	x17,x19,x11
-	adc	x19,x20,xzr
-
-	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
-	sbcs	x9,x15,x12
-	sbcs	x10,x16,xzr
-	sbcs	x11,x17,x13
-	sbcs	xzr,x19,xzr		// did it borrow?
-
-	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
-	csel	x15,x15,x9,lo
-	csel	x16,x16,x10,lo
-	stp	x14,x15,[x0]
-	csel	x17,x17,x11,lo
-	stp	x16,x17,[x0,#16]
-
-	ret
-
-
-// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
-// to x4-x7
-
-.align	4
-__ecp_nistz256_sqr_mont:
-	//  |  |  |  |  |  |a1*a0|  |
-	//  |  |  |  |  |a2*a0|  |  |
-	//  |  |a3*a2|a3*a0|  |  |  |
-	//  |  |  |  |a2*a1|  |  |  |
-	//  |  |  |a3*a1|  |  |  |  |
-	// *|  |  |  |  |  |  |  | 2|
-	// +|a3*a3|a2*a2|a1*a1|a0*a0|
-	//  |--+--+--+--+--+--+--+--|
-	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
-	//
-	//  "can't overflow" below mark carrying into high part of
-	//  multiplication result, which can't overflow, because it
-	//  can never be all ones.
-
-	mul	x15,x5,x4		// a[1]*a[0]
-	umulh	x9,x5,x4
-	mul	x16,x6,x4		// a[2]*a[0]
-	umulh	x10,x6,x4
-	mul	x17,x7,x4		// a[3]*a[0]
-	umulh	x19,x7,x4
-
-	adds	x16,x16,x9		// accumulate high parts of multiplication
-	mul	x8,x6,x5		// a[2]*a[1]
-	umulh	x9,x6,x5
-	adcs	x17,x17,x10
-	mul	x10,x7,x5		// a[3]*a[1]
-	umulh	x11,x7,x5
-	adc	x19,x19,xzr		// can't overflow
-
-	mul	x20,x7,x6		// a[3]*a[2]
-	umulh	x1,x7,x6
-
-	adds	x9,x9,x10		// accumulate high parts of multiplication
-	mul	x14,x4,x4		// a[0]*a[0]
-	adc	x10,x11,xzr		// can't overflow
-
-	adds	x17,x17,x8		// accumulate low parts of multiplication
-	umulh	x4,x4,x4
-	adcs	x19,x19,x9
-	mul	x9,x5,x5		// a[1]*a[1]
-	adcs	x20,x20,x10
-	umulh	x5,x5,x5
-	adc	x1,x1,xzr		// can't overflow
-
-	adds	x15,x15,x15	// acc[1-6]*=2
-	mul	x10,x6,x6		// a[2]*a[2]
-	adcs	x16,x16,x16
-	umulh	x6,x6,x6
-	adcs	x17,x17,x17
-	mul	x11,x7,x7		// a[3]*a[3]
-	adcs	x19,x19,x19
-	umulh	x7,x7,x7
-	adcs	x20,x20,x20
-	adcs	x1,x1,x1
-	adc	x2,xzr,xzr
-
-	adds	x15,x15,x4		// +a[i]*a[i]
-	adcs	x16,x16,x9
-	adcs	x17,x17,x5
-	adcs	x19,x19,x10
-	adcs	x20,x20,x6
-	lsl	x8,x14,#32
-	adcs	x1,x1,x11
-	lsr	x9,x14,#32
-	adc	x2,x2,x7
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	adcs	x15,x16,x9
-	lsl	x8,x14,#32
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	lsr	x9,x14,#32
-	adc	x17,x11,xzr		// can't overflow
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	adcs	x15,x16,x9
-	lsl	x8,x14,#32
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	lsr	x9,x14,#32
-	adc	x17,x11,xzr		// can't overflow
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	adcs	x15,x16,x9
-	lsl	x8,x14,#32
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	lsr	x9,x14,#32
-	adc	x17,x11,xzr		// can't overflow
-	subs	x10,x14,x8		// "*0xffff0001"
-	sbc	x11,x14,x9
-	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
-	adcs	x15,x16,x9
-	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
-	adc	x17,x11,xzr		// can't overflow
-
-	adds	x14,x14,x19	// accumulate upper half
-	adcs	x15,x15,x20
-	adcs	x16,x16,x1
-	adcs	x17,x17,x2
-	adc	x19,xzr,xzr
-
-	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
-	sbcs	x9,x15,x12
-	sbcs	x10,x16,xzr
-	sbcs	x11,x17,x13
-	sbcs	xzr,x19,xzr		// did it borrow?
-
-	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
-	csel	x15,x15,x9,lo
-	csel	x16,x16,x10,lo
-	stp	x14,x15,[x0]
-	csel	x17,x17,x11,lo
-	stp	x16,x17,[x0,#16]
-
-	ret
-
-
-// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
-// x4-x7 and x8-x11. This is done because it's used in multiple
-// contexts, e.g. in multiplication by 2 and 3...
-
-.align	4
-__ecp_nistz256_add_to:
-	adds	x14,x14,x8		// ret = a+b
-	adcs	x15,x15,x9
-	adcs	x16,x16,x10
-	adcs	x17,x17,x11
-	adc	x1,xzr,xzr		// zap x1
-
-	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
-	sbcs	x9,x15,x12
-	sbcs	x10,x16,xzr
-	sbcs	x11,x17,x13
-	sbcs	xzr,x1,xzr		// did subtraction borrow?
-
-	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
-	csel	x15,x15,x9,lo
-	csel	x16,x16,x10,lo
-	stp	x14,x15,[x0]
-	csel	x17,x17,x11,lo
-	stp	x16,x17,[x0,#16]
-
-	ret
-
-
-
-.align	4
-__ecp_nistz256_sub_from:
-	ldp	x8,x9,[x2]
-	ldp	x10,x11,[x2,#16]
-	subs	x14,x14,x8		// ret = a-b
-	sbcs	x15,x15,x9
-	sbcs	x16,x16,x10
-	sbcs	x17,x17,x11
-	sbc	x1,xzr,xzr		// zap x1
-
-	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
-	adcs	x9,x15,x12
-	adcs	x10,x16,xzr
-	adc	x11,x17,x13
-	cmp	x1,xzr			// did subtraction borrow?
-
-	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
-	csel	x15,x15,x9,eq
-	csel	x16,x16,x10,eq
-	stp	x14,x15,[x0]
-	csel	x17,x17,x11,eq
-	stp	x16,x17,[x0,#16]
-
-	ret
-
-
-
-.align	4
-__ecp_nistz256_sub_morf:
-	ldp	x8,x9,[x2]
-	ldp	x10,x11,[x2,#16]
-	subs	x14,x8,x14		// ret = b-a
-	sbcs	x15,x9,x15
-	sbcs	x16,x10,x16
-	sbcs	x17,x11,x17
-	sbc	x1,xzr,xzr		// zap x1
-
-	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
-	adcs	x9,x15,x12
-	adcs	x10,x16,xzr
-	adc	x11,x17,x13
-	cmp	x1,xzr			// did subtraction borrow?
-
-	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
-	csel	x15,x15,x9,eq
-	csel	x16,x16,x10,eq
-	stp	x14,x15,[x0]
-	csel	x17,x17,x11,eq
-	stp	x16,x17,[x0,#16]
-
-	ret
-
-
-
-.align	4
-__ecp_nistz256_div_by_2:
-	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
-	adcs	x9,x15,x12
-	adcs	x10,x16,xzr
-	adcs	x11,x17,x13
-	adc	x1,xzr,xzr		// zap x1
-	tst	x14,#1		// is a even?
-
-	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
-	csel	x15,x15,x9,eq
-	csel	x16,x16,x10,eq
-	csel	x17,x17,x11,eq
-	csel	x1,xzr,x1,eq
-
-	lsr	x14,x14,#1		// ret >>= 1
-	orr	x14,x14,x15,lsl#63
-	lsr	x15,x15,#1
-	orr	x15,x15,x16,lsl#63
-	lsr	x16,x16,#1
-	orr	x16,x16,x17,lsl#63
-	lsr	x17,x17,#1
-	stp	x14,x15,[x0]
-	orr	x17,x17,x1,lsl#63
-	stp	x16,x17,[x0,#16]
-
-	ret
-
-.globl	_ecp_nistz256_point_double
-.private_extern	_ecp_nistz256_point_double
-
-.align	5
-_ecp_nistz256_point_double:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	sub	sp,sp,#32*4
-
-Ldouble_shortcut:
-	ldp	x14,x15,[x1,#32]
-	mov	x21,x0
-	ldp	x16,x17,[x1,#48]
-	mov	x22,x1
-	adrp	x13,Lpoly@PAGE
-	add	x13,x13,Lpoly@PAGEOFF
-	ldr	x12,[x13,#8]
-	mov	x8,x14
-	ldr	x13,[x13,#24]
-	mov	x9,x15
-	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
-	mov	x10,x16
-	mov	x11,x17
-	ldp	x6,x7,[x22,#64+16]
-	add	x0,sp,#0
-	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
-
-	add	x0,sp,#64
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
-
-	ldp	x8,x9,[x22]
-	ldp	x10,x11,[x22,#16]
-	mov	x4,x14		// put Zsqr aside for p256_sub
-	mov	x5,x15
-	mov	x6,x16
-	mov	x7,x17
-	add	x0,sp,#32
-	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
-
-	add	x2,x22,#0
-	mov	x14,x4		// restore Zsqr
-	mov	x15,x5
-	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
-	mov	x16,x6
-	mov	x17,x7
-	ldp	x6,x7,[sp,#0+16]
-	add	x0,sp,#64
-	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
-
-	add	x0,sp,#0
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
-
-	ldr	x3,[x22,#32]
-	ldp	x4,x5,[x22,#64]
-	ldp	x6,x7,[x22,#64+16]
-	add	x2,x22,#32
-	add	x0,sp,#96
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
-
-	mov	x8,x14
-	mov	x9,x15
-	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
-	mov	x10,x16
-	mov	x11,x17
-	ldp	x6,x7,[sp,#0+16]
-	add	x0,x21,#64
-	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
-
-	add	x0,sp,#96
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
-
-	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
-	ldp	x4,x5,[sp,#32]
-	ldp	x6,x7,[sp,#32+16]
-	add	x0,x21,#32
-	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
-
-	add	x2,sp,#64
-	add	x0,sp,#32
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
-
-	mov	x8,x14		// duplicate M
-	mov	x9,x15
-	mov	x10,x16
-	mov	x11,x17
-	mov	x4,x14		// put M aside
-	mov	x5,x15
-	mov	x6,x16
-	mov	x7,x17
-	add	x0,sp,#32
-	bl	__ecp_nistz256_add_to
-	mov	x8,x4			// restore M
-	mov	x9,x5
-	ldr	x3,[x22]		// forward load for p256_mul_mont
-	mov	x10,x6
-	ldp	x4,x5,[sp,#0]
-	mov	x11,x7
-	ldp	x6,x7,[sp,#0+16]
-	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
-
-	add	x2,x22,#0
-	add	x0,sp,#0
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
-
-	mov	x8,x14
-	mov	x9,x15
-	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
-	mov	x10,x16
-	mov	x11,x17
-	ldp	x6,x7,[sp,#32+16]
-	add	x0,sp,#96
-	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
-
-	add	x0,x21,#0
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
-
-	add	x2,sp,#96
-	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
-
-	add	x2,sp,#0
-	add	x0,sp,#0
-	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
-
-	ldr	x3,[sp,#32]
-	mov	x4,x14		// copy S
-	mov	x5,x15
-	mov	x6,x16
-	mov	x7,x17
-	add	x2,sp,#32
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
-
-	add	x2,x21,#32
-	add	x0,x21,#32
-	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
-
-	add	sp,x29,#0		// destroy frame
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.globl	_ecp_nistz256_point_add
-.private_extern	_ecp_nistz256_point_add
-
-.align	5
-_ecp_nistz256_point_add:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#32*12
-
-	ldp	x4,x5,[x2,#64]	// in2_z
-	ldp	x6,x7,[x2,#64+16]
-	mov	x21,x0
-	mov	x22,x1
-	mov	x23,x2
-	adrp	x13,Lpoly@PAGE
-	add	x13,x13,Lpoly@PAGEOFF
-	ldr	x12,[x13,#8]
-	ldr	x13,[x13,#24]
-	orr	x8,x4,x5
-	orr	x10,x6,x7
-	orr	x25,x8,x10
-	cmp	x25,#0
-	csetm	x25,ne		// ~in2infty
-	add	x0,sp,#192
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
-
-	ldp	x4,x5,[x22,#64]	// in1_z
-	ldp	x6,x7,[x22,#64+16]
-	orr	x8,x4,x5
-	orr	x10,x6,x7
-	orr	x24,x8,x10
-	cmp	x24,#0
-	csetm	x24,ne		// ~in1infty
-	add	x0,sp,#128
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
-
-	ldr	x3,[x23,#64]
-	ldp	x4,x5,[sp,#192]
-	ldp	x6,x7,[sp,#192+16]
-	add	x2,x23,#64
-	add	x0,sp,#320
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
-
-	ldr	x3,[x22,#64]
-	ldp	x4,x5,[sp,#128]
-	ldp	x6,x7,[sp,#128+16]
-	add	x2,x22,#64
-	add	x0,sp,#352
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
-
-	ldr	x3,[x22,#32]
-	ldp	x4,x5,[sp,#320]
-	ldp	x6,x7,[sp,#320+16]
-	add	x2,x22,#32
-	add	x0,sp,#320
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
-
-	ldr	x3,[x23,#32]
-	ldp	x4,x5,[sp,#352]
-	ldp	x6,x7,[sp,#352+16]
-	add	x2,x23,#32
-	add	x0,sp,#352
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
-
-	add	x2,sp,#320
-	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
-	ldp	x4,x5,[x22]
-	ldp	x6,x7,[x22,#16]
-	add	x0,sp,#160
-	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
-
-	orr	x14,x14,x15	// see if result is zero
-	orr	x16,x16,x17
-	orr	x26,x14,x16	// ~is_equal(S1,S2)
-
-	add	x2,sp,#192
-	add	x0,sp,#256
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
-
-	ldr	x3,[sp,#128]
-	ldp	x4,x5,[x23]
-	ldp	x6,x7,[x23,#16]
-	add	x2,sp,#128
-	add	x0,sp,#288
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
-
-	add	x2,sp,#256
-	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
-	ldp	x6,x7,[sp,#160+16]
-	add	x0,sp,#96
-	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
-
-	orr	x14,x14,x15	// see if result is zero
-	orr	x16,x16,x17
-	orr	x14,x14,x16	// ~is_equal(U1,U2)
-
-	mvn	x27,x24	// -1/0 -> 0/-1
-	mvn	x28,x25	// -1/0 -> 0/-1
-	orr	x14,x14,x27
-	orr	x14,x14,x28
-	orr	x14,x14,x26
-	cbnz	x14,Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
-
-Ladd_double:
-	mov	x1,x22
-	mov	x0,x21
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
-	b	Ldouble_shortcut
-
-.align	4
-Ladd_proceed:
-	add	x0,sp,#192
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
-
-	ldr	x3,[x22,#64]
-	ldp	x4,x5,[sp,#96]
-	ldp	x6,x7,[sp,#96+16]
-	add	x2,x22,#64
-	add	x0,sp,#64
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
-
-	ldp	x4,x5,[sp,#96]
-	ldp	x6,x7,[sp,#96+16]
-	add	x0,sp,#128
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
-
-	ldr	x3,[x23,#64]
-	ldp	x4,x5,[sp,#64]
-	ldp	x6,x7,[sp,#64+16]
-	add	x2,x23,#64
-	add	x0,sp,#64
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
-
-	ldr	x3,[sp,#96]
-	ldp	x4,x5,[sp,#128]
-	ldp	x6,x7,[sp,#128+16]
-	add	x2,sp,#96
-	add	x0,sp,#224
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
-
-	ldr	x3,[sp,#128]
-	ldp	x4,x5,[sp,#256]
-	ldp	x6,x7,[sp,#256+16]
-	add	x2,sp,#128
-	add	x0,sp,#288
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
-
-	mov	x8,x14
-	mov	x9,x15
-	mov	x10,x16
-	mov	x11,x17
-	add	x0,sp,#128
-	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
-
-	add	x2,sp,#192
-	add	x0,sp,#0
-	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
-
-	add	x2,sp,#224
-	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
-
-	add	x2,sp,#288
-	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
-	ldp	x4,x5,[sp,#320]
-	ldp	x6,x7,[sp,#320+16]
-	add	x0,sp,#32
-	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
-
-	add	x2,sp,#224
-	add	x0,sp,#352
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
-
-	ldr	x3,[sp,#160]
-	ldp	x4,x5,[sp,#32]
-	ldp	x6,x7,[sp,#32+16]
-	add	x2,sp,#160
-	add	x0,sp,#32
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
-
-	add	x2,sp,#352
-	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
-
-	ldp	x4,x5,[sp,#0]		// res
-	ldp	x6,x7,[sp,#0+16]
-	ldp	x8,x9,[x23]		// in2
-	ldp	x10,x11,[x23,#16]
-	ldp	x14,x15,[x22,#0]	// in1
-	cmp	x24,#0			// ~, remember?
-	ldp	x16,x17,[x22,#0+16]
-	csel	x8,x4,x8,ne
-	csel	x9,x5,x9,ne
-	ldp	x4,x5,[sp,#0+0+32]	// res
-	csel	x10,x6,x10,ne
-	csel	x11,x7,x11,ne
-	cmp	x25,#0			// ~, remember?
-	ldp	x6,x7,[sp,#0+0+48]
-	csel	x14,x8,x14,ne
-	csel	x15,x9,x15,ne
-	ldp	x8,x9,[x23,#0+32]	// in2
-	csel	x16,x10,x16,ne
-	csel	x17,x11,x17,ne
-	ldp	x10,x11,[x23,#0+48]
-	stp	x14,x15,[x21,#0]
-	stp	x16,x17,[x21,#0+16]
-	ldp	x14,x15,[x22,#32]	// in1
-	cmp	x24,#0			// ~, remember?
-	ldp	x16,x17,[x22,#32+16]
-	csel	x8,x4,x8,ne
-	csel	x9,x5,x9,ne
-	ldp	x4,x5,[sp,#0+32+32]	// res
-	csel	x10,x6,x10,ne
-	csel	x11,x7,x11,ne
-	cmp	x25,#0			// ~, remember?
-	ldp	x6,x7,[sp,#0+32+48]
-	csel	x14,x8,x14,ne
-	csel	x15,x9,x15,ne
-	ldp	x8,x9,[x23,#32+32]	// in2
-	csel	x16,x10,x16,ne
-	csel	x17,x11,x17,ne
-	ldp	x10,x11,[x23,#32+48]
-	stp	x14,x15,[x21,#32]
-	stp	x16,x17,[x21,#32+16]
-	ldp	x14,x15,[x22,#64]	// in1
-	cmp	x24,#0			// ~, remember?
-	ldp	x16,x17,[x22,#64+16]
-	csel	x8,x4,x8,ne
-	csel	x9,x5,x9,ne
-	csel	x10,x6,x10,ne
-	csel	x11,x7,x11,ne
-	cmp	x25,#0			// ~, remember?
-	csel	x14,x8,x14,ne
-	csel	x15,x9,x15,ne
-	csel	x16,x10,x16,ne
-	csel	x17,x11,x17,ne
-	stp	x14,x15,[x21,#64]
-	stp	x16,x17,[x21,#64+16]
-
-Ladd_done:
-	add	sp,x29,#0		// destroy frame
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#96
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.globl	_ecp_nistz256_point_add_affine
-.private_extern	_ecp_nistz256_point_add_affine
-
-.align	5
-_ecp_nistz256_point_add_affine:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-80]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	sub	sp,sp,#32*10
-
-	mov	x21,x0
-	mov	x22,x1
-	mov	x23,x2
-	adrp	x13,Lpoly@PAGE
-	add	x13,x13,Lpoly@PAGEOFF
-	ldr	x12,[x13,#8]
-	ldr	x13,[x13,#24]
-
-	ldp	x4,x5,[x1,#64]	// in1_z
-	ldp	x6,x7,[x1,#64+16]
-	orr	x8,x4,x5
-	orr	x10,x6,x7
-	orr	x24,x8,x10
-	cmp	x24,#0
-	csetm	x24,ne		// ~in1infty
-
-	ldp	x14,x15,[x2]	// in2_x
-	ldp	x16,x17,[x2,#16]
-	ldp	x8,x9,[x2,#32]	// in2_y
-	ldp	x10,x11,[x2,#48]
-	orr	x14,x14,x15
-	orr	x16,x16,x17
-	orr	x8,x8,x9
-	orr	x10,x10,x11
-	orr	x14,x14,x16
-	orr	x8,x8,x10
-	orr	x25,x14,x8
-	cmp	x25,#0
-	csetm	x25,ne		// ~in2infty
-
-	add	x0,sp,#128
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
-
-	mov	x4,x14
-	mov	x5,x15
-	mov	x6,x16
-	mov	x7,x17
-	ldr	x3,[x23]
-	add	x2,x23,#0
-	add	x0,sp,#96
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
-
-	add	x2,x22,#0
-	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
-	ldp	x4,x5,[sp,#128]
-	ldp	x6,x7,[sp,#128+16]
-	add	x0,sp,#160
-	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
-
-	add	x2,x22,#64
-	add	x0,sp,#128
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
-
-	ldr	x3,[x22,#64]
-	ldp	x4,x5,[sp,#160]
-	ldp	x6,x7,[sp,#160+16]
-	add	x2,x22,#64
-	add	x0,sp,#64
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
-
-	ldr	x3,[x23,#32]
-	ldp	x4,x5,[sp,#128]
-	ldp	x6,x7,[sp,#128+16]
-	add	x2,x23,#32
-	add	x0,sp,#128
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
-
-	add	x2,x22,#32
-	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
-	ldp	x6,x7,[sp,#160+16]
-	add	x0,sp,#192
-	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
-
-	add	x0,sp,#224
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
-
-	ldp	x4,x5,[sp,#192]
-	ldp	x6,x7,[sp,#192+16]
-	add	x0,sp,#288
-	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
-
-	ldr	x3,[sp,#160]
-	ldp	x4,x5,[sp,#224]
-	ldp	x6,x7,[sp,#224+16]
-	add	x2,sp,#160
-	add	x0,sp,#256
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
-
-	ldr	x3,[x22]
-	ldp	x4,x5,[sp,#224]
-	ldp	x6,x7,[sp,#224+16]
-	add	x2,x22,#0
-	add	x0,sp,#96
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
-
-	mov	x8,x14
-	mov	x9,x15
-	mov	x10,x16
-	mov	x11,x17
-	add	x0,sp,#224
-	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
-
-	add	x2,sp,#288
-	add	x0,sp,#0
-	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
-
-	add	x2,sp,#256
-	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
-
-	add	x2,sp,#96
-	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
-	ldp	x4,x5,[sp,#256]
-	ldp	x6,x7,[sp,#256+16]
-	add	x0,sp,#32
-	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
-
-	add	x2,x22,#32
-	add	x0,sp,#128
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
-
-	ldr	x3,[sp,#192]
-	ldp	x4,x5,[sp,#32]
-	ldp	x6,x7,[sp,#32+16]
-	add	x2,sp,#192
-	add	x0,sp,#32
-	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
-
-	add	x2,sp,#128
-	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
-
-	ldp	x4,x5,[sp,#0]		// res
-	ldp	x6,x7,[sp,#0+16]
-	ldp	x8,x9,[x23]		// in2
-	ldp	x10,x11,[x23,#16]
-	ldp	x14,x15,[x22,#0]	// in1
-	cmp	x24,#0			// ~, remember?
-	ldp	x16,x17,[x22,#0+16]
-	csel	x8,x4,x8,ne
-	csel	x9,x5,x9,ne
-	ldp	x4,x5,[sp,#0+0+32]	// res
-	csel	x10,x6,x10,ne
-	csel	x11,x7,x11,ne
-	cmp	x25,#0			// ~, remember?
-	ldp	x6,x7,[sp,#0+0+48]
-	csel	x14,x8,x14,ne
-	csel	x15,x9,x15,ne
-	ldp	x8,x9,[x23,#0+32]	// in2
-	csel	x16,x10,x16,ne
-	csel	x17,x11,x17,ne
-	ldp	x10,x11,[x23,#0+48]
-	stp	x14,x15,[x21,#0]
-	stp	x16,x17,[x21,#0+16]
-	adrp	x23,Lone_mont@PAGE-64
-	add	x23,x23,Lone_mont@PAGEOFF-64
-	ldp	x14,x15,[x22,#32]	// in1
-	cmp	x24,#0			// ~, remember?
-	ldp	x16,x17,[x22,#32+16]
-	csel	x8,x4,x8,ne
-	csel	x9,x5,x9,ne
-	ldp	x4,x5,[sp,#0+32+32]	// res
-	csel	x10,x6,x10,ne
-	csel	x11,x7,x11,ne
-	cmp	x25,#0			// ~, remember?
-	ldp	x6,x7,[sp,#0+32+48]
-	csel	x14,x8,x14,ne
-	csel	x15,x9,x15,ne
-	ldp	x8,x9,[x23,#32+32]	// in2
-	csel	x16,x10,x16,ne
-	csel	x17,x11,x17,ne
-	ldp	x10,x11,[x23,#32+48]
-	stp	x14,x15,[x21,#32]
-	stp	x16,x17,[x21,#32+16]
-	ldp	x14,x15,[x22,#64]	// in1
-	cmp	x24,#0			// ~, remember?
-	ldp	x16,x17,[x22,#64+16]
-	csel	x8,x4,x8,ne
-	csel	x9,x5,x9,ne
-	csel	x10,x6,x10,ne
-	csel	x11,x7,x11,ne
-	cmp	x25,#0			// ~, remember?
-	csel	x14,x8,x14,ne
-	csel	x15,x9,x15,ne
-	csel	x16,x10,x16,ne
-	csel	x17,x11,x17,ne
-	stp	x14,x15,[x21,#64]
-	stp	x16,x17,[x21,#64+16]
-
-	add	sp,x29,#0		// destroy frame
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x29,x30,[sp],#80
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-////////////////////////////////////////////////////////////////////////
-// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
-//                                uint64_t b[4]);
-.globl	_ecp_nistz256_ord_mul_mont
-.private_extern	_ecp_nistz256_ord_mul_mont
-
-.align	4
-_ecp_nistz256_ord_mul_mont:
-	AARCH64_VALID_CALL_TARGET
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	stp	x29,x30,[sp,#-64]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-
-	adrp	x23,Lord@PAGE
-	add	x23,x23,Lord@PAGEOFF
-	ldr	x3,[x2]		// bp[0]
-	ldp	x4,x5,[x1]
-	ldp	x6,x7,[x1,#16]
-
-	ldp	x12,x13,[x23,#0]
-	ldp	x21,x22,[x23,#16]
-	ldr	x23,[x23,#32]
-
-	mul	x14,x4,x3		// a[0]*b[0]
-	umulh	x8,x4,x3
-
-	mul	x15,x5,x3		// a[1]*b[0]
-	umulh	x9,x5,x3
-
-	mul	x16,x6,x3		// a[2]*b[0]
-	umulh	x10,x6,x3
-
-	mul	x17,x7,x3		// a[3]*b[0]
-	umulh	x19,x7,x3
-
-	mul	x24,x14,x23
-
-	adds	x15,x15,x8		// accumulate high parts of multiplication
-	adcs	x16,x16,x9
-	adcs	x17,x17,x10
-	adc	x19,x19,xzr
-	mov	x20,xzr
-	ldr	x3,[x2,#8*1]		// b[i]
-
-	lsl	x8,x24,#32
-	subs	x16,x16,x24
-	lsr	x9,x24,#32
-	sbcs	x17,x17,x8
-	sbcs	x19,x19,x9
-	sbc	x20,x20,xzr
-
-	subs	xzr,x14,#1
-	umulh	x9,x12,x24
-	mul	x10,x13,x24
-	umulh	x11,x13,x24
-
-	adcs	x10,x10,x9
-	mul	x8,x4,x3
-	adc	x11,x11,xzr
-	mul	x9,x5,x3
-
-	adds	x14,x15,x10
-	mul	x10,x6,x3
-	adcs	x15,x16,x11
-	mul	x11,x7,x3
-	adcs	x16,x17,x24
-	adcs	x17,x19,x24
-	adc	x19,x20,xzr
-
-	adds	x14,x14,x8		// accumulate low parts
-	umulh	x8,x4,x3
-	adcs	x15,x15,x9
-	umulh	x9,x5,x3
-	adcs	x16,x16,x10
-	umulh	x10,x6,x3
-	adcs	x17,x17,x11
-	umulh	x11,x7,x3
-	adc	x19,x19,xzr
-	mul	x24,x14,x23
-	adds	x15,x15,x8		// accumulate high parts
-	adcs	x16,x16,x9
-	adcs	x17,x17,x10
-	adcs	x19,x19,x11
-	adc	x20,xzr,xzr
-	ldr	x3,[x2,#8*2]		// b[i]
-
-	lsl	x8,x24,#32
-	subs	x16,x16,x24
-	lsr	x9,x24,#32
-	sbcs	x17,x17,x8
-	sbcs	x19,x19,x9
-	sbc	x20,x20,xzr
-
-	subs	xzr,x14,#1
-	umulh	x9,x12,x24
-	mul	x10,x13,x24
-	umulh	x11,x13,x24
-
-	adcs	x10,x10,x9
-	mul	x8,x4,x3
-	adc	x11,x11,xzr
-	mul	x9,x5,x3
-
-	adds	x14,x15,x10
-	mul	x10,x6,x3
-	adcs	x15,x16,x11
-	mul	x11,x7,x3
-	adcs	x16,x17,x24
-	adcs	x17,x19,x24
-	adc	x19,x20,xzr
-
-	adds	x14,x14,x8		// accumulate low parts
-	umulh	x8,x4,x3
-	adcs	x15,x15,x9
-	umulh	x9,x5,x3
-	adcs	x16,x16,x10
-	umulh	x10,x6,x3
-	adcs	x17,x17,x11
-	umulh	x11,x7,x3
-	adc	x19,x19,xzr
-	mul	x24,x14,x23
-	adds	x15,x15,x8		// accumulate high parts
-	adcs	x16,x16,x9
-	adcs	x17,x17,x10
-	adcs	x19,x19,x11
-	adc	x20,xzr,xzr
-	ldr	x3,[x2,#8*3]		// b[i]
-
-	lsl	x8,x24,#32
-	subs	x16,x16,x24
-	lsr	x9,x24,#32
-	sbcs	x17,x17,x8
-	sbcs	x19,x19,x9
-	sbc	x20,x20,xzr
-
-	subs	xzr,x14,#1
-	umulh	x9,x12,x24
-	mul	x10,x13,x24
-	umulh	x11,x13,x24
-
-	adcs	x10,x10,x9
-	mul	x8,x4,x3
-	adc	x11,x11,xzr
-	mul	x9,x5,x3
-
-	adds	x14,x15,x10
-	mul	x10,x6,x3
-	adcs	x15,x16,x11
-	mul	x11,x7,x3
-	adcs	x16,x17,x24
-	adcs	x17,x19,x24
-	adc	x19,x20,xzr
-
-	adds	x14,x14,x8		// accumulate low parts
-	umulh	x8,x4,x3
-	adcs	x15,x15,x9
-	umulh	x9,x5,x3
-	adcs	x16,x16,x10
-	umulh	x10,x6,x3
-	adcs	x17,x17,x11
-	umulh	x11,x7,x3
-	adc	x19,x19,xzr
-	mul	x24,x14,x23
-	adds	x15,x15,x8		// accumulate high parts
-	adcs	x16,x16,x9
-	adcs	x17,x17,x10
-	adcs	x19,x19,x11
-	adc	x20,xzr,xzr
-	lsl	x8,x24,#32		// last reduction
-	subs	x16,x16,x24
-	lsr	x9,x24,#32
-	sbcs	x17,x17,x8
-	sbcs	x19,x19,x9
-	sbc	x20,x20,xzr
-
-	subs	xzr,x14,#1
-	umulh	x9,x12,x24
-	mul	x10,x13,x24
-	umulh	x11,x13,x24
-
-	adcs	x10,x10,x9
-	adc	x11,x11,xzr
-
-	adds	x14,x15,x10
-	adcs	x15,x16,x11
-	adcs	x16,x17,x24
-	adcs	x17,x19,x24
-	adc	x19,x20,xzr
-
-	subs	x8,x14,x12		// ret -= modulus
-	sbcs	x9,x15,x13
-	sbcs	x10,x16,x21
-	sbcs	x11,x17,x22
-	sbcs	xzr,x19,xzr
-
-	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
-	csel	x15,x15,x9,lo
-	csel	x16,x16,x10,lo
-	stp	x14,x15,[x0]
-	csel	x17,x17,x11,lo
-	stp	x16,x17,[x0,#16]
-
-	ldp	x19,x20,[sp,#16]
-	ldp	x21,x22,[sp,#32]
-	ldp	x23,x24,[sp,#48]
-	ldr	x29,[sp],#64
-	ret
-
-
-////////////////////////////////////////////////////////////////////////
-// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
-//                                uint64_t rep);
-.globl	_ecp_nistz256_ord_sqr_mont
-.private_extern	_ecp_nistz256_ord_sqr_mont
-
-.align	4
-_ecp_nistz256_ord_sqr_mont:
-	AARCH64_VALID_CALL_TARGET
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	stp	x29,x30,[sp,#-64]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-
-	adrp	x23,Lord@PAGE
-	add	x23,x23,Lord@PAGEOFF
-	ldp	x4,x5,[x1]
-	ldp	x6,x7,[x1,#16]
-
-	ldp	x12,x13,[x23,#0]
-	ldp	x21,x22,[x23,#16]
-	ldr	x23,[x23,#32]
-	b	Loop_ord_sqr
-
-.align	4
-Loop_ord_sqr:
-	sub	x2,x2,#1
-	////////////////////////////////////////////////////////////////
-	//  |  |  |  |  |  |a1*a0|  |
-	//  |  |  |  |  |a2*a0|  |  |
-	//  |  |a3*a2|a3*a0|  |  |  |
-	//  |  |  |  |a2*a1|  |  |  |
-	//  |  |  |a3*a1|  |  |  |  |
-	// *|  |  |  |  |  |  |  | 2|
-	// +|a3*a3|a2*a2|a1*a1|a0*a0|
-	//  |--+--+--+--+--+--+--+--|
-	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 
-	//
-	//  "can't overflow" below mark carrying into high part of
-	//  multiplication result, which can't overflow, because it
-	//  can never be all ones.
-
-	mul	x15,x5,x4		// a[1]*a[0]
-	umulh	x9,x5,x4
-	mul	x16,x6,x4		// a[2]*a[0]
-	umulh	x10,x6,x4
-	mul	x17,x7,x4		// a[3]*a[0]
-	umulh	x19,x7,x4
-
-	adds	x16,x16,x9		// accumulate high parts of multiplication
-	mul	x8,x6,x5		// a[2]*a[1]
-	umulh	x9,x6,x5
-	adcs	x17,x17,x10
-	mul	x10,x7,x5		// a[3]*a[1]
-	umulh	x11,x7,x5
-	adc	x19,x19,xzr		// can't overflow
-
-	mul	x20,x7,x6		// a[3]*a[2]
-	umulh	x1,x7,x6
-
-	adds	x9,x9,x10		// accumulate high parts of multiplication
-	mul	x14,x4,x4		// a[0]*a[0]
-	adc	x10,x11,xzr		// can't overflow
-
-	adds	x17,x17,x8		// accumulate low parts of multiplication
-	umulh	x4,x4,x4
-	adcs	x19,x19,x9
-	mul	x9,x5,x5		// a[1]*a[1]
-	adcs	x20,x20,x10
-	umulh	x5,x5,x5
-	adc	x1,x1,xzr		// can't overflow
-
-	adds	x15,x15,x15	// acc[1-6]*=2
-	mul	x10,x6,x6		// a[2]*a[2]
-	adcs	x16,x16,x16
-	umulh	x6,x6,x6
-	adcs	x17,x17,x17
-	mul	x11,x7,x7		// a[3]*a[3]
-	adcs	x19,x19,x19
-	umulh	x7,x7,x7
-	adcs	x20,x20,x20
-	adcs	x1,x1,x1
-	adc	x3,xzr,xzr
-
-	adds	x15,x15,x4		// +a[i]*a[i]
-	mul	x24,x14,x23
-	adcs	x16,x16,x9
-	adcs	x17,x17,x5
-	adcs	x19,x19,x10
-	adcs	x20,x20,x6
-	adcs	x1,x1,x11
-	adc	x3,x3,x7
-	subs	xzr,x14,#1
-	umulh	x9,x12,x24
-	mul	x10,x13,x24
-	umulh	x11,x13,x24
-
-	adcs	x10,x10,x9
-	adc	x11,x11,xzr
-
-	adds	x14,x15,x10
-	adcs	x15,x16,x11
-	adcs	x16,x17,x24
-	adc	x17,xzr,x24		// can't overflow
-	mul	x11,x14,x23
-	lsl	x8,x24,#32
-	subs	x15,x15,x24
-	lsr	x9,x24,#32
-	sbcs	x16,x16,x8
-	sbc	x17,x17,x9		// can't borrow
-	subs	xzr,x14,#1
-	umulh	x9,x12,x11
-	mul	x10,x13,x11
-	umulh	x24,x13,x11
-
-	adcs	x10,x10,x9
-	adc	x24,x24,xzr
-
-	adds	x14,x15,x10
-	adcs	x15,x16,x24
-	adcs	x16,x17,x11
-	adc	x17,xzr,x11		// can't overflow
-	mul	x24,x14,x23
-	lsl	x8,x11,#32
-	subs	x15,x15,x11
-	lsr	x9,x11,#32
-	sbcs	x16,x16,x8
-	sbc	x17,x17,x9		// can't borrow
-	subs	xzr,x14,#1
-	umulh	x9,x12,x24
-	mul	x10,x13,x24
-	umulh	x11,x13,x24
-
-	adcs	x10,x10,x9
-	adc	x11,x11,xzr
-
-	adds	x14,x15,x10
-	adcs	x15,x16,x11
-	adcs	x16,x17,x24
-	adc	x17,xzr,x24		// can't overflow
-	mul	x11,x14,x23
-	lsl	x8,x24,#32
-	subs	x15,x15,x24
-	lsr	x9,x24,#32
-	sbcs	x16,x16,x8
-	sbc	x17,x17,x9		// can't borrow
-	subs	xzr,x14,#1
-	umulh	x9,x12,x11
-	mul	x10,x13,x11
-	umulh	x24,x13,x11
-
-	adcs	x10,x10,x9
-	adc	x24,x24,xzr
-
-	adds	x14,x15,x10
-	adcs	x15,x16,x24
-	adcs	x16,x17,x11
-	adc	x17,xzr,x11		// can't overflow
-	lsl	x8,x11,#32
-	subs	x15,x15,x11
-	lsr	x9,x11,#32
-	sbcs	x16,x16,x8
-	sbc	x17,x17,x9		// can't borrow
-	adds	x14,x14,x19	// accumulate upper half
-	adcs	x15,x15,x20
-	adcs	x16,x16,x1
-	adcs	x17,x17,x3
-	adc	x19,xzr,xzr
-
-	subs	x8,x14,x12		// ret -= modulus
-	sbcs	x9,x15,x13
-	sbcs	x10,x16,x21
-	sbcs	x11,x17,x22
-	sbcs	xzr,x19,xzr
-
-	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
-	csel	x5,x15,x9,lo
-	csel	x6,x16,x10,lo
-	csel	x7,x17,x11,lo
-
-	cbnz	x2,Loop_ord_sqr
-
-	stp	x4,x5,[x0]
-	stp	x6,x7,[x0,#16]
-
-	ldp	x19,x20,[sp,#16]
-	ldp	x21,x22,[sp,#32]
-	ldp	x23,x24,[sp,#48]
-	ldr	x29,[sp],#64
-	ret
-
-////////////////////////////////////////////////////////////////////////
-// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
-.globl	_ecp_nistz256_select_w5
-.private_extern	_ecp_nistz256_select_w5
-
-.align	4
-_ecp_nistz256_select_w5:
-	AARCH64_VALID_CALL_TARGET
-
-    // x10 := x0
-    // w9 := 0; loop counter and incremented internal index
-	mov	x10, x0
-	mov	w9, #0
-
-    // [v16-v21] := 0
-	movi	v16.16b, #0
-	movi	v17.16b, #0
-	movi	v18.16b, #0
-	movi	v19.16b, #0
-	movi	v20.16b, #0
-	movi	v21.16b, #0
-
-Lselect_w5_loop:
-    // Loop 16 times.
-
-    // Increment index (loop counter); tested at the end of the loop
-	add	w9, w9, #1
-
-    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
-    //  and advance x1 to point to the next entry
-	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
-
-    // x11 := (w9 == w2)? All 1s : All 0s
-	cmp	w9, w2
-	csetm	x11, eq
-
-    // continue loading ...
-	ld1	{v26.2d, v27.2d}, [x1],#32
-
-    // duplicate mask_64 into Mask (all 0s or all 1s)
-	dup	v3.2d, x11
-
-    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
-    // i.e., values in output registers will remain the same if w9 != w2
-	bit	v16.16b, v22.16b, v3.16b
-	bit	v17.16b, v23.16b, v3.16b
-
-	bit	v18.16b, v24.16b, v3.16b
-	bit	v19.16b, v25.16b, v3.16b
-
-	bit	v20.16b, v26.16b, v3.16b
-	bit	v21.16b, v27.16b, v3.16b
-
-    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
-	tbz	w9, #4, Lselect_w5_loop
-
-    // Write [v16-v21] to memory at the output pointer
-	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
-	st1	{v20.2d, v21.2d}, [x10]
-
-	ret
-
-
-
-////////////////////////////////////////////////////////////////////////
-// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
-.globl	_ecp_nistz256_select_w7
-.private_extern	_ecp_nistz256_select_w7
-
-.align	4
-_ecp_nistz256_select_w7:
-	AARCH64_VALID_CALL_TARGET
-
-    // w9 := 0; loop counter and incremented internal index
-	mov	w9, #0
-
-    // [v16-v21] := 0
-	movi	v16.16b, #0
-	movi	v17.16b, #0
-	movi	v18.16b, #0
-	movi	v19.16b, #0
-
-Lselect_w7_loop:
-    // Loop 64 times.
-
-    // Increment index (loop counter); tested at the end of the loop
-	add	w9, w9, #1
-
-    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
-    //  and advance x1 to point to the next entry
-	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
-
-    // x11 := (w9 == w2)? All 1s : All 0s
-	cmp	w9, w2
-	csetm	x11, eq
-
-    // duplicate mask_64 into Mask (all 0s or all 1s)
-	dup	v3.2d, x11
-
-    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
-    // i.e., values in output registers will remain the same if w9 != w2
-	bit	v16.16b, v22.16b, v3.16b
-	bit	v17.16b, v23.16b, v3.16b
-
-	bit	v18.16b, v24.16b, v3.16b
-	bit	v19.16b, v25.16b, v3.16b
-
-    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
-	tbz	w9, #6, Lselect_w7_loop
-
-    // Write [v16-v19] to memory at the output pointer
-	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
-
-	ret
-
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-apple.S b/apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-apple.S
deleted file mode 100644
index 49ea9b8..0000000
--- a/apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-apple.S
+++ /dev/null
@@ -1,309 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include "openssl/arm_arch.h"
-
-.text
-.globl	_beeu_mod_inverse_vartime
-.private_extern	_beeu_mod_inverse_vartime
-
-.align	4
-_beeu_mod_inverse_vartime:
-    // Reserve enough space for 14 8-byte registers on the stack
-    // in the first stp call for x29, x30.
-    // Then store the remaining callee-saved registers.
-    //
-    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
-    //    ^                                                     ^
-    //    sp  <------------------- 112 bytes ----------------> old sp
-    //   x29 (FP)
-    //
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-112]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	stp	x0,x2,[sp,#96]
-
-    // B = b3..b0 := a
-	ldp	x25,x26,[x1]
-	ldp	x27,x28,[x1,#16]
-
-    // n3..n0 := n
-    // Note: the value of input params are changed in the following.
-	ldp	x0,x1,[x2]
-	ldp	x2,x30,[x2,#16]
-
-    // A = a3..a0 := n
-	mov	x21, x0
-	mov	x22, x1
-	mov	x23, x2
-	mov	x24, x30
-
-    // X = x4..x0 := 1
-	mov	x3, #1
-	eor	x4, x4, x4
-	eor	x5, x5, x5
-	eor	x6, x6, x6
-	eor	x7, x7, x7
-
-    // Y = y4..y0 := 0
-	eor	x8, x8, x8
-	eor	x9, x9, x9
-	eor	x10, x10, x10
-	eor	x11, x11, x11
-	eor	x12, x12, x12
-
-Lbeeu_loop:
-    // if B == 0, jump to .Lbeeu_loop_end
-	orr	x14, x25, x26
-	orr	x14, x14, x27
-
-    // reverse the bit order of x25. This is needed for clz after this macro
-	rbit	x15, x25
-
-	orr	x14, x14, x28
-	cbz	x14,Lbeeu_loop_end
-
-
-    // 0 < B < |n|,
-    // 0 < A <= |n|,
-    // (1)      X*a  ==  B   (mod |n|),
-    // (2) (-1)*Y*a  ==  A   (mod |n|)
-
-    // Now divide B by the maximum possible power of two in the
-    // integers, and divide X by the same value mod |n|.
-    // When we're done, (1) still holds.
-
-    // shift := number of trailing 0s in x25
-    // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
-	clz	x13, x15
-
-    // If there is no shift, goto shift_A_Y
-	cbz	x13, Lbeeu_shift_A_Y
-
-    // Shift B right by "x13" bits
-	neg	x14, x13
-	lsr	x25, x25, x13
-	lsl	x15, x26, x14
-
-	lsr	x26, x26, x13
-	lsl	x19, x27, x14
-
-	orr	x25, x25, x15
-
-	lsr	x27, x27, x13
-	lsl	x20, x28, x14
-
-	orr	x26, x26, x19
-
-	lsr	x28, x28, x13
-
-	orr	x27, x27, x20
-
-
-    // Shift X right by "x13" bits, adding n whenever X becomes odd.
-    // x13--;
-    // x14 := 0; needed in the addition to the most significant word in SHIFT1
-	eor	x14, x14, x14
-Lbeeu_shift_loop_X:
-	tbz	x3, #0, Lshift1_0
-	adds	x3, x3, x0
-	adcs	x4, x4, x1
-	adcs	x5, x5, x2
-	adcs	x6, x6, x30
-	adc	x7, x7, x14
-Lshift1_0:
-    // var0 := [var1|var0]<64..1>;
-    // i.e. concatenate var1 and var0,
-    //      extract bits <64..1> from the resulting 128-bit value
-    //      and put them in var0
-	extr	x3, x4, x3, #1
-	extr	x4, x5, x4, #1
-	extr	x5, x6, x5, #1
-	extr	x6, x7, x6, #1
-	lsr	x7, x7, #1
-
-	subs	x13, x13, #1
-	bne	Lbeeu_shift_loop_X
-
-    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
-    // with the following differences:
-    // - "x13" is set directly to the number of trailing 0s in B
-    //   (using rbit and clz instructions)
-    // - The loop is only used to call SHIFT1(X)
-    //   and x13 is decreased while executing the X loop.
-    // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
-
-Lbeeu_shift_A_Y:
-    // Same for A and Y.
-    // Afterwards, (2) still holds.
-    // Reverse the bit order of x21
-    // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
-	rbit	x15, x21
-	clz	x13, x15
-
-    // If there is no shift, goto |B-A|, X+Y update
-	cbz	x13, Lbeeu_update_B_X_or_A_Y
-
-    // Shift A right by "x13" bits
-	neg	x14, x13
-	lsr	x21, x21, x13
-	lsl	x15, x22, x14
-
-	lsr	x22, x22, x13
-	lsl	x19, x23, x14
-
-	orr	x21, x21, x15
-
-	lsr	x23, x23, x13
-	lsl	x20, x24, x14
-
-	orr	x22, x22, x19
-
-	lsr	x24, x24, x13
-
-	orr	x23, x23, x20
-
-
-    // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
-    // x13--;
-    // x14 := 0; needed in the addition to the most significant word in SHIFT1
-	eor	x14, x14, x14
-Lbeeu_shift_loop_Y:
-	tbz	x8, #0, Lshift1_1
-	adds	x8, x8, x0
-	adcs	x9, x9, x1
-	adcs	x10, x10, x2
-	adcs	x11, x11, x30
-	adc	x12, x12, x14
-Lshift1_1:
-    // var0 := [var1|var0]<64..1>;
-    // i.e. concatenate var1 and var0,
-    //      extract bits <64..1> from the resulting 128-bit value
-    //      and put them in var0
-	extr	x8, x9, x8, #1
-	extr	x9, x10, x9, #1
-	extr	x10, x11, x10, #1
-	extr	x11, x12, x11, #1
-	lsr	x12, x12, #1
-
-	subs	x13, x13, #1
-	bne	Lbeeu_shift_loop_Y
-
-Lbeeu_update_B_X_or_A_Y:
-    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
-    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
-    //       without taking a sign bit if generated. The lack of a carry would
-    //       indicate a negative result. See, for example,
-    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
-	subs	x14, x25, x21
-	sbcs	x15, x26, x22
-	sbcs	x19, x27, x23
-	sbcs	x20, x28, x24
-	bcs	Lbeeu_B_greater_than_A
-
-    // Else A > B =>
-    // A := A - B; Y := Y + X; goto beginning of the loop
-	subs	x21, x21, x25
-	sbcs	x22, x22, x26
-	sbcs	x23, x23, x27
-	sbcs	x24, x24, x28
-
-	adds	x8, x8, x3
-	adcs	x9, x9, x4
-	adcs	x10, x10, x5
-	adcs	x11, x11, x6
-	adc	x12, x12, x7
-	b	Lbeeu_loop
-
-Lbeeu_B_greater_than_A:
-    // Continue with B > A =>
-    // B := B - A; X := X + Y; goto beginning of the loop
-	mov	x25, x14
-	mov	x26, x15
-	mov	x27, x19
-	mov	x28, x20
-
-	adds	x3, x3, x8
-	adcs	x4, x4, x9
-	adcs	x5, x5, x10
-	adcs	x6, x6, x11
-	adc	x7, x7, x12
-	b	Lbeeu_loop
-
-Lbeeu_loop_end:
-    // The Euclid's algorithm loop ends when A == gcd(a,n);
-    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
-    // Since (-1)*Y*a == A (mod |n|), Y>0
-    // then out = -Y mod n
-
-    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
-    // Is A-1 == 0?
-    // If not, fail.
-	sub	x14, x21, #1
-	orr	x14, x14, x22
-	orr	x14, x14, x23
-	orr	x14, x14, x24
-	cbnz	x14, Lbeeu_err
-
-    // If Y>n ==> Y:=Y-n
-Lbeeu_reduction_loop:
-    // x_i := y_i - n_i (X is no longer needed, use it as temp)
-    // (x14 = 0 from above)
-	subs	x3, x8, x0
-	sbcs	x4, x9, x1
-	sbcs	x5, x10, x2
-	sbcs	x6, x11, x30
-	sbcs	x7, x12, x14
-
-    // If result is non-negative (i.e., cs = carry set = no borrow),
-    // y_i := x_i; goto reduce again
-    // else
-    // y_i := y_i; continue
-	csel	x8, x3, x8, cs
-	csel	x9, x4, x9, cs
-	csel	x10, x5, x10, cs
-	csel	x11, x6, x11, cs
-	csel	x12, x7, x12, cs
-	bcs	Lbeeu_reduction_loop
-
-    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
-    // out = -Y = n-Y
-	subs	x8, x0, x8
-	sbcs	x9, x1, x9
-	sbcs	x10, x2, x10
-	sbcs	x11, x30, x11
-
-    // Save Y in output (out (x0) was saved on the stack)
-	ldr	x3, [sp,#96]
-	stp	x8, x9, [x3]
-	stp	x10, x11, [x3,#16]
-    // return 1 (success)
-	mov	x0, #1
-	b	Lbeeu_finish
-
-Lbeeu_err:
-    // return 0 (error)
-	eor	x0, x0, x0
-
-Lbeeu_finish:
-    // Restore callee-saved registers, except x0, x2
-	add	sp,x29,#0
-	ldp	x19,x20,[sp,#16]
-	ldp	x21,x22,[sp,#32]
-	ldp	x23,x24,[sp,#48]
-	ldp	x25,x26,[sp,#64]
-	ldp	x27,x28,[sp,#80]
-	ldp	x29,x30,[sp],#112
-
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/sha1-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/sha1-armv8-apple.S
deleted file mode 100644
index 744c630..0000000
--- a/apple-aarch64/crypto/fipsmodule/sha1-armv8-apple.S
+++ /dev/null
@@ -1,1227 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.text
-
-
-.private_extern	_OPENSSL_armcap_P
-.globl	_sha1_block_data_order
-.private_extern	_sha1_block_data_order
-
-.align	6
-_sha1_block_data_order:
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	AARCH64_VALID_CALL_TARGET
-#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
-	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
-#else
-	adrp	x16,_OPENSSL_armcap_P@PAGE
-#endif
-	ldr	w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
-	tst	w16,#ARMV8_SHA1
-	b.ne	Lv8_entry
-
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-
-	ldp	w20,w21,[x0]
-	ldp	w22,w23,[x0,#8]
-	ldr	w24,[x0,#16]
-
-Loop:
-	ldr	x3,[x1],#64
-	movz	w28,#0x7999
-	sub	x2,x2,#1
-	movk	w28,#0x5a82,lsl#16
-#ifdef	__AARCH64EB__
-	ror	x3,x3,#32
-#else
-	rev32	x3,x3
-#endif
-	add	w24,w24,w28		// warm it up
-	add	w24,w24,w3
-	lsr	x4,x3,#32
-	ldr	x5,[x1,#-56]
-	bic	w25,w23,w21
-	and	w26,w22,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	add	w23,w23,w4	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x5,x5,#32
-#else
-	rev32	x5,x5
-#endif
-	bic	w25,w22,w20
-	and	w26,w21,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	add	w22,w22,w5	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	lsr	x6,x5,#32
-	ldr	x7,[x1,#-48]
-	bic	w25,w21,w24
-	and	w26,w20,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	add	w21,w21,w6	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x7,x7,#32
-#else
-	rev32	x7,x7
-#endif
-	bic	w25,w20,w23
-	and	w26,w24,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	add	w20,w20,w7	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	lsr	x8,x7,#32
-	ldr	x9,[x1,#-40]
-	bic	w25,w24,w22
-	and	w26,w23,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	add	w24,w24,w8	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x9,x9,#32
-#else
-	rev32	x9,x9
-#endif
-	bic	w25,w23,w21
-	and	w26,w22,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	add	w23,w23,w9	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	lsr	x10,x9,#32
-	ldr	x11,[x1,#-32]
-	bic	w25,w22,w20
-	and	w26,w21,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	add	w22,w22,w10	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x11,x11,#32
-#else
-	rev32	x11,x11
-#endif
-	bic	w25,w21,w24
-	and	w26,w20,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	add	w21,w21,w11	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	lsr	x12,x11,#32
-	ldr	x13,[x1,#-24]
-	bic	w25,w20,w23
-	and	w26,w24,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	add	w20,w20,w12	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x13,x13,#32
-#else
-	rev32	x13,x13
-#endif
-	bic	w25,w24,w22
-	and	w26,w23,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	add	w24,w24,w13	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	lsr	x14,x13,#32
-	ldr	x15,[x1,#-16]
-	bic	w25,w23,w21
-	and	w26,w22,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	add	w23,w23,w14	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x15,x15,#32
-#else
-	rev32	x15,x15
-#endif
-	bic	w25,w22,w20
-	and	w26,w21,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	add	w22,w22,w15	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	lsr	x16,x15,#32
-	ldr	x17,[x1,#-8]
-	bic	w25,w21,w24
-	and	w26,w20,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	add	w21,w21,w16	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-#ifdef	__AARCH64EB__
-	ror	x17,x17,#32
-#else
-	rev32	x17,x17
-#endif
-	bic	w25,w20,w23
-	and	w26,w24,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	add	w20,w20,w17	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	lsr	x19,x17,#32
-	eor	w3,w3,w5
-	bic	w25,w24,w22
-	and	w26,w23,w22
-	ror	w27,w21,#27
-	eor	w3,w3,w11
-	add	w24,w24,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w20,w20,w27		// e+=rot(a,5)
-	eor	w3,w3,w16
-	ror	w22,w22,#2
-	add	w24,w24,w19	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w3,w3,#31
-	eor	w4,w4,w6
-	bic	w25,w23,w21
-	and	w26,w22,w21
-	ror	w27,w20,#27
-	eor	w4,w4,w12
-	add	w23,w23,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w24,w24,w27		// e+=rot(a,5)
-	eor	w4,w4,w17
-	ror	w21,w21,#2
-	add	w23,w23,w3	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w4,w4,#31
-	eor	w5,w5,w7
-	bic	w25,w22,w20
-	and	w26,w21,w20
-	ror	w27,w24,#27
-	eor	w5,w5,w13
-	add	w22,w22,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w23,w23,w27		// e+=rot(a,5)
-	eor	w5,w5,w19
-	ror	w20,w20,#2
-	add	w22,w22,w4	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w5,w5,#31
-	eor	w6,w6,w8
-	bic	w25,w21,w24
-	and	w26,w20,w24
-	ror	w27,w23,#27
-	eor	w6,w6,w14
-	add	w21,w21,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w22,w22,w27		// e+=rot(a,5)
-	eor	w6,w6,w3
-	ror	w24,w24,#2
-	add	w21,w21,w5	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w6,w6,#31
-	eor	w7,w7,w9
-	bic	w25,w20,w23
-	and	w26,w24,w23
-	ror	w27,w22,#27
-	eor	w7,w7,w15
-	add	w20,w20,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w21,w21,w27		// e+=rot(a,5)
-	eor	w7,w7,w4
-	ror	w23,w23,#2
-	add	w20,w20,w6	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w7,w7,#31
-	movz	w28,#0xeba1
-	movk	w28,#0x6ed9,lsl#16
-	eor	w8,w8,w10
-	bic	w25,w24,w22
-	and	w26,w23,w22
-	ror	w27,w21,#27
-	eor	w8,w8,w16
-	add	w24,w24,w28		// future e+=K
-	orr	w25,w25,w26
-	add	w20,w20,w27		// e+=rot(a,5)
-	eor	w8,w8,w5
-	ror	w22,w22,#2
-	add	w24,w24,w7	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w8,w8,#31
-	eor	w9,w9,w11
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w9,w9,w17
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w9,w9,w6
-	add	w23,w23,w8	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w9,w9,#31
-	eor	w10,w10,w12
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w10,w10,w19
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w10,w10,w7
-	add	w22,w22,w9	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w10,w10,#31
-	eor	w11,w11,w13
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w11,w11,w3
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w11,w11,w8
-	add	w21,w21,w10	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w11,w11,#31
-	eor	w12,w12,w14
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w12,w12,w4
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w12,w12,w9
-	add	w20,w20,w11	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w12,w12,#31
-	eor	w13,w13,w15
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w13,w13,w5
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w13,w13,w10
-	add	w24,w24,w12	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w13,w13,#31
-	eor	w14,w14,w16
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w14,w14,w6
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w14,w14,w11
-	add	w23,w23,w13	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w14,w14,#31
-	eor	w15,w15,w17
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w15,w15,w7
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w15,w15,w12
-	add	w22,w22,w14	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w15,w15,#31
-	eor	w16,w16,w19
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w16,w16,w8
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w16,w16,w13
-	add	w21,w21,w15	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w16,w16,#31
-	eor	w17,w17,w3
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w17,w17,w9
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w17,w17,w14
-	add	w20,w20,w16	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w17,w17,#31
-	eor	w19,w19,w4
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w19,w19,w10
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w19,w19,w15
-	add	w24,w24,w17	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w19,w19,#31
-	eor	w3,w3,w5
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w3,w3,w11
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w3,w3,w16
-	add	w23,w23,w19	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w3,w3,#31
-	eor	w4,w4,w6
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w4,w4,w12
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w4,w4,w17
-	add	w22,w22,w3	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w4,w4,#31
-	eor	w5,w5,w7
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w5,w5,w13
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w5,w5,w19
-	add	w21,w21,w4	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w5,w5,#31
-	eor	w6,w6,w8
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w6,w6,w14
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w6,w6,w3
-	add	w20,w20,w5	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w6,w6,#31
-	eor	w7,w7,w9
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w7,w7,w15
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w7,w7,w4
-	add	w24,w24,w6	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w7,w7,#31
-	eor	w8,w8,w10
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w8,w8,w16
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w8,w8,w5
-	add	w23,w23,w7	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w8,w8,#31
-	eor	w9,w9,w11
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w9,w9,w17
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w9,w9,w6
-	add	w22,w22,w8	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w9,w9,#31
-	eor	w10,w10,w12
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w10,w10,w19
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w10,w10,w7
-	add	w21,w21,w9	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w10,w10,#31
-	eor	w11,w11,w13
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w11,w11,w3
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w11,w11,w8
-	add	w20,w20,w10	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w11,w11,#31
-	movz	w28,#0xbcdc
-	movk	w28,#0x8f1b,lsl#16
-	eor	w12,w12,w14
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w12,w12,w4
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w12,w12,w9
-	add	w24,w24,w11	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w12,w12,#31
-	orr	w25,w21,w22
-	and	w26,w21,w22
-	eor	w13,w13,w15
-	ror	w27,w20,#27
-	and	w25,w25,w23
-	add	w23,w23,w28		// future e+=K
-	eor	w13,w13,w5
-	add	w24,w24,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w21,w21,#2
-	eor	w13,w13,w10
-	add	w23,w23,w12	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w13,w13,#31
-	orr	w25,w20,w21
-	and	w26,w20,w21
-	eor	w14,w14,w16
-	ror	w27,w24,#27
-	and	w25,w25,w22
-	add	w22,w22,w28		// future e+=K
-	eor	w14,w14,w6
-	add	w23,w23,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w20,w20,#2
-	eor	w14,w14,w11
-	add	w22,w22,w13	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w14,w14,#31
-	orr	w25,w24,w20
-	and	w26,w24,w20
-	eor	w15,w15,w17
-	ror	w27,w23,#27
-	and	w25,w25,w21
-	add	w21,w21,w28		// future e+=K
-	eor	w15,w15,w7
-	add	w22,w22,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w24,w24,#2
-	eor	w15,w15,w12
-	add	w21,w21,w14	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w15,w15,#31
-	orr	w25,w23,w24
-	and	w26,w23,w24
-	eor	w16,w16,w19
-	ror	w27,w22,#27
-	and	w25,w25,w20
-	add	w20,w20,w28		// future e+=K
-	eor	w16,w16,w8
-	add	w21,w21,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w23,w23,#2
-	eor	w16,w16,w13
-	add	w20,w20,w15	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w16,w16,#31
-	orr	w25,w22,w23
-	and	w26,w22,w23
-	eor	w17,w17,w3
-	ror	w27,w21,#27
-	and	w25,w25,w24
-	add	w24,w24,w28		// future e+=K
-	eor	w17,w17,w9
-	add	w20,w20,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w22,w22,#2
-	eor	w17,w17,w14
-	add	w24,w24,w16	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w17,w17,#31
-	orr	w25,w21,w22
-	and	w26,w21,w22
-	eor	w19,w19,w4
-	ror	w27,w20,#27
-	and	w25,w25,w23
-	add	w23,w23,w28		// future e+=K
-	eor	w19,w19,w10
-	add	w24,w24,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w21,w21,#2
-	eor	w19,w19,w15
-	add	w23,w23,w17	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w19,w19,#31
-	orr	w25,w20,w21
-	and	w26,w20,w21
-	eor	w3,w3,w5
-	ror	w27,w24,#27
-	and	w25,w25,w22
-	add	w22,w22,w28		// future e+=K
-	eor	w3,w3,w11
-	add	w23,w23,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w20,w20,#2
-	eor	w3,w3,w16
-	add	w22,w22,w19	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w3,w3,#31
-	orr	w25,w24,w20
-	and	w26,w24,w20
-	eor	w4,w4,w6
-	ror	w27,w23,#27
-	and	w25,w25,w21
-	add	w21,w21,w28		// future e+=K
-	eor	w4,w4,w12
-	add	w22,w22,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w24,w24,#2
-	eor	w4,w4,w17
-	add	w21,w21,w3	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w4,w4,#31
-	orr	w25,w23,w24
-	and	w26,w23,w24
-	eor	w5,w5,w7
-	ror	w27,w22,#27
-	and	w25,w25,w20
-	add	w20,w20,w28		// future e+=K
-	eor	w5,w5,w13
-	add	w21,w21,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w23,w23,#2
-	eor	w5,w5,w19
-	add	w20,w20,w4	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w5,w5,#31
-	orr	w25,w22,w23
-	and	w26,w22,w23
-	eor	w6,w6,w8
-	ror	w27,w21,#27
-	and	w25,w25,w24
-	add	w24,w24,w28		// future e+=K
-	eor	w6,w6,w14
-	add	w20,w20,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w22,w22,#2
-	eor	w6,w6,w3
-	add	w24,w24,w5	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w6,w6,#31
-	orr	w25,w21,w22
-	and	w26,w21,w22
-	eor	w7,w7,w9
-	ror	w27,w20,#27
-	and	w25,w25,w23
-	add	w23,w23,w28		// future e+=K
-	eor	w7,w7,w15
-	add	w24,w24,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w21,w21,#2
-	eor	w7,w7,w4
-	add	w23,w23,w6	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w7,w7,#31
-	orr	w25,w20,w21
-	and	w26,w20,w21
-	eor	w8,w8,w10
-	ror	w27,w24,#27
-	and	w25,w25,w22
-	add	w22,w22,w28		// future e+=K
-	eor	w8,w8,w16
-	add	w23,w23,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w20,w20,#2
-	eor	w8,w8,w5
-	add	w22,w22,w7	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w8,w8,#31
-	orr	w25,w24,w20
-	and	w26,w24,w20
-	eor	w9,w9,w11
-	ror	w27,w23,#27
-	and	w25,w25,w21
-	add	w21,w21,w28		// future e+=K
-	eor	w9,w9,w17
-	add	w22,w22,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w24,w24,#2
-	eor	w9,w9,w6
-	add	w21,w21,w8	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w9,w9,#31
-	orr	w25,w23,w24
-	and	w26,w23,w24
-	eor	w10,w10,w12
-	ror	w27,w22,#27
-	and	w25,w25,w20
-	add	w20,w20,w28		// future e+=K
-	eor	w10,w10,w19
-	add	w21,w21,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w23,w23,#2
-	eor	w10,w10,w7
-	add	w20,w20,w9	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w10,w10,#31
-	orr	w25,w22,w23
-	and	w26,w22,w23
-	eor	w11,w11,w13
-	ror	w27,w21,#27
-	and	w25,w25,w24
-	add	w24,w24,w28		// future e+=K
-	eor	w11,w11,w3
-	add	w20,w20,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w22,w22,#2
-	eor	w11,w11,w8
-	add	w24,w24,w10	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w11,w11,#31
-	orr	w25,w21,w22
-	and	w26,w21,w22
-	eor	w12,w12,w14
-	ror	w27,w20,#27
-	and	w25,w25,w23
-	add	w23,w23,w28		// future e+=K
-	eor	w12,w12,w4
-	add	w24,w24,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w21,w21,#2
-	eor	w12,w12,w9
-	add	w23,w23,w11	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w12,w12,#31
-	orr	w25,w20,w21
-	and	w26,w20,w21
-	eor	w13,w13,w15
-	ror	w27,w24,#27
-	and	w25,w25,w22
-	add	w22,w22,w28		// future e+=K
-	eor	w13,w13,w5
-	add	w23,w23,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w20,w20,#2
-	eor	w13,w13,w10
-	add	w22,w22,w12	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w13,w13,#31
-	orr	w25,w24,w20
-	and	w26,w24,w20
-	eor	w14,w14,w16
-	ror	w27,w23,#27
-	and	w25,w25,w21
-	add	w21,w21,w28		// future e+=K
-	eor	w14,w14,w6
-	add	w22,w22,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w24,w24,#2
-	eor	w14,w14,w11
-	add	w21,w21,w13	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w14,w14,#31
-	orr	w25,w23,w24
-	and	w26,w23,w24
-	eor	w15,w15,w17
-	ror	w27,w22,#27
-	and	w25,w25,w20
-	add	w20,w20,w28		// future e+=K
-	eor	w15,w15,w7
-	add	w21,w21,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w23,w23,#2
-	eor	w15,w15,w12
-	add	w20,w20,w14	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w15,w15,#31
-	movz	w28,#0xc1d6
-	movk	w28,#0xca62,lsl#16
-	orr	w25,w22,w23
-	and	w26,w22,w23
-	eor	w16,w16,w19
-	ror	w27,w21,#27
-	and	w25,w25,w24
-	add	w24,w24,w28		// future e+=K
-	eor	w16,w16,w8
-	add	w20,w20,w27		// e+=rot(a,5)
-	orr	w25,w25,w26
-	ror	w22,w22,#2
-	eor	w16,w16,w13
-	add	w24,w24,w15	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w16,w16,#31
-	eor	w17,w17,w3
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w17,w17,w9
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w17,w17,w14
-	add	w23,w23,w16	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w17,w17,#31
-	eor	w19,w19,w4
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w19,w19,w10
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w19,w19,w15
-	add	w22,w22,w17	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w19,w19,#31
-	eor	w3,w3,w5
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w3,w3,w11
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w3,w3,w16
-	add	w21,w21,w19	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w3,w3,#31
-	eor	w4,w4,w6
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w4,w4,w12
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w4,w4,w17
-	add	w20,w20,w3	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w4,w4,#31
-	eor	w5,w5,w7
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w5,w5,w13
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w5,w5,w19
-	add	w24,w24,w4	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w5,w5,#31
-	eor	w6,w6,w8
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w6,w6,w14
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w6,w6,w3
-	add	w23,w23,w5	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w6,w6,#31
-	eor	w7,w7,w9
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w7,w7,w15
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w7,w7,w4
-	add	w22,w22,w6	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w7,w7,#31
-	eor	w8,w8,w10
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w8,w8,w16
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w8,w8,w5
-	add	w21,w21,w7	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w8,w8,#31
-	eor	w9,w9,w11
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w9,w9,w17
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w9,w9,w6
-	add	w20,w20,w8	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w9,w9,#31
-	eor	w10,w10,w12
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w10,w10,w19
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w10,w10,w7
-	add	w24,w24,w9	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w10,w10,#31
-	eor	w11,w11,w13
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w11,w11,w3
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w11,w11,w8
-	add	w23,w23,w10	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w11,w11,#31
-	eor	w12,w12,w14
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w12,w12,w4
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w12,w12,w9
-	add	w22,w22,w11	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w12,w12,#31
-	eor	w13,w13,w15
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w13,w13,w5
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w13,w13,w10
-	add	w21,w21,w12	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w13,w13,#31
-	eor	w14,w14,w16
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w14,w14,w6
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	eor	w14,w14,w11
-	add	w20,w20,w13	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ror	w14,w14,#31
-	eor	w15,w15,w17
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	add	w24,w24,w28		// future e+=K
-	eor	w15,w15,w7
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	eor	w15,w15,w12
-	add	w24,w24,w14	// future e+=X[i]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	ror	w15,w15,#31
-	eor	w16,w16,w19
-	eor	w25,w23,w21
-	ror	w27,w20,#27
-	add	w23,w23,w28		// future e+=K
-	eor	w16,w16,w8
-	eor	w25,w25,w22
-	add	w24,w24,w27		// e+=rot(a,5)
-	ror	w21,w21,#2
-	eor	w16,w16,w13
-	add	w23,w23,w15	// future e+=X[i]
-	add	w24,w24,w25		// e+=F(b,c,d)
-	ror	w16,w16,#31
-	eor	w17,w17,w3
-	eor	w25,w22,w20
-	ror	w27,w24,#27
-	add	w22,w22,w28		// future e+=K
-	eor	w17,w17,w9
-	eor	w25,w25,w21
-	add	w23,w23,w27		// e+=rot(a,5)
-	ror	w20,w20,#2
-	eor	w17,w17,w14
-	add	w22,w22,w16	// future e+=X[i]
-	add	w23,w23,w25		// e+=F(b,c,d)
-	ror	w17,w17,#31
-	eor	w19,w19,w4
-	eor	w25,w21,w24
-	ror	w27,w23,#27
-	add	w21,w21,w28		// future e+=K
-	eor	w19,w19,w10
-	eor	w25,w25,w20
-	add	w22,w22,w27		// e+=rot(a,5)
-	ror	w24,w24,#2
-	eor	w19,w19,w15
-	add	w21,w21,w17	// future e+=X[i]
-	add	w22,w22,w25		// e+=F(b,c,d)
-	ror	w19,w19,#31
-	ldp	w4,w5,[x0]
-	eor	w25,w20,w23
-	ror	w27,w22,#27
-	add	w20,w20,w28		// future e+=K
-	eor	w25,w25,w24
-	add	w21,w21,w27		// e+=rot(a,5)
-	ror	w23,w23,#2
-	add	w20,w20,w19	// future e+=X[i]
-	add	w21,w21,w25		// e+=F(b,c,d)
-	ldp	w6,w7,[x0,#8]
-	eor	w25,w24,w22
-	ror	w27,w21,#27
-	eor	w25,w25,w23
-	add	w20,w20,w27		// e+=rot(a,5)
-	ror	w22,w22,#2
-	ldr	w8,[x0,#16]
-	add	w20,w20,w25		// e+=F(b,c,d)
-	add	w21,w21,w5
-	add	w22,w22,w6
-	add	w20,w20,w4
-	add	w23,w23,w7
-	add	w24,w24,w8
-	stp	w20,w21,[x0]
-	stp	w22,w23,[x0,#8]
-	str	w24,[x0,#16]
-	cbnz	x2,Loop
-
-	ldp	x19,x20,[sp,#16]
-	ldp	x21,x22,[sp,#32]
-	ldp	x23,x24,[sp,#48]
-	ldp	x25,x26,[sp,#64]
-	ldp	x27,x28,[sp,#80]
-	ldr	x29,[sp],#96
-	ret
-
-
-.align	6
-sha1_block_armv8:
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	AARCH64_VALID_CALL_TARGET
-Lv8_entry:
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	adrp	x4,Lconst@PAGE
-	add	x4,x4,Lconst@PAGEOFF
-	eor	v1.16b,v1.16b,v1.16b
-	ld1	{v0.4s},[x0],#16
-	ld1	{v1.s}[0],[x0]
-	sub	x0,x0,#16
-	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x4]
-
-Loop_hw:
-	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
-	sub	x2,x2,#1
-	rev32	v4.16b,v4.16b
-	rev32	v5.16b,v5.16b
-
-	add	v20.4s,v16.4s,v4.4s
-	rev32	v6.16b,v6.16b
-	orr	v22.16b,v0.16b,v0.16b	// offload
-
-	add	v21.4s,v16.4s,v5.4s
-	rev32	v7.16b,v7.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b
-.long	0x5e140020	//sha1c v0.16b,v1.16b,v20.4s		// 0
-	add	v20.4s,v16.4s,v6.4s
-.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 1
-.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
-	add	v21.4s,v16.4s,v7.4s
-.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
-.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 2
-.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
-	add	v20.4s,v16.4s,v4.4s
-.long	0x5e281885	//sha1su1 v5.16b,v4.16b
-.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 3
-.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
-	add	v21.4s,v17.4s,v5.4s
-.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
-.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 4
-.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
-	add	v20.4s,v17.4s,v6.4s
-.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
-.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 5
-.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
-	add	v21.4s,v17.4s,v7.4s
-.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
-.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 6
-.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
-	add	v20.4s,v17.4s,v4.4s
-.long	0x5e281885	//sha1su1 v5.16b,v4.16b
-.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 7
-.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
-	add	v21.4s,v17.4s,v5.4s
-.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
-.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 8
-.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
-	add	v20.4s,v18.4s,v6.4s
-.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
-.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 9
-.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
-	add	v21.4s,v18.4s,v7.4s
-.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
-.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 10
-.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
-	add	v20.4s,v18.4s,v4.4s
-.long	0x5e281885	//sha1su1 v5.16b,v4.16b
-.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 11
-.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
-	add	v21.4s,v18.4s,v5.4s
-.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
-.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 12
-.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
-	add	v20.4s,v18.4s,v6.4s
-.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
-.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 13
-.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
-	add	v21.4s,v19.4s,v7.4s
-.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
-.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 14
-.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
-	add	v20.4s,v19.4s,v4.4s
-.long	0x5e281885	//sha1su1 v5.16b,v4.16b
-.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 15
-.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
-	add	v21.4s,v19.4s,v5.4s
-.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
-.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 16
-.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
-	add	v20.4s,v19.4s,v6.4s
-.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 17
-.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
-	add	v21.4s,v19.4s,v7.4s
-
-.long	0x5e280803	//sha1h v3.16b,v0.16b		// 18
-.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
-
-.long	0x5e280802	//sha1h v2.16b,v0.16b		// 19
-.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
-
-	add	v1.4s,v1.4s,v2.4s
-	add	v0.4s,v0.4s,v22.4s
-
-	cbnz	x2,Loop_hw
-
-	st1	{v0.4s},[x0],#16
-	st1	{v1.s}[0],[x0]
-
-	ldr	x29,[sp],#16
-	ret
-
-.section	__TEXT,__const
-.align	6
-Lconst:
-.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
-.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
-.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
-.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
-.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/sha256-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/sha256-armv8-apple.S
deleted file mode 100644
index b54bcf9..0000000
--- a/apple-aarch64/crypto/fipsmodule/sha256-armv8-apple.S
+++ /dev/null
@@ -1,1204 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License").  You may not use
-// this file except in compliance with the License.  You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-//
-// Permission to use under GPLv2 terms is granted.
-// ====================================================================
-//
-// SHA256/512 for ARMv8.
-//
-// Performance in cycles per processed byte and improvement coefficient
-// over code generated with "default" compiler:
-//
-//		SHA256-hw	SHA256(*)	SHA512
-// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
-// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
-// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
-// Denver	2.01		10.5 (+26%)	6.70 (+8%)
-// X-Gene			20.0 (+100%)	12.8 (+300%(***))
-// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
-// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
-//
-// (*)	Software SHA256 results are of lesser relevance, presented
-//	mostly for informational purposes.
-// (**)	The result is a trade-off: it's possible to improve it by
-//	10% (or by 1 cycle per round), but at the cost of 20% loss
-//	on Cortex-A53 (or by 4 cycles per round).
-// (***)	Super-impressive coefficients over gcc-generated code are
-//	indication of some compiler "pathology", most notably code
-//	generated with -mgeneral-regs-only is significantly faster
-//	and the gap is only 40-90%.
-
-#ifndef	__KERNEL__
-# include <openssl/arm_arch.h>
-#endif
-
-.text
-
-
-.private_extern	_OPENSSL_armcap_P
-.globl	_sha256_block_data_order
-.private_extern	_sha256_block_data_order
-
-.align	6
-_sha256_block_data_order:
-	AARCH64_VALID_CALL_TARGET
-#ifndef	__KERNEL__
-#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
-	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
-#else
-	adrp	x16,_OPENSSL_armcap_P@PAGE
-#endif
-	ldr	w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
-	tst	w16,#ARMV8_SHA256
-	b.ne	Lv8_entry
-#endif
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#4*4
-
-	ldp	w20,w21,[x0]				// load context
-	ldp	w22,w23,[x0,#2*4]
-	ldp	w24,w25,[x0,#4*4]
-	add	x2,x1,x2,lsl#6	// end of input
-	ldp	w26,w27,[x0,#6*4]
-	adrp	x30,LK256@PAGE
-	add	x30,x30,LK256@PAGEOFF
-	stp	x0,x2,[x29,#96]
-
-Loop:
-	ldp	w3,w4,[x1],#2*4
-	ldr	w19,[x30],#4			// *K++
-	eor	w28,w21,w22				// magic seed
-	str	x1,[x29,#112]
-#ifndef	__AARCH64EB__
-	rev	w3,w3			// 0
-#endif
-	ror	w16,w24,#6
-	add	w27,w27,w19			// h+=K[i]
-	eor	w6,w24,w24,ror#14
-	and	w17,w25,w24
-	bic	w19,w26,w24
-	add	w27,w27,w3			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w20,w21			// a^b, b^c in next round
-	eor	w16,w16,w6,ror#11	// Sigma1(e)
-	ror	w6,w20,#2
-	add	w27,w27,w17			// h+=Ch(e,f,g)
-	eor	w17,w20,w20,ror#9
-	add	w27,w27,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w23,w23,w27			// d+=h
-	eor	w28,w28,w21			// Maj(a,b,c)
-	eor	w17,w6,w17,ror#13	// Sigma0(a)
-	add	w27,w27,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w27,w27,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w4,w4			// 1
-#endif
-	ldp	w5,w6,[x1],#2*4
-	add	w27,w27,w17			// h+=Sigma0(a)
-	ror	w16,w23,#6
-	add	w26,w26,w28			// h+=K[i]
-	eor	w7,w23,w23,ror#14
-	and	w17,w24,w23
-	bic	w28,w25,w23
-	add	w26,w26,w4			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w27,w20			// a^b, b^c in next round
-	eor	w16,w16,w7,ror#11	// Sigma1(e)
-	ror	w7,w27,#2
-	add	w26,w26,w17			// h+=Ch(e,f,g)
-	eor	w17,w27,w27,ror#9
-	add	w26,w26,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w22,w22,w26			// d+=h
-	eor	w19,w19,w20			// Maj(a,b,c)
-	eor	w17,w7,w17,ror#13	// Sigma0(a)
-	add	w26,w26,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w26,w26,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w5,w5			// 2
-#endif
-	add	w26,w26,w17			// h+=Sigma0(a)
-	ror	w16,w22,#6
-	add	w25,w25,w19			// h+=K[i]
-	eor	w8,w22,w22,ror#14
-	and	w17,w23,w22
-	bic	w19,w24,w22
-	add	w25,w25,w5			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w26,w27			// a^b, b^c in next round
-	eor	w16,w16,w8,ror#11	// Sigma1(e)
-	ror	w8,w26,#2
-	add	w25,w25,w17			// h+=Ch(e,f,g)
-	eor	w17,w26,w26,ror#9
-	add	w25,w25,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w21,w21,w25			// d+=h
-	eor	w28,w28,w27			// Maj(a,b,c)
-	eor	w17,w8,w17,ror#13	// Sigma0(a)
-	add	w25,w25,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w25,w25,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w6,w6			// 3
-#endif
-	ldp	w7,w8,[x1],#2*4
-	add	w25,w25,w17			// h+=Sigma0(a)
-	ror	w16,w21,#6
-	add	w24,w24,w28			// h+=K[i]
-	eor	w9,w21,w21,ror#14
-	and	w17,w22,w21
-	bic	w28,w23,w21
-	add	w24,w24,w6			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w25,w26			// a^b, b^c in next round
-	eor	w16,w16,w9,ror#11	// Sigma1(e)
-	ror	w9,w25,#2
-	add	w24,w24,w17			// h+=Ch(e,f,g)
-	eor	w17,w25,w25,ror#9
-	add	w24,w24,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w20,w20,w24			// d+=h
-	eor	w19,w19,w26			// Maj(a,b,c)
-	eor	w17,w9,w17,ror#13	// Sigma0(a)
-	add	w24,w24,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w24,w24,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w7,w7			// 4
-#endif
-	add	w24,w24,w17			// h+=Sigma0(a)
-	ror	w16,w20,#6
-	add	w23,w23,w19			// h+=K[i]
-	eor	w10,w20,w20,ror#14
-	and	w17,w21,w20
-	bic	w19,w22,w20
-	add	w23,w23,w7			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w24,w25			// a^b, b^c in next round
-	eor	w16,w16,w10,ror#11	// Sigma1(e)
-	ror	w10,w24,#2
-	add	w23,w23,w17			// h+=Ch(e,f,g)
-	eor	w17,w24,w24,ror#9
-	add	w23,w23,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w27,w27,w23			// d+=h
-	eor	w28,w28,w25			// Maj(a,b,c)
-	eor	w17,w10,w17,ror#13	// Sigma0(a)
-	add	w23,w23,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w23,w23,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w8,w8			// 5
-#endif
-	ldp	w9,w10,[x1],#2*4
-	add	w23,w23,w17			// h+=Sigma0(a)
-	ror	w16,w27,#6
-	add	w22,w22,w28			// h+=K[i]
-	eor	w11,w27,w27,ror#14
-	and	w17,w20,w27
-	bic	w28,w21,w27
-	add	w22,w22,w8			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w23,w24			// a^b, b^c in next round
-	eor	w16,w16,w11,ror#11	// Sigma1(e)
-	ror	w11,w23,#2
-	add	w22,w22,w17			// h+=Ch(e,f,g)
-	eor	w17,w23,w23,ror#9
-	add	w22,w22,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w26,w26,w22			// d+=h
-	eor	w19,w19,w24			// Maj(a,b,c)
-	eor	w17,w11,w17,ror#13	// Sigma0(a)
-	add	w22,w22,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w22,w22,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w9,w9			// 6
-#endif
-	add	w22,w22,w17			// h+=Sigma0(a)
-	ror	w16,w26,#6
-	add	w21,w21,w19			// h+=K[i]
-	eor	w12,w26,w26,ror#14
-	and	w17,w27,w26
-	bic	w19,w20,w26
-	add	w21,w21,w9			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w22,w23			// a^b, b^c in next round
-	eor	w16,w16,w12,ror#11	// Sigma1(e)
-	ror	w12,w22,#2
-	add	w21,w21,w17			// h+=Ch(e,f,g)
-	eor	w17,w22,w22,ror#9
-	add	w21,w21,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w25,w25,w21			// d+=h
-	eor	w28,w28,w23			// Maj(a,b,c)
-	eor	w17,w12,w17,ror#13	// Sigma0(a)
-	add	w21,w21,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w21,w21,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w10,w10			// 7
-#endif
-	ldp	w11,w12,[x1],#2*4
-	add	w21,w21,w17			// h+=Sigma0(a)
-	ror	w16,w25,#6
-	add	w20,w20,w28			// h+=K[i]
-	eor	w13,w25,w25,ror#14
-	and	w17,w26,w25
-	bic	w28,w27,w25
-	add	w20,w20,w10			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w21,w22			// a^b, b^c in next round
-	eor	w16,w16,w13,ror#11	// Sigma1(e)
-	ror	w13,w21,#2
-	add	w20,w20,w17			// h+=Ch(e,f,g)
-	eor	w17,w21,w21,ror#9
-	add	w20,w20,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w24,w24,w20			// d+=h
-	eor	w19,w19,w22			// Maj(a,b,c)
-	eor	w17,w13,w17,ror#13	// Sigma0(a)
-	add	w20,w20,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w20,w20,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w11,w11			// 8
-#endif
-	add	w20,w20,w17			// h+=Sigma0(a)
-	ror	w16,w24,#6
-	add	w27,w27,w19			// h+=K[i]
-	eor	w14,w24,w24,ror#14
-	and	w17,w25,w24
-	bic	w19,w26,w24
-	add	w27,w27,w11			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w20,w21			// a^b, b^c in next round
-	eor	w16,w16,w14,ror#11	// Sigma1(e)
-	ror	w14,w20,#2
-	add	w27,w27,w17			// h+=Ch(e,f,g)
-	eor	w17,w20,w20,ror#9
-	add	w27,w27,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w23,w23,w27			// d+=h
-	eor	w28,w28,w21			// Maj(a,b,c)
-	eor	w17,w14,w17,ror#13	// Sigma0(a)
-	add	w27,w27,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w27,w27,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w12,w12			// 9
-#endif
-	ldp	w13,w14,[x1],#2*4
-	add	w27,w27,w17			// h+=Sigma0(a)
-	ror	w16,w23,#6
-	add	w26,w26,w28			// h+=K[i]
-	eor	w15,w23,w23,ror#14
-	and	w17,w24,w23
-	bic	w28,w25,w23
-	add	w26,w26,w12			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w27,w20			// a^b, b^c in next round
-	eor	w16,w16,w15,ror#11	// Sigma1(e)
-	ror	w15,w27,#2
-	add	w26,w26,w17			// h+=Ch(e,f,g)
-	eor	w17,w27,w27,ror#9
-	add	w26,w26,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w22,w22,w26			// d+=h
-	eor	w19,w19,w20			// Maj(a,b,c)
-	eor	w17,w15,w17,ror#13	// Sigma0(a)
-	add	w26,w26,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w26,w26,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w13,w13			// 10
-#endif
-	add	w26,w26,w17			// h+=Sigma0(a)
-	ror	w16,w22,#6
-	add	w25,w25,w19			// h+=K[i]
-	eor	w0,w22,w22,ror#14
-	and	w17,w23,w22
-	bic	w19,w24,w22
-	add	w25,w25,w13			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w26,w27			// a^b, b^c in next round
-	eor	w16,w16,w0,ror#11	// Sigma1(e)
-	ror	w0,w26,#2
-	add	w25,w25,w17			// h+=Ch(e,f,g)
-	eor	w17,w26,w26,ror#9
-	add	w25,w25,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w21,w21,w25			// d+=h
-	eor	w28,w28,w27			// Maj(a,b,c)
-	eor	w17,w0,w17,ror#13	// Sigma0(a)
-	add	w25,w25,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w25,w25,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w14,w14			// 11
-#endif
-	ldp	w15,w0,[x1],#2*4
-	add	w25,w25,w17			// h+=Sigma0(a)
-	str	w6,[sp,#12]
-	ror	w16,w21,#6
-	add	w24,w24,w28			// h+=K[i]
-	eor	w6,w21,w21,ror#14
-	and	w17,w22,w21
-	bic	w28,w23,w21
-	add	w24,w24,w14			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w25,w26			// a^b, b^c in next round
-	eor	w16,w16,w6,ror#11	// Sigma1(e)
-	ror	w6,w25,#2
-	add	w24,w24,w17			// h+=Ch(e,f,g)
-	eor	w17,w25,w25,ror#9
-	add	w24,w24,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w20,w20,w24			// d+=h
-	eor	w19,w19,w26			// Maj(a,b,c)
-	eor	w17,w6,w17,ror#13	// Sigma0(a)
-	add	w24,w24,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w24,w24,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w15,w15			// 12
-#endif
-	add	w24,w24,w17			// h+=Sigma0(a)
-	str	w7,[sp,#0]
-	ror	w16,w20,#6
-	add	w23,w23,w19			// h+=K[i]
-	eor	w7,w20,w20,ror#14
-	and	w17,w21,w20
-	bic	w19,w22,w20
-	add	w23,w23,w15			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w24,w25			// a^b, b^c in next round
-	eor	w16,w16,w7,ror#11	// Sigma1(e)
-	ror	w7,w24,#2
-	add	w23,w23,w17			// h+=Ch(e,f,g)
-	eor	w17,w24,w24,ror#9
-	add	w23,w23,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w27,w27,w23			// d+=h
-	eor	w28,w28,w25			// Maj(a,b,c)
-	eor	w17,w7,w17,ror#13	// Sigma0(a)
-	add	w23,w23,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w23,w23,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w0,w0			// 13
-#endif
-	ldp	w1,w2,[x1]
-	add	w23,w23,w17			// h+=Sigma0(a)
-	str	w8,[sp,#4]
-	ror	w16,w27,#6
-	add	w22,w22,w28			// h+=K[i]
-	eor	w8,w27,w27,ror#14
-	and	w17,w20,w27
-	bic	w28,w21,w27
-	add	w22,w22,w0			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w23,w24			// a^b, b^c in next round
-	eor	w16,w16,w8,ror#11	// Sigma1(e)
-	ror	w8,w23,#2
-	add	w22,w22,w17			// h+=Ch(e,f,g)
-	eor	w17,w23,w23,ror#9
-	add	w22,w22,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w26,w26,w22			// d+=h
-	eor	w19,w19,w24			// Maj(a,b,c)
-	eor	w17,w8,w17,ror#13	// Sigma0(a)
-	add	w22,w22,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w22,w22,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w1,w1			// 14
-#endif
-	ldr	w6,[sp,#12]
-	add	w22,w22,w17			// h+=Sigma0(a)
-	str	w9,[sp,#8]
-	ror	w16,w26,#6
-	add	w21,w21,w19			// h+=K[i]
-	eor	w9,w26,w26,ror#14
-	and	w17,w27,w26
-	bic	w19,w20,w26
-	add	w21,w21,w1			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w22,w23			// a^b, b^c in next round
-	eor	w16,w16,w9,ror#11	// Sigma1(e)
-	ror	w9,w22,#2
-	add	w21,w21,w17			// h+=Ch(e,f,g)
-	eor	w17,w22,w22,ror#9
-	add	w21,w21,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w25,w25,w21			// d+=h
-	eor	w28,w28,w23			// Maj(a,b,c)
-	eor	w17,w9,w17,ror#13	// Sigma0(a)
-	add	w21,w21,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w21,w21,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w2,w2			// 15
-#endif
-	ldr	w7,[sp,#0]
-	add	w21,w21,w17			// h+=Sigma0(a)
-	str	w10,[sp,#12]
-	ror	w16,w25,#6
-	add	w20,w20,w28			// h+=K[i]
-	ror	w9,w4,#7
-	and	w17,w26,w25
-	ror	w8,w1,#17
-	bic	w28,w27,w25
-	ror	w10,w21,#2
-	add	w20,w20,w2			// h+=X[i]
-	eor	w16,w16,w25,ror#11
-	eor	w9,w9,w4,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w21,w22			// a^b, b^c in next round
-	eor	w16,w16,w25,ror#25	// Sigma1(e)
-	eor	w10,w10,w21,ror#13
-	add	w20,w20,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w8,w8,w1,ror#19
-	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
-	add	w20,w20,w16			// h+=Sigma1(e)
-	eor	w19,w19,w22			// Maj(a,b,c)
-	eor	w17,w10,w21,ror#22	// Sigma0(a)
-	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
-	add	w3,w3,w12
-	add	w24,w24,w20			// d+=h
-	add	w20,w20,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w3,w3,w9
-	add	w20,w20,w17			// h+=Sigma0(a)
-	add	w3,w3,w8
-Loop_16_xx:
-	ldr	w8,[sp,#4]
-	str	w11,[sp,#0]
-	ror	w16,w24,#6
-	add	w27,w27,w19			// h+=K[i]
-	ror	w10,w5,#7
-	and	w17,w25,w24
-	ror	w9,w2,#17
-	bic	w19,w26,w24
-	ror	w11,w20,#2
-	add	w27,w27,w3			// h+=X[i]
-	eor	w16,w16,w24,ror#11
-	eor	w10,w10,w5,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w20,w21			// a^b, b^c in next round
-	eor	w16,w16,w24,ror#25	// Sigma1(e)
-	eor	w11,w11,w20,ror#13
-	add	w27,w27,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w9,w9,w2,ror#19
-	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
-	add	w27,w27,w16			// h+=Sigma1(e)
-	eor	w28,w28,w21			// Maj(a,b,c)
-	eor	w17,w11,w20,ror#22	// Sigma0(a)
-	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
-	add	w4,w4,w13
-	add	w23,w23,w27			// d+=h
-	add	w27,w27,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w4,w4,w10
-	add	w27,w27,w17			// h+=Sigma0(a)
-	add	w4,w4,w9
-	ldr	w9,[sp,#8]
-	str	w12,[sp,#4]
-	ror	w16,w23,#6
-	add	w26,w26,w28			// h+=K[i]
-	ror	w11,w6,#7
-	and	w17,w24,w23
-	ror	w10,w3,#17
-	bic	w28,w25,w23
-	ror	w12,w27,#2
-	add	w26,w26,w4			// h+=X[i]
-	eor	w16,w16,w23,ror#11
-	eor	w11,w11,w6,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w27,w20			// a^b, b^c in next round
-	eor	w16,w16,w23,ror#25	// Sigma1(e)
-	eor	w12,w12,w27,ror#13
-	add	w26,w26,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w10,w10,w3,ror#19
-	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
-	add	w26,w26,w16			// h+=Sigma1(e)
-	eor	w19,w19,w20			// Maj(a,b,c)
-	eor	w17,w12,w27,ror#22	// Sigma0(a)
-	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
-	add	w5,w5,w14
-	add	w22,w22,w26			// d+=h
-	add	w26,w26,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w5,w5,w11
-	add	w26,w26,w17			// h+=Sigma0(a)
-	add	w5,w5,w10
-	ldr	w10,[sp,#12]
-	str	w13,[sp,#8]
-	ror	w16,w22,#6
-	add	w25,w25,w19			// h+=K[i]
-	ror	w12,w7,#7
-	and	w17,w23,w22
-	ror	w11,w4,#17
-	bic	w19,w24,w22
-	ror	w13,w26,#2
-	add	w25,w25,w5			// h+=X[i]
-	eor	w16,w16,w22,ror#11
-	eor	w12,w12,w7,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w26,w27			// a^b, b^c in next round
-	eor	w16,w16,w22,ror#25	// Sigma1(e)
-	eor	w13,w13,w26,ror#13
-	add	w25,w25,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w11,w11,w4,ror#19
-	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
-	add	w25,w25,w16			// h+=Sigma1(e)
-	eor	w28,w28,w27			// Maj(a,b,c)
-	eor	w17,w13,w26,ror#22	// Sigma0(a)
-	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
-	add	w6,w6,w15
-	add	w21,w21,w25			// d+=h
-	add	w25,w25,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w6,w6,w12
-	add	w25,w25,w17			// h+=Sigma0(a)
-	add	w6,w6,w11
-	ldr	w11,[sp,#0]
-	str	w14,[sp,#12]
-	ror	w16,w21,#6
-	add	w24,w24,w28			// h+=K[i]
-	ror	w13,w8,#7
-	and	w17,w22,w21
-	ror	w12,w5,#17
-	bic	w28,w23,w21
-	ror	w14,w25,#2
-	add	w24,w24,w6			// h+=X[i]
-	eor	w16,w16,w21,ror#11
-	eor	w13,w13,w8,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w25,w26			// a^b, b^c in next round
-	eor	w16,w16,w21,ror#25	// Sigma1(e)
-	eor	w14,w14,w25,ror#13
-	add	w24,w24,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w12,w12,w5,ror#19
-	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
-	add	w24,w24,w16			// h+=Sigma1(e)
-	eor	w19,w19,w26			// Maj(a,b,c)
-	eor	w17,w14,w25,ror#22	// Sigma0(a)
-	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
-	add	w7,w7,w0
-	add	w20,w20,w24			// d+=h
-	add	w24,w24,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w7,w7,w13
-	add	w24,w24,w17			// h+=Sigma0(a)
-	add	w7,w7,w12
-	ldr	w12,[sp,#4]
-	str	w15,[sp,#0]
-	ror	w16,w20,#6
-	add	w23,w23,w19			// h+=K[i]
-	ror	w14,w9,#7
-	and	w17,w21,w20
-	ror	w13,w6,#17
-	bic	w19,w22,w20
-	ror	w15,w24,#2
-	add	w23,w23,w7			// h+=X[i]
-	eor	w16,w16,w20,ror#11
-	eor	w14,w14,w9,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w24,w25			// a^b, b^c in next round
-	eor	w16,w16,w20,ror#25	// Sigma1(e)
-	eor	w15,w15,w24,ror#13
-	add	w23,w23,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w13,w13,w6,ror#19
-	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
-	add	w23,w23,w16			// h+=Sigma1(e)
-	eor	w28,w28,w25			// Maj(a,b,c)
-	eor	w17,w15,w24,ror#22	// Sigma0(a)
-	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
-	add	w8,w8,w1
-	add	w27,w27,w23			// d+=h
-	add	w23,w23,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w8,w8,w14
-	add	w23,w23,w17			// h+=Sigma0(a)
-	add	w8,w8,w13
-	ldr	w13,[sp,#8]
-	str	w0,[sp,#4]
-	ror	w16,w27,#6
-	add	w22,w22,w28			// h+=K[i]
-	ror	w15,w10,#7
-	and	w17,w20,w27
-	ror	w14,w7,#17
-	bic	w28,w21,w27
-	ror	w0,w23,#2
-	add	w22,w22,w8			// h+=X[i]
-	eor	w16,w16,w27,ror#11
-	eor	w15,w15,w10,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w23,w24			// a^b, b^c in next round
-	eor	w16,w16,w27,ror#25	// Sigma1(e)
-	eor	w0,w0,w23,ror#13
-	add	w22,w22,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w14,w14,w7,ror#19
-	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
-	add	w22,w22,w16			// h+=Sigma1(e)
-	eor	w19,w19,w24			// Maj(a,b,c)
-	eor	w17,w0,w23,ror#22	// Sigma0(a)
-	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
-	add	w9,w9,w2
-	add	w26,w26,w22			// d+=h
-	add	w22,w22,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w9,w9,w15
-	add	w22,w22,w17			// h+=Sigma0(a)
-	add	w9,w9,w14
-	ldr	w14,[sp,#12]
-	str	w1,[sp,#8]
-	ror	w16,w26,#6
-	add	w21,w21,w19			// h+=K[i]
-	ror	w0,w11,#7
-	and	w17,w27,w26
-	ror	w15,w8,#17
-	bic	w19,w20,w26
-	ror	w1,w22,#2
-	add	w21,w21,w9			// h+=X[i]
-	eor	w16,w16,w26,ror#11
-	eor	w0,w0,w11,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w22,w23			// a^b, b^c in next round
-	eor	w16,w16,w26,ror#25	// Sigma1(e)
-	eor	w1,w1,w22,ror#13
-	add	w21,w21,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w15,w15,w8,ror#19
-	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
-	add	w21,w21,w16			// h+=Sigma1(e)
-	eor	w28,w28,w23			// Maj(a,b,c)
-	eor	w17,w1,w22,ror#22	// Sigma0(a)
-	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
-	add	w10,w10,w3
-	add	w25,w25,w21			// d+=h
-	add	w21,w21,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w10,w10,w0
-	add	w21,w21,w17			// h+=Sigma0(a)
-	add	w10,w10,w15
-	ldr	w15,[sp,#0]
-	str	w2,[sp,#12]
-	ror	w16,w25,#6
-	add	w20,w20,w28			// h+=K[i]
-	ror	w1,w12,#7
-	and	w17,w26,w25
-	ror	w0,w9,#17
-	bic	w28,w27,w25
-	ror	w2,w21,#2
-	add	w20,w20,w10			// h+=X[i]
-	eor	w16,w16,w25,ror#11
-	eor	w1,w1,w12,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w21,w22			// a^b, b^c in next round
-	eor	w16,w16,w25,ror#25	// Sigma1(e)
-	eor	w2,w2,w21,ror#13
-	add	w20,w20,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w0,w0,w9,ror#19
-	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
-	add	w20,w20,w16			// h+=Sigma1(e)
-	eor	w19,w19,w22			// Maj(a,b,c)
-	eor	w17,w2,w21,ror#22	// Sigma0(a)
-	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
-	add	w11,w11,w4
-	add	w24,w24,w20			// d+=h
-	add	w20,w20,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w11,w11,w1
-	add	w20,w20,w17			// h+=Sigma0(a)
-	add	w11,w11,w0
-	ldr	w0,[sp,#4]
-	str	w3,[sp,#0]
-	ror	w16,w24,#6
-	add	w27,w27,w19			// h+=K[i]
-	ror	w2,w13,#7
-	and	w17,w25,w24
-	ror	w1,w10,#17
-	bic	w19,w26,w24
-	ror	w3,w20,#2
-	add	w27,w27,w11			// h+=X[i]
-	eor	w16,w16,w24,ror#11
-	eor	w2,w2,w13,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w20,w21			// a^b, b^c in next round
-	eor	w16,w16,w24,ror#25	// Sigma1(e)
-	eor	w3,w3,w20,ror#13
-	add	w27,w27,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w1,w1,w10,ror#19
-	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
-	add	w27,w27,w16			// h+=Sigma1(e)
-	eor	w28,w28,w21			// Maj(a,b,c)
-	eor	w17,w3,w20,ror#22	// Sigma0(a)
-	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
-	add	w12,w12,w5
-	add	w23,w23,w27			// d+=h
-	add	w27,w27,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w12,w12,w2
-	add	w27,w27,w17			// h+=Sigma0(a)
-	add	w12,w12,w1
-	ldr	w1,[sp,#8]
-	str	w4,[sp,#4]
-	ror	w16,w23,#6
-	add	w26,w26,w28			// h+=K[i]
-	ror	w3,w14,#7
-	and	w17,w24,w23
-	ror	w2,w11,#17
-	bic	w28,w25,w23
-	ror	w4,w27,#2
-	add	w26,w26,w12			// h+=X[i]
-	eor	w16,w16,w23,ror#11
-	eor	w3,w3,w14,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w27,w20			// a^b, b^c in next round
-	eor	w16,w16,w23,ror#25	// Sigma1(e)
-	eor	w4,w4,w27,ror#13
-	add	w26,w26,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w2,w2,w11,ror#19
-	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
-	add	w26,w26,w16			// h+=Sigma1(e)
-	eor	w19,w19,w20			// Maj(a,b,c)
-	eor	w17,w4,w27,ror#22	// Sigma0(a)
-	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
-	add	w13,w13,w6
-	add	w22,w22,w26			// d+=h
-	add	w26,w26,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w13,w13,w3
-	add	w26,w26,w17			// h+=Sigma0(a)
-	add	w13,w13,w2
-	ldr	w2,[sp,#12]
-	str	w5,[sp,#8]
-	ror	w16,w22,#6
-	add	w25,w25,w19			// h+=K[i]
-	ror	w4,w15,#7
-	and	w17,w23,w22
-	ror	w3,w12,#17
-	bic	w19,w24,w22
-	ror	w5,w26,#2
-	add	w25,w25,w13			// h+=X[i]
-	eor	w16,w16,w22,ror#11
-	eor	w4,w4,w15,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w26,w27			// a^b, b^c in next round
-	eor	w16,w16,w22,ror#25	// Sigma1(e)
-	eor	w5,w5,w26,ror#13
-	add	w25,w25,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w3,w3,w12,ror#19
-	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
-	add	w25,w25,w16			// h+=Sigma1(e)
-	eor	w28,w28,w27			// Maj(a,b,c)
-	eor	w17,w5,w26,ror#22	// Sigma0(a)
-	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
-	add	w14,w14,w7
-	add	w21,w21,w25			// d+=h
-	add	w25,w25,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w14,w14,w4
-	add	w25,w25,w17			// h+=Sigma0(a)
-	add	w14,w14,w3
-	ldr	w3,[sp,#0]
-	str	w6,[sp,#12]
-	ror	w16,w21,#6
-	add	w24,w24,w28			// h+=K[i]
-	ror	w5,w0,#7
-	and	w17,w22,w21
-	ror	w4,w13,#17
-	bic	w28,w23,w21
-	ror	w6,w25,#2
-	add	w24,w24,w14			// h+=X[i]
-	eor	w16,w16,w21,ror#11
-	eor	w5,w5,w0,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w25,w26			// a^b, b^c in next round
-	eor	w16,w16,w21,ror#25	// Sigma1(e)
-	eor	w6,w6,w25,ror#13
-	add	w24,w24,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w4,w4,w13,ror#19
-	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
-	add	w24,w24,w16			// h+=Sigma1(e)
-	eor	w19,w19,w26			// Maj(a,b,c)
-	eor	w17,w6,w25,ror#22	// Sigma0(a)
-	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
-	add	w15,w15,w8
-	add	w20,w20,w24			// d+=h
-	add	w24,w24,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w15,w15,w5
-	add	w24,w24,w17			// h+=Sigma0(a)
-	add	w15,w15,w4
-	ldr	w4,[sp,#4]
-	str	w7,[sp,#0]
-	ror	w16,w20,#6
-	add	w23,w23,w19			// h+=K[i]
-	ror	w6,w1,#7
-	and	w17,w21,w20
-	ror	w5,w14,#17
-	bic	w19,w22,w20
-	ror	w7,w24,#2
-	add	w23,w23,w15			// h+=X[i]
-	eor	w16,w16,w20,ror#11
-	eor	w6,w6,w1,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w24,w25			// a^b, b^c in next round
-	eor	w16,w16,w20,ror#25	// Sigma1(e)
-	eor	w7,w7,w24,ror#13
-	add	w23,w23,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w5,w5,w14,ror#19
-	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
-	add	w23,w23,w16			// h+=Sigma1(e)
-	eor	w28,w28,w25			// Maj(a,b,c)
-	eor	w17,w7,w24,ror#22	// Sigma0(a)
-	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
-	add	w0,w0,w9
-	add	w27,w27,w23			// d+=h
-	add	w23,w23,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w0,w0,w6
-	add	w23,w23,w17			// h+=Sigma0(a)
-	add	w0,w0,w5
-	ldr	w5,[sp,#8]
-	str	w8,[sp,#4]
-	ror	w16,w27,#6
-	add	w22,w22,w28			// h+=K[i]
-	ror	w7,w2,#7
-	and	w17,w20,w27
-	ror	w6,w15,#17
-	bic	w28,w21,w27
-	ror	w8,w23,#2
-	add	w22,w22,w0			// h+=X[i]
-	eor	w16,w16,w27,ror#11
-	eor	w7,w7,w2,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w23,w24			// a^b, b^c in next round
-	eor	w16,w16,w27,ror#25	// Sigma1(e)
-	eor	w8,w8,w23,ror#13
-	add	w22,w22,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w6,w6,w15,ror#19
-	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
-	add	w22,w22,w16			// h+=Sigma1(e)
-	eor	w19,w19,w24			// Maj(a,b,c)
-	eor	w17,w8,w23,ror#22	// Sigma0(a)
-	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
-	add	w1,w1,w10
-	add	w26,w26,w22			// d+=h
-	add	w22,w22,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w1,w1,w7
-	add	w22,w22,w17			// h+=Sigma0(a)
-	add	w1,w1,w6
-	ldr	w6,[sp,#12]
-	str	w9,[sp,#8]
-	ror	w16,w26,#6
-	add	w21,w21,w19			// h+=K[i]
-	ror	w8,w3,#7
-	and	w17,w27,w26
-	ror	w7,w0,#17
-	bic	w19,w20,w26
-	ror	w9,w22,#2
-	add	w21,w21,w1			// h+=X[i]
-	eor	w16,w16,w26,ror#11
-	eor	w8,w8,w3,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w22,w23			// a^b, b^c in next round
-	eor	w16,w16,w26,ror#25	// Sigma1(e)
-	eor	w9,w9,w22,ror#13
-	add	w21,w21,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w7,w7,w0,ror#19
-	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
-	add	w21,w21,w16			// h+=Sigma1(e)
-	eor	w28,w28,w23			// Maj(a,b,c)
-	eor	w17,w9,w22,ror#22	// Sigma0(a)
-	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
-	add	w2,w2,w11
-	add	w25,w25,w21			// d+=h
-	add	w21,w21,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w2,w2,w8
-	add	w21,w21,w17			// h+=Sigma0(a)
-	add	w2,w2,w7
-	ldr	w7,[sp,#0]
-	str	w10,[sp,#12]
-	ror	w16,w25,#6
-	add	w20,w20,w28			// h+=K[i]
-	ror	w9,w4,#7
-	and	w17,w26,w25
-	ror	w8,w1,#17
-	bic	w28,w27,w25
-	ror	w10,w21,#2
-	add	w20,w20,w2			// h+=X[i]
-	eor	w16,w16,w25,ror#11
-	eor	w9,w9,w4,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w21,w22			// a^b, b^c in next round
-	eor	w16,w16,w25,ror#25	// Sigma1(e)
-	eor	w10,w10,w21,ror#13
-	add	w20,w20,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w8,w8,w1,ror#19
-	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
-	add	w20,w20,w16			// h+=Sigma1(e)
-	eor	w19,w19,w22			// Maj(a,b,c)
-	eor	w17,w10,w21,ror#22	// Sigma0(a)
-	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
-	add	w3,w3,w12
-	add	w24,w24,w20			// d+=h
-	add	w20,w20,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w3,w3,w9
-	add	w20,w20,w17			// h+=Sigma0(a)
-	add	w3,w3,w8
-	cbnz	w19,Loop_16_xx
-
-	ldp	x0,x2,[x29,#96]
-	ldr	x1,[x29,#112]
-	sub	x30,x30,#260		// rewind
-
-	ldp	w3,w4,[x0]
-	ldp	w5,w6,[x0,#2*4]
-	add	x1,x1,#14*4			// advance input pointer
-	ldp	w7,w8,[x0,#4*4]
-	add	w20,w20,w3
-	ldp	w9,w10,[x0,#6*4]
-	add	w21,w21,w4
-	add	w22,w22,w5
-	add	w23,w23,w6
-	stp	w20,w21,[x0]
-	add	w24,w24,w7
-	add	w25,w25,w8
-	stp	w22,w23,[x0,#2*4]
-	add	w26,w26,w9
-	add	w27,w27,w10
-	cmp	x1,x2
-	stp	w24,w25,[x0,#4*4]
-	stp	w26,w27,[x0,#6*4]
-	b.ne	Loop
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#4*4
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#128
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.section	__TEXT,__const
-.align	6
-
-LK256:
-.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.long	0	//terminator
-
-.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-.text
-#ifndef	__KERNEL__
-
-.align	6
-sha256_block_armv8:
-Lv8_entry:
-	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ld1	{v0.4s,v1.4s},[x0]
-	adrp	x3,LK256@PAGE
-	add	x3,x3,LK256@PAGEOFF
-
-Loop_hw:
-	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
-	sub	x2,x2,#1
-	ld1	{v16.4s},[x3],#16
-	rev32	v4.16b,v4.16b
-	rev32	v5.16b,v5.16b
-	rev32	v6.16b,v6.16b
-	rev32	v7.16b,v7.16b
-	orr	v18.16b,v0.16b,v0.16b		// offload
-	orr	v19.16b,v1.16b,v1.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v4.4s
-.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v5.4s
-.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v6.4s
-.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v7.4s
-.long	0x5e282887	//sha256su0 v7.16b,v4.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v4.4s
-.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v5.4s
-.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v6.4s
-.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v7.4s
-.long	0x5e282887	//sha256su0 v7.16b,v4.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v4.4s
-.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v5.4s
-.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v6.4s
-.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v7.4s
-.long	0x5e282887	//sha256su0 v7.16b,v4.16b
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
-	ld1	{v17.4s},[x3],#16
-	add	v16.4s,v16.4s,v4.4s
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-
-	ld1	{v16.4s},[x3],#16
-	add	v17.4s,v17.4s,v5.4s
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-
-	ld1	{v17.4s},[x3]
-	add	v16.4s,v16.4s,v6.4s
-	sub	x3,x3,#64*4-16	// rewind
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-
-	add	v17.4s,v17.4s,v7.4s
-	orr	v2.16b,v0.16b,v0.16b
-.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-
-	add	v0.4s,v0.4s,v18.4s
-	add	v1.4s,v1.4s,v19.4s
-
-	cbnz	x2,Loop_hw
-
-	st1	{v0.4s,v1.4s},[x0]
-
-	ldr	x29,[sp],#16
-	ret
-
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/sha512-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/sha512-armv8-apple.S
deleted file mode 100644
index 10e8aaf..0000000
--- a/apple-aarch64/crypto/fipsmodule/sha512-armv8-apple.S
+++ /dev/null
@@ -1,1606 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License").  You may not use
-// this file except in compliance with the License.  You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-//
-// Permission to use under GPLv2 terms is granted.
-// ====================================================================
-//
-// SHA256/512 for ARMv8.
-//
-// Performance in cycles per processed byte and improvement coefficient
-// over code generated with "default" compiler:
-//
-//		SHA256-hw	SHA256(*)	SHA512
-// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
-// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
-// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
-// Denver	2.01		10.5 (+26%)	6.70 (+8%)
-// X-Gene			20.0 (+100%)	12.8 (+300%(***))
-// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
-// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
-//
-// (*)	Software SHA256 results are of lesser relevance, presented
-//	mostly for informational purposes.
-// (**)	The result is a trade-off: it's possible to improve it by
-//	10% (or by 1 cycle per round), but at the cost of 20% loss
-//	on Cortex-A53 (or by 4 cycles per round).
-// (***)	Super-impressive coefficients over gcc-generated code are
-//	indication of some compiler "pathology", most notably code
-//	generated with -mgeneral-regs-only is significantly faster
-//	and the gap is only 40-90%.
-
-#ifndef	__KERNEL__
-# include <openssl/arm_arch.h>
-#endif
-
-.text
-
-
-.private_extern	_OPENSSL_armcap_P
-.globl	_sha512_block_data_order
-.private_extern	_sha512_block_data_order
-
-.align	6
-_sha512_block_data_order:
-	AARCH64_VALID_CALL_TARGET
-#ifndef	__KERNEL__
-#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
-	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
-#else
-	adrp	x16,_OPENSSL_armcap_P@PAGE
-#endif
-	ldr	w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
-	tst	w16,#ARMV8_SHA512
-	b.ne	Lv8_entry
-#endif
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#4*8
-
-	ldp	x20,x21,[x0]				// load context
-	ldp	x22,x23,[x0,#2*8]
-	ldp	x24,x25,[x0,#4*8]
-	add	x2,x1,x2,lsl#7	// end of input
-	ldp	x26,x27,[x0,#6*8]
-	adrp	x30,LK512@PAGE
-	add	x30,x30,LK512@PAGEOFF
-	stp	x0,x2,[x29,#96]
-
-Loop:
-	ldp	x3,x4,[x1],#2*8
-	ldr	x19,[x30],#8			// *K++
-	eor	x28,x21,x22				// magic seed
-	str	x1,[x29,#112]
-#ifndef	__AARCH64EB__
-	rev	x3,x3			// 0
-#endif
-	ror	x16,x24,#14
-	add	x27,x27,x19			// h+=K[i]
-	eor	x6,x24,x24,ror#23
-	and	x17,x25,x24
-	bic	x19,x26,x24
-	add	x27,x27,x3			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x20,x21			// a^b, b^c in next round
-	eor	x16,x16,x6,ror#18	// Sigma1(e)
-	ror	x6,x20,#28
-	add	x27,x27,x17			// h+=Ch(e,f,g)
-	eor	x17,x20,x20,ror#5
-	add	x27,x27,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x23,x23,x27			// d+=h
-	eor	x28,x28,x21			// Maj(a,b,c)
-	eor	x17,x6,x17,ror#34	// Sigma0(a)
-	add	x27,x27,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x27,x27,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x4,x4			// 1
-#endif
-	ldp	x5,x6,[x1],#2*8
-	add	x27,x27,x17			// h+=Sigma0(a)
-	ror	x16,x23,#14
-	add	x26,x26,x28			// h+=K[i]
-	eor	x7,x23,x23,ror#23
-	and	x17,x24,x23
-	bic	x28,x25,x23
-	add	x26,x26,x4			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x27,x20			// a^b, b^c in next round
-	eor	x16,x16,x7,ror#18	// Sigma1(e)
-	ror	x7,x27,#28
-	add	x26,x26,x17			// h+=Ch(e,f,g)
-	eor	x17,x27,x27,ror#5
-	add	x26,x26,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x22,x22,x26			// d+=h
-	eor	x19,x19,x20			// Maj(a,b,c)
-	eor	x17,x7,x17,ror#34	// Sigma0(a)
-	add	x26,x26,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x26,x26,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x5,x5			// 2
-#endif
-	add	x26,x26,x17			// h+=Sigma0(a)
-	ror	x16,x22,#14
-	add	x25,x25,x19			// h+=K[i]
-	eor	x8,x22,x22,ror#23
-	and	x17,x23,x22
-	bic	x19,x24,x22
-	add	x25,x25,x5			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x26,x27			// a^b, b^c in next round
-	eor	x16,x16,x8,ror#18	// Sigma1(e)
-	ror	x8,x26,#28
-	add	x25,x25,x17			// h+=Ch(e,f,g)
-	eor	x17,x26,x26,ror#5
-	add	x25,x25,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x21,x21,x25			// d+=h
-	eor	x28,x28,x27			// Maj(a,b,c)
-	eor	x17,x8,x17,ror#34	// Sigma0(a)
-	add	x25,x25,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x25,x25,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x6,x6			// 3
-#endif
-	ldp	x7,x8,[x1],#2*8
-	add	x25,x25,x17			// h+=Sigma0(a)
-	ror	x16,x21,#14
-	add	x24,x24,x28			// h+=K[i]
-	eor	x9,x21,x21,ror#23
-	and	x17,x22,x21
-	bic	x28,x23,x21
-	add	x24,x24,x6			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x25,x26			// a^b, b^c in next round
-	eor	x16,x16,x9,ror#18	// Sigma1(e)
-	ror	x9,x25,#28
-	add	x24,x24,x17			// h+=Ch(e,f,g)
-	eor	x17,x25,x25,ror#5
-	add	x24,x24,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x20,x20,x24			// d+=h
-	eor	x19,x19,x26			// Maj(a,b,c)
-	eor	x17,x9,x17,ror#34	// Sigma0(a)
-	add	x24,x24,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x24,x24,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x7,x7			// 4
-#endif
-	add	x24,x24,x17			// h+=Sigma0(a)
-	ror	x16,x20,#14
-	add	x23,x23,x19			// h+=K[i]
-	eor	x10,x20,x20,ror#23
-	and	x17,x21,x20
-	bic	x19,x22,x20
-	add	x23,x23,x7			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x24,x25			// a^b, b^c in next round
-	eor	x16,x16,x10,ror#18	// Sigma1(e)
-	ror	x10,x24,#28
-	add	x23,x23,x17			// h+=Ch(e,f,g)
-	eor	x17,x24,x24,ror#5
-	add	x23,x23,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x27,x27,x23			// d+=h
-	eor	x28,x28,x25			// Maj(a,b,c)
-	eor	x17,x10,x17,ror#34	// Sigma0(a)
-	add	x23,x23,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x23,x23,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x8,x8			// 5
-#endif
-	ldp	x9,x10,[x1],#2*8
-	add	x23,x23,x17			// h+=Sigma0(a)
-	ror	x16,x27,#14
-	add	x22,x22,x28			// h+=K[i]
-	eor	x11,x27,x27,ror#23
-	and	x17,x20,x27
-	bic	x28,x21,x27
-	add	x22,x22,x8			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x23,x24			// a^b, b^c in next round
-	eor	x16,x16,x11,ror#18	// Sigma1(e)
-	ror	x11,x23,#28
-	add	x22,x22,x17			// h+=Ch(e,f,g)
-	eor	x17,x23,x23,ror#5
-	add	x22,x22,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x26,x26,x22			// d+=h
-	eor	x19,x19,x24			// Maj(a,b,c)
-	eor	x17,x11,x17,ror#34	// Sigma0(a)
-	add	x22,x22,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x22,x22,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x9,x9			// 6
-#endif
-	add	x22,x22,x17			// h+=Sigma0(a)
-	ror	x16,x26,#14
-	add	x21,x21,x19			// h+=K[i]
-	eor	x12,x26,x26,ror#23
-	and	x17,x27,x26
-	bic	x19,x20,x26
-	add	x21,x21,x9			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x22,x23			// a^b, b^c in next round
-	eor	x16,x16,x12,ror#18	// Sigma1(e)
-	ror	x12,x22,#28
-	add	x21,x21,x17			// h+=Ch(e,f,g)
-	eor	x17,x22,x22,ror#5
-	add	x21,x21,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x25,x25,x21			// d+=h
-	eor	x28,x28,x23			// Maj(a,b,c)
-	eor	x17,x12,x17,ror#34	// Sigma0(a)
-	add	x21,x21,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x21,x21,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x10,x10			// 7
-#endif
-	ldp	x11,x12,[x1],#2*8
-	add	x21,x21,x17			// h+=Sigma0(a)
-	ror	x16,x25,#14
-	add	x20,x20,x28			// h+=K[i]
-	eor	x13,x25,x25,ror#23
-	and	x17,x26,x25
-	bic	x28,x27,x25
-	add	x20,x20,x10			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x21,x22			// a^b, b^c in next round
-	eor	x16,x16,x13,ror#18	// Sigma1(e)
-	ror	x13,x21,#28
-	add	x20,x20,x17			// h+=Ch(e,f,g)
-	eor	x17,x21,x21,ror#5
-	add	x20,x20,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x24,x24,x20			// d+=h
-	eor	x19,x19,x22			// Maj(a,b,c)
-	eor	x17,x13,x17,ror#34	// Sigma0(a)
-	add	x20,x20,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x20,x20,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x11,x11			// 8
-#endif
-	add	x20,x20,x17			// h+=Sigma0(a)
-	ror	x16,x24,#14
-	add	x27,x27,x19			// h+=K[i]
-	eor	x14,x24,x24,ror#23
-	and	x17,x25,x24
-	bic	x19,x26,x24
-	add	x27,x27,x11			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x20,x21			// a^b, b^c in next round
-	eor	x16,x16,x14,ror#18	// Sigma1(e)
-	ror	x14,x20,#28
-	add	x27,x27,x17			// h+=Ch(e,f,g)
-	eor	x17,x20,x20,ror#5
-	add	x27,x27,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x23,x23,x27			// d+=h
-	eor	x28,x28,x21			// Maj(a,b,c)
-	eor	x17,x14,x17,ror#34	// Sigma0(a)
-	add	x27,x27,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x27,x27,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x12,x12			// 9
-#endif
-	ldp	x13,x14,[x1],#2*8
-	add	x27,x27,x17			// h+=Sigma0(a)
-	ror	x16,x23,#14
-	add	x26,x26,x28			// h+=K[i]
-	eor	x15,x23,x23,ror#23
-	and	x17,x24,x23
-	bic	x28,x25,x23
-	add	x26,x26,x12			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x27,x20			// a^b, b^c in next round
-	eor	x16,x16,x15,ror#18	// Sigma1(e)
-	ror	x15,x27,#28
-	add	x26,x26,x17			// h+=Ch(e,f,g)
-	eor	x17,x27,x27,ror#5
-	add	x26,x26,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x22,x22,x26			// d+=h
-	eor	x19,x19,x20			// Maj(a,b,c)
-	eor	x17,x15,x17,ror#34	// Sigma0(a)
-	add	x26,x26,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x26,x26,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x13,x13			// 10
-#endif
-	add	x26,x26,x17			// h+=Sigma0(a)
-	ror	x16,x22,#14
-	add	x25,x25,x19			// h+=K[i]
-	eor	x0,x22,x22,ror#23
-	and	x17,x23,x22
-	bic	x19,x24,x22
-	add	x25,x25,x13			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x26,x27			// a^b, b^c in next round
-	eor	x16,x16,x0,ror#18	// Sigma1(e)
-	ror	x0,x26,#28
-	add	x25,x25,x17			// h+=Ch(e,f,g)
-	eor	x17,x26,x26,ror#5
-	add	x25,x25,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x21,x21,x25			// d+=h
-	eor	x28,x28,x27			// Maj(a,b,c)
-	eor	x17,x0,x17,ror#34	// Sigma0(a)
-	add	x25,x25,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x25,x25,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x14,x14			// 11
-#endif
-	ldp	x15,x0,[x1],#2*8
-	add	x25,x25,x17			// h+=Sigma0(a)
-	str	x6,[sp,#24]
-	ror	x16,x21,#14
-	add	x24,x24,x28			// h+=K[i]
-	eor	x6,x21,x21,ror#23
-	and	x17,x22,x21
-	bic	x28,x23,x21
-	add	x24,x24,x14			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x25,x26			// a^b, b^c in next round
-	eor	x16,x16,x6,ror#18	// Sigma1(e)
-	ror	x6,x25,#28
-	add	x24,x24,x17			// h+=Ch(e,f,g)
-	eor	x17,x25,x25,ror#5
-	add	x24,x24,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x20,x20,x24			// d+=h
-	eor	x19,x19,x26			// Maj(a,b,c)
-	eor	x17,x6,x17,ror#34	// Sigma0(a)
-	add	x24,x24,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x24,x24,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x15,x15			// 12
-#endif
-	add	x24,x24,x17			// h+=Sigma0(a)
-	str	x7,[sp,#0]
-	ror	x16,x20,#14
-	add	x23,x23,x19			// h+=K[i]
-	eor	x7,x20,x20,ror#23
-	and	x17,x21,x20
-	bic	x19,x22,x20
-	add	x23,x23,x15			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x24,x25			// a^b, b^c in next round
-	eor	x16,x16,x7,ror#18	// Sigma1(e)
-	ror	x7,x24,#28
-	add	x23,x23,x17			// h+=Ch(e,f,g)
-	eor	x17,x24,x24,ror#5
-	add	x23,x23,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x27,x27,x23			// d+=h
-	eor	x28,x28,x25			// Maj(a,b,c)
-	eor	x17,x7,x17,ror#34	// Sigma0(a)
-	add	x23,x23,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x23,x23,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x0,x0			// 13
-#endif
-	ldp	x1,x2,[x1]
-	add	x23,x23,x17			// h+=Sigma0(a)
-	str	x8,[sp,#8]
-	ror	x16,x27,#14
-	add	x22,x22,x28			// h+=K[i]
-	eor	x8,x27,x27,ror#23
-	and	x17,x20,x27
-	bic	x28,x21,x27
-	add	x22,x22,x0			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x23,x24			// a^b, b^c in next round
-	eor	x16,x16,x8,ror#18	// Sigma1(e)
-	ror	x8,x23,#28
-	add	x22,x22,x17			// h+=Ch(e,f,g)
-	eor	x17,x23,x23,ror#5
-	add	x22,x22,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x26,x26,x22			// d+=h
-	eor	x19,x19,x24			// Maj(a,b,c)
-	eor	x17,x8,x17,ror#34	// Sigma0(a)
-	add	x22,x22,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x22,x22,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x1,x1			// 14
-#endif
-	ldr	x6,[sp,#24]
-	add	x22,x22,x17			// h+=Sigma0(a)
-	str	x9,[sp,#16]
-	ror	x16,x26,#14
-	add	x21,x21,x19			// h+=K[i]
-	eor	x9,x26,x26,ror#23
-	and	x17,x27,x26
-	bic	x19,x20,x26
-	add	x21,x21,x1			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x22,x23			// a^b, b^c in next round
-	eor	x16,x16,x9,ror#18	// Sigma1(e)
-	ror	x9,x22,#28
-	add	x21,x21,x17			// h+=Ch(e,f,g)
-	eor	x17,x22,x22,ror#5
-	add	x21,x21,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x25,x25,x21			// d+=h
-	eor	x28,x28,x23			// Maj(a,b,c)
-	eor	x17,x9,x17,ror#34	// Sigma0(a)
-	add	x21,x21,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x21,x21,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x2,x2			// 15
-#endif
-	ldr	x7,[sp,#0]
-	add	x21,x21,x17			// h+=Sigma0(a)
-	str	x10,[sp,#24]
-	ror	x16,x25,#14
-	add	x20,x20,x28			// h+=K[i]
-	ror	x9,x4,#1
-	and	x17,x26,x25
-	ror	x8,x1,#19
-	bic	x28,x27,x25
-	ror	x10,x21,#28
-	add	x20,x20,x2			// h+=X[i]
-	eor	x16,x16,x25,ror#18
-	eor	x9,x9,x4,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x21,x22			// a^b, b^c in next round
-	eor	x16,x16,x25,ror#41	// Sigma1(e)
-	eor	x10,x10,x21,ror#34
-	add	x20,x20,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x8,x8,x1,ror#61
-	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
-	add	x20,x20,x16			// h+=Sigma1(e)
-	eor	x19,x19,x22			// Maj(a,b,c)
-	eor	x17,x10,x21,ror#39	// Sigma0(a)
-	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
-	add	x3,x3,x12
-	add	x24,x24,x20			// d+=h
-	add	x20,x20,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x3,x3,x9
-	add	x20,x20,x17			// h+=Sigma0(a)
-	add	x3,x3,x8
-Loop_16_xx:
-	ldr	x8,[sp,#8]
-	str	x11,[sp,#0]
-	ror	x16,x24,#14
-	add	x27,x27,x19			// h+=K[i]
-	ror	x10,x5,#1
-	and	x17,x25,x24
-	ror	x9,x2,#19
-	bic	x19,x26,x24
-	ror	x11,x20,#28
-	add	x27,x27,x3			// h+=X[i]
-	eor	x16,x16,x24,ror#18
-	eor	x10,x10,x5,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x20,x21			// a^b, b^c in next round
-	eor	x16,x16,x24,ror#41	// Sigma1(e)
-	eor	x11,x11,x20,ror#34
-	add	x27,x27,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x9,x9,x2,ror#61
-	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
-	add	x27,x27,x16			// h+=Sigma1(e)
-	eor	x28,x28,x21			// Maj(a,b,c)
-	eor	x17,x11,x20,ror#39	// Sigma0(a)
-	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
-	add	x4,x4,x13
-	add	x23,x23,x27			// d+=h
-	add	x27,x27,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x4,x4,x10
-	add	x27,x27,x17			// h+=Sigma0(a)
-	add	x4,x4,x9
-	ldr	x9,[sp,#16]
-	str	x12,[sp,#8]
-	ror	x16,x23,#14
-	add	x26,x26,x28			// h+=K[i]
-	ror	x11,x6,#1
-	and	x17,x24,x23
-	ror	x10,x3,#19
-	bic	x28,x25,x23
-	ror	x12,x27,#28
-	add	x26,x26,x4			// h+=X[i]
-	eor	x16,x16,x23,ror#18
-	eor	x11,x11,x6,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x27,x20			// a^b, b^c in next round
-	eor	x16,x16,x23,ror#41	// Sigma1(e)
-	eor	x12,x12,x27,ror#34
-	add	x26,x26,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x10,x10,x3,ror#61
-	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
-	add	x26,x26,x16			// h+=Sigma1(e)
-	eor	x19,x19,x20			// Maj(a,b,c)
-	eor	x17,x12,x27,ror#39	// Sigma0(a)
-	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
-	add	x5,x5,x14
-	add	x22,x22,x26			// d+=h
-	add	x26,x26,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x5,x5,x11
-	add	x26,x26,x17			// h+=Sigma0(a)
-	add	x5,x5,x10
-	ldr	x10,[sp,#24]
-	str	x13,[sp,#16]
-	ror	x16,x22,#14
-	add	x25,x25,x19			// h+=K[i]
-	ror	x12,x7,#1
-	and	x17,x23,x22
-	ror	x11,x4,#19
-	bic	x19,x24,x22
-	ror	x13,x26,#28
-	add	x25,x25,x5			// h+=X[i]
-	eor	x16,x16,x22,ror#18
-	eor	x12,x12,x7,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x26,x27			// a^b, b^c in next round
-	eor	x16,x16,x22,ror#41	// Sigma1(e)
-	eor	x13,x13,x26,ror#34
-	add	x25,x25,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x11,x11,x4,ror#61
-	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
-	add	x25,x25,x16			// h+=Sigma1(e)
-	eor	x28,x28,x27			// Maj(a,b,c)
-	eor	x17,x13,x26,ror#39	// Sigma0(a)
-	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
-	add	x6,x6,x15
-	add	x21,x21,x25			// d+=h
-	add	x25,x25,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x6,x6,x12
-	add	x25,x25,x17			// h+=Sigma0(a)
-	add	x6,x6,x11
-	ldr	x11,[sp,#0]
-	str	x14,[sp,#24]
-	ror	x16,x21,#14
-	add	x24,x24,x28			// h+=K[i]
-	ror	x13,x8,#1
-	and	x17,x22,x21
-	ror	x12,x5,#19
-	bic	x28,x23,x21
-	ror	x14,x25,#28
-	add	x24,x24,x6			// h+=X[i]
-	eor	x16,x16,x21,ror#18
-	eor	x13,x13,x8,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x25,x26			// a^b, b^c in next round
-	eor	x16,x16,x21,ror#41	// Sigma1(e)
-	eor	x14,x14,x25,ror#34
-	add	x24,x24,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x12,x12,x5,ror#61
-	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
-	add	x24,x24,x16			// h+=Sigma1(e)
-	eor	x19,x19,x26			// Maj(a,b,c)
-	eor	x17,x14,x25,ror#39	// Sigma0(a)
-	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
-	add	x7,x7,x0
-	add	x20,x20,x24			// d+=h
-	add	x24,x24,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x7,x7,x13
-	add	x24,x24,x17			// h+=Sigma0(a)
-	add	x7,x7,x12
-	ldr	x12,[sp,#8]
-	str	x15,[sp,#0]
-	ror	x16,x20,#14
-	add	x23,x23,x19			// h+=K[i]
-	ror	x14,x9,#1
-	and	x17,x21,x20
-	ror	x13,x6,#19
-	bic	x19,x22,x20
-	ror	x15,x24,#28
-	add	x23,x23,x7			// h+=X[i]
-	eor	x16,x16,x20,ror#18
-	eor	x14,x14,x9,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x24,x25			// a^b, b^c in next round
-	eor	x16,x16,x20,ror#41	// Sigma1(e)
-	eor	x15,x15,x24,ror#34
-	add	x23,x23,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x13,x13,x6,ror#61
-	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
-	add	x23,x23,x16			// h+=Sigma1(e)
-	eor	x28,x28,x25			// Maj(a,b,c)
-	eor	x17,x15,x24,ror#39	// Sigma0(a)
-	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
-	add	x8,x8,x1
-	add	x27,x27,x23			// d+=h
-	add	x23,x23,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x8,x8,x14
-	add	x23,x23,x17			// h+=Sigma0(a)
-	add	x8,x8,x13
-	ldr	x13,[sp,#16]
-	str	x0,[sp,#8]
-	ror	x16,x27,#14
-	add	x22,x22,x28			// h+=K[i]
-	ror	x15,x10,#1
-	and	x17,x20,x27
-	ror	x14,x7,#19
-	bic	x28,x21,x27
-	ror	x0,x23,#28
-	add	x22,x22,x8			// h+=X[i]
-	eor	x16,x16,x27,ror#18
-	eor	x15,x15,x10,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x23,x24			// a^b, b^c in next round
-	eor	x16,x16,x27,ror#41	// Sigma1(e)
-	eor	x0,x0,x23,ror#34
-	add	x22,x22,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x14,x14,x7,ror#61
-	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
-	add	x22,x22,x16			// h+=Sigma1(e)
-	eor	x19,x19,x24			// Maj(a,b,c)
-	eor	x17,x0,x23,ror#39	// Sigma0(a)
-	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
-	add	x9,x9,x2
-	add	x26,x26,x22			// d+=h
-	add	x22,x22,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x9,x9,x15
-	add	x22,x22,x17			// h+=Sigma0(a)
-	add	x9,x9,x14
-	ldr	x14,[sp,#24]
-	str	x1,[sp,#16]
-	ror	x16,x26,#14
-	add	x21,x21,x19			// h+=K[i]
-	ror	x0,x11,#1
-	and	x17,x27,x26
-	ror	x15,x8,#19
-	bic	x19,x20,x26
-	ror	x1,x22,#28
-	add	x21,x21,x9			// h+=X[i]
-	eor	x16,x16,x26,ror#18
-	eor	x0,x0,x11,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x22,x23			// a^b, b^c in next round
-	eor	x16,x16,x26,ror#41	// Sigma1(e)
-	eor	x1,x1,x22,ror#34
-	add	x21,x21,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x15,x15,x8,ror#61
-	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
-	add	x21,x21,x16			// h+=Sigma1(e)
-	eor	x28,x28,x23			// Maj(a,b,c)
-	eor	x17,x1,x22,ror#39	// Sigma0(a)
-	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
-	add	x10,x10,x3
-	add	x25,x25,x21			// d+=h
-	add	x21,x21,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x10,x10,x0
-	add	x21,x21,x17			// h+=Sigma0(a)
-	add	x10,x10,x15
-	ldr	x15,[sp,#0]
-	str	x2,[sp,#24]
-	ror	x16,x25,#14
-	add	x20,x20,x28			// h+=K[i]
-	ror	x1,x12,#1
-	and	x17,x26,x25
-	ror	x0,x9,#19
-	bic	x28,x27,x25
-	ror	x2,x21,#28
-	add	x20,x20,x10			// h+=X[i]
-	eor	x16,x16,x25,ror#18
-	eor	x1,x1,x12,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x21,x22			// a^b, b^c in next round
-	eor	x16,x16,x25,ror#41	// Sigma1(e)
-	eor	x2,x2,x21,ror#34
-	add	x20,x20,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x0,x0,x9,ror#61
-	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
-	add	x20,x20,x16			// h+=Sigma1(e)
-	eor	x19,x19,x22			// Maj(a,b,c)
-	eor	x17,x2,x21,ror#39	// Sigma0(a)
-	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
-	add	x11,x11,x4
-	add	x24,x24,x20			// d+=h
-	add	x20,x20,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x11,x11,x1
-	add	x20,x20,x17			// h+=Sigma0(a)
-	add	x11,x11,x0
-	ldr	x0,[sp,#8]
-	str	x3,[sp,#0]
-	ror	x16,x24,#14
-	add	x27,x27,x19			// h+=K[i]
-	ror	x2,x13,#1
-	and	x17,x25,x24
-	ror	x1,x10,#19
-	bic	x19,x26,x24
-	ror	x3,x20,#28
-	add	x27,x27,x11			// h+=X[i]
-	eor	x16,x16,x24,ror#18
-	eor	x2,x2,x13,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x20,x21			// a^b, b^c in next round
-	eor	x16,x16,x24,ror#41	// Sigma1(e)
-	eor	x3,x3,x20,ror#34
-	add	x27,x27,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x1,x1,x10,ror#61
-	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
-	add	x27,x27,x16			// h+=Sigma1(e)
-	eor	x28,x28,x21			// Maj(a,b,c)
-	eor	x17,x3,x20,ror#39	// Sigma0(a)
-	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
-	add	x12,x12,x5
-	add	x23,x23,x27			// d+=h
-	add	x27,x27,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x12,x12,x2
-	add	x27,x27,x17			// h+=Sigma0(a)
-	add	x12,x12,x1
-	ldr	x1,[sp,#16]
-	str	x4,[sp,#8]
-	ror	x16,x23,#14
-	add	x26,x26,x28			// h+=K[i]
-	ror	x3,x14,#1
-	and	x17,x24,x23
-	ror	x2,x11,#19
-	bic	x28,x25,x23
-	ror	x4,x27,#28
-	add	x26,x26,x12			// h+=X[i]
-	eor	x16,x16,x23,ror#18
-	eor	x3,x3,x14,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x27,x20			// a^b, b^c in next round
-	eor	x16,x16,x23,ror#41	// Sigma1(e)
-	eor	x4,x4,x27,ror#34
-	add	x26,x26,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x2,x2,x11,ror#61
-	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
-	add	x26,x26,x16			// h+=Sigma1(e)
-	eor	x19,x19,x20			// Maj(a,b,c)
-	eor	x17,x4,x27,ror#39	// Sigma0(a)
-	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
-	add	x13,x13,x6
-	add	x22,x22,x26			// d+=h
-	add	x26,x26,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x13,x13,x3
-	add	x26,x26,x17			// h+=Sigma0(a)
-	add	x13,x13,x2
-	ldr	x2,[sp,#24]
-	str	x5,[sp,#16]
-	ror	x16,x22,#14
-	add	x25,x25,x19			// h+=K[i]
-	ror	x4,x15,#1
-	and	x17,x23,x22
-	ror	x3,x12,#19
-	bic	x19,x24,x22
-	ror	x5,x26,#28
-	add	x25,x25,x13			// h+=X[i]
-	eor	x16,x16,x22,ror#18
-	eor	x4,x4,x15,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x26,x27			// a^b, b^c in next round
-	eor	x16,x16,x22,ror#41	// Sigma1(e)
-	eor	x5,x5,x26,ror#34
-	add	x25,x25,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x3,x3,x12,ror#61
-	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
-	add	x25,x25,x16			// h+=Sigma1(e)
-	eor	x28,x28,x27			// Maj(a,b,c)
-	eor	x17,x5,x26,ror#39	// Sigma0(a)
-	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
-	add	x14,x14,x7
-	add	x21,x21,x25			// d+=h
-	add	x25,x25,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x14,x14,x4
-	add	x25,x25,x17			// h+=Sigma0(a)
-	add	x14,x14,x3
-	ldr	x3,[sp,#0]
-	str	x6,[sp,#24]
-	ror	x16,x21,#14
-	add	x24,x24,x28			// h+=K[i]
-	ror	x5,x0,#1
-	and	x17,x22,x21
-	ror	x4,x13,#19
-	bic	x28,x23,x21
-	ror	x6,x25,#28
-	add	x24,x24,x14			// h+=X[i]
-	eor	x16,x16,x21,ror#18
-	eor	x5,x5,x0,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x25,x26			// a^b, b^c in next round
-	eor	x16,x16,x21,ror#41	// Sigma1(e)
-	eor	x6,x6,x25,ror#34
-	add	x24,x24,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x4,x4,x13,ror#61
-	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
-	add	x24,x24,x16			// h+=Sigma1(e)
-	eor	x19,x19,x26			// Maj(a,b,c)
-	eor	x17,x6,x25,ror#39	// Sigma0(a)
-	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
-	add	x15,x15,x8
-	add	x20,x20,x24			// d+=h
-	add	x24,x24,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x15,x15,x5
-	add	x24,x24,x17			// h+=Sigma0(a)
-	add	x15,x15,x4
-	ldr	x4,[sp,#8]
-	str	x7,[sp,#0]
-	ror	x16,x20,#14
-	add	x23,x23,x19			// h+=K[i]
-	ror	x6,x1,#1
-	and	x17,x21,x20
-	ror	x5,x14,#19
-	bic	x19,x22,x20
-	ror	x7,x24,#28
-	add	x23,x23,x15			// h+=X[i]
-	eor	x16,x16,x20,ror#18
-	eor	x6,x6,x1,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x24,x25			// a^b, b^c in next round
-	eor	x16,x16,x20,ror#41	// Sigma1(e)
-	eor	x7,x7,x24,ror#34
-	add	x23,x23,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x5,x5,x14,ror#61
-	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
-	add	x23,x23,x16			// h+=Sigma1(e)
-	eor	x28,x28,x25			// Maj(a,b,c)
-	eor	x17,x7,x24,ror#39	// Sigma0(a)
-	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
-	add	x0,x0,x9
-	add	x27,x27,x23			// d+=h
-	add	x23,x23,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x0,x0,x6
-	add	x23,x23,x17			// h+=Sigma0(a)
-	add	x0,x0,x5
-	ldr	x5,[sp,#16]
-	str	x8,[sp,#8]
-	ror	x16,x27,#14
-	add	x22,x22,x28			// h+=K[i]
-	ror	x7,x2,#1
-	and	x17,x20,x27
-	ror	x6,x15,#19
-	bic	x28,x21,x27
-	ror	x8,x23,#28
-	add	x22,x22,x0			// h+=X[i]
-	eor	x16,x16,x27,ror#18
-	eor	x7,x7,x2,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x23,x24			// a^b, b^c in next round
-	eor	x16,x16,x27,ror#41	// Sigma1(e)
-	eor	x8,x8,x23,ror#34
-	add	x22,x22,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x6,x6,x15,ror#61
-	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
-	add	x22,x22,x16			// h+=Sigma1(e)
-	eor	x19,x19,x24			// Maj(a,b,c)
-	eor	x17,x8,x23,ror#39	// Sigma0(a)
-	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
-	add	x1,x1,x10
-	add	x26,x26,x22			// d+=h
-	add	x22,x22,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x1,x1,x7
-	add	x22,x22,x17			// h+=Sigma0(a)
-	add	x1,x1,x6
-	ldr	x6,[sp,#24]
-	str	x9,[sp,#16]
-	ror	x16,x26,#14
-	add	x21,x21,x19			// h+=K[i]
-	ror	x8,x3,#1
-	and	x17,x27,x26
-	ror	x7,x0,#19
-	bic	x19,x20,x26
-	ror	x9,x22,#28
-	add	x21,x21,x1			// h+=X[i]
-	eor	x16,x16,x26,ror#18
-	eor	x8,x8,x3,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x22,x23			// a^b, b^c in next round
-	eor	x16,x16,x26,ror#41	// Sigma1(e)
-	eor	x9,x9,x22,ror#34
-	add	x21,x21,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x7,x7,x0,ror#61
-	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
-	add	x21,x21,x16			// h+=Sigma1(e)
-	eor	x28,x28,x23			// Maj(a,b,c)
-	eor	x17,x9,x22,ror#39	// Sigma0(a)
-	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
-	add	x2,x2,x11
-	add	x25,x25,x21			// d+=h
-	add	x21,x21,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x2,x2,x8
-	add	x21,x21,x17			// h+=Sigma0(a)
-	add	x2,x2,x7
-	ldr	x7,[sp,#0]
-	str	x10,[sp,#24]
-	ror	x16,x25,#14
-	add	x20,x20,x28			// h+=K[i]
-	ror	x9,x4,#1
-	and	x17,x26,x25
-	ror	x8,x1,#19
-	bic	x28,x27,x25
-	ror	x10,x21,#28
-	add	x20,x20,x2			// h+=X[i]
-	eor	x16,x16,x25,ror#18
-	eor	x9,x9,x4,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x21,x22			// a^b, b^c in next round
-	eor	x16,x16,x25,ror#41	// Sigma1(e)
-	eor	x10,x10,x21,ror#34
-	add	x20,x20,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x8,x8,x1,ror#61
-	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
-	add	x20,x20,x16			// h+=Sigma1(e)
-	eor	x19,x19,x22			// Maj(a,b,c)
-	eor	x17,x10,x21,ror#39	// Sigma0(a)
-	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
-	add	x3,x3,x12
-	add	x24,x24,x20			// d+=h
-	add	x20,x20,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x3,x3,x9
-	add	x20,x20,x17			// h+=Sigma0(a)
-	add	x3,x3,x8
-	cbnz	x19,Loop_16_xx
-
-	ldp	x0,x2,[x29,#96]
-	ldr	x1,[x29,#112]
-	sub	x30,x30,#648		// rewind
-
-	ldp	x3,x4,[x0]
-	ldp	x5,x6,[x0,#2*8]
-	add	x1,x1,#14*8			// advance input pointer
-	ldp	x7,x8,[x0,#4*8]
-	add	x20,x20,x3
-	ldp	x9,x10,[x0,#6*8]
-	add	x21,x21,x4
-	add	x22,x22,x5
-	add	x23,x23,x6
-	stp	x20,x21,[x0]
-	add	x24,x24,x7
-	add	x25,x25,x8
-	stp	x22,x23,[x0,#2*8]
-	add	x26,x26,x9
-	add	x27,x27,x10
-	cmp	x1,x2
-	stp	x24,x25,[x0,#4*8]
-	stp	x26,x27,[x0,#6*8]
-	b.ne	Loop
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#4*8
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#128
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.section	__TEXT,__const
-.align	6
-
-LK512:
-.quad	0x428a2f98d728ae22,0x7137449123ef65cd
-.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
-.quad	0x3956c25bf348b538,0x59f111f1b605d019
-.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
-.quad	0xd807aa98a3030242,0x12835b0145706fbe
-.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
-.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
-.quad	0x9bdc06a725c71235,0xc19bf174cf692694
-.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
-.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
-.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
-.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
-.quad	0x983e5152ee66dfab,0xa831c66d2db43210
-.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
-.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
-.quad	0x06ca6351e003826f,0x142929670a0e6e70
-.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
-.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
-.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
-.quad	0x81c2c92e47edaee6,0x92722c851482353b
-.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
-.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
-.quad	0xd192e819d6ef5218,0xd69906245565a910
-.quad	0xf40e35855771202a,0x106aa07032bbd1b8
-.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
-.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
-.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
-.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
-.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
-.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
-.quad	0x90befffa23631e28,0xa4506cebde82bde9
-.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
-.quad	0xca273eceea26619c,0xd186b8c721c0c207
-.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
-.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
-.quad	0x113f9804bef90dae,0x1b710b35131c471b
-.quad	0x28db77f523047d84,0x32caab7b40c72493
-.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
-.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
-.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
-.quad	0	// terminator
-
-.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-.text
-#ifndef	__KERNEL__
-
-.align	6
-sha512_block_armv8:
-Lv8_entry:
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
-	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
-
-	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
-	adrp	x3,LK512@PAGE
-	add	x3,x3,LK512@PAGEOFF
-
-	rev64	v16.16b,v16.16b
-	rev64	v17.16b,v17.16b
-	rev64	v18.16b,v18.16b
-	rev64	v19.16b,v19.16b
-	rev64	v20.16b,v20.16b
-	rev64	v21.16b,v21.16b
-	rev64	v22.16b,v22.16b
-	rev64	v23.16b,v23.16b
-	b	Loop_hw
-
-.align	4
-Loop_hw:
-	ld1	{v24.2d},[x3],#16
-	subs	x2,x2,#1
-	sub	x4,x1,#128
-	orr	v26.16b,v0.16b,v0.16b			// offload
-	orr	v27.16b,v1.16b,v1.16b
-	orr	v28.16b,v2.16b,v2.16b
-	orr	v29.16b,v3.16b,v3.16b
-	csel	x1,x1,x4,ne			// conditional rewind
-	add	v24.2d,v24.2d,v16.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08230	//sha512su0 v16.16b,v17.16b
-	ext	v7.16b,v20.16b,v21.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v25.2d,v25.2d,v17.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08251	//sha512su0 v17.16b,v18.16b
-	ext	v7.16b,v21.16b,v22.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	add	v24.2d,v24.2d,v18.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08272	//sha512su0 v18.16b,v19.16b
-	ext	v7.16b,v22.16b,v23.16b,#8
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	add	v25.2d,v25.2d,v19.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08293	//sha512su0 v19.16b,v20.16b
-	ext	v7.16b,v23.16b,v16.16b,#8
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	add	v24.2d,v24.2d,v20.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
-	ext	v7.16b,v16.16b,v17.16b,#8
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v25.2d,v25.2d,v21.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
-	ext	v7.16b,v17.16b,v18.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v24.2d,v24.2d,v22.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
-	ext	v7.16b,v18.16b,v19.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	add	v25.2d,v25.2d,v23.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08217	//sha512su0 v23.16b,v16.16b
-	ext	v7.16b,v19.16b,v20.16b,#8
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	add	v24.2d,v24.2d,v16.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08230	//sha512su0 v16.16b,v17.16b
-	ext	v7.16b,v20.16b,v21.16b,#8
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	add	v25.2d,v25.2d,v17.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08251	//sha512su0 v17.16b,v18.16b
-	ext	v7.16b,v21.16b,v22.16b,#8
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v24.2d,v24.2d,v18.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08272	//sha512su0 v18.16b,v19.16b
-	ext	v7.16b,v22.16b,v23.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v25.2d,v25.2d,v19.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08293	//sha512su0 v19.16b,v20.16b
-	ext	v7.16b,v23.16b,v16.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	add	v24.2d,v24.2d,v20.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
-	ext	v7.16b,v16.16b,v17.16b,#8
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	add	v25.2d,v25.2d,v21.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
-	ext	v7.16b,v17.16b,v18.16b,#8
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	add	v24.2d,v24.2d,v22.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
-	ext	v7.16b,v18.16b,v19.16b,#8
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v25.2d,v25.2d,v23.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08217	//sha512su0 v23.16b,v16.16b
-	ext	v7.16b,v19.16b,v20.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v24.2d,v24.2d,v16.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08230	//sha512su0 v16.16b,v17.16b
-	ext	v7.16b,v20.16b,v21.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	add	v25.2d,v25.2d,v17.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08251	//sha512su0 v17.16b,v18.16b
-	ext	v7.16b,v21.16b,v22.16b,#8
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	add	v24.2d,v24.2d,v18.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08272	//sha512su0 v18.16b,v19.16b
-	ext	v7.16b,v22.16b,v23.16b,#8
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	add	v25.2d,v25.2d,v19.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08293	//sha512su0 v19.16b,v20.16b
-	ext	v7.16b,v23.16b,v16.16b,#8
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v24.2d,v24.2d,v20.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
-	ext	v7.16b,v16.16b,v17.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v25.2d,v25.2d,v21.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
-	ext	v7.16b,v17.16b,v18.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	add	v24.2d,v24.2d,v22.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
-	ext	v7.16b,v18.16b,v19.16b,#8
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	add	v25.2d,v25.2d,v23.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08217	//sha512su0 v23.16b,v16.16b
-	ext	v7.16b,v19.16b,v20.16b,#8
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	add	v24.2d,v24.2d,v16.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08230	//sha512su0 v16.16b,v17.16b
-	ext	v7.16b,v20.16b,v21.16b,#8
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v25.2d,v25.2d,v17.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08251	//sha512su0 v17.16b,v18.16b
-	ext	v7.16b,v21.16b,v22.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v24.2d,v24.2d,v18.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec08272	//sha512su0 v18.16b,v19.16b
-	ext	v7.16b,v22.16b,v23.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	add	v25.2d,v25.2d,v19.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08293	//sha512su0 v19.16b,v20.16b
-	ext	v7.16b,v23.16b,v16.16b,#8
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	add	v24.2d,v24.2d,v20.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
-	ext	v7.16b,v16.16b,v17.16b,#8
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	add	v25.2d,v25.2d,v21.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
-	ext	v7.16b,v17.16b,v18.16b,#8
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v24.2d,v24.2d,v22.2d
-	ld1	{v25.2d},[x3],#16
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
-	ext	v7.16b,v18.16b,v19.16b,#8
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	add	v25.2d,v25.2d,v23.2d
-	ld1	{v24.2d},[x3],#16
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xcec08217	//sha512su0 v23.16b,v16.16b
-	ext	v7.16b,v19.16b,v20.16b,#8
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	ld1	{v25.2d},[x3],#16
-	add	v24.2d,v24.2d,v16.2d
-	ld1	{v16.16b},[x1],#16		// load next input
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-	rev64	v16.16b,v16.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	ld1	{v24.2d},[x3],#16
-	add	v25.2d,v25.2d,v17.2d
-	ld1	{v17.16b},[x1],#16		// load next input
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-	rev64	v17.16b,v17.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	ld1	{v25.2d},[x3],#16
-	add	v24.2d,v24.2d,v18.2d
-	ld1	{v18.16b},[x1],#16		// load next input
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-	rev64	v18.16b,v18.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	ld1	{v24.2d},[x3],#16
-	add	v25.2d,v25.2d,v19.2d
-	ld1	{v19.16b},[x1],#16		// load next input
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v2.16b,v3.16b,#8
-	ext	v6.16b,v1.16b,v2.16b,#8
-	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
-	rev64	v19.16b,v19.16b
-	add	v4.2d,v1.2d,v3.2d		// "D + T1"
-.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
-	ld1	{v25.2d},[x3],#16
-	add	v24.2d,v24.2d,v20.2d
-	ld1	{v20.16b},[x1],#16		// load next input
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v4.16b,v2.16b,#8
-	ext	v6.16b,v0.16b,v4.16b,#8
-	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
-	rev64	v20.16b,v20.16b
-	add	v1.2d,v0.2d,v2.2d		// "D + T1"
-.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
-	ld1	{v24.2d},[x3],#16
-	add	v25.2d,v25.2d,v21.2d
-	ld1	{v21.16b},[x1],#16		// load next input
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v1.16b,v4.16b,#8
-	ext	v6.16b,v3.16b,v1.16b,#8
-	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
-	rev64	v21.16b,v21.16b
-	add	v0.2d,v3.2d,v4.2d		// "D + T1"
-.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
-	ld1	{v25.2d},[x3],#16
-	add	v24.2d,v24.2d,v22.2d
-	ld1	{v22.16b},[x1],#16		// load next input
-	ext	v24.16b,v24.16b,v24.16b,#8
-	ext	v5.16b,v0.16b,v1.16b,#8
-	ext	v6.16b,v2.16b,v0.16b,#8
-	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
-.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
-	rev64	v22.16b,v22.16b
-	add	v3.2d,v2.2d,v1.2d		// "D + T1"
-.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
-	sub	x3,x3,#80*8	// rewind
-	add	v25.2d,v25.2d,v23.2d
-	ld1	{v23.16b},[x1],#16		// load next input
-	ext	v25.16b,v25.16b,v25.16b,#8
-	ext	v5.16b,v3.16b,v0.16b,#8
-	ext	v6.16b,v4.16b,v3.16b,#8
-	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
-.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
-	rev64	v23.16b,v23.16b
-	add	v2.2d,v4.2d,v0.2d		// "D + T1"
-.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
-	add	v0.2d,v0.2d,v26.2d			// accumulate
-	add	v1.2d,v1.2d,v27.2d
-	add	v2.2d,v2.2d,v28.2d
-	add	v3.2d,v3.2d,v29.2d
-
-	cbnz	x2,Loop_hw
-
-	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
-
-	ldr	x29,[sp],#16
-	ret
-
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/fipsmodule/vpaes-armv8-apple.S b/apple-aarch64/crypto/fipsmodule/vpaes-armv8-apple.S
deleted file mode 100644
index a108a96..0000000
--- a/apple-aarch64/crypto/fipsmodule/vpaes-armv8-apple.S
+++ /dev/null
@@ -1,1224 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.section	__TEXT,__const
-
-
-.align	7	// totally strategic alignment
-_vpaes_consts:
-Lk_mc_forward:	//	mc_forward
-.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
-.quad	0x080B0A0904070605, 0x000302010C0F0E0D
-.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
-.quad	0x000302010C0F0E0D, 0x080B0A0904070605
-Lk_mc_backward:	//	mc_backward
-.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
-.quad	0x020100030E0D0C0F, 0x0A09080B06050407
-.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
-.quad	0x0A09080B06050407, 0x020100030E0D0C0F
-Lk_sr:	//	sr
-.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
-.quad	0x030E09040F0A0500, 0x0B06010C07020D08
-.quad	0x0F060D040B020900, 0x070E050C030A0108
-.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
-
-//
-// "Hot" constants
-//
-Lk_inv:	//	inv, inva
-.quad	0x0E05060F0D080180, 0x040703090A0B0C02
-.quad	0x01040A060F0B0780, 0x030D0E0C02050809
-Lk_ipt:	//	input transform (lo, hi)
-.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
-.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
-Lk_sbo:	//	sbou, sbot
-.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
-.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
-Lk_sb1:	//	sb1u, sb1t
-.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
-.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
-Lk_sb2:	//	sb2u, sb2t
-.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
-.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
-
-//
-//  Decryption stuff
-//
-Lk_dipt:	//	decryption input transform
-.quad	0x0F505B040B545F00, 0x154A411E114E451A
-.quad	0x86E383E660056500, 0x12771772F491F194
-Lk_dsbo:	//	decryption sbox final output
-.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
-.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-Lk_dsb9:	//	decryption sbox output *9*u, *9*t
-.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
-.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
-Lk_dsbd:	//	decryption sbox output *D*u, *D*t
-.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
-.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
-Lk_dsbb:	//	decryption sbox output *B*u, *B*t
-.quad	0xD022649296B44200, 0x602646F6B0F2D404
-.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
-Lk_dsbe:	//	decryption sbox output *E*u, *E*t
-.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
-.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
-
-//
-//  Key schedule constants
-//
-Lk_dksd:	//	decryption key schedule: invskew x*D
-.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
-.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
-Lk_dksb:	//	decryption key schedule: invskew x*B
-.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
-.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
-Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
-.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
-.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
-Lk_dks9:	//	decryption key schedule: invskew x*9
-.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
-.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
-
-Lk_rcon:	//	rcon
-.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
-
-Lk_opt:	//	output transform
-.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
-.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
-Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
-.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
-.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
-
-.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
-.align	2
-
-.align	6
-
-.text
-##
-##  _aes_preheat
-##
-##  Fills register %r10 -> .aes_consts (so you can -fPIC)
-##  and %xmm9-%xmm15 as specified below.
-##
-
-.align	4
-_vpaes_encrypt_preheat:
-	adrp	x10, Lk_inv@PAGE
-	add	x10, x10, Lk_inv@PAGEOFF
-	movi	v17.16b, #0x0f
-	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
-	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// Lk_ipt, Lk_sbo
-	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// Lk_sb1, Lk_sb2
-	ret
-
-
-##
-##  _aes_encrypt_core
-##
-##  AES-encrypt %xmm0.
-##
-##  Inputs:
-##     %xmm0 = input
-##     %xmm9-%xmm15 as in _vpaes_preheat
-##    (%rdx) = scheduled keys
-##
-##  Output in %xmm0
-##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
-##  Preserves %xmm6 - %xmm8 so you get some local vectors
-##
-##
-
-.align	4
-_vpaes_encrypt_core:
-	mov	x9, x2
-	ldr	w8, [x2,#240]			// pull rounds
-	adrp	x11, Lk_mc_forward@PAGE+16
-	add	x11, x11, Lk_mc_forward@PAGEOFF+16
-						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
-	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
-	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
-	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
-	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
-						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
-	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
-	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
-	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
-	b	Lenc_entry
-
-.align	4
-Lenc_loop:
-	// middle of middle round
-	add	x10, x11, #0x40
-	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
-	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
-	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
-	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
-	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
-	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
-	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
-	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
-	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
-	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
-	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
-	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
-	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
-	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
-	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
-	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
-	sub	w8, w8, #1			// nr--
-
-Lenc_entry:
-	// top of round
-	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
-	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
-	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
-	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
-	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
-	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
-	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
-	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
-	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
-	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
-	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
-	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
-	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
-	cbnz	w8, Lenc_loop
-
-	// middle of last round
-	add	x10, x11, #0x80
-						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
-						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
-	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
-	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
-	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
-	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
-	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
-	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
-	ret
-
-
-.globl	_vpaes_encrypt
-.private_extern	_vpaes_encrypt
-
-.align	4
-_vpaes_encrypt:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ld1	{v7.16b}, [x0]
-	bl	_vpaes_encrypt_preheat
-	bl	_vpaes_encrypt_core
-	st1	{v0.16b}, [x1]
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-
-.align	4
-_vpaes_encrypt_2x:
-	mov	x9, x2
-	ldr	w8, [x2,#240]			// pull rounds
-	adrp	x11, Lk_mc_forward@PAGE+16
-	add	x11, x11, Lk_mc_forward@PAGEOFF+16
-						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
-	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
-	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
-	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
-	and	v9.16b,  v15.16b,  v17.16b
-	ushr	v8.16b,  v15.16b,  #4
-	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
-	tbl	v9.16b,  {v20.16b}, v9.16b
-						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
-	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
-	tbl	v10.16b, {v21.16b}, v8.16b
-	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
-	eor	v8.16b,  v9.16b,   v16.16b
-	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
-	eor	v8.16b,  v8.16b,   v10.16b
-	b	Lenc_2x_entry
-
-.align	4
-Lenc_2x_loop:
-	// middle of middle round
-	add	x10, x11, #0x40
-	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
-	tbl	v12.16b, {v25.16b}, v10.16b
-	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
-	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
-	tbl	v8.16b,  {v24.16b}, v11.16b
-	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
-	eor	v12.16b, v12.16b, v16.16b
-	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
-	tbl	v13.16b, {v27.16b}, v10.16b
-	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
-	eor	v8.16b,  v8.16b,  v12.16b
-	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
-	tbl	v10.16b, {v26.16b}, v11.16b
-	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
-	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
-	tbl	v11.16b, {v8.16b}, v1.16b
-	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
-	eor	v10.16b, v10.16b, v13.16b
-	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
-	tbl	v8.16b,  {v8.16b}, v4.16b
-	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
-	eor	v11.16b, v11.16b, v10.16b
-	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
-	tbl	v12.16b, {v11.16b},v1.16b
-	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
-	eor	v8.16b,  v8.16b,  v11.16b
-	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
-	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
-	eor	v8.16b,  v8.16b,  v12.16b
-	sub	w8, w8, #1			// nr--
-
-Lenc_2x_entry:
-	// top of round
-	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
-	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
-	and	v9.16b,  v8.16b, v17.16b
-	ushr	v8.16b,  v8.16b, #4
-	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
-	tbl	v13.16b, {v19.16b},v9.16b
-	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
-	eor	v9.16b,  v9.16b,  v8.16b
-	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
-	tbl	v11.16b, {v18.16b},v8.16b
-	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
-	tbl	v12.16b, {v18.16b},v9.16b
-	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
-	eor	v11.16b, v11.16b, v13.16b
-	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
-	eor	v12.16b, v12.16b, v13.16b
-	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
-	tbl	v10.16b, {v18.16b},v11.16b
-	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
-	tbl	v11.16b, {v18.16b},v12.16b
-	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
-	eor	v10.16b, v10.16b, v9.16b
-	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
-	eor	v11.16b, v11.16b, v8.16b
-	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
-	cbnz	w8, Lenc_2x_loop
-
-	// middle of last round
-	add	x10, x11, #0x80
-						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
-						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
-	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
-	tbl	v12.16b, {v22.16b}, v10.16b
-	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
-	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
-	tbl	v8.16b,  {v23.16b}, v11.16b
-	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
-	eor	v12.16b, v12.16b, v16.16b
-	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
-	eor	v8.16b,  v8.16b,  v12.16b
-	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
-	tbl	v1.16b,  {v8.16b},v1.16b
-	ret
-
-
-
-.align	4
-_vpaes_decrypt_preheat:
-	adrp	x10, Lk_inv@PAGE
-	add	x10, x10, Lk_inv@PAGEOFF
-	movi	v17.16b, #0x0f
-	adrp	x11, Lk_dipt@PAGE
-	add	x11, x11, Lk_dipt@PAGEOFF
-	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
-	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// Lk_dipt, Lk_dsbo
-	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// Lk_dsb9, Lk_dsbd
-	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// Lk_dsbb, Lk_dsbe
-	ret
-
-
-##
-##  Decryption core
-##
-##  Same API as encryption core.
-##
-
-.align	4
-_vpaes_decrypt_core:
-	mov	x9, x2
-	ldr	w8, [x2,#240]			// pull rounds
-
-						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
-	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
-	eor	x11, x11, #0x30			// xor		$0x30,	%r11
-	adrp	x10, Lk_sr@PAGE
-	add	x10, x10, Lk_sr@PAGEOFF
-	and	x11, x11, #0x30			// and		$0x30,	%r11
-	add	x11, x11, x10
-	adrp	x10, Lk_mc_forward@PAGE+48
-	add	x10, x10, Lk_mc_forward@PAGEOFF+48
-
-	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
-	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
-	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
-	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
-	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
-						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
-	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
-	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
-	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
-	b	Ldec_entry
-
-.align	4
-Ldec_loop:
-//
-//  Inverse mix columns
-//
-						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
-						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
-	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
-	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
-	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
-						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
-	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
-
-	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
-	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
-	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
-						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
-	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
-
-	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
-	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
-	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
-						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
-	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
-
-	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
-	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
-	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
-	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-	sub	w8, w8, #1			// sub		$1,%rax			# nr--
-
-Ldec_entry:
-	// top of round
-	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
-	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
-	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
-	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
-	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
-	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
-	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
-	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
-	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
-	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
-	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
-	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
-	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
-	cbnz	w8, Ldec_loop
-
-	// middle of last round
-						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
-	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
-						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
-	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
-	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
-	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
-	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
-	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
-	ret
-
-
-.globl	_vpaes_decrypt
-.private_extern	_vpaes_decrypt
-
-.align	4
-_vpaes_decrypt:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	ld1	{v7.16b}, [x0]
-	bl	_vpaes_decrypt_preheat
-	bl	_vpaes_decrypt_core
-	st1	{v0.16b}, [x1]
-
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-// v14-v15 input, v0-v1 output
-
-.align	4
-_vpaes_decrypt_2x:
-	mov	x9, x2
-	ldr	w8, [x2,#240]			// pull rounds
-
-						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
-	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
-	eor	x11, x11, #0x30			// xor		$0x30,	%r11
-	adrp	x10, Lk_sr@PAGE
-	add	x10, x10, Lk_sr@PAGEOFF
-	and	x11, x11, #0x30			// and		$0x30,	%r11
-	add	x11, x11, x10
-	adrp	x10, Lk_mc_forward@PAGE+48
-	add	x10, x10, Lk_mc_forward@PAGEOFF+48
-
-	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
-	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
-	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
-	and	v9.16b,  v15.16b, v17.16b
-	ushr	v8.16b,  v15.16b, #4
-	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
-	tbl	v10.16b, {v20.16b},v9.16b
-	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
-						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
-	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
-	tbl	v8.16b,  {v21.16b},v8.16b
-	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
-	eor	v10.16b, v10.16b, v16.16b
-	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
-	eor	v8.16b,  v8.16b,  v10.16b
-	b	Ldec_2x_entry
-
-.align	4
-Ldec_2x_loop:
-//
-//  Inverse mix columns
-//
-						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
-						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
-	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
-	tbl	v12.16b, {v24.16b}, v10.16b
-	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
-	tbl	v9.16b,  {v25.16b}, v11.16b
-	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
-	eor	v8.16b,  v12.16b, v16.16b
-						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
-	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
-
-	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
-	tbl	v12.16b, {v26.16b}, v10.16b
-	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	tbl	v8.16b,  {v8.16b},v5.16b
-	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
-	tbl	v9.16b,  {v27.16b}, v11.16b
-	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	eor	v8.16b,  v8.16b,  v12.16b
-						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
-	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-	eor	v8.16b,  v8.16b,  v9.16b
-						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
-
-	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
-	tbl	v12.16b, {v28.16b}, v10.16b
-	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	tbl	v8.16b,  {v8.16b},v5.16b
-	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
-	tbl	v9.16b,  {v29.16b}, v11.16b
-	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	eor	v8.16b,  v8.16b,  v12.16b
-						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
-	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-	eor	v8.16b,  v8.16b,  v9.16b
-						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
-
-	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
-	tbl	v12.16b, {v30.16b}, v10.16b
-	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	tbl	v8.16b,  {v8.16b},v5.16b
-	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
-	tbl	v9.16b,  {v31.16b}, v11.16b
-	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	eor	v8.16b,  v8.16b,  v12.16b
-	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
-	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
-	eor	v8.16b,  v8.16b,  v9.16b
-	sub	w8, w8, #1			// sub		$1,%rax			# nr--
-
-Ldec_2x_entry:
-	// top of round
-	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
-	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
-	and	v9.16b,  v8.16b,  v17.16b
-	ushr	v8.16b,  v8.16b,  #4
-	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
-	tbl	v10.16b, {v19.16b},v9.16b
-	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
-	eor	v9.16b,	 v9.16b,  v8.16b
-	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
-	tbl	v11.16b, {v18.16b},v8.16b
-	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
-	tbl	v12.16b, {v18.16b},v9.16b
-	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
-	eor	v11.16b, v11.16b, v10.16b
-	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
-	eor	v12.16b, v12.16b, v10.16b
-	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
-	tbl	v10.16b, {v18.16b},v11.16b
-	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
-	tbl	v11.16b, {v18.16b},v12.16b
-	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
-	eor	v10.16b, v10.16b, v9.16b
-	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
-	eor	v11.16b, v11.16b, v8.16b
-	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
-	cbnz	w8, Ldec_2x_loop
-
-	// middle of last round
-						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
-	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
-	tbl	v12.16b, {v22.16b}, v10.16b
-						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
-	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
-	tbl	v9.16b,  {v23.16b}, v11.16b
-	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
-	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
-	eor	v12.16b, v12.16b, v16.16b
-	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
-	eor	v8.16b,  v9.16b,  v12.16b
-	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
-	tbl	v1.16b,  {v8.16b},v2.16b
-	ret
-
-########################################################
-##                                                    ##
-##                  AES key schedule                  ##
-##                                                    ##
-########################################################
-
-.align	4
-_vpaes_key_preheat:
-	adrp	x10, Lk_inv@PAGE
-	add	x10, x10, Lk_inv@PAGEOFF
-	movi	v16.16b, #0x5b			// Lk_s63
-	adrp	x11, Lk_sb1@PAGE
-	add	x11, x11, Lk_sb1@PAGEOFF
-	movi	v17.16b, #0x0f			// Lk_s0F
-	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// Lk_inv, Lk_ipt
-	adrp	x10, Lk_dksd@PAGE
-	add	x10, x10, Lk_dksd@PAGEOFF
-	ld1	{v22.2d,v23.2d}, [x11]		// Lk_sb1
-	adrp	x11, Lk_mc_forward@PAGE
-	add	x11, x11, Lk_mc_forward@PAGEOFF
-	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// Lk_dksd, Lk_dksb
-	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// Lk_dkse, Lk_dks9
-	ld1	{v8.2d}, [x10]			// Lk_rcon
-	ld1	{v9.2d}, [x11]			// Lk_mc_forward[0]
-	ret
-
-
-
-.align	4
-_vpaes_schedule_core:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29, x30, [sp,#-16]!
-	add	x29,sp,#0
-
-	bl	_vpaes_key_preheat		// load the tables
-
-	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
-
-	// input transform
-	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
-	bl	_vpaes_schedule_transform
-	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
-
-	adrp	x10, Lk_sr@PAGE		// lea	Lk_sr(%rip),%r10
-	add	x10, x10, Lk_sr@PAGEOFF
-
-	add	x8, x8, x10
-	cbnz	w3, Lschedule_am_decrypting
-
-	// encrypting, output zeroth round key after transform
-	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
-	b	Lschedule_go
-
-Lschedule_am_decrypting:
-	// decrypting, output zeroth round key after shiftrows
-	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
-	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
-	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
-	eor	x8, x8, #0x30			// xor	$0x30, %r8
-
-Lschedule_go:
-	cmp	w1, #192			// cmp	$192,	%esi
-	b.hi	Lschedule_256
-	b.eq	Lschedule_192
-	// 128: fall though
-
-##
-##  .schedule_128
-##
-##  128-bit specific part of key schedule.
-##
-##  This schedule is really simple, because all its parts
-##  are accomplished by the subroutines.
-##
-Lschedule_128:
-	mov	x0, #10			// mov	$10, %esi
-
-Loop_schedule_128:
-	sub	x0, x0, #1			// dec	%esi
-	bl	_vpaes_schedule_round
-	cbz	x0, Lschedule_mangle_last
-	bl	_vpaes_schedule_mangle		// write output
-	b	Loop_schedule_128
-
-##
-##  .aes_schedule_192
-##
-##  192-bit specific part of key schedule.
-##
-##  The main body of this schedule is the same as the 128-bit
-##  schedule, but with more smearing.  The long, high side is
-##  stored in %xmm7 as before, and the short, low side is in
-##  the high bits of %xmm6.
-##
-##  This schedule is somewhat nastier, however, because each
-##  round produces 192 bits of key material, or 1.5 round keys.
-##  Therefore, on each cycle we do 2 rounds and produce 3 round
-##  keys.
-##
-.align	4
-Lschedule_192:
-	sub	x0, x0, #8
-	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
-	bl	_vpaes_schedule_transform	// input transform
-	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
-	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
-	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
-	mov	x0, #4			// mov	$4,	%esi
-
-Loop_schedule_192:
-	sub	x0, x0, #1			// dec	%esi
-	bl	_vpaes_schedule_round
-	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
-	bl	_vpaes_schedule_mangle		// save key n
-	bl	_vpaes_schedule_192_smear
-	bl	_vpaes_schedule_mangle		// save key n+1
-	bl	_vpaes_schedule_round
-	cbz	x0, Lschedule_mangle_last
-	bl	_vpaes_schedule_mangle		// save key n+2
-	bl	_vpaes_schedule_192_smear
-	b	Loop_schedule_192
-
-##
-##  .aes_schedule_256
-##
-##  256-bit specific part of key schedule.
-##
-##  The structure here is very similar to the 128-bit
-##  schedule, but with an additional "low side" in
-##  %xmm6.  The low side's rounds are the same as the
-##  high side's, except no rcon and no rotation.
-##
-.align	4
-Lschedule_256:
-	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
-	bl	_vpaes_schedule_transform	// input transform
-	mov	x0, #7			// mov	$7, %esi
-
-Loop_schedule_256:
-	sub	x0, x0, #1			// dec	%esi
-	bl	_vpaes_schedule_mangle		// output low result
-	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
-
-	// high round
-	bl	_vpaes_schedule_round
-	cbz	x0, Lschedule_mangle_last
-	bl	_vpaes_schedule_mangle
-
-	// low round. swap xmm7 and xmm6
-	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
-	movi	v4.16b, #0
-	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
-	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
-	bl	_vpaes_schedule_low_round
-	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
-
-	b	Loop_schedule_256
-
-##
-##  .aes_schedule_mangle_last
-##
-##  Mangler for last round of key schedule
-##  Mangles %xmm0
-##    when encrypting, outputs out(%xmm0) ^ 63
-##    when decrypting, outputs unskew(%xmm0)
-##
-##  Always called right before return... jumps to cleanup and exits
-##
-.align	4
-Lschedule_mangle_last:
-	// schedule last round key from xmm0
-	adrp	x11, Lk_deskew@PAGE	// lea	Lk_deskew(%rip),%r11	# prepare to deskew
-	add	x11, x11, Lk_deskew@PAGEOFF
-
-	cbnz	w3, Lschedule_mangle_last_dec
-
-	// encrypting
-	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
-	adrp	x11, Lk_opt@PAGE		// lea	Lk_opt(%rip),	%r11		# prepare to output transform
-	add	x11, x11, Lk_opt@PAGEOFF
-	add	x2, x2, #32			// add	$32,	%rdx
-	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
-
-Lschedule_mangle_last_dec:
-	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
-	sub	x2, x2, #16			// add	$-16,	%rdx
-	eor	v0.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
-	bl	_vpaes_schedule_transform	// output transform
-	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
-
-	// cleanup
-	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
-	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
-	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
-	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
-	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
-	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
-	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
-	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
-	ldp	x29, x30, [sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-##
-##  .aes_schedule_192_smear
-##
-##  Smear the short, low side in the 192-bit key schedule.
-##
-##  Inputs:
-##    %xmm7: high side, b  a  x  y
-##    %xmm6:  low side, d  c  0  0
-##    %xmm13: 0
-##
-##  Outputs:
-##    %xmm6: b+c+d  b+c  0  0
-##    %xmm0: b+c+d  b+c  b  a
-##
-
-.align	4
-_vpaes_schedule_192_smear:
-	movi	v1.16b, #0
-	dup	v0.4s, v7.s[3]
-	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
-	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
-	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
-	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
-	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
-	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
-	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
-	ret
-
-
-##
-##  .aes_schedule_round
-##
-##  Runs one main round of the key schedule on %xmm0, %xmm7
-##
-##  Specifically, runs subbytes on the high dword of %xmm0
-##  then rotates it by one byte and xors into the low dword of
-##  %xmm7.
-##
-##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
-##  next rcon.
-##
-##  Smears the dwords of %xmm7 by xoring the low into the
-##  second low, result into third, result into highest.
-##
-##  Returns results in %xmm7 = %xmm0.
-##  Clobbers %xmm1-%xmm4, %r11.
-##
-
-.align	4
-_vpaes_schedule_round:
-	// extract rcon from xmm8
-	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
-	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
-	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
-	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
-
-	// rotate
-	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
-	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
-
-	// fall through...
-
-	// low round: same as high round, but no rotation and no rcon.
-_vpaes_schedule_low_round:
-	// smear xmm7
-	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
-	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
-	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
-
-	// subbytes
-	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
-	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
-	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
-	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
-	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
-	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
-	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
-	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
-	eor	v7.16b, v7.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm7,	%xmm7
-	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
-	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
-	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
-	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
-	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
-	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
-	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
-	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
-
-	// add in smeared stuff
-	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
-	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
-	ret
-
-
-##
-##  .aes_schedule_transform
-##
-##  Linear-transform %xmm0 according to tables at (%r11)
-##
-##  Requires that %xmm9 = 0x0F0F... as in preheat
-##  Output in %xmm0
-##  Clobbers %xmm1, %xmm2
-##
-
-.align	4
-_vpaes_schedule_transform:
-	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
-	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
-						// vmovdqa	(%r11),	%xmm2 	# lo
-	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
-						// vmovdqa	16(%r11),	%xmm1 # hi
-	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
-	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
-	ret
-
-
-##
-##  .aes_schedule_mangle
-##
-##  Mangle xmm0 from (basis-transformed) standard version
-##  to our version.
-##
-##  On encrypt,
-##    xor with 0x63
-##    multiply by circulant 0,1,1,1
-##    apply shiftrows transform
-##
-##  On decrypt,
-##    xor with 0x63
-##    multiply by "inverse mixcolumns" circulant E,B,D,9
-##    deskew
-##    apply shiftrows transform
-##
-##
-##  Writes out to (%rdx), and increments or decrements it
-##  Keeps track of round number mod 4 in %r8
-##  Preserves xmm0
-##  Clobbers xmm1-xmm5
-##
-
-.align	4
-_vpaes_schedule_mangle:
-	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
-						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
-	cbnz	w3, Lschedule_mangle_dec
-
-	// encrypting
-	eor	v4.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm4
-	add	x2, x2, #16			// add	$16,	%rdx
-	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
-	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
-	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
-	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
-	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
-	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
-
-	b	Lschedule_mangle_both
-.align	4
-Lschedule_mangle_dec:
-	// inverse mix columns
-						// lea	.Lk_dksd(%rip),%r11
-	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
-	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
-
-						// vmovdqa	0x00(%r11),	%xmm2
-	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
-						// vmovdqa	0x10(%r11),	%xmm3
-	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
-	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
-	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
-
-						// vmovdqa	0x20(%r11),	%xmm2
-	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
-	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
-						// vmovdqa	0x30(%r11),	%xmm3
-	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
-	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
-	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
-
-						// vmovdqa	0x40(%r11),	%xmm2
-	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
-	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
-						// vmovdqa	0x50(%r11),	%xmm3
-	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
-	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
-
-						// vmovdqa	0x60(%r11),	%xmm2
-	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
-	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
-						// vmovdqa	0x70(%r11),	%xmm4
-	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
-	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
-	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
-	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
-
-	sub	x2, x2, #16			// add	$-16,	%rdx
-
-Lschedule_mangle_both:
-	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
-	add	x8, x8, #48			// add	$-16,	%r8
-	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
-	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
-	ret
-
-
-.globl	_vpaes_set_encrypt_key
-.private_extern	_vpaes_set_encrypt_key
-
-.align	4
-_vpaes_set_encrypt_key:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	stp	d8,d9,[sp,#-16]!	// ABI spec says so
-
-	lsr	w9, w1, #5		// shr	$5,%eax
-	add	w9, w9, #5		// $5,%eax
-	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
-
-	mov	w3, #0		// mov	$0,%ecx
-	mov	x8, #0x30		// mov	$0x30,%r8d
-	bl	_vpaes_schedule_core
-	eor	x0, x0, x0
-
-	ldp	d8,d9,[sp],#16
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.globl	_vpaes_set_decrypt_key
-.private_extern	_vpaes_set_decrypt_key
-
-.align	4
-_vpaes_set_decrypt_key:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	stp	d8,d9,[sp,#-16]!	// ABI spec says so
-
-	lsr	w9, w1, #5		// shr	$5,%eax
-	add	w9, w9, #5		// $5,%eax
-	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
-	lsl	w9, w9, #4		// shl	$4,%eax
-	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
-	add	x2, x2, x9
-
-	mov	w3, #1		// mov	$1,%ecx
-	lsr	w8, w1, #1		// shr	$1,%r8d
-	and	x8, x8, #32		// and	$32,%r8d
-	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
-	bl	_vpaes_schedule_core
-
-	ldp	d8,d9,[sp],#16
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.globl	_vpaes_cbc_encrypt
-.private_extern	_vpaes_cbc_encrypt
-
-.align	4
-_vpaes_cbc_encrypt:
-	AARCH64_SIGN_LINK_REGISTER
-	cbz	x2, Lcbc_abort
-	cmp	w5, #0			// check direction
-	b.eq	vpaes_cbc_decrypt
-
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	mov	x17, x2		// reassign
-	mov	x2,  x3		// reassign
-
-	ld1	{v0.16b}, [x4]	// load ivec
-	bl	_vpaes_encrypt_preheat
-	b	Lcbc_enc_loop
-
-.align	4
-Lcbc_enc_loop:
-	ld1	{v7.16b}, [x0],#16	// load input
-	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
-	bl	_vpaes_encrypt_core
-	st1	{v0.16b}, [x1],#16	// save output
-	subs	x17, x17, #16
-	b.hi	Lcbc_enc_loop
-
-	st1	{v0.16b}, [x4]	// write ivec
-
-	ldp	x29,x30,[sp],#16
-Lcbc_abort:
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-
-.align	4
-vpaes_cbc_decrypt:
-	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
-	// only from vpaes_cbc_encrypt which has already signed the return address.
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	stp	d8,d9,[sp,#-16]!	// ABI spec says so
-	stp	d10,d11,[sp,#-16]!
-	stp	d12,d13,[sp,#-16]!
-	stp	d14,d15,[sp,#-16]!
-
-	mov	x17, x2		// reassign
-	mov	x2,  x3		// reassign
-	ld1	{v6.16b}, [x4]	// load ivec
-	bl	_vpaes_decrypt_preheat
-	tst	x17, #16
-	b.eq	Lcbc_dec_loop2x
-
-	ld1	{v7.16b}, [x0], #16	// load input
-	bl	_vpaes_decrypt_core
-	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
-	orr	v6.16b, v7.16b, v7.16b	// next ivec value
-	st1	{v0.16b}, [x1], #16
-	subs	x17, x17, #16
-	b.ls	Lcbc_dec_done
-
-.align	4
-Lcbc_dec_loop2x:
-	ld1	{v14.16b,v15.16b}, [x0], #32
-	bl	_vpaes_decrypt_2x
-	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
-	eor	v1.16b, v1.16b, v14.16b
-	orr	v6.16b, v15.16b, v15.16b
-	st1	{v0.16b,v1.16b}, [x1], #32
-	subs	x17, x17, #32
-	b.hi	Lcbc_dec_loop2x
-
-Lcbc_dec_done:
-	st1	{v6.16b}, [x4]
-
-	ldp	d14,d15,[sp],#16
-	ldp	d12,d13,[sp],#16
-	ldp	d10,d11,[sp],#16
-	ldp	d8,d9,[sp],#16
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-.globl	_vpaes_ctr32_encrypt_blocks
-.private_extern	_vpaes_ctr32_encrypt_blocks
-
-.align	4
-_vpaes_ctr32_encrypt_blocks:
-	AARCH64_SIGN_LINK_REGISTER
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-	stp	d8,d9,[sp,#-16]!	// ABI spec says so
-	stp	d10,d11,[sp,#-16]!
-	stp	d12,d13,[sp,#-16]!
-	stp	d14,d15,[sp,#-16]!
-
-	cbz	x2, Lctr32_done
-
-	// Note, unlike the other functions, x2 here is measured in blocks,
-	// not bytes.
-	mov	x17, x2
-	mov	x2,  x3
-
-	// Load the IV and counter portion.
-	ldr	w6, [x4, #12]
-	ld1	{v7.16b}, [x4]
-
-	bl	_vpaes_encrypt_preheat
-	tst	x17, #1
-	rev	w6, w6		// The counter is big-endian.
-	b.eq	Lctr32_prep_loop
-
-	// Handle one block so the remaining block count is even for
-	// _vpaes_encrypt_2x.
-	ld1	{v6.16b}, [x0], #16	// Load input ahead of time
-	bl	_vpaes_encrypt_core
-	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
-	st1	{v0.16b}, [x1], #16
-	subs	x17, x17, #1
-	// Update the counter.
-	add	w6, w6, #1
-	rev	w7, w6
-	mov	v7.s[3], w7
-	b.ls	Lctr32_done
-
-Lctr32_prep_loop:
-	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
-	// uses v14 and v15.
-	mov	v15.16b, v7.16b
-	mov	v14.16b, v7.16b
-	add	w6, w6, #1
-	rev	w7, w6
-	mov	v15.s[3], w7
-
-Lctr32_loop:
-	ld1	{v6.16b,v7.16b}, [x0], #32	// Load input ahead of time
-	bl	_vpaes_encrypt_2x
-	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
-	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
-	st1	{v0.16b,v1.16b}, [x1], #32
-	subs	x17, x17, #2
-	// Update the counter.
-	add	w7, w6, #1
-	add	w6, w6, #2
-	rev	w7, w7
-	mov	v14.s[3], w7
-	rev	w7, w6
-	mov	v15.s[3], w7
-	b.hi	Lctr32_loop
-
-Lctr32_done:
-	ldp	d14,d15,[sp],#16
-	ldp	d12,d13,[sp],#16
-	ldp	d10,d11,[sp],#16
-	ldp	d8,d9,[sp],#16
-	ldp	x29,x30,[sp],#16
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-aarch64/crypto/test/trampoline-armv8-apple.S b/apple-aarch64/crypto/test/trampoline-armv8-apple.S
deleted file mode 100644
index 99055e0..0000000
--- a/apple-aarch64/crypto/test/trampoline-armv8-apple.S
+++ /dev/null
@@ -1,750 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.text
-
-// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
-// with |argv|, then saves the callee-saved registers into |state|. It returns
-// the result of |func|. The |unwind| argument is unused.
-// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
-//                              const uint64_t *argv, size_t argc,
-//                              uint64_t unwind);
-
-.globl	_abi_test_trampoline
-.private_extern	_abi_test_trampoline
-.align	4
-_abi_test_trampoline:
-Labi_test_trampoline_begin:
-	AARCH64_SIGN_LINK_REGISTER
-	// Stack layout (low to high addresses)
-	//   x29,x30 (16 bytes)
-	//    d8-d15 (64 bytes)
-	//   x19-x28 (80 bytes)
-	//    x1 (8 bytes)
-	//   padding (8 bytes)
-	stp	x29, x30, [sp, #-176]!
-	mov	x29, sp
-
-	// Saved callee-saved registers and |state|.
-	stp	d8, d9, [sp, #16]
-	stp	d10, d11, [sp, #32]
-	stp	d12, d13, [sp, #48]
-	stp	d14, d15, [sp, #64]
-	stp	x19, x20, [sp, #80]
-	stp	x21, x22, [sp, #96]
-	stp	x23, x24, [sp, #112]
-	stp	x25, x26, [sp, #128]
-	stp	x27, x28, [sp, #144]
-	str	x1, [sp, #160]
-
-	// Load registers from |state|, with the exception of x29. x29 is the
-	// frame pointer and also callee-saved, but AAPCS64 allows platforms to
-	// mandate that x29 always point to a frame. iOS64 does so, which means
-	// we cannot fill x29 with entropy without violating ABI rules
-	// ourselves. x29 is tested separately below.
-	ldp	d8, d9, [x1], #16
-	ldp	d10, d11, [x1], #16
-	ldp	d12, d13, [x1], #16
-	ldp	d14, d15, [x1], #16
-	ldp	x19, x20, [x1], #16
-	ldp	x21, x22, [x1], #16
-	ldp	x23, x24, [x1], #16
-	ldp	x25, x26, [x1], #16
-	ldp	x27, x28, [x1], #16
-
-	// Move parameters into temporary registers.
-	mov	x9, x0
-	mov	x10, x2
-	mov	x11, x3
-
-	// Load parameters into registers.
-	cbz	x11, Largs_done
-	ldr	x0, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x1, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x2, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x3, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x4, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x5, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x6, [x10], #8
-	subs	x11, x11, #1
-	b.eq	Largs_done
-	ldr	x7, [x10], #8
-
-Largs_done:
-	blr	x9
-
-	// Reload |state| and store registers.
-	ldr	x1, [sp, #160]
-	stp	d8, d9, [x1], #16
-	stp	d10, d11, [x1], #16
-	stp	d12, d13, [x1], #16
-	stp	d14, d15, [x1], #16
-	stp	x19, x20, [x1], #16
-	stp	x21, x22, [x1], #16
-	stp	x23, x24, [x1], #16
-	stp	x25, x26, [x1], #16
-	stp	x27, x28, [x1], #16
-
-	// |func| is required to preserve x29, the frame pointer. We cannot load
-	// random values into x29 (see comment above), so compare it against the
-	// expected value and zero the field of |state| if corrupted.
-	mov	x9, sp
-	cmp	x29, x9
-	b.eq	Lx29_ok
-	str	xzr, [x1]
-
-Lx29_ok:
-	// Restore callee-saved registers.
-	ldp	d8, d9, [sp, #16]
-	ldp	d10, d11, [sp, #32]
-	ldp	d12, d13, [sp, #48]
-	ldp	d14, d15, [sp, #64]
-	ldp	x19, x20, [sp, #80]
-	ldp	x21, x22, [sp, #96]
-	ldp	x23, x24, [sp, #112]
-	ldp	x25, x26, [sp, #128]
-	ldp	x27, x28, [sp, #144]
-
-	ldp	x29, x30, [sp], #176
-	AARCH64_VALIDATE_LINK_REGISTER
-	ret
-
-
-.globl	_abi_test_clobber_x0
-.private_extern	_abi_test_clobber_x0
-.align	4
-_abi_test_clobber_x0:
-	AARCH64_VALID_CALL_TARGET
-	mov	x0, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x1
-.private_extern	_abi_test_clobber_x1
-.align	4
-_abi_test_clobber_x1:
-	AARCH64_VALID_CALL_TARGET
-	mov	x1, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x2
-.private_extern	_abi_test_clobber_x2
-.align	4
-_abi_test_clobber_x2:
-	AARCH64_VALID_CALL_TARGET
-	mov	x2, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x3
-.private_extern	_abi_test_clobber_x3
-.align	4
-_abi_test_clobber_x3:
-	AARCH64_VALID_CALL_TARGET
-	mov	x3, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x4
-.private_extern	_abi_test_clobber_x4
-.align	4
-_abi_test_clobber_x4:
-	AARCH64_VALID_CALL_TARGET
-	mov	x4, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x5
-.private_extern	_abi_test_clobber_x5
-.align	4
-_abi_test_clobber_x5:
-	AARCH64_VALID_CALL_TARGET
-	mov	x5, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x6
-.private_extern	_abi_test_clobber_x6
-.align	4
-_abi_test_clobber_x6:
-	AARCH64_VALID_CALL_TARGET
-	mov	x6, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x7
-.private_extern	_abi_test_clobber_x7
-.align	4
-_abi_test_clobber_x7:
-	AARCH64_VALID_CALL_TARGET
-	mov	x7, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x8
-.private_extern	_abi_test_clobber_x8
-.align	4
-_abi_test_clobber_x8:
-	AARCH64_VALID_CALL_TARGET
-	mov	x8, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x9
-.private_extern	_abi_test_clobber_x9
-.align	4
-_abi_test_clobber_x9:
-	AARCH64_VALID_CALL_TARGET
-	mov	x9, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x10
-.private_extern	_abi_test_clobber_x10
-.align	4
-_abi_test_clobber_x10:
-	AARCH64_VALID_CALL_TARGET
-	mov	x10, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x11
-.private_extern	_abi_test_clobber_x11
-.align	4
-_abi_test_clobber_x11:
-	AARCH64_VALID_CALL_TARGET
-	mov	x11, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x12
-.private_extern	_abi_test_clobber_x12
-.align	4
-_abi_test_clobber_x12:
-	AARCH64_VALID_CALL_TARGET
-	mov	x12, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x13
-.private_extern	_abi_test_clobber_x13
-.align	4
-_abi_test_clobber_x13:
-	AARCH64_VALID_CALL_TARGET
-	mov	x13, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x14
-.private_extern	_abi_test_clobber_x14
-.align	4
-_abi_test_clobber_x14:
-	AARCH64_VALID_CALL_TARGET
-	mov	x14, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x15
-.private_extern	_abi_test_clobber_x15
-.align	4
-_abi_test_clobber_x15:
-	AARCH64_VALID_CALL_TARGET
-	mov	x15, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x16
-.private_extern	_abi_test_clobber_x16
-.align	4
-_abi_test_clobber_x16:
-	AARCH64_VALID_CALL_TARGET
-	mov	x16, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x17
-.private_extern	_abi_test_clobber_x17
-.align	4
-_abi_test_clobber_x17:
-	AARCH64_VALID_CALL_TARGET
-	mov	x17, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x19
-.private_extern	_abi_test_clobber_x19
-.align	4
-_abi_test_clobber_x19:
-	AARCH64_VALID_CALL_TARGET
-	mov	x19, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x20
-.private_extern	_abi_test_clobber_x20
-.align	4
-_abi_test_clobber_x20:
-	AARCH64_VALID_CALL_TARGET
-	mov	x20, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x21
-.private_extern	_abi_test_clobber_x21
-.align	4
-_abi_test_clobber_x21:
-	AARCH64_VALID_CALL_TARGET
-	mov	x21, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x22
-.private_extern	_abi_test_clobber_x22
-.align	4
-_abi_test_clobber_x22:
-	AARCH64_VALID_CALL_TARGET
-	mov	x22, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x23
-.private_extern	_abi_test_clobber_x23
-.align	4
-_abi_test_clobber_x23:
-	AARCH64_VALID_CALL_TARGET
-	mov	x23, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x24
-.private_extern	_abi_test_clobber_x24
-.align	4
-_abi_test_clobber_x24:
-	AARCH64_VALID_CALL_TARGET
-	mov	x24, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x25
-.private_extern	_abi_test_clobber_x25
-.align	4
-_abi_test_clobber_x25:
-	AARCH64_VALID_CALL_TARGET
-	mov	x25, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x26
-.private_extern	_abi_test_clobber_x26
-.align	4
-_abi_test_clobber_x26:
-	AARCH64_VALID_CALL_TARGET
-	mov	x26, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x27
-.private_extern	_abi_test_clobber_x27
-.align	4
-_abi_test_clobber_x27:
-	AARCH64_VALID_CALL_TARGET
-	mov	x27, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x28
-.private_extern	_abi_test_clobber_x28
-.align	4
-_abi_test_clobber_x28:
-	AARCH64_VALID_CALL_TARGET
-	mov	x28, xzr
-	ret
-
-
-.globl	_abi_test_clobber_x29
-.private_extern	_abi_test_clobber_x29
-.align	4
-_abi_test_clobber_x29:
-	AARCH64_VALID_CALL_TARGET
-	mov	x29, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d0
-.private_extern	_abi_test_clobber_d0
-.align	4
-_abi_test_clobber_d0:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d0, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d1
-.private_extern	_abi_test_clobber_d1
-.align	4
-_abi_test_clobber_d1:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d1, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d2
-.private_extern	_abi_test_clobber_d2
-.align	4
-_abi_test_clobber_d2:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d2, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d3
-.private_extern	_abi_test_clobber_d3
-.align	4
-_abi_test_clobber_d3:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d3, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d4
-.private_extern	_abi_test_clobber_d4
-.align	4
-_abi_test_clobber_d4:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d4, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d5
-.private_extern	_abi_test_clobber_d5
-.align	4
-_abi_test_clobber_d5:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d5, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d6
-.private_extern	_abi_test_clobber_d6
-.align	4
-_abi_test_clobber_d6:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d6, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d7
-.private_extern	_abi_test_clobber_d7
-.align	4
-_abi_test_clobber_d7:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d7, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d8
-.private_extern	_abi_test_clobber_d8
-.align	4
-_abi_test_clobber_d8:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d8, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d9
-.private_extern	_abi_test_clobber_d9
-.align	4
-_abi_test_clobber_d9:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d9, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d10
-.private_extern	_abi_test_clobber_d10
-.align	4
-_abi_test_clobber_d10:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d10, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d11
-.private_extern	_abi_test_clobber_d11
-.align	4
-_abi_test_clobber_d11:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d11, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d12
-.private_extern	_abi_test_clobber_d12
-.align	4
-_abi_test_clobber_d12:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d12, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d13
-.private_extern	_abi_test_clobber_d13
-.align	4
-_abi_test_clobber_d13:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d13, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d14
-.private_extern	_abi_test_clobber_d14
-.align	4
-_abi_test_clobber_d14:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d14, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d15
-.private_extern	_abi_test_clobber_d15
-.align	4
-_abi_test_clobber_d15:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d15, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d16
-.private_extern	_abi_test_clobber_d16
-.align	4
-_abi_test_clobber_d16:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d16, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d17
-.private_extern	_abi_test_clobber_d17
-.align	4
-_abi_test_clobber_d17:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d17, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d18
-.private_extern	_abi_test_clobber_d18
-.align	4
-_abi_test_clobber_d18:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d18, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d19
-.private_extern	_abi_test_clobber_d19
-.align	4
-_abi_test_clobber_d19:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d19, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d20
-.private_extern	_abi_test_clobber_d20
-.align	4
-_abi_test_clobber_d20:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d20, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d21
-.private_extern	_abi_test_clobber_d21
-.align	4
-_abi_test_clobber_d21:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d21, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d22
-.private_extern	_abi_test_clobber_d22
-.align	4
-_abi_test_clobber_d22:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d22, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d23
-.private_extern	_abi_test_clobber_d23
-.align	4
-_abi_test_clobber_d23:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d23, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d24
-.private_extern	_abi_test_clobber_d24
-.align	4
-_abi_test_clobber_d24:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d24, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d25
-.private_extern	_abi_test_clobber_d25
-.align	4
-_abi_test_clobber_d25:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d25, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d26
-.private_extern	_abi_test_clobber_d26
-.align	4
-_abi_test_clobber_d26:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d26, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d27
-.private_extern	_abi_test_clobber_d27
-.align	4
-_abi_test_clobber_d27:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d27, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d28
-.private_extern	_abi_test_clobber_d28
-.align	4
-_abi_test_clobber_d28:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d28, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d29
-.private_extern	_abi_test_clobber_d29
-.align	4
-_abi_test_clobber_d29:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d29, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d30
-.private_extern	_abi_test_clobber_d30
-.align	4
-_abi_test_clobber_d30:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d30, xzr
-	ret
-
-
-.globl	_abi_test_clobber_d31
-.private_extern	_abi_test_clobber_d31
-.align	4
-_abi_test_clobber_d31:
-	AARCH64_VALID_CALL_TARGET
-	fmov	d31, xzr
-	ret
-
-
-.globl	_abi_test_clobber_v8_upper
-.private_extern	_abi_test_clobber_v8_upper
-.align	4
-_abi_test_clobber_v8_upper:
-	AARCH64_VALID_CALL_TARGET
-	fmov	v8.d[1], xzr
-	ret
-
-
-.globl	_abi_test_clobber_v9_upper
-.private_extern	_abi_test_clobber_v9_upper
-.align	4
-_abi_test_clobber_v9_upper:
-	AARCH64_VALID_CALL_TARGET
-	fmov	v9.d[1], xzr
-	ret
-
-
-.globl	_abi_test_clobber_v10_upper
-.private_extern	_abi_test_clobber_v10_upper
-.align	4
-_abi_test_clobber_v10_upper:
-	AARCH64_VALID_CALL_TARGET
-	fmov	v10.d[1], xzr
-	ret
-
-
-.globl	_abi_test_clobber_v11_upper
-.private_extern	_abi_test_clobber_v11_upper
-.align	4
-_abi_test_clobber_v11_upper:
-	AARCH64_VALID_CALL_TARGET
-	fmov	v11.d[1], xzr
-	ret
-
-
-.globl	_abi_test_clobber_v12_upper
-.private_extern	_abi_test_clobber_v12_upper
-.align	4
-_abi_test_clobber_v12_upper:
-	AARCH64_VALID_CALL_TARGET
-	fmov	v12.d[1], xzr
-	ret
-
-
-.globl	_abi_test_clobber_v13_upper
-.private_extern	_abi_test_clobber_v13_upper
-.align	4
-_abi_test_clobber_v13_upper:
-	AARCH64_VALID_CALL_TARGET
-	fmov	v13.d[1], xzr
-	ret
-
-
-.globl	_abi_test_clobber_v14_upper
-.private_extern	_abi_test_clobber_v14_upper
-.align	4
-_abi_test_clobber_v14_upper:
-	AARCH64_VALID_CALL_TARGET
-	fmov	v14.d[1], xzr
-	ret
-
-
-.globl	_abi_test_clobber_v15_upper
-.private_extern	_abi_test_clobber_v15_upper
-.align	4
-_abi_test_clobber_v15_upper:
-	AARCH64_VALID_CALL_TARGET
-	fmov	v15.d[1], xzr
-	ret
-
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/apple-arm/crypto/chacha/chacha-armv4-apple.S b/apple-arm/crypto/chacha/chacha-armv4-apple.S
deleted file mode 100644
index cf2644e..0000000
--- a/apple-arm/crypto/chacha/chacha-armv4-apple.S
+++ /dev/null
@@ -1,1490 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
-
-
-.text
-#if defined(__thumb2__) || defined(__clang__)
-.syntax	unified
-#endif
-#if defined(__thumb2__)
-.thumb
-#else
-.code	32
-#endif
-
-#if defined(__thumb2__) || defined(__clang__)
-#define ldrhsb	ldrbhs
-#endif
-
-.align	5
-Lsigma:
-.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
-Lone:
-.long	1,0,0,0
-#if __ARM_MAX_ARCH__>=7
-LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-LChaCha20_ctr32
-#else
-.word	-1
-#endif
-
-.globl	_ChaCha20_ctr32
-.private_extern	_ChaCha20_ctr32
-#ifdef __thumb2__
-.thumb_func	_ChaCha20_ctr32
-#endif
-.align	5
-_ChaCha20_ctr32:
-LChaCha20_ctr32:
-	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
-	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
-#if __ARM_ARCH__<7 && !defined(__thumb2__)
-	sub	r14,pc,#16		@ _ChaCha20_ctr32
-#else
-	adr	r14,LChaCha20_ctr32
-#endif
-	cmp	r2,#0			@ len==0?
-#ifdef	__thumb2__
-	itt	eq
-#endif
-	addeq	sp,sp,#4*3
-	beq	Lno_data
-#if __ARM_MAX_ARCH__>=7
-	cmp	r2,#192			@ test len
-	bls	Lshort
-	ldr	r4,[r14,#-32]
-	ldr	r4,[r14,r4]
-# ifdef	__APPLE__
-	ldr	r4,[r4]
-# endif
-	tst	r4,#ARMV7_NEON
-	bne	LChaCha20_neon
-Lshort:
-#endif
-	ldmia	r12,{r4,r5,r6,r7}		@ load counter and nonce
-	sub	sp,sp,#4*(16)		@ off-load area
-	sub	r14,r14,#64		@ Lsigma
-	stmdb	sp!,{r4,r5,r6,r7}		@ copy counter and nonce
-	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
-	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy key
-	stmdb	sp!,{r0,r1,r2,r3}		@ copy sigma
-	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
-	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
-	b	Loop_outer_enter
-
-.align	4
-Loop_outer:
-	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
-	str	r11,[sp,#4*(32+2)]	@ save len
-	str	r12,  [sp,#4*(32+1)]	@ save inp
-	str	r14,  [sp,#4*(32+0)]	@ save out
-Loop_outer_enter:
-	ldr	r11, [sp,#4*(15)]
-	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
-	ldr	r10, [sp,#4*(13)]
-	ldr	r14,[sp,#4*(14)]
-	str	r11, [sp,#4*(16+15)]
-	mov	r11,#10
-	b	Loop
-
-.align	4
-Loop:
-	subs	r11,r11,#1
-	add	r0,r0,r4
-	mov	r12,r12,ror#16
-	add	r1,r1,r5
-	mov	r10,r10,ror#16
-	eor	r12,r12,r0,ror#16
-	eor	r10,r10,r1,ror#16
-	add	r8,r8,r12
-	mov	r4,r4,ror#20
-	add	r9,r9,r10
-	mov	r5,r5,ror#20
-	eor	r4,r4,r8,ror#20
-	eor	r5,r5,r9,ror#20
-	add	r0,r0,r4
-	mov	r12,r12,ror#24
-	add	r1,r1,r5
-	mov	r10,r10,ror#24
-	eor	r12,r12,r0,ror#24
-	eor	r10,r10,r1,ror#24
-	add	r8,r8,r12
-	mov	r4,r4,ror#25
-	add	r9,r9,r10
-	mov	r5,r5,ror#25
-	str	r10,[sp,#4*(16+13)]
-	ldr	r10,[sp,#4*(16+15)]
-	eor	r4,r4,r8,ror#25
-	eor	r5,r5,r9,ror#25
-	str	r8,[sp,#4*(16+8)]
-	ldr	r8,[sp,#4*(16+10)]
-	add	r2,r2,r6
-	mov	r14,r14,ror#16
-	str	r9,[sp,#4*(16+9)]
-	ldr	r9,[sp,#4*(16+11)]
-	add	r3,r3,r7
-	mov	r10,r10,ror#16
-	eor	r14,r14,r2,ror#16
-	eor	r10,r10,r3,ror#16
-	add	r8,r8,r14
-	mov	r6,r6,ror#20
-	add	r9,r9,r10
-	mov	r7,r7,ror#20
-	eor	r6,r6,r8,ror#20
-	eor	r7,r7,r9,ror#20
-	add	r2,r2,r6
-	mov	r14,r14,ror#24
-	add	r3,r3,r7
-	mov	r10,r10,ror#24
-	eor	r14,r14,r2,ror#24
-	eor	r10,r10,r3,ror#24
-	add	r8,r8,r14
-	mov	r6,r6,ror#25
-	add	r9,r9,r10
-	mov	r7,r7,ror#25
-	eor	r6,r6,r8,ror#25
-	eor	r7,r7,r9,ror#25
-	add	r0,r0,r5
-	mov	r10,r10,ror#16
-	add	r1,r1,r6
-	mov	r12,r12,ror#16
-	eor	r10,r10,r0,ror#16
-	eor	r12,r12,r1,ror#16
-	add	r8,r8,r10
-	mov	r5,r5,ror#20
-	add	r9,r9,r12
-	mov	r6,r6,ror#20
-	eor	r5,r5,r8,ror#20
-	eor	r6,r6,r9,ror#20
-	add	r0,r0,r5
-	mov	r10,r10,ror#24
-	add	r1,r1,r6
-	mov	r12,r12,ror#24
-	eor	r10,r10,r0,ror#24
-	eor	r12,r12,r1,ror#24
-	add	r8,r8,r10
-	mov	r5,r5,ror#25
-	str	r10,[sp,#4*(16+15)]
-	ldr	r10,[sp,#4*(16+13)]
-	add	r9,r9,r12
-	mov	r6,r6,ror#25
-	eor	r5,r5,r8,ror#25
-	eor	r6,r6,r9,ror#25
-	str	r8,[sp,#4*(16+10)]
-	ldr	r8,[sp,#4*(16+8)]
-	add	r2,r2,r7
-	mov	r10,r10,ror#16
-	str	r9,[sp,#4*(16+11)]
-	ldr	r9,[sp,#4*(16+9)]
-	add	r3,r3,r4
-	mov	r14,r14,ror#16
-	eor	r10,r10,r2,ror#16
-	eor	r14,r14,r3,ror#16
-	add	r8,r8,r10
-	mov	r7,r7,ror#20
-	add	r9,r9,r14
-	mov	r4,r4,ror#20
-	eor	r7,r7,r8,ror#20
-	eor	r4,r4,r9,ror#20
-	add	r2,r2,r7
-	mov	r10,r10,ror#24
-	add	r3,r3,r4
-	mov	r14,r14,ror#24
-	eor	r10,r10,r2,ror#24
-	eor	r14,r14,r3,ror#24
-	add	r8,r8,r10
-	mov	r7,r7,ror#25
-	add	r9,r9,r14
-	mov	r4,r4,ror#25
-	eor	r7,r7,r8,ror#25
-	eor	r4,r4,r9,ror#25
-	bne	Loop
-
-	ldr	r11,[sp,#4*(32+2)]	@ load len
-
-	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
-	str	r9, [sp,#4*(16+9)]
-	str	r12,[sp,#4*(16+12)]
-	str	r10, [sp,#4*(16+13)]
-	str	r14,[sp,#4*(16+14)]
-
-	@ at this point we have first half of 512-bit result in
-	@ rx and second half at sp+4*(16+8)
-
-	cmp	r11,#64		@ done yet?
-#ifdef	__thumb2__
-	itete	lo
-#endif
-	addlo	r12,sp,#4*(0)		@ shortcut or ...
-	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
-	addlo	r14,sp,#4*(0)		@ shortcut or ...
-	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
-
-	ldr	r8,[sp,#4*(0)]	@ load key material
-	ldr	r9,[sp,#4*(1)]
-
-#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
-# if __ARM_ARCH__<7
-	orr	r10,r12,r14
-	tst	r10,#3		@ are input and output aligned?
-	ldr	r10,[sp,#4*(2)]
-	bne	Lunaligned
-	cmp	r11,#64		@ restore flags
-# else
-	ldr	r10,[sp,#4*(2)]
-# endif
-	ldr	r11,[sp,#4*(3)]
-
-	add	r0,r0,r8	@ accumulate key material
-	add	r1,r1,r9
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r8,[r12],#16		@ load input
-	ldrhs	r9,[r12,#-12]
-
-	add	r2,r2,r10
-	add	r3,r3,r11
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r10,[r12,#-8]
-	ldrhs	r11,[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-# endif
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r0,r0,r8	@ xor with input
-	eorhs	r1,r1,r9
-	add	r8,sp,#4*(4)
-	str	r0,[r14],#16		@ store output
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r2,r2,r10
-	eorhs	r3,r3,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-	str	r1,[r14,#-12]
-	str	r2,[r14,#-8]
-	str	r3,[r14,#-4]
-
-	add	r4,r4,r8	@ accumulate key material
-	add	r5,r5,r9
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r8,[r12],#16		@ load input
-	ldrhs	r9,[r12,#-12]
-	add	r6,r6,r10
-	add	r7,r7,r11
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r10,[r12,#-8]
-	ldrhs	r11,[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
-	rev	r4,r4
-	rev	r5,r5
-	rev	r6,r6
-	rev	r7,r7
-# endif
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r4,r4,r8
-	eorhs	r5,r5,r9
-	add	r8,sp,#4*(8)
-	str	r4,[r14],#16		@ store output
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r6,r6,r10
-	eorhs	r7,r7,r11
-	str	r5,[r14,#-12]
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-	str	r6,[r14,#-8]
-	add	r0,sp,#4*(16+8)
-	str	r7,[r14,#-4]
-
-	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
-
-	add	r0,r0,r8	@ accumulate key material
-	add	r1,r1,r9
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r8,[r12],#16		@ load input
-	ldrhs	r9,[r12,#-12]
-# ifdef	__thumb2__
-	itt	hi
-# endif
-	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
-	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
-	add	r2,r2,r10
-	add	r3,r3,r11
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r10,[r12,#-8]
-	ldrhs	r11,[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-# endif
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r0,r0,r8
-	eorhs	r1,r1,r9
-	add	r8,sp,#4*(12)
-	str	r0,[r14],#16		@ store output
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r2,r2,r10
-	eorhs	r3,r3,r11
-	str	r1,[r14,#-12]
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-	str	r2,[r14,#-8]
-	str	r3,[r14,#-4]
-
-	add	r4,r4,r8	@ accumulate key material
-	add	r5,r5,r9
-# ifdef	__thumb2__
-	itt	hi
-# endif
-	addhi	r8,r8,#1		@ next counter value
-	strhi	r8,[sp,#4*(12)]	@ save next counter value
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r8,[r12],#16		@ load input
-	ldrhs	r9,[r12,#-12]
-	add	r6,r6,r10
-	add	r7,r7,r11
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhs	r10,[r12,#-8]
-	ldrhs	r11,[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
-	rev	r4,r4
-	rev	r5,r5
-	rev	r6,r6
-	rev	r7,r7
-# endif
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r4,r4,r8
-	eorhs	r5,r5,r9
-# ifdef	__thumb2__
-	it	ne
-# endif
-	ldrne	r8,[sp,#4*(32+2)]	@ re-load len
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	eorhs	r6,r6,r10
-	eorhs	r7,r7,r11
-	str	r4,[r14],#16		@ store output
-	str	r5,[r14,#-12]
-# ifdef	__thumb2__
-	it	hs
-# endif
-	subhs	r11,r8,#64		@ len-=64
-	str	r6,[r14,#-8]
-	str	r7,[r14,#-4]
-	bhi	Loop_outer
-
-	beq	Ldone
-# if __ARM_ARCH__<7
-	b	Ltail
-
-.align	4
-Lunaligned:@ unaligned endian-neutral path
-	cmp	r11,#64		@ restore flags
-# endif
-#endif
-#if __ARM_ARCH__<7
-	ldr	r11,[sp,#4*(3)]
-	add	r0,r0,r8		@ accumulate key material
-	add	r1,r1,r9
-	add	r2,r2,r10
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r8,r8,r8		@ zero or ...
-	ldrhsb	r8,[r12],#16			@ ... load input
-	eorlo	r9,r9,r9
-	ldrhsb	r9,[r12,#-12]
-
-	add	r3,r3,r11
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r10,r10,r10
-	ldrhsb	r10,[r12,#-8]
-	eorlo	r11,r11,r11
-	ldrhsb	r11,[r12,#-4]
-
-	eor	r0,r8,r0		@ xor with input (or zero)
-	eor	r1,r9,r1
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-15]		@ load more input
-	ldrhsb	r9,[r12,#-11]
-	eor	r2,r10,r2
-	strb	r0,[r14],#16		@ store output
-	eor	r3,r11,r3
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-7]
-	ldrhsb	r11,[r12,#-3]
-	strb	r1,[r14,#-12]
-	eor	r0,r8,r0,lsr#8
-	strb	r2,[r14,#-8]
-	eor	r1,r9,r1,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-14]		@ load more input
-	ldrhsb	r9,[r12,#-10]
-	strb	r3,[r14,#-4]
-	eor	r2,r10,r2,lsr#8
-	strb	r0,[r14,#-15]
-	eor	r3,r11,r3,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-6]
-	ldrhsb	r11,[r12,#-2]
-	strb	r1,[r14,#-11]
-	eor	r0,r8,r0,lsr#8
-	strb	r2,[r14,#-7]
-	eor	r1,r9,r1,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-13]		@ load more input
-	ldrhsb	r9,[r12,#-9]
-	strb	r3,[r14,#-3]
-	eor	r2,r10,r2,lsr#8
-	strb	r0,[r14,#-14]
-	eor	r3,r11,r3,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-5]
-	ldrhsb	r11,[r12,#-1]
-	strb	r1,[r14,#-10]
-	strb	r2,[r14,#-6]
-	eor	r0,r8,r0,lsr#8
-	strb	r3,[r14,#-2]
-	eor	r1,r9,r1,lsr#8
-	strb	r0,[r14,#-13]
-	eor	r2,r10,r2,lsr#8
-	strb	r1,[r14,#-9]
-	eor	r3,r11,r3,lsr#8
-	strb	r2,[r14,#-5]
-	strb	r3,[r14,#-1]
-	add	r8,sp,#4*(4+0)
-	ldmia	r8,{r8,r9,r10,r11}		@ load key material
-	add	r0,sp,#4*(16+8)
-	add	r4,r4,r8		@ accumulate key material
-	add	r5,r5,r9
-	add	r6,r6,r10
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r8,r8,r8		@ zero or ...
-	ldrhsb	r8,[r12],#16			@ ... load input
-	eorlo	r9,r9,r9
-	ldrhsb	r9,[r12,#-12]
-
-	add	r7,r7,r11
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r10,r10,r10
-	ldrhsb	r10,[r12,#-8]
-	eorlo	r11,r11,r11
-	ldrhsb	r11,[r12,#-4]
-
-	eor	r4,r8,r4		@ xor with input (or zero)
-	eor	r5,r9,r5
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-15]		@ load more input
-	ldrhsb	r9,[r12,#-11]
-	eor	r6,r10,r6
-	strb	r4,[r14],#16		@ store output
-	eor	r7,r11,r7
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-7]
-	ldrhsb	r11,[r12,#-3]
-	strb	r5,[r14,#-12]
-	eor	r4,r8,r4,lsr#8
-	strb	r6,[r14,#-8]
-	eor	r5,r9,r5,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-14]		@ load more input
-	ldrhsb	r9,[r12,#-10]
-	strb	r7,[r14,#-4]
-	eor	r6,r10,r6,lsr#8
-	strb	r4,[r14,#-15]
-	eor	r7,r11,r7,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-6]
-	ldrhsb	r11,[r12,#-2]
-	strb	r5,[r14,#-11]
-	eor	r4,r8,r4,lsr#8
-	strb	r6,[r14,#-7]
-	eor	r5,r9,r5,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-13]		@ load more input
-	ldrhsb	r9,[r12,#-9]
-	strb	r7,[r14,#-3]
-	eor	r6,r10,r6,lsr#8
-	strb	r4,[r14,#-14]
-	eor	r7,r11,r7,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-5]
-	ldrhsb	r11,[r12,#-1]
-	strb	r5,[r14,#-10]
-	strb	r6,[r14,#-6]
-	eor	r4,r8,r4,lsr#8
-	strb	r7,[r14,#-2]
-	eor	r5,r9,r5,lsr#8
-	strb	r4,[r14,#-13]
-	eor	r6,r10,r6,lsr#8
-	strb	r5,[r14,#-9]
-	eor	r7,r11,r7,lsr#8
-	strb	r6,[r14,#-5]
-	strb	r7,[r14,#-1]
-	add	r8,sp,#4*(4+4)
-	ldmia	r8,{r8,r9,r10,r11}		@ load key material
-	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}		@ load second half
-# ifdef	__thumb2__
-	itt	hi
-# endif
-	strhi	r10,[sp,#4*(16+10)]		@ copy "rx"
-	strhi	r11,[sp,#4*(16+11)]		@ copy "rx"
-	add	r0,r0,r8		@ accumulate key material
-	add	r1,r1,r9
-	add	r2,r2,r10
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r8,r8,r8		@ zero or ...
-	ldrhsb	r8,[r12],#16			@ ... load input
-	eorlo	r9,r9,r9
-	ldrhsb	r9,[r12,#-12]
-
-	add	r3,r3,r11
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r10,r10,r10
-	ldrhsb	r10,[r12,#-8]
-	eorlo	r11,r11,r11
-	ldrhsb	r11,[r12,#-4]
-
-	eor	r0,r8,r0		@ xor with input (or zero)
-	eor	r1,r9,r1
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-15]		@ load more input
-	ldrhsb	r9,[r12,#-11]
-	eor	r2,r10,r2
-	strb	r0,[r14],#16		@ store output
-	eor	r3,r11,r3
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-7]
-	ldrhsb	r11,[r12,#-3]
-	strb	r1,[r14,#-12]
-	eor	r0,r8,r0,lsr#8
-	strb	r2,[r14,#-8]
-	eor	r1,r9,r1,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-14]		@ load more input
-	ldrhsb	r9,[r12,#-10]
-	strb	r3,[r14,#-4]
-	eor	r2,r10,r2,lsr#8
-	strb	r0,[r14,#-15]
-	eor	r3,r11,r3,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-6]
-	ldrhsb	r11,[r12,#-2]
-	strb	r1,[r14,#-11]
-	eor	r0,r8,r0,lsr#8
-	strb	r2,[r14,#-7]
-	eor	r1,r9,r1,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-13]		@ load more input
-	ldrhsb	r9,[r12,#-9]
-	strb	r3,[r14,#-3]
-	eor	r2,r10,r2,lsr#8
-	strb	r0,[r14,#-14]
-	eor	r3,r11,r3,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-5]
-	ldrhsb	r11,[r12,#-1]
-	strb	r1,[r14,#-10]
-	strb	r2,[r14,#-6]
-	eor	r0,r8,r0,lsr#8
-	strb	r3,[r14,#-2]
-	eor	r1,r9,r1,lsr#8
-	strb	r0,[r14,#-13]
-	eor	r2,r10,r2,lsr#8
-	strb	r1,[r14,#-9]
-	eor	r3,r11,r3,lsr#8
-	strb	r2,[r14,#-5]
-	strb	r3,[r14,#-1]
-	add	r8,sp,#4*(4+8)
-	ldmia	r8,{r8,r9,r10,r11}		@ load key material
-	add	r4,r4,r8		@ accumulate key material
-# ifdef	__thumb2__
-	itt	hi
-# endif
-	addhi	r8,r8,#1			@ next counter value
-	strhi	r8,[sp,#4*(12)]		@ save next counter value
-	add	r5,r5,r9
-	add	r6,r6,r10
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r8,r8,r8		@ zero or ...
-	ldrhsb	r8,[r12],#16			@ ... load input
-	eorlo	r9,r9,r9
-	ldrhsb	r9,[r12,#-12]
-
-	add	r7,r7,r11
-# ifdef	__thumb2__
-	itete	lo
-# endif
-	eorlo	r10,r10,r10
-	ldrhsb	r10,[r12,#-8]
-	eorlo	r11,r11,r11
-	ldrhsb	r11,[r12,#-4]
-
-	eor	r4,r8,r4		@ xor with input (or zero)
-	eor	r5,r9,r5
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-15]		@ load more input
-	ldrhsb	r9,[r12,#-11]
-	eor	r6,r10,r6
-	strb	r4,[r14],#16		@ store output
-	eor	r7,r11,r7
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-7]
-	ldrhsb	r11,[r12,#-3]
-	strb	r5,[r14,#-12]
-	eor	r4,r8,r4,lsr#8
-	strb	r6,[r14,#-8]
-	eor	r5,r9,r5,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-14]		@ load more input
-	ldrhsb	r9,[r12,#-10]
-	strb	r7,[r14,#-4]
-	eor	r6,r10,r6,lsr#8
-	strb	r4,[r14,#-15]
-	eor	r7,r11,r7,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-6]
-	ldrhsb	r11,[r12,#-2]
-	strb	r5,[r14,#-11]
-	eor	r4,r8,r4,lsr#8
-	strb	r6,[r14,#-7]
-	eor	r5,r9,r5,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r8,[r12,#-13]		@ load more input
-	ldrhsb	r9,[r12,#-9]
-	strb	r7,[r14,#-3]
-	eor	r6,r10,r6,lsr#8
-	strb	r4,[r14,#-14]
-	eor	r7,r11,r7,lsr#8
-# ifdef	__thumb2__
-	itt	hs
-# endif
-	ldrhsb	r10,[r12,#-5]
-	ldrhsb	r11,[r12,#-1]
-	strb	r5,[r14,#-10]
-	strb	r6,[r14,#-6]
-	eor	r4,r8,r4,lsr#8
-	strb	r7,[r14,#-2]
-	eor	r5,r9,r5,lsr#8
-	strb	r4,[r14,#-13]
-	eor	r6,r10,r6,lsr#8
-	strb	r5,[r14,#-9]
-	eor	r7,r11,r7,lsr#8
-	strb	r6,[r14,#-5]
-	strb	r7,[r14,#-1]
-# ifdef	__thumb2__
-	it	ne
-# endif
-	ldrne	r8,[sp,#4*(32+2)]		@ re-load len
-# ifdef	__thumb2__
-	it	hs
-# endif
-	subhs	r11,r8,#64			@ len-=64
-	bhi	Loop_outer
-
-	beq	Ldone
-#endif
-
-Ltail:
-	ldr	r12,[sp,#4*(32+1)]	@ load inp
-	add	r9,sp,#4*(0)
-	ldr	r14,[sp,#4*(32+0)]	@ load out
-
-Loop_tail:
-	ldrb	r10,[r9],#1	@ read buffer on stack
-	ldrb	r11,[r12],#1		@ read input
-	subs	r8,r8,#1
-	eor	r11,r11,r10
-	strb	r11,[r14],#1		@ store output
-	bne	Loop_tail
-
-Ldone:
-	add	sp,sp,#4*(32+3)
-Lno_data:
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-#ifdef __thumb2__
-.thumb_func	ChaCha20_neon
-#endif
-.align	5
-ChaCha20_neon:
-	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
-	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
-LChaCha20_neon:
-	adr	r14,Lsigma
-	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI spec says so
-	stmdb	sp!,{r0,r1,r2,r3}
-
-	vld1.32	{q1,q2},[r3]		@ load key
-	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
-
-	sub	sp,sp,#4*(16+16)
-	vld1.32	{q3},[r12]		@ load counter and nonce
-	add	r12,sp,#4*8
-	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
-	vld1.32	{q0},[r14]!		@ load sigma
-	vld1.32	{q12},[r14]		@ one
-	vst1.32	{q2,q3},[r12]		@ copy 1/2key|counter|nonce
-	vst1.32	{q0,q1},[sp]		@ copy sigma|1/2key
-
-	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
-	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
-	vshl.i32	d26,d24,#1	@ two
-	vstr	d24,[sp,#4*(16+0)]
-	vshl.i32	d28,d24,#2	@ four
-	vstr	d26,[sp,#4*(16+2)]
-	vmov	q4,q0
-	vstr	d28,[sp,#4*(16+4)]
-	vmov	q8,q0
-	vmov	q5,q1
-	vmov	q9,q1
-	b	Loop_neon_enter
-
-.align	4
-Loop_neon_outer:
-	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
-	cmp	r11,#64*2		@ if len<=64*2
-	bls	Lbreak_neon		@ switch to integer-only
-	vmov	q4,q0
-	str	r11,[sp,#4*(32+2)]	@ save len
-	vmov	q8,q0
-	str	r12,  [sp,#4*(32+1)]	@ save inp
-	vmov	q5,q1
-	str	r14,  [sp,#4*(32+0)]	@ save out
-	vmov	q9,q1
-Loop_neon_enter:
-	ldr	r11, [sp,#4*(15)]
-	vadd.i32	q7,q3,q12		@ counter+1
-	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
-	vmov	q6,q2
-	ldr	r10, [sp,#4*(13)]
-	vmov	q10,q2
-	ldr	r14,[sp,#4*(14)]
-	vadd.i32	q11,q7,q12		@ counter+2
-	str	r11, [sp,#4*(16+15)]
-	mov	r11,#10
-	add	r12,r12,#3	@ counter+3
-	b	Loop_neon
-
-.align	4
-Loop_neon:
-	subs	r11,r11,#1
-	vadd.i32	q0,q0,q1
-	add	r0,r0,r4
-	vadd.i32	q4,q4,q5
-	mov	r12,r12,ror#16
-	vadd.i32	q8,q8,q9
-	add	r1,r1,r5
-	veor	q3,q3,q0
-	mov	r10,r10,ror#16
-	veor	q7,q7,q4
-	eor	r12,r12,r0,ror#16
-	veor	q11,q11,q8
-	eor	r10,r10,r1,ror#16
-	vrev32.16	q3,q3
-	add	r8,r8,r12
-	vrev32.16	q7,q7
-	mov	r4,r4,ror#20
-	vrev32.16	q11,q11
-	add	r9,r9,r10
-	vadd.i32	q2,q2,q3
-	mov	r5,r5,ror#20
-	vadd.i32	q6,q6,q7
-	eor	r4,r4,r8,ror#20
-	vadd.i32	q10,q10,q11
-	eor	r5,r5,r9,ror#20
-	veor	q12,q1,q2
-	add	r0,r0,r4
-	veor	q13,q5,q6
-	mov	r12,r12,ror#24
-	veor	q14,q9,q10
-	add	r1,r1,r5
-	vshr.u32	q1,q12,#20
-	mov	r10,r10,ror#24
-	vshr.u32	q5,q13,#20
-	eor	r12,r12,r0,ror#24
-	vshr.u32	q9,q14,#20
-	eor	r10,r10,r1,ror#24
-	vsli.32	q1,q12,#12
-	add	r8,r8,r12
-	vsli.32	q5,q13,#12
-	mov	r4,r4,ror#25
-	vsli.32	q9,q14,#12
-	add	r9,r9,r10
-	vadd.i32	q0,q0,q1
-	mov	r5,r5,ror#25
-	vadd.i32	q4,q4,q5
-	str	r10,[sp,#4*(16+13)]
-	vadd.i32	q8,q8,q9
-	ldr	r10,[sp,#4*(16+15)]
-	veor	q12,q3,q0
-	eor	r4,r4,r8,ror#25
-	veor	q13,q7,q4
-	eor	r5,r5,r9,ror#25
-	veor	q14,q11,q8
-	str	r8,[sp,#4*(16+8)]
-	vshr.u32	q3,q12,#24
-	ldr	r8,[sp,#4*(16+10)]
-	vshr.u32	q7,q13,#24
-	add	r2,r2,r6
-	vshr.u32	q11,q14,#24
-	mov	r14,r14,ror#16
-	vsli.32	q3,q12,#8
-	str	r9,[sp,#4*(16+9)]
-	vsli.32	q7,q13,#8
-	ldr	r9,[sp,#4*(16+11)]
-	vsli.32	q11,q14,#8
-	add	r3,r3,r7
-	vadd.i32	q2,q2,q3
-	mov	r10,r10,ror#16
-	vadd.i32	q6,q6,q7
-	eor	r14,r14,r2,ror#16
-	vadd.i32	q10,q10,q11
-	eor	r10,r10,r3,ror#16
-	veor	q12,q1,q2
-	add	r8,r8,r14
-	veor	q13,q5,q6
-	mov	r6,r6,ror#20
-	veor	q14,q9,q10
-	add	r9,r9,r10
-	vshr.u32	q1,q12,#25
-	mov	r7,r7,ror#20
-	vshr.u32	q5,q13,#25
-	eor	r6,r6,r8,ror#20
-	vshr.u32	q9,q14,#25
-	eor	r7,r7,r9,ror#20
-	vsli.32	q1,q12,#7
-	add	r2,r2,r6
-	vsli.32	q5,q13,#7
-	mov	r14,r14,ror#24
-	vsli.32	q9,q14,#7
-	add	r3,r3,r7
-	vext.8	q2,q2,q2,#8
-	mov	r10,r10,ror#24
-	vext.8	q6,q6,q6,#8
-	eor	r14,r14,r2,ror#24
-	vext.8	q10,q10,q10,#8
-	eor	r10,r10,r3,ror#24
-	vext.8	q1,q1,q1,#4
-	add	r8,r8,r14
-	vext.8	q5,q5,q5,#4
-	mov	r6,r6,ror#25
-	vext.8	q9,q9,q9,#4
-	add	r9,r9,r10
-	vext.8	q3,q3,q3,#12
-	mov	r7,r7,ror#25
-	vext.8	q7,q7,q7,#12
-	eor	r6,r6,r8,ror#25
-	vext.8	q11,q11,q11,#12
-	eor	r7,r7,r9,ror#25
-	vadd.i32	q0,q0,q1
-	add	r0,r0,r5
-	vadd.i32	q4,q4,q5
-	mov	r10,r10,ror#16
-	vadd.i32	q8,q8,q9
-	add	r1,r1,r6
-	veor	q3,q3,q0
-	mov	r12,r12,ror#16
-	veor	q7,q7,q4
-	eor	r10,r10,r0,ror#16
-	veor	q11,q11,q8
-	eor	r12,r12,r1,ror#16
-	vrev32.16	q3,q3
-	add	r8,r8,r10
-	vrev32.16	q7,q7
-	mov	r5,r5,ror#20
-	vrev32.16	q11,q11
-	add	r9,r9,r12
-	vadd.i32	q2,q2,q3
-	mov	r6,r6,ror#20
-	vadd.i32	q6,q6,q7
-	eor	r5,r5,r8,ror#20
-	vadd.i32	q10,q10,q11
-	eor	r6,r6,r9,ror#20
-	veor	q12,q1,q2
-	add	r0,r0,r5
-	veor	q13,q5,q6
-	mov	r10,r10,ror#24
-	veor	q14,q9,q10
-	add	r1,r1,r6
-	vshr.u32	q1,q12,#20
-	mov	r12,r12,ror#24
-	vshr.u32	q5,q13,#20
-	eor	r10,r10,r0,ror#24
-	vshr.u32	q9,q14,#20
-	eor	r12,r12,r1,ror#24
-	vsli.32	q1,q12,#12
-	add	r8,r8,r10
-	vsli.32	q5,q13,#12
-	mov	r5,r5,ror#25
-	vsli.32	q9,q14,#12
-	str	r10,[sp,#4*(16+15)]
-	vadd.i32	q0,q0,q1
-	ldr	r10,[sp,#4*(16+13)]
-	vadd.i32	q4,q4,q5
-	add	r9,r9,r12
-	vadd.i32	q8,q8,q9
-	mov	r6,r6,ror#25
-	veor	q12,q3,q0
-	eor	r5,r5,r8,ror#25
-	veor	q13,q7,q4
-	eor	r6,r6,r9,ror#25
-	veor	q14,q11,q8
-	str	r8,[sp,#4*(16+10)]
-	vshr.u32	q3,q12,#24
-	ldr	r8,[sp,#4*(16+8)]
-	vshr.u32	q7,q13,#24
-	add	r2,r2,r7
-	vshr.u32	q11,q14,#24
-	mov	r10,r10,ror#16
-	vsli.32	q3,q12,#8
-	str	r9,[sp,#4*(16+11)]
-	vsli.32	q7,q13,#8
-	ldr	r9,[sp,#4*(16+9)]
-	vsli.32	q11,q14,#8
-	add	r3,r3,r4
-	vadd.i32	q2,q2,q3
-	mov	r14,r14,ror#16
-	vadd.i32	q6,q6,q7
-	eor	r10,r10,r2,ror#16
-	vadd.i32	q10,q10,q11
-	eor	r14,r14,r3,ror#16
-	veor	q12,q1,q2
-	add	r8,r8,r10
-	veor	q13,q5,q6
-	mov	r7,r7,ror#20
-	veor	q14,q9,q10
-	add	r9,r9,r14
-	vshr.u32	q1,q12,#25
-	mov	r4,r4,ror#20
-	vshr.u32	q5,q13,#25
-	eor	r7,r7,r8,ror#20
-	vshr.u32	q9,q14,#25
-	eor	r4,r4,r9,ror#20
-	vsli.32	q1,q12,#7
-	add	r2,r2,r7
-	vsli.32	q5,q13,#7
-	mov	r10,r10,ror#24
-	vsli.32	q9,q14,#7
-	add	r3,r3,r4
-	vext.8	q2,q2,q2,#8
-	mov	r14,r14,ror#24
-	vext.8	q6,q6,q6,#8
-	eor	r10,r10,r2,ror#24
-	vext.8	q10,q10,q10,#8
-	eor	r14,r14,r3,ror#24
-	vext.8	q1,q1,q1,#12
-	add	r8,r8,r10
-	vext.8	q5,q5,q5,#12
-	mov	r7,r7,ror#25
-	vext.8	q9,q9,q9,#12
-	add	r9,r9,r14
-	vext.8	q3,q3,q3,#4
-	mov	r4,r4,ror#25
-	vext.8	q7,q7,q7,#4
-	eor	r7,r7,r8,ror#25
-	vext.8	q11,q11,q11,#4
-	eor	r4,r4,r9,ror#25
-	bne	Loop_neon
-
-	add	r11,sp,#32
-	vld1.32	{q12,q13},[sp]		@ load key material
-	vld1.32	{q14,q15},[r11]
-
-	ldr	r11,[sp,#4*(32+2)]	@ load len
-
-	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
-	str	r9, [sp,#4*(16+9)]
-	str	r12,[sp,#4*(16+12)]
-	str	r10, [sp,#4*(16+13)]
-	str	r14,[sp,#4*(16+14)]
-
-	@ at this point we have first half of 512-bit result in
-	@ rx and second half at sp+4*(16+8)
-
-	ldr	r12,[sp,#4*(32+1)]	@ load inp
-	ldr	r14,[sp,#4*(32+0)]	@ load out
-
-	vadd.i32	q0,q0,q12		@ accumulate key material
-	vadd.i32	q4,q4,q12
-	vadd.i32	q8,q8,q12
-	vldr	d24,[sp,#4*(16+0)]	@ one
-
-	vadd.i32	q1,q1,q13
-	vadd.i32	q5,q5,q13
-	vadd.i32	q9,q9,q13
-	vldr	d26,[sp,#4*(16+2)]	@ two
-
-	vadd.i32	q2,q2,q14
-	vadd.i32	q6,q6,q14
-	vadd.i32	q10,q10,q14
-	vadd.i32	d14,d14,d24	@ counter+1
-	vadd.i32	d22,d22,d26	@ counter+2
-
-	vadd.i32	q3,q3,q15
-	vadd.i32	q7,q7,q15
-	vadd.i32	q11,q11,q15
-
-	cmp	r11,#64*4
-	blo	Ltail_neon
-
-	vld1.8	{q12,q13},[r12]!	@ load input
-	mov	r11,sp
-	vld1.8	{q14,q15},[r12]!
-	veor	q0,q0,q12		@ xor with input
-	veor	q1,q1,q13
-	vld1.8	{q12,q13},[r12]!
-	veor	q2,q2,q14
-	veor	q3,q3,q15
-	vld1.8	{q14,q15},[r12]!
-
-	veor	q4,q4,q12
-	vst1.8	{q0,q1},[r14]!	@ store output
-	veor	q5,q5,q13
-	vld1.8	{q12,q13},[r12]!
-	veor	q6,q6,q14
-	vst1.8	{q2,q3},[r14]!
-	veor	q7,q7,q15
-	vld1.8	{q14,q15},[r12]!
-
-	veor	q8,q8,q12
-	vld1.32	{q0,q1},[r11]!	@ load for next iteration
-	veor	d25,d25,d25
-	vldr	d24,[sp,#4*(16+4)]	@ four
-	veor	q9,q9,q13
-	vld1.32	{q2,q3},[r11]
-	veor	q10,q10,q14
-	vst1.8	{q4,q5},[r14]!
-	veor	q11,q11,q15
-	vst1.8	{q6,q7},[r14]!
-
-	vadd.i32	d6,d6,d24	@ next counter value
-	vldr	d24,[sp,#4*(16+0)]	@ one
-
-	ldmia	sp,{r8,r9,r10,r11}	@ load key material
-	add	r0,r0,r8	@ accumulate key material
-	ldr	r8,[r12],#16		@ load input
-	vst1.8	{q8,q9},[r14]!
-	add	r1,r1,r9
-	ldr	r9,[r12,#-12]
-	vst1.8	{q10,q11},[r14]!
-	add	r2,r2,r10
-	ldr	r10,[r12,#-8]
-	add	r3,r3,r11
-	ldr	r11,[r12,#-4]
-# ifdef	__ARMEB__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-# endif
-	eor	r0,r0,r8	@ xor with input
-	add	r8,sp,#4*(4)
-	eor	r1,r1,r9
-	str	r0,[r14],#16		@ store output
-	eor	r2,r2,r10
-	str	r1,[r14,#-12]
-	eor	r3,r3,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-	str	r2,[r14,#-8]
-	str	r3,[r14,#-4]
-
-	add	r4,r4,r8	@ accumulate key material
-	ldr	r8,[r12],#16		@ load input
-	add	r5,r5,r9
-	ldr	r9,[r12,#-12]
-	add	r6,r6,r10
-	ldr	r10,[r12,#-8]
-	add	r7,r7,r11
-	ldr	r11,[r12,#-4]
-# ifdef	__ARMEB__
-	rev	r4,r4
-	rev	r5,r5
-	rev	r6,r6
-	rev	r7,r7
-# endif
-	eor	r4,r4,r8
-	add	r8,sp,#4*(8)
-	eor	r5,r5,r9
-	str	r4,[r14],#16		@ store output
-	eor	r6,r6,r10
-	str	r5,[r14,#-12]
-	eor	r7,r7,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-	str	r6,[r14,#-8]
-	add	r0,sp,#4*(16+8)
-	str	r7,[r14,#-4]
-
-	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
-
-	add	r0,r0,r8	@ accumulate key material
-	ldr	r8,[r12],#16		@ load input
-	add	r1,r1,r9
-	ldr	r9,[r12,#-12]
-# ifdef	__thumb2__
-	it	hi
-# endif
-	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
-	add	r2,r2,r10
-	ldr	r10,[r12,#-8]
-# ifdef	__thumb2__
-	it	hi
-# endif
-	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
-	add	r3,r3,r11
-	ldr	r11,[r12,#-4]
-# ifdef	__ARMEB__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-# endif
-	eor	r0,r0,r8
-	add	r8,sp,#4*(12)
-	eor	r1,r1,r9
-	str	r0,[r14],#16		@ store output
-	eor	r2,r2,r10
-	str	r1,[r14,#-12]
-	eor	r3,r3,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-	str	r2,[r14,#-8]
-	str	r3,[r14,#-4]
-
-	add	r4,r4,r8	@ accumulate key material
-	add	r8,r8,#4		@ next counter value
-	add	r5,r5,r9
-	str	r8,[sp,#4*(12)]	@ save next counter value
-	ldr	r8,[r12],#16		@ load input
-	add	r6,r6,r10
-	add	r4,r4,#3		@ counter+3
-	ldr	r9,[r12,#-12]
-	add	r7,r7,r11
-	ldr	r10,[r12,#-8]
-	ldr	r11,[r12,#-4]
-# ifdef	__ARMEB__
-	rev	r4,r4
-	rev	r5,r5
-	rev	r6,r6
-	rev	r7,r7
-# endif
-	eor	r4,r4,r8
-# ifdef	__thumb2__
-	it	hi
-# endif
-	ldrhi	r8,[sp,#4*(32+2)]	@ re-load len
-	eor	r5,r5,r9
-	eor	r6,r6,r10
-	str	r4,[r14],#16		@ store output
-	eor	r7,r7,r11
-	str	r5,[r14,#-12]
-	sub	r11,r8,#64*4	@ len-=64*4
-	str	r6,[r14,#-8]
-	str	r7,[r14,#-4]
-	bhi	Loop_neon_outer
-
-	b	Ldone_neon
-
-.align	4
-Lbreak_neon:
-	@ harmonize NEON and integer-only stack frames: load data
-	@ from NEON frame, but save to integer-only one; distance
-	@ between the two is 4*(32+4+16-32)=4*(20).
-
-	str	r11, [sp,#4*(20+32+2)]	@ save len
-	add	r11,sp,#4*(32+4)
-	str	r12,   [sp,#4*(20+32+1)]	@ save inp
-	str	r14,   [sp,#4*(20+32+0)]	@ save out
-
-	ldr	r12,[sp,#4*(16+10)]
-	ldr	r14,[sp,#4*(16+11)]
-	vldmia	r11,{d8,d9,d10,d11,d12,d13,d14,d15}			@ fulfill ABI requirement
-	str	r12,[sp,#4*(20+16+10)]	@ copy "rx"
-	str	r14,[sp,#4*(20+16+11)]	@ copy "rx"
-
-	ldr	r11, [sp,#4*(15)]
-	ldr	r12,[sp,#4*(12)]		@ modulo-scheduled load
-	ldr	r10, [sp,#4*(13)]
-	ldr	r14,[sp,#4*(14)]
-	str	r11, [sp,#4*(20+16+15)]
-	add	r11,sp,#4*(20)
-	vst1.32	{q0,q1},[r11]!		@ copy key
-	add	sp,sp,#4*(20)			@ switch frame
-	vst1.32	{q2,q3},[r11]
-	mov	r11,#10
-	b	Loop				@ go integer-only
-
-.align	4
-Ltail_neon:
-	cmp	r11,#64*3
-	bhs	L192_or_more_neon
-	cmp	r11,#64*2
-	bhs	L128_or_more_neon
-	cmp	r11,#64*1
-	bhs	L64_or_more_neon
-
-	add	r8,sp,#4*(8)
-	vst1.8	{q0,q1},[sp]
-	add	r10,sp,#4*(0)
-	vst1.8	{q2,q3},[r8]
-	b	Loop_tail_neon
-
-.align	4
-L64_or_more_neon:
-	vld1.8	{q12,q13},[r12]!
-	vld1.8	{q14,q15},[r12]!
-	veor	q0,q0,q12
-	veor	q1,q1,q13
-	veor	q2,q2,q14
-	veor	q3,q3,q15
-	vst1.8	{q0,q1},[r14]!
-	vst1.8	{q2,q3},[r14]!
-
-	beq	Ldone_neon
-
-	add	r8,sp,#4*(8)
-	vst1.8	{q4,q5},[sp]
-	add	r10,sp,#4*(0)
-	vst1.8	{q6,q7},[r8]
-	sub	r11,r11,#64*1	@ len-=64*1
-	b	Loop_tail_neon
-
-.align	4
-L128_or_more_neon:
-	vld1.8	{q12,q13},[r12]!
-	vld1.8	{q14,q15},[r12]!
-	veor	q0,q0,q12
-	veor	q1,q1,q13
-	vld1.8	{q12,q13},[r12]!
-	veor	q2,q2,q14
-	veor	q3,q3,q15
-	vld1.8	{q14,q15},[r12]!
-
-	veor	q4,q4,q12
-	veor	q5,q5,q13
-	vst1.8	{q0,q1},[r14]!
-	veor	q6,q6,q14
-	vst1.8	{q2,q3},[r14]!
-	veor	q7,q7,q15
-	vst1.8	{q4,q5},[r14]!
-	vst1.8	{q6,q7},[r14]!
-
-	beq	Ldone_neon
-
-	add	r8,sp,#4*(8)
-	vst1.8	{q8,q9},[sp]
-	add	r10,sp,#4*(0)
-	vst1.8	{q10,q11},[r8]
-	sub	r11,r11,#64*2	@ len-=64*2
-	b	Loop_tail_neon
-
-.align	4
-L192_or_more_neon:
-	vld1.8	{q12,q13},[r12]!
-	vld1.8	{q14,q15},[r12]!
-	veor	q0,q0,q12
-	veor	q1,q1,q13
-	vld1.8	{q12,q13},[r12]!
-	veor	q2,q2,q14
-	veor	q3,q3,q15
-	vld1.8	{q14,q15},[r12]!
-
-	veor	q4,q4,q12
-	veor	q5,q5,q13
-	vld1.8	{q12,q13},[r12]!
-	veor	q6,q6,q14
-	vst1.8	{q0,q1},[r14]!
-	veor	q7,q7,q15
-	vld1.8	{q14,q15},[r12]!
-
-	veor	q8,q8,q12
-	vst1.8	{q2,q3},[r14]!
-	veor	q9,q9,q13
-	vst1.8	{q4,q5},[r14]!
-	veor	q10,q10,q14
-	vst1.8	{q6,q7},[r14]!
-	veor	q11,q11,q15
-	vst1.8	{q8,q9},[r14]!
-	vst1.8	{q10,q11},[r14]!
-
-	beq	Ldone_neon
-
-	ldmia	sp,{r8,r9,r10,r11}	@ load key material
-	add	r0,r0,r8	@ accumulate key material
-	add	r8,sp,#4*(4)
-	add	r1,r1,r9
-	add	r2,r2,r10
-	add	r3,r3,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-
-	add	r4,r4,r8	@ accumulate key material
-	add	r8,sp,#4*(8)
-	add	r5,r5,r9
-	add	r6,r6,r10
-	add	r7,r7,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-# ifdef	__ARMEB__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-	rev	r4,r4
-	rev	r5,r5
-	rev	r6,r6
-	rev	r7,r7
-# endif
-	stmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7}
-	add	r0,sp,#4*(16+8)
-
-	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
-
-	add	r0,r0,r8	@ accumulate key material
-	add	r8,sp,#4*(12)
-	add	r1,r1,r9
-	add	r2,r2,r10
-	add	r3,r3,r11
-	ldmia	r8,{r8,r9,r10,r11}	@ load key material
-
-	add	r4,r4,r8	@ accumulate key material
-	add	r8,sp,#4*(8)
-	add	r5,r5,r9
-	add	r4,r4,#3		@ counter+3
-	add	r6,r6,r10
-	add	r7,r7,r11
-	ldr	r11,[sp,#4*(32+2)]	@ re-load len
-# ifdef	__ARMEB__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-	rev	r4,r4
-	rev	r5,r5
-	rev	r6,r6
-	rev	r7,r7
-# endif
-	stmia	r8,{r0,r1,r2,r3,r4,r5,r6,r7}
-	add	r10,sp,#4*(0)
-	sub	r11,r11,#64*3	@ len-=64*3
-
-Loop_tail_neon:
-	ldrb	r8,[r10],#1	@ read buffer on stack
-	ldrb	r9,[r12],#1		@ read input
-	subs	r11,r11,#1
-	eor	r8,r8,r9
-	strb	r8,[r14],#1		@ store output
-	bne	Loop_tail_neon
-
-Ldone_neon:
-	add	sp,sp,#4*(32+4)
-	vldmia	sp,{d8,d9,d10,d11,d12,d13,d14,d15}
-	add	sp,sp,#4*(16+3)
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
-
-.comm	_OPENSSL_armcap_P,4
-.non_lazy_symbol_pointer
-OPENSSL_armcap_P:
-.indirect_symbol	_OPENSSL_armcap_P
-.long	0
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/aesv8-armv7-apple.S b/apple-arm/crypto/fipsmodule/aesv8-armv7-apple.S
deleted file mode 100644
index 638062a..0000000
--- a/apple-arm/crypto/fipsmodule/aesv8-armv7-apple.S
+++ /dev/null
@@ -1,801 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-#if __ARM_MAX_ARCH__>=7
-.text
-
-
-.code	32
-#undef	__thumb2__
-.align	5
-Lrcon:
-.long	0x01,0x01,0x01,0x01
-.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
-.long	0x1b,0x1b,0x1b,0x1b
-
-.text
-
-.globl	_aes_hw_set_encrypt_key
-.private_extern	_aes_hw_set_encrypt_key
-#ifdef __thumb2__
-.thumb_func	_aes_hw_set_encrypt_key
-#endif
-.align	5
-_aes_hw_set_encrypt_key:
-Lenc_key:
-	mov	r3,#-1
-	cmp	r0,#0
-	beq	Lenc_key_abort
-	cmp	r2,#0
-	beq	Lenc_key_abort
-	mov	r3,#-2
-	cmp	r1,#128
-	blt	Lenc_key_abort
-	cmp	r1,#256
-	bgt	Lenc_key_abort
-	tst	r1,#0x3f
-	bne	Lenc_key_abort
-
-	adr	r3,Lrcon
-	cmp	r1,#192
-
-	veor	q0,q0,q0
-	vld1.8	{q3},[r0]!
-	mov	r1,#8		@ reuse r1
-	vld1.32	{q1,q2},[r3]!
-
-	blt	Loop128
-	beq	L192
-	b	L256
-
-.align	4
-Loop128:
-	vtbl.8	d20,{q3},d4
-	vtbl.8	d21,{q3},d5
-	vext.8	q9,q0,q3,#12
-	vst1.32	{q3},[r2]!
-.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
-	subs	r1,r1,#1
-
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q10,q10,q1
-	veor	q3,q3,q9
-	vshl.u8	q1,q1,#1
-	veor	q3,q3,q10
-	bne	Loop128
-
-	vld1.32	{q1},[r3]
-
-	vtbl.8	d20,{q3},d4
-	vtbl.8	d21,{q3},d5
-	vext.8	q9,q0,q3,#12
-	vst1.32	{q3},[r2]!
-.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
-
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q10,q10,q1
-	veor	q3,q3,q9
-	vshl.u8	q1,q1,#1
-	veor	q3,q3,q10
-
-	vtbl.8	d20,{q3},d4
-	vtbl.8	d21,{q3},d5
-	vext.8	q9,q0,q3,#12
-	vst1.32	{q3},[r2]!
-.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
-
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q10,q10,q1
-	veor	q3,q3,q9
-	veor	q3,q3,q10
-	vst1.32	{q3},[r2]
-	add	r2,r2,#0x50
-
-	mov	r12,#10
-	b	Ldone
-
-.align	4
-L192:
-	vld1.8	{d16},[r0]!
-	vmov.i8	q10,#8			@ borrow q10
-	vst1.32	{q3},[r2]!
-	vsub.i8	q2,q2,q10	@ adjust the mask
-
-Loop192:
-	vtbl.8	d20,{q8},d4
-	vtbl.8	d21,{q8},d5
-	vext.8	q9,q0,q3,#12
-	vst1.32	{d16},[r2]!
-.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
-	subs	r1,r1,#1
-
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q3,q3,q9
-
-	vdup.32	q9,d7[1]
-	veor	q9,q9,q8
-	veor	q10,q10,q1
-	vext.8	q8,q0,q8,#12
-	vshl.u8	q1,q1,#1
-	veor	q8,q8,q9
-	veor	q3,q3,q10
-	veor	q8,q8,q10
-	vst1.32	{q3},[r2]!
-	bne	Loop192
-
-	mov	r12,#12
-	add	r2,r2,#0x20
-	b	Ldone
-
-.align	4
-L256:
-	vld1.8	{q8},[r0]
-	mov	r1,#7
-	mov	r12,#14
-	vst1.32	{q3},[r2]!
-
-Loop256:
-	vtbl.8	d20,{q8},d4
-	vtbl.8	d21,{q8},d5
-	vext.8	q9,q0,q3,#12
-	vst1.32	{q8},[r2]!
-.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
-	subs	r1,r1,#1
-
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q3,q3,q9
-	vext.8	q9,q0,q9,#12
-	veor	q10,q10,q1
-	veor	q3,q3,q9
-	vshl.u8	q1,q1,#1
-	veor	q3,q3,q10
-	vst1.32	{q3},[r2]!
-	beq	Ldone
-
-	vdup.32	q10,d7[1]
-	vext.8	q9,q0,q8,#12
-.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
-
-	veor	q8,q8,q9
-	vext.8	q9,q0,q9,#12
-	veor	q8,q8,q9
-	vext.8	q9,q0,q9,#12
-	veor	q8,q8,q9
-
-	veor	q8,q8,q10
-	b	Loop256
-
-Ldone:
-	str	r12,[r2]
-	mov	r3,#0
-
-Lenc_key_abort:
-	mov	r0,r3			@ return value
-
-	bx	lr
-
-
-.globl	_aes_hw_set_decrypt_key
-.private_extern	_aes_hw_set_decrypt_key
-#ifdef __thumb2__
-.thumb_func	_aes_hw_set_decrypt_key
-#endif
-.align	5
-_aes_hw_set_decrypt_key:
-	stmdb	sp!,{r4,lr}
-	bl	Lenc_key
-
-	cmp	r0,#0
-	bne	Ldec_key_abort
-
-	sub	r2,r2,#240		@ restore original r2
-	mov	r4,#-16
-	add	r0,r2,r12,lsl#4	@ end of key schedule
-
-	vld1.32	{q0},[r2]
-	vld1.32	{q1},[r0]
-	vst1.32	{q0},[r0],r4
-	vst1.32	{q1},[r2]!
-
-Loop_imc:
-	vld1.32	{q0},[r2]
-	vld1.32	{q1},[r0]
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-	vst1.32	{q0},[r0],r4
-	vst1.32	{q1},[r2]!
-	cmp	r0,r2
-	bhi	Loop_imc
-
-	vld1.32	{q0},[r2]
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-	vst1.32	{q0},[r0]
-
-	eor	r0,r0,r0		@ return value
-Ldec_key_abort:
-	ldmia	sp!,{r4,pc}
-
-.globl	_aes_hw_encrypt
-.private_extern	_aes_hw_encrypt
-#ifdef __thumb2__
-.thumb_func	_aes_hw_encrypt
-#endif
-.align	5
-_aes_hw_encrypt:
-	AARCH64_VALID_CALL_TARGET
-	ldr	r3,[r2,#240]
-	vld1.32	{q0},[r2]!
-	vld1.8	{q2},[r0]
-	sub	r3,r3,#2
-	vld1.32	{q1},[r2]!
-
-Loop_enc:
-.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
-.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
-	vld1.32	{q0},[r2]!
-	subs	r3,r3,#2
-.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
-.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
-	vld1.32	{q1},[r2]!
-	bgt	Loop_enc
-
-.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
-.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
-	vld1.32	{q0},[r2]
-.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
-	veor	q2,q2,q0
-
-	vst1.8	{q2},[r1]
-	bx	lr
-
-.globl	_aes_hw_decrypt
-.private_extern	_aes_hw_decrypt
-#ifdef __thumb2__
-.thumb_func	_aes_hw_decrypt
-#endif
-.align	5
-_aes_hw_decrypt:
-	AARCH64_VALID_CALL_TARGET
-	ldr	r3,[r2,#240]
-	vld1.32	{q0},[r2]!
-	vld1.8	{q2},[r0]
-	sub	r3,r3,#2
-	vld1.32	{q1},[r2]!
-
-Loop_dec:
-.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
-.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
-	vld1.32	{q0},[r2]!
-	subs	r3,r3,#2
-.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
-.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
-	vld1.32	{q1},[r2]!
-	bgt	Loop_dec
-
-.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
-.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
-	vld1.32	{q0},[r2]
-.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
-	veor	q2,q2,q0
-
-	vst1.8	{q2},[r1]
-	bx	lr
-
-.globl	_aes_hw_cbc_encrypt
-.private_extern	_aes_hw_cbc_encrypt
-#ifdef __thumb2__
-.thumb_func	_aes_hw_cbc_encrypt
-#endif
-.align	5
-_aes_hw_cbc_encrypt:
-	mov	ip,sp
-	stmdb	sp!,{r4,r5,r6,r7,r8,lr}
-	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
-	ldmia	ip,{r4,r5}		@ load remaining args
-	subs	r2,r2,#16
-	mov	r8,#16
-	blo	Lcbc_abort
-	moveq	r8,#0
-
-	cmp	r5,#0			@ en- or decrypting?
-	ldr	r5,[r3,#240]
-	and	r2,r2,#-16
-	vld1.8	{q6},[r4]
-	vld1.8	{q0},[r0],r8
-
-	vld1.32	{q8,q9},[r3]		@ load key schedule...
-	sub	r5,r5,#6
-	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
-	sub	r5,r5,#2
-	vld1.32	{q10,q11},[r7]!
-	vld1.32	{q12,q13},[r7]!
-	vld1.32	{q14,q15},[r7]!
-	vld1.32	{q7},[r7]
-
-	add	r7,r3,#32
-	mov	r6,r5
-	beq	Lcbc_dec
-
-	cmp	r5,#2
-	veor	q0,q0,q6
-	veor	q5,q8,q7
-	beq	Lcbc_enc128
-
-	vld1.32	{q2,q3},[r7]
-	add	r7,r3,#16
-	add	r6,r3,#16*4
-	add	r12,r3,#16*5
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	add	r14,r3,#16*6
-	add	r3,r3,#16*7
-	b	Lenter_cbc_enc
-
-.align	4
-Loop_cbc_enc:
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vst1.8	{q6},[r1]!
-Lenter_cbc_enc:
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.32	{q8},[r6]
-	cmp	r5,#4
-.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.32	{q9},[r12]
-	beq	Lcbc_enc192
-
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.32	{q8},[r14]
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.32	{q9},[r3]
-	nop
-
-Lcbc_enc192:
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	subs	r2,r2,#16
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	moveq	r8,#0
-.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.8	{q8},[r0],r8
-.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	veor	q8,q8,q5
-.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.32	{q9},[r7]		@ re-pre-load rndkey[1]
-.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
-	veor	q6,q0,q7
-	bhs	Loop_cbc_enc
-
-	vst1.8	{q6},[r1]!
-	b	Lcbc_done
-
-.align	5
-Lcbc_enc128:
-	vld1.32	{q2,q3},[r7]
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	b	Lenter_cbc_enc128
-Loop_cbc_enc128:
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vst1.8	{q6},[r1]!
-Lenter_cbc_enc128:
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	subs	r2,r2,#16
-.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	moveq	r8,#0
-.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	vld1.8	{q8},[r0],r8
-.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-	veor	q8,q8,q5
-.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
-	veor	q6,q0,q7
-	bhs	Loop_cbc_enc128
-
-	vst1.8	{q6},[r1]!
-	b	Lcbc_done
-.align	5
-Lcbc_dec:
-	vld1.8	{q10},[r0]!
-	subs	r2,r2,#32		@ bias
-	add	r6,r5,#2
-	vorr	q3,q0,q0
-	vorr	q1,q0,q0
-	vorr	q11,q10,q10
-	blo	Lcbc_dec_tail
-
-	vorr	q1,q10,q10
-	vld1.8	{q10},[r0]!
-	vorr	q2,q0,q0
-	vorr	q3,q1,q1
-	vorr	q11,q10,q10
-
-Loop3x_cbc_dec:
-.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.32	{q8},[r7]!
-	subs	r6,r6,#2
-.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.32	{q9},[r7]!
-	bgt	Loop3x_cbc_dec
-
-.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	veor	q4,q6,q7
-	subs	r2,r2,#0x30
-	veor	q5,q2,q7
-	movlo	r6,r2			@ r6, r6, is zero at this point
-.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	veor	q9,q3,q7
-	add	r0,r0,r6		@ r0 is adjusted in such way that
-					@ at exit from the loop q1-q10
-					@ are loaded with last "words"
-	vorr	q6,q11,q11
-	mov	r7,r3
-.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.8	{q2},[r0]!
-.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.8	{q3},[r0]!
-.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
-.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
-.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.8	{q11},[r0]!
-.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
-.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
-.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
-	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
-	add	r6,r5,#2
-	veor	q4,q4,q0
-	veor	q5,q5,q1
-	veor	q10,q10,q9
-	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
-	vst1.8	{q4},[r1]!
-	vorr	q0,q2,q2
-	vst1.8	{q5},[r1]!
-	vorr	q1,q3,q3
-	vst1.8	{q10},[r1]!
-	vorr	q10,q11,q11
-	bhs	Loop3x_cbc_dec
-
-	cmn	r2,#0x30
-	beq	Lcbc_done
-	nop
-
-Lcbc_dec_tail:
-.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.32	{q8},[r7]!
-	subs	r6,r6,#2
-.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	vld1.32	{q9},[r7]!
-	bgt	Lcbc_dec_tail
-
-.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	cmn	r2,#0x20
-.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	veor	q5,q6,q7
-.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
-.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
-.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
-.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
-	veor	q9,q3,q7
-.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
-.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
-	beq	Lcbc_dec_one
-	veor	q5,q5,q1
-	veor	q9,q9,q10
-	vorr	q6,q11,q11
-	vst1.8	{q5},[r1]!
-	vst1.8	{q9},[r1]!
-	b	Lcbc_done
-
-Lcbc_dec_one:
-	veor	q5,q5,q10
-	vorr	q6,q11,q11
-	vst1.8	{q5},[r1]!
-
-Lcbc_done:
-	vst1.8	{q6},[r4]
-Lcbc_abort:
-	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
-	ldmia	sp!,{r4,r5,r6,r7,r8,pc}
-
-.globl	_aes_hw_ctr32_encrypt_blocks
-.private_extern	_aes_hw_ctr32_encrypt_blocks
-#ifdef __thumb2__
-.thumb_func	_aes_hw_ctr32_encrypt_blocks
-#endif
-.align	5
-_aes_hw_ctr32_encrypt_blocks:
-	mov	ip,sp
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
-	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
-	ldr	r4, [ip]		@ load remaining arg
-	ldr	r5,[r3,#240]
-
-	ldr	r8, [r4, #12]
-	vld1.32	{q0},[r4]
-
-	vld1.32	{q8,q9},[r3]		@ load key schedule...
-	sub	r5,r5,#4
-	mov	r12,#16
-	cmp	r2,#2
-	add	r7,r3,r5,lsl#4	@ pointer to last 5 round keys
-	sub	r5,r5,#2
-	vld1.32	{q12,q13},[r7]!
-	vld1.32	{q14,q15},[r7]!
-	vld1.32	{q7},[r7]
-	add	r7,r3,#32
-	mov	r6,r5
-	movlo	r12,#0
-
-	@ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
-	@ affected by silicon errata #1742098 [0] and #1655431 [1],
-	@ respectively, where the second instruction of an aese/aesmc
-	@ instruction pair may execute twice if an interrupt is taken right
-	@ after the first instruction consumes an input register of which a
-	@ single 32-bit lane has been updated the last time it was modified.
-	@ 
-	@ This function uses a counter in one 32-bit lane. The 
-	@ could write to q1 and q10 directly, but that trips this bugs.
-	@ We write to q6 and copy to the final register as a workaround.
-	@ 
-	@ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
-	@ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
-#ifndef __ARMEB__
-	rev	r8, r8
-#endif
-	add	r10, r8, #1
-	vorr	q6,q0,q0
-	rev	r10, r10
-	vmov.32	d13[1],r10
-	add	r8, r8, #2
-	vorr	q1,q6,q6
-	bls	Lctr32_tail
-	rev	r12, r8
-	vmov.32	d13[1],r12
-	sub	r2,r2,#3		@ bias
-	vorr	q10,q6,q6
-	b	Loop3x_ctr32
-
-.align	4
-Loop3x_ctr32:
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
-.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
-	vld1.32	{q8},[r7]!
-	subs	r6,r6,#2
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
-.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
-	vld1.32	{q9},[r7]!
-	bgt	Loop3x_ctr32
-
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
-.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
-.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
-	vld1.8	{q2},[r0]!
-	add	r9,r8,#1
-.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
-.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
-	vld1.8	{q3},[r0]!
-	rev	r9,r9
-.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
-.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
-.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
-.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
-	vld1.8	{q11},[r0]!
-	mov	r7,r3
-.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
-.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
-.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
-.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
-.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
-.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
-	veor	q2,q2,q7
-	add	r10,r8,#2
-.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
-.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
-	veor	q3,q3,q7
-	add	r8,r8,#3
-.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
-.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
-.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
-.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
-	 @ Note the logic to update q0, q1, and q1 is written to work
-	 @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
-	 @ 32-bit mode. See the comment above.
-	veor	q11,q11,q7
-	vmov.32	d13[1], r9
-.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
-.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
-	vorr	q0,q6,q6
-	rev	r10,r10
-.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
-.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
-	vmov.32	d13[1], r10
-	rev	r12,r8
-.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
-.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
-	vorr	q1,q6,q6
-	vmov.32	d13[1], r12
-.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
-.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
-	vorr	q10,q6,q6
-	subs	r2,r2,#3
-.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
-.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
-.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15
-
-	veor	q2,q2,q4
-	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
-	vst1.8	{q2},[r1]!
-	veor	q3,q3,q5
-	mov	r6,r5
-	vst1.8	{q3},[r1]!
-	veor	q11,q11,q9
-	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
-	vst1.8	{q11},[r1]!
-	bhs	Loop3x_ctr32
-
-	adds	r2,r2,#3
-	beq	Lctr32_done
-	cmp	r2,#1
-	mov	r12,#16
-	moveq	r12,#0
-
-Lctr32_tail:
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-	vld1.32	{q8},[r7]!
-	subs	r6,r6,#2
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-	vld1.32	{q9},[r7]!
-	bgt	Lctr32_tail
-
-.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-	vld1.8	{q2},[r0],r12
-.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-	vld1.8	{q3},[r0]
-.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-	veor	q2,q2,q7
-.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
-.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
-.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
-.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
-	veor	q3,q3,q7
-.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
-.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
-
-	cmp	r2,#1
-	veor	q2,q2,q0
-	veor	q3,q3,q1
-	vst1.8	{q2},[r1]!
-	beq	Lctr32_done
-	vst1.8	{q3},[r1]
-
-Lctr32_done:
-	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
-
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/armv4-mont-apple.S b/apple-arm/crypto/fipsmodule/armv4-mont-apple.S
deleted file mode 100644
index 54bd13f..0000000
--- a/apple-arm/crypto/fipsmodule/armv4-mont-apple.S
+++ /dev/null
@@ -1,974 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
-
-
-.text
-#if defined(__thumb2__)
-.syntax	unified
-.thumb
-#else
-.code	32
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-.align	5
-LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-Lbn_mul_mont
-#endif
-
-.globl	_bn_mul_mont
-.private_extern	_bn_mul_mont
-#ifdef __thumb2__
-.thumb_func	_bn_mul_mont
-#endif
-
-.align	5
-_bn_mul_mont:
-Lbn_mul_mont:
-	ldr	ip,[sp,#4]		@ load num
-	stmdb	sp!,{r0,r2}		@ sp points at argument block
-#if __ARM_MAX_ARCH__>=7
-	tst	ip,#7
-	bne	Lialu
-	adr	r0,Lbn_mul_mont
-	ldr	r2,LOPENSSL_armcap
-	ldr	r0,[r0,r2]
-#ifdef	__APPLE__
-	ldr	r0,[r0]
-#endif
-	tst	r0,#ARMV7_NEON		@ NEON available?
-	ldmia	sp, {r0,r2}
-	beq	Lialu
-	add	sp,sp,#8
-	b	bn_mul8x_mont_neon
-.align	4
-Lialu:
-#endif
-	cmp	ip,#2
-	mov	r0,ip			@ load num
-#ifdef	__thumb2__
-	ittt	lt
-#endif
-	movlt	r0,#0
-	addlt	sp,sp,#2*4
-	blt	Labrt
-
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ save 10 registers
-
-	mov	r0,r0,lsl#2		@ rescale r0 for byte count
-	sub	sp,sp,r0		@ alloca(4*num)
-	sub	sp,sp,#4		@ +extra dword
-	sub	r0,r0,#4		@ "num=num-1"
-	add	r4,r2,r0		@ &bp[num-1]
-
-	add	r0,sp,r0		@ r0 to point at &tp[num-1]
-	ldr	r8,[r0,#14*4]		@ &n0
-	ldr	r2,[r2]		@ bp[0]
-	ldr	r5,[r1],#4		@ ap[0],ap++
-	ldr	r6,[r3],#4		@ np[0],np++
-	ldr	r8,[r8]		@ *n0
-	str	r4,[r0,#15*4]		@ save &bp[num]
-
-	umull	r10,r11,r5,r2	@ ap[0]*bp[0]
-	str	r8,[r0,#14*4]		@ save n0 value
-	mul	r8,r10,r8		@ "tp[0]"*n0
-	mov	r12,#0
-	umlal	r10,r12,r6,r8	@ np[0]*n0+"t[0]"
-	mov	r4,sp
-
-L1st:
-	ldr	r5,[r1],#4		@ ap[j],ap++
-	mov	r10,r11
-	ldr	r6,[r3],#4		@ np[j],np++
-	mov	r11,#0
-	umlal	r10,r11,r5,r2	@ ap[j]*bp[0]
-	mov	r14,#0
-	umlal	r12,r14,r6,r8	@ np[j]*n0
-	adds	r12,r12,r10
-	str	r12,[r4],#4		@ tp[j-1]=,tp++
-	adc	r12,r14,#0
-	cmp	r4,r0
-	bne	L1st
-
-	adds	r12,r12,r11
-	ldr	r4,[r0,#13*4]		@ restore bp
-	mov	r14,#0
-	ldr	r8,[r0,#14*4]		@ restore n0
-	adc	r14,r14,#0
-	str	r12,[r0]		@ tp[num-1]=
-	mov	r7,sp
-	str	r14,[r0,#4]		@ tp[num]=
-
-Louter:
-	sub	r7,r0,r7		@ "original" r0-1 value
-	sub	r1,r1,r7		@ "rewind" ap to &ap[1]
-	ldr	r2,[r4,#4]!		@ *(++bp)
-	sub	r3,r3,r7		@ "rewind" np to &np[1]
-	ldr	r5,[r1,#-4]		@ ap[0]
-	ldr	r10,[sp]		@ tp[0]
-	ldr	r6,[r3,#-4]		@ np[0]
-	ldr	r7,[sp,#4]		@ tp[1]
-
-	mov	r11,#0
-	umlal	r10,r11,r5,r2	@ ap[0]*bp[i]+tp[0]
-	str	r4,[r0,#13*4]		@ save bp
-	mul	r8,r10,r8
-	mov	r12,#0
-	umlal	r10,r12,r6,r8	@ np[0]*n0+"tp[0]"
-	mov	r4,sp
-
-Linner:
-	ldr	r5,[r1],#4		@ ap[j],ap++
-	adds	r10,r11,r7		@ +=tp[j]
-	ldr	r6,[r3],#4		@ np[j],np++
-	mov	r11,#0
-	umlal	r10,r11,r5,r2	@ ap[j]*bp[i]
-	mov	r14,#0
-	umlal	r12,r14,r6,r8	@ np[j]*n0
-	adc	r11,r11,#0
-	ldr	r7,[r4,#8]		@ tp[j+1]
-	adds	r12,r12,r10
-	str	r12,[r4],#4		@ tp[j-1]=,tp++
-	adc	r12,r14,#0
-	cmp	r4,r0
-	bne	Linner
-
-	adds	r12,r12,r11
-	mov	r14,#0
-	ldr	r4,[r0,#13*4]		@ restore bp
-	adc	r14,r14,#0
-	ldr	r8,[r0,#14*4]		@ restore n0
-	adds	r12,r12,r7
-	ldr	r7,[r0,#15*4]		@ restore &bp[num]
-	adc	r14,r14,#0
-	str	r12,[r0]		@ tp[num-1]=
-	str	r14,[r0,#4]		@ tp[num]=
-
-	cmp	r4,r7
-#ifdef	__thumb2__
-	itt	ne
-#endif
-	movne	r7,sp
-	bne	Louter
-
-	ldr	r2,[r0,#12*4]		@ pull rp
-	mov	r5,sp
-	add	r0,r0,#4		@ r0 to point at &tp[num]
-	sub	r5,r0,r5		@ "original" num value
-	mov	r4,sp			@ "rewind" r4
-	mov	r1,r4			@ "borrow" r1
-	sub	r3,r3,r5		@ "rewind" r3 to &np[0]
-
-	subs	r7,r7,r7		@ "clear" carry flag
-Lsub:	ldr	r7,[r4],#4
-	ldr	r6,[r3],#4
-	sbcs	r7,r7,r6		@ tp[j]-np[j]
-	str	r7,[r2],#4		@ rp[j]=
-	teq	r4,r0		@ preserve carry
-	bne	Lsub
-	sbcs	r14,r14,#0		@ upmost carry
-	mov	r4,sp			@ "rewind" r4
-	sub	r2,r2,r5		@ "rewind" r2
-
-Lcopy:	ldr	r7,[r4]		@ conditional copy
-	ldr	r5,[r2]
-	str	sp,[r4],#4		@ zap tp
-#ifdef	__thumb2__
-	it	cc
-#endif
-	movcc	r5,r7
-	str	r5,[r2],#4
-	teq	r4,r0		@ preserve carry
-	bne	Lcopy
-
-	mov	sp,r0
-	add	sp,sp,#4		@ skip over tp[num+1]
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ restore registers
-	add	sp,sp,#2*4		@ skip over {r0,r2}
-	mov	r0,#1
-Labrt:
-#if __ARM_ARCH__>=5
-	bx	lr				@ bx lr
-#else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-#ifdef __thumb2__
-.thumb_func	bn_mul8x_mont_neon
-#endif
-.align	5
-bn_mul8x_mont_neon:
-	mov	ip,sp
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
-	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
-	ldmia	ip,{r4,r5}		@ load rest of parameter block
-	mov	ip,sp
-
-	cmp	r5,#8
-	bhi	LNEON_8n
-
-	@ special case for r5==8, everything is in register bank...
-
-	vld1.32	{d28[0]}, [r2,:32]!
-	veor	d8,d8,d8
-	sub	r7,sp,r5,lsl#4
-	vld1.32	{d0,d1,d2,d3},  [r1]!		@ can't specify :32 :-(
-	and	r7,r7,#-64
-	vld1.32	{d30[0]}, [r4,:32]
-	mov	sp,r7			@ alloca
-	vzip.16	d28,d8
-
-	vmull.u32	q6,d28,d0[0]
-	vmull.u32	q7,d28,d0[1]
-	vmull.u32	q8,d28,d1[0]
-	vshl.i64	d29,d13,#16
-	vmull.u32	q9,d28,d1[1]
-
-	vadd.u64	d29,d29,d12
-	veor	d8,d8,d8
-	vmul.u32	d29,d29,d30
-
-	vmull.u32	q10,d28,d2[0]
-	vld1.32	{d4,d5,d6,d7}, [r3]!
-	vmull.u32	q11,d28,d2[1]
-	vmull.u32	q12,d28,d3[0]
-	vzip.16	d29,d8
-	vmull.u32	q13,d28,d3[1]
-
-	vmlal.u32	q6,d29,d4[0]
-	sub	r9,r5,#1
-	vmlal.u32	q7,d29,d4[1]
-	vmlal.u32	q8,d29,d5[0]
-	vmlal.u32	q9,d29,d5[1]
-
-	vmlal.u32	q10,d29,d6[0]
-	vmov	q5,q6
-	vmlal.u32	q11,d29,d6[1]
-	vmov	q6,q7
-	vmlal.u32	q12,d29,d7[0]
-	vmov	q7,q8
-	vmlal.u32	q13,d29,d7[1]
-	vmov	q8,q9
-	vmov	q9,q10
-	vshr.u64	d10,d10,#16
-	vmov	q10,q11
-	vmov	q11,q12
-	vadd.u64	d10,d10,d11
-	vmov	q12,q13
-	veor	q13,q13
-	vshr.u64	d10,d10,#16
-
-	b	LNEON_outer8
-
-.align	4
-LNEON_outer8:
-	vld1.32	{d28[0]}, [r2,:32]!
-	veor	d8,d8,d8
-	vzip.16	d28,d8
-	vadd.u64	d12,d12,d10
-
-	vmlal.u32	q6,d28,d0[0]
-	vmlal.u32	q7,d28,d0[1]
-	vmlal.u32	q8,d28,d1[0]
-	vshl.i64	d29,d13,#16
-	vmlal.u32	q9,d28,d1[1]
-
-	vadd.u64	d29,d29,d12
-	veor	d8,d8,d8
-	subs	r9,r9,#1
-	vmul.u32	d29,d29,d30
-
-	vmlal.u32	q10,d28,d2[0]
-	vmlal.u32	q11,d28,d2[1]
-	vmlal.u32	q12,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q13,d28,d3[1]
-
-	vmlal.u32	q6,d29,d4[0]
-	vmlal.u32	q7,d29,d4[1]
-	vmlal.u32	q8,d29,d5[0]
-	vmlal.u32	q9,d29,d5[1]
-
-	vmlal.u32	q10,d29,d6[0]
-	vmov	q5,q6
-	vmlal.u32	q11,d29,d6[1]
-	vmov	q6,q7
-	vmlal.u32	q12,d29,d7[0]
-	vmov	q7,q8
-	vmlal.u32	q13,d29,d7[1]
-	vmov	q8,q9
-	vmov	q9,q10
-	vshr.u64	d10,d10,#16
-	vmov	q10,q11
-	vmov	q11,q12
-	vadd.u64	d10,d10,d11
-	vmov	q12,q13
-	veor	q13,q13
-	vshr.u64	d10,d10,#16
-
-	bne	LNEON_outer8
-
-	vadd.u64	d12,d12,d10
-	mov	r7,sp
-	vshr.u64	d10,d12,#16
-	mov	r8,r5
-	vadd.u64	d13,d13,d10
-	add	r6,sp,#96
-	vshr.u64	d10,d13,#16
-	vzip.16	d12,d13
-
-	b	LNEON_tail_entry
-
-.align	4
-LNEON_8n:
-	veor	q6,q6,q6
-	sub	r7,sp,#128
-	veor	q7,q7,q7
-	sub	r7,r7,r5,lsl#4
-	veor	q8,q8,q8
-	and	r7,r7,#-64
-	veor	q9,q9,q9
-	mov	sp,r7			@ alloca
-	veor	q10,q10,q10
-	add	r7,r7,#256
-	veor	q11,q11,q11
-	sub	r8,r5,#8
-	veor	q12,q12,q12
-	veor	q13,q13,q13
-
-LNEON_8n_init:
-	vst1.64	{q6,q7},[r7,:256]!
-	subs	r8,r8,#8
-	vst1.64	{q8,q9},[r7,:256]!
-	vst1.64	{q10,q11},[r7,:256]!
-	vst1.64	{q12,q13},[r7,:256]!
-	bne	LNEON_8n_init
-
-	add	r6,sp,#256
-	vld1.32	{d0,d1,d2,d3},[r1]!
-	add	r10,sp,#8
-	vld1.32	{d30[0]},[r4,:32]
-	mov	r9,r5
-	b	LNEON_8n_outer
-
-.align	4
-LNEON_8n_outer:
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	veor	d8,d8,d8
-	vzip.16	d28,d8
-	add	r7,sp,#128
-	vld1.32	{d4,d5,d6,d7},[r3]!
-
-	vmlal.u32	q6,d28,d0[0]
-	vmlal.u32	q7,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q8,d28,d1[0]
-	vshl.i64	d29,d13,#16
-	vmlal.u32	q9,d28,d1[1]
-	vadd.u64	d29,d29,d12
-	vmlal.u32	q10,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q11,d28,d2[1]
-	vst1.32	{d28},[sp,:64]		@ put aside smashed b[8*i+0]
-	vmlal.u32	q12,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q13,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q6,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q7,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q8,d29,d5[0]
-	vshr.u64	d12,d12,#16
-	vmlal.u32	q9,d29,d5[1]
-	vmlal.u32	q10,d29,d6[0]
-	vadd.u64	d12,d12,d13
-	vmlal.u32	q11,d29,d6[1]
-	vshr.u64	d12,d12,#16
-	vmlal.u32	q12,d29,d7[0]
-	vmlal.u32	q13,d29,d7[1]
-	vadd.u64	d14,d14,d12
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+0]
-	vmlal.u32	q7,d28,d0[0]
-	vld1.64	{q6},[r6,:128]!
-	vmlal.u32	q8,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q9,d28,d1[0]
-	vshl.i64	d29,d15,#16
-	vmlal.u32	q10,d28,d1[1]
-	vadd.u64	d29,d29,d14
-	vmlal.u32	q11,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q12,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+1]
-	vmlal.u32	q13,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q6,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q7,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q8,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q9,d29,d5[0]
-	vshr.u64	d14,d14,#16
-	vmlal.u32	q10,d29,d5[1]
-	vmlal.u32	q11,d29,d6[0]
-	vadd.u64	d14,d14,d15
-	vmlal.u32	q12,d29,d6[1]
-	vshr.u64	d14,d14,#16
-	vmlal.u32	q13,d29,d7[0]
-	vmlal.u32	q6,d29,d7[1]
-	vadd.u64	d16,d16,d14
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+1]
-	vmlal.u32	q8,d28,d0[0]
-	vld1.64	{q7},[r6,:128]!
-	vmlal.u32	q9,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q10,d28,d1[0]
-	vshl.i64	d29,d17,#16
-	vmlal.u32	q11,d28,d1[1]
-	vadd.u64	d29,d29,d16
-	vmlal.u32	q12,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q13,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+2]
-	vmlal.u32	q6,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q7,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q8,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q9,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q10,d29,d5[0]
-	vshr.u64	d16,d16,#16
-	vmlal.u32	q11,d29,d5[1]
-	vmlal.u32	q12,d29,d6[0]
-	vadd.u64	d16,d16,d17
-	vmlal.u32	q13,d29,d6[1]
-	vshr.u64	d16,d16,#16
-	vmlal.u32	q6,d29,d7[0]
-	vmlal.u32	q7,d29,d7[1]
-	vadd.u64	d18,d18,d16
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+2]
-	vmlal.u32	q9,d28,d0[0]
-	vld1.64	{q8},[r6,:128]!
-	vmlal.u32	q10,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q11,d28,d1[0]
-	vshl.i64	d29,d19,#16
-	vmlal.u32	q12,d28,d1[1]
-	vadd.u64	d29,d29,d18
-	vmlal.u32	q13,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q6,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+3]
-	vmlal.u32	q7,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q8,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q9,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q10,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q11,d29,d5[0]
-	vshr.u64	d18,d18,#16
-	vmlal.u32	q12,d29,d5[1]
-	vmlal.u32	q13,d29,d6[0]
-	vadd.u64	d18,d18,d19
-	vmlal.u32	q6,d29,d6[1]
-	vshr.u64	d18,d18,#16
-	vmlal.u32	q7,d29,d7[0]
-	vmlal.u32	q8,d29,d7[1]
-	vadd.u64	d20,d20,d18
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+3]
-	vmlal.u32	q10,d28,d0[0]
-	vld1.64	{q9},[r6,:128]!
-	vmlal.u32	q11,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q12,d28,d1[0]
-	vshl.i64	d29,d21,#16
-	vmlal.u32	q13,d28,d1[1]
-	vadd.u64	d29,d29,d20
-	vmlal.u32	q6,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q7,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+4]
-	vmlal.u32	q8,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q9,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q10,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q11,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q12,d29,d5[0]
-	vshr.u64	d20,d20,#16
-	vmlal.u32	q13,d29,d5[1]
-	vmlal.u32	q6,d29,d6[0]
-	vadd.u64	d20,d20,d21
-	vmlal.u32	q7,d29,d6[1]
-	vshr.u64	d20,d20,#16
-	vmlal.u32	q8,d29,d7[0]
-	vmlal.u32	q9,d29,d7[1]
-	vadd.u64	d22,d22,d20
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+4]
-	vmlal.u32	q11,d28,d0[0]
-	vld1.64	{q10},[r6,:128]!
-	vmlal.u32	q12,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q13,d28,d1[0]
-	vshl.i64	d29,d23,#16
-	vmlal.u32	q6,d28,d1[1]
-	vadd.u64	d29,d29,d22
-	vmlal.u32	q7,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q8,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+5]
-	vmlal.u32	q9,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q10,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q11,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q12,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q13,d29,d5[0]
-	vshr.u64	d22,d22,#16
-	vmlal.u32	q6,d29,d5[1]
-	vmlal.u32	q7,d29,d6[0]
-	vadd.u64	d22,d22,d23
-	vmlal.u32	q8,d29,d6[1]
-	vshr.u64	d22,d22,#16
-	vmlal.u32	q9,d29,d7[0]
-	vmlal.u32	q10,d29,d7[1]
-	vadd.u64	d24,d24,d22
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+5]
-	vmlal.u32	q12,d28,d0[0]
-	vld1.64	{q11},[r6,:128]!
-	vmlal.u32	q13,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q6,d28,d1[0]
-	vshl.i64	d29,d25,#16
-	vmlal.u32	q7,d28,d1[1]
-	vadd.u64	d29,d29,d24
-	vmlal.u32	q8,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q9,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+6]
-	vmlal.u32	q10,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q11,d28,d3[1]
-	vld1.32	{d28[0]},[r2,:32]!	@ *b++
-	vmlal.u32	q12,d29,d4[0]
-	veor	d10,d10,d10
-	vmlal.u32	q13,d29,d4[1]
-	vzip.16	d28,d10
-	vmlal.u32	q6,d29,d5[0]
-	vshr.u64	d24,d24,#16
-	vmlal.u32	q7,d29,d5[1]
-	vmlal.u32	q8,d29,d6[0]
-	vadd.u64	d24,d24,d25
-	vmlal.u32	q9,d29,d6[1]
-	vshr.u64	d24,d24,#16
-	vmlal.u32	q10,d29,d7[0]
-	vmlal.u32	q11,d29,d7[1]
-	vadd.u64	d26,d26,d24
-	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+6]
-	vmlal.u32	q13,d28,d0[0]
-	vld1.64	{q12},[r6,:128]!
-	vmlal.u32	q6,d28,d0[1]
-	veor	d8,d8,d8
-	vmlal.u32	q7,d28,d1[0]
-	vshl.i64	d29,d27,#16
-	vmlal.u32	q8,d28,d1[1]
-	vadd.u64	d29,d29,d26
-	vmlal.u32	q9,d28,d2[0]
-	vmul.u32	d29,d29,d30
-	vmlal.u32	q10,d28,d2[1]
-	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+7]
-	vmlal.u32	q11,d28,d3[0]
-	vzip.16	d29,d8
-	vmlal.u32	q12,d28,d3[1]
-	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
-	vmlal.u32	q13,d29,d4[0]
-	vld1.32	{d0,d1,d2,d3},[r1]!
-	vmlal.u32	q6,d29,d4[1]
-	vmlal.u32	q7,d29,d5[0]
-	vshr.u64	d26,d26,#16
-	vmlal.u32	q8,d29,d5[1]
-	vmlal.u32	q9,d29,d6[0]
-	vadd.u64	d26,d26,d27
-	vmlal.u32	q10,d29,d6[1]
-	vshr.u64	d26,d26,#16
-	vmlal.u32	q11,d29,d7[0]
-	vmlal.u32	q12,d29,d7[1]
-	vadd.u64	d12,d12,d26
-	vst1.32	{d29},[r10,:64]	@ put aside smashed m[8*i+7]
-	add	r10,sp,#8		@ rewind
-	sub	r8,r5,#8
-	b	LNEON_8n_inner
-
-.align	4
-LNEON_8n_inner:
-	subs	r8,r8,#8
-	vmlal.u32	q6,d28,d0[0]
-	vld1.64	{q13},[r6,:128]
-	vmlal.u32	q7,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+0]
-	vmlal.u32	q8,d28,d1[0]
-	vld1.32	{d4,d5,d6,d7},[r3]!
-	vmlal.u32	q9,d28,d1[1]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q10,d28,d2[0]
-	vmlal.u32	q11,d28,d2[1]
-	vmlal.u32	q12,d28,d3[0]
-	vmlal.u32	q13,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+1]
-	vmlal.u32	q6,d29,d4[0]
-	vmlal.u32	q7,d29,d4[1]
-	vmlal.u32	q8,d29,d5[0]
-	vmlal.u32	q9,d29,d5[1]
-	vmlal.u32	q10,d29,d6[0]
-	vmlal.u32	q11,d29,d6[1]
-	vmlal.u32	q12,d29,d7[0]
-	vmlal.u32	q13,d29,d7[1]
-	vst1.64	{q6},[r7,:128]!
-	vmlal.u32	q7,d28,d0[0]
-	vld1.64	{q6},[r6,:128]
-	vmlal.u32	q8,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+1]
-	vmlal.u32	q9,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q10,d28,d1[1]
-	vmlal.u32	q11,d28,d2[0]
-	vmlal.u32	q12,d28,d2[1]
-	vmlal.u32	q13,d28,d3[0]
-	vmlal.u32	q6,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+2]
-	vmlal.u32	q7,d29,d4[0]
-	vmlal.u32	q8,d29,d4[1]
-	vmlal.u32	q9,d29,d5[0]
-	vmlal.u32	q10,d29,d5[1]
-	vmlal.u32	q11,d29,d6[0]
-	vmlal.u32	q12,d29,d6[1]
-	vmlal.u32	q13,d29,d7[0]
-	vmlal.u32	q6,d29,d7[1]
-	vst1.64	{q7},[r7,:128]!
-	vmlal.u32	q8,d28,d0[0]
-	vld1.64	{q7},[r6,:128]
-	vmlal.u32	q9,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+2]
-	vmlal.u32	q10,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q11,d28,d1[1]
-	vmlal.u32	q12,d28,d2[0]
-	vmlal.u32	q13,d28,d2[1]
-	vmlal.u32	q6,d28,d3[0]
-	vmlal.u32	q7,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+3]
-	vmlal.u32	q8,d29,d4[0]
-	vmlal.u32	q9,d29,d4[1]
-	vmlal.u32	q10,d29,d5[0]
-	vmlal.u32	q11,d29,d5[1]
-	vmlal.u32	q12,d29,d6[0]
-	vmlal.u32	q13,d29,d6[1]
-	vmlal.u32	q6,d29,d7[0]
-	vmlal.u32	q7,d29,d7[1]
-	vst1.64	{q8},[r7,:128]!
-	vmlal.u32	q9,d28,d0[0]
-	vld1.64	{q8},[r6,:128]
-	vmlal.u32	q10,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+3]
-	vmlal.u32	q11,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q12,d28,d1[1]
-	vmlal.u32	q13,d28,d2[0]
-	vmlal.u32	q6,d28,d2[1]
-	vmlal.u32	q7,d28,d3[0]
-	vmlal.u32	q8,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+4]
-	vmlal.u32	q9,d29,d4[0]
-	vmlal.u32	q10,d29,d4[1]
-	vmlal.u32	q11,d29,d5[0]
-	vmlal.u32	q12,d29,d5[1]
-	vmlal.u32	q13,d29,d6[0]
-	vmlal.u32	q6,d29,d6[1]
-	vmlal.u32	q7,d29,d7[0]
-	vmlal.u32	q8,d29,d7[1]
-	vst1.64	{q9},[r7,:128]!
-	vmlal.u32	q10,d28,d0[0]
-	vld1.64	{q9},[r6,:128]
-	vmlal.u32	q11,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+4]
-	vmlal.u32	q12,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q13,d28,d1[1]
-	vmlal.u32	q6,d28,d2[0]
-	vmlal.u32	q7,d28,d2[1]
-	vmlal.u32	q8,d28,d3[0]
-	vmlal.u32	q9,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+5]
-	vmlal.u32	q10,d29,d4[0]
-	vmlal.u32	q11,d29,d4[1]
-	vmlal.u32	q12,d29,d5[0]
-	vmlal.u32	q13,d29,d5[1]
-	vmlal.u32	q6,d29,d6[0]
-	vmlal.u32	q7,d29,d6[1]
-	vmlal.u32	q8,d29,d7[0]
-	vmlal.u32	q9,d29,d7[1]
-	vst1.64	{q10},[r7,:128]!
-	vmlal.u32	q11,d28,d0[0]
-	vld1.64	{q10},[r6,:128]
-	vmlal.u32	q12,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+5]
-	vmlal.u32	q13,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q6,d28,d1[1]
-	vmlal.u32	q7,d28,d2[0]
-	vmlal.u32	q8,d28,d2[1]
-	vmlal.u32	q9,d28,d3[0]
-	vmlal.u32	q10,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+6]
-	vmlal.u32	q11,d29,d4[0]
-	vmlal.u32	q12,d29,d4[1]
-	vmlal.u32	q13,d29,d5[0]
-	vmlal.u32	q6,d29,d5[1]
-	vmlal.u32	q7,d29,d6[0]
-	vmlal.u32	q8,d29,d6[1]
-	vmlal.u32	q9,d29,d7[0]
-	vmlal.u32	q10,d29,d7[1]
-	vst1.64	{q11},[r7,:128]!
-	vmlal.u32	q12,d28,d0[0]
-	vld1.64	{q11},[r6,:128]
-	vmlal.u32	q13,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+6]
-	vmlal.u32	q6,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q7,d28,d1[1]
-	vmlal.u32	q8,d28,d2[0]
-	vmlal.u32	q9,d28,d2[1]
-	vmlal.u32	q10,d28,d3[0]
-	vmlal.u32	q11,d28,d3[1]
-	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+7]
-	vmlal.u32	q12,d29,d4[0]
-	vmlal.u32	q13,d29,d4[1]
-	vmlal.u32	q6,d29,d5[0]
-	vmlal.u32	q7,d29,d5[1]
-	vmlal.u32	q8,d29,d6[0]
-	vmlal.u32	q9,d29,d6[1]
-	vmlal.u32	q10,d29,d7[0]
-	vmlal.u32	q11,d29,d7[1]
-	vst1.64	{q12},[r7,:128]!
-	vmlal.u32	q13,d28,d0[0]
-	vld1.64	{q12},[r6,:128]
-	vmlal.u32	q6,d28,d0[1]
-	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+7]
-	vmlal.u32	q7,d28,d1[0]
-	it	ne
-	addne	r6,r6,#16	@ don't advance in last iteration
-	vmlal.u32	q8,d28,d1[1]
-	vmlal.u32	q9,d28,d2[0]
-	vmlal.u32	q10,d28,d2[1]
-	vmlal.u32	q11,d28,d3[0]
-	vmlal.u32	q12,d28,d3[1]
-	it	eq
-	subeq	r1,r1,r5,lsl#2	@ rewind
-	vmlal.u32	q13,d29,d4[0]
-	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
-	vmlal.u32	q6,d29,d4[1]
-	vld1.32	{d0,d1,d2,d3},[r1]!
-	vmlal.u32	q7,d29,d5[0]
-	add	r10,sp,#8		@ rewind
-	vmlal.u32	q8,d29,d5[1]
-	vmlal.u32	q9,d29,d6[0]
-	vmlal.u32	q10,d29,d6[1]
-	vmlal.u32	q11,d29,d7[0]
-	vst1.64	{q13},[r7,:128]!
-	vmlal.u32	q12,d29,d7[1]
-
-	bne	LNEON_8n_inner
-	add	r6,sp,#128
-	vst1.64	{q6,q7},[r7,:256]!
-	veor	q2,q2,q2		@ d4-d5
-	vst1.64	{q8,q9},[r7,:256]!
-	veor	q3,q3,q3		@ d6-d7
-	vst1.64	{q10,q11},[r7,:256]!
-	vst1.64	{q12},[r7,:128]
-
-	subs	r9,r9,#8
-	vld1.64	{q6,q7},[r6,:256]!
-	vld1.64	{q8,q9},[r6,:256]!
-	vld1.64	{q10,q11},[r6,:256]!
-	vld1.64	{q12,q13},[r6,:256]!
-
-	itt	ne
-	subne	r3,r3,r5,lsl#2	@ rewind
-	bne	LNEON_8n_outer
-
-	add	r7,sp,#128
-	vst1.64	{q2,q3}, [sp,:256]!	@ start wiping stack frame
-	vshr.u64	d10,d12,#16
-	vst1.64	{q2,q3},[sp,:256]!
-	vadd.u64	d13,d13,d10
-	vst1.64	{q2,q3}, [sp,:256]!
-	vshr.u64	d10,d13,#16
-	vst1.64	{q2,q3}, [sp,:256]!
-	vzip.16	d12,d13
-
-	mov	r8,r5
-	b	LNEON_tail_entry
-
-.align	4
-LNEON_tail:
-	vadd.u64	d12,d12,d10
-	vshr.u64	d10,d12,#16
-	vld1.64	{q8,q9}, [r6, :256]!
-	vadd.u64	d13,d13,d10
-	vld1.64	{q10,q11}, [r6, :256]!
-	vshr.u64	d10,d13,#16
-	vld1.64	{q12,q13}, [r6, :256]!
-	vzip.16	d12,d13
-
-LNEON_tail_entry:
-	vadd.u64	d14,d14,d10
-	vst1.32	{d12[0]}, [r7, :32]!
-	vshr.u64	d10,d14,#16
-	vadd.u64	d15,d15,d10
-	vshr.u64	d10,d15,#16
-	vzip.16	d14,d15
-	vadd.u64	d16,d16,d10
-	vst1.32	{d14[0]}, [r7, :32]!
-	vshr.u64	d10,d16,#16
-	vadd.u64	d17,d17,d10
-	vshr.u64	d10,d17,#16
-	vzip.16	d16,d17
-	vadd.u64	d18,d18,d10
-	vst1.32	{d16[0]}, [r7, :32]!
-	vshr.u64	d10,d18,#16
-	vadd.u64	d19,d19,d10
-	vshr.u64	d10,d19,#16
-	vzip.16	d18,d19
-	vadd.u64	d20,d20,d10
-	vst1.32	{d18[0]}, [r7, :32]!
-	vshr.u64	d10,d20,#16
-	vadd.u64	d21,d21,d10
-	vshr.u64	d10,d21,#16
-	vzip.16	d20,d21
-	vadd.u64	d22,d22,d10
-	vst1.32	{d20[0]}, [r7, :32]!
-	vshr.u64	d10,d22,#16
-	vadd.u64	d23,d23,d10
-	vshr.u64	d10,d23,#16
-	vzip.16	d22,d23
-	vadd.u64	d24,d24,d10
-	vst1.32	{d22[0]}, [r7, :32]!
-	vshr.u64	d10,d24,#16
-	vadd.u64	d25,d25,d10
-	vshr.u64	d10,d25,#16
-	vzip.16	d24,d25
-	vadd.u64	d26,d26,d10
-	vst1.32	{d24[0]}, [r7, :32]!
-	vshr.u64	d10,d26,#16
-	vadd.u64	d27,d27,d10
-	vshr.u64	d10,d27,#16
-	vzip.16	d26,d27
-	vld1.64	{q6,q7}, [r6, :256]!
-	subs	r8,r8,#8
-	vst1.32	{d26[0]},   [r7, :32]!
-	bne	LNEON_tail
-
-	vst1.32	{d10[0]}, [r7, :32]		@ top-most bit
-	sub	r3,r3,r5,lsl#2			@ rewind r3
-	subs	r1,sp,#0				@ clear carry flag
-	add	r2,sp,r5,lsl#2
-
-LNEON_sub:
-	ldmia	r1!, {r4,r5,r6,r7}
-	ldmia	r3!, {r8,r9,r10,r11}
-	sbcs	r8, r4,r8
-	sbcs	r9, r5,r9
-	sbcs	r10,r6,r10
-	sbcs	r11,r7,r11
-	teq	r1,r2				@ preserves carry
-	stmia	r0!, {r8,r9,r10,r11}
-	bne	LNEON_sub
-
-	ldr	r10, [r1]				@ load top-most bit
-	mov	r11,sp
-	veor	q0,q0,q0
-	sub	r11,r2,r11				@ this is num*4
-	veor	q1,q1,q1
-	mov	r1,sp
-	sub	r0,r0,r11				@ rewind r0
-	mov	r3,r2				@ second 3/4th of frame
-	sbcs	r10,r10,#0				@ result is carry flag
-
-LNEON_copy_n_zap:
-	ldmia	r1!, {r4,r5,r6,r7}
-	ldmia	r0,  {r8,r9,r10,r11}
-	it	cc
-	movcc	r8, r4
-	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
-	itt	cc
-	movcc	r9, r5
-	movcc	r10,r6
-	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
-	it	cc
-	movcc	r11,r7
-	ldmia	r1, {r4,r5,r6,r7}
-	stmia	r0!, {r8,r9,r10,r11}
-	sub	r1,r1,#16
-	ldmia	r0, {r8,r9,r10,r11}
-	it	cc
-	movcc	r8, r4
-	vst1.64	{q0,q1}, [r1,:256]!			@ wipe
-	itt	cc
-	movcc	r9, r5
-	movcc	r10,r6
-	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
-	it	cc
-	movcc	r11,r7
-	teq	r1,r2				@ preserves carry
-	stmia	r0!, {r8,r9,r10,r11}
-	bne	LNEON_copy_n_zap
-
-	mov	sp,ip
-	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
-	bx	lr						@ bx lr
-
-#endif
-.byte	77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#if __ARM_MAX_ARCH__>=7
-.comm	_OPENSSL_armcap_P,4
-.non_lazy_symbol_pointer
-OPENSSL_armcap_P:
-.indirect_symbol	_OPENSSL_armcap_P
-.long	0
-.private_extern	_OPENSSL_armcap_P
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/bsaes-armv7-apple.S b/apple-arm/crypto/fipsmodule/bsaes-armv7-apple.S
deleted file mode 100644
index 28cc6b3..0000000
--- a/apple-arm/crypto/fipsmodule/bsaes-armv7-apple.S
+++ /dev/null
@@ -1,1528 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
-@
-@ Licensed under the OpenSSL license (the "License").  You may not use
-@ this file except in compliance with the License.  You can obtain a copy
-@ in the file LICENSE in the source distribution or at
-@ https://www.openssl.org/source/license.html
-
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@
-@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
-@ of Linaro. Permission to use under GPL terms is granted.
-@ ====================================================================
-
-@ Bit-sliced AES for ARM NEON
-@
-@ February 2012.
-@
-@ This implementation is direct adaptation of bsaes-x86_64 module for
-@ ARM NEON. Except that this module is endian-neutral [in sense that
-@ it can be compiled for either endianness] by courtesy of vld1.8's
-@ neutrality. Initial version doesn't implement interface to OpenSSL,
-@ only low-level primitives and unsupported entry points, just enough
-@ to collect performance results, which for Cortex-A8 core are:
-@
-@ encrypt	19.5 cycles per byte processed with 128-bit key
-@ decrypt	22.1 cycles per byte processed with 128-bit key
-@ key conv.	440  cycles per 128-bit key/0.18 of 8x block
-@
-@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
-@ which is [much] worse than anticipated (for further details see
-@ http://www.openssl.org/~appro/Snapdragon-S4.html).
-@
-@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
-@ manages in 20.0 cycles].
-@
-@ When comparing to x86_64 results keep in mind that NEON unit is
-@ [mostly] single-issue and thus can't [fully] benefit from
-@ instruction-level parallelism. And when comparing to aes-armv4
-@ results keep in mind key schedule conversion overhead (see
-@ bsaes-x86_64.pl for further details)...
-@
-@						<appro@openssl.org>
-
-@ April-August 2013
-@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
-
-#ifndef __KERNEL__
-# include <openssl/arm_arch.h>
-
-# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
-# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
-# define VFP_ABI_FRAME	0x40
-#else
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-# define VFP_ABI_FRAME	0
-# define BSAES_ASM_EXTENDED_KEY
-# define XTS_CHAIN_TWEAK
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-#ifdef __thumb__
-# define adrl adr
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.text
-.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
-#if defined(__thumb2__) && !defined(__APPLE__)
-.thumb
-#else
-.code	32
-# undef __thumb2__
-#endif
-
-#ifdef __thumb2__
-.thumb_func	_bsaes_decrypt8
-#endif
-.align	4
-_bsaes_decrypt8:
-	adr	r6,.
-	vldmia	r4!, {q9}		@ round 0 key
-#if defined(__thumb2__) || defined(__APPLE__)
-	adr	r6,LM0ISR
-#else
-	add	r6,r6,#LM0ISR-_bsaes_decrypt8
-#endif
-
-	vldmia	r6!, {q8}		@ LM0ISR
-	veor	q10, q0, q9	@ xor with round0 key
-	veor	q11, q1, q9
-	vtbl.8	d0, {q10}, d16
-	vtbl.8	d1, {q10}, d17
-	veor	q12, q2, q9
-	vtbl.8	d2, {q11}, d16
-	vtbl.8	d3, {q11}, d17
-	veor	q13, q3, q9
-	vtbl.8	d4, {q12}, d16
-	vtbl.8	d5, {q12}, d17
-	veor	q14, q4, q9
-	vtbl.8	d6, {q13}, d16
-	vtbl.8	d7, {q13}, d17
-	veor	q15, q5, q9
-	vtbl.8	d8, {q14}, d16
-	vtbl.8	d9, {q14}, d17
-	veor	q10, q6, q9
-	vtbl.8	d10, {q15}, d16
-	vtbl.8	d11, {q15}, d17
-	veor	q11, q7, q9
-	vtbl.8	d12, {q10}, d16
-	vtbl.8	d13, {q10}, d17
-	vtbl.8	d14, {q11}, d16
-	vtbl.8	d15, {q11}, d17
-	vmov.i8	q8,#0x55			@ compose LBS0
-	vmov.i8	q9,#0x33			@ compose LBS1
-	vshr.u64	q10, q6, #1
-	vshr.u64	q11, q4, #1
-	veor	q10, q10, q7
-	veor	q11, q11, q5
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #1
-	veor	q5, q5, q11
-	vshl.u64	q11, q11, #1
-	veor	q6, q6, q10
-	veor	q4, q4, q11
-	vshr.u64	q10, q2, #1
-	vshr.u64	q11, q0, #1
-	veor	q10, q10, q3
-	veor	q11, q11, q1
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q3, q3, q10
-	vshl.u64	q10, q10, #1
-	veor	q1, q1, q11
-	vshl.u64	q11, q11, #1
-	veor	q2, q2, q10
-	veor	q0, q0, q11
-	vmov.i8	q8,#0x0f			@ compose LBS2
-	vshr.u64	q10, q5, #2
-	vshr.u64	q11, q4, #2
-	veor	q10, q10, q7
-	veor	q11, q11, q6
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #2
-	veor	q6, q6, q11
-	vshl.u64	q11, q11, #2
-	veor	q5, q5, q10
-	veor	q4, q4, q11
-	vshr.u64	q10, q1, #2
-	vshr.u64	q11, q0, #2
-	veor	q10, q10, q3
-	veor	q11, q11, q2
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q3, q3, q10
-	vshl.u64	q10, q10, #2
-	veor	q2, q2, q11
-	vshl.u64	q11, q11, #2
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	vshr.u64	q10, q3, #4
-	vshr.u64	q11, q2, #4
-	veor	q10, q10, q7
-	veor	q11, q11, q6
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #4
-	veor	q6, q6, q11
-	vshl.u64	q11, q11, #4
-	veor	q3, q3, q10
-	veor	q2, q2, q11
-	vshr.u64	q10, q1, #4
-	vshr.u64	q11, q0, #4
-	veor	q10, q10, q5
-	veor	q11, q11, q4
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #4
-	veor	q4, q4, q11
-	vshl.u64	q11, q11, #4
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	sub	r5,r5,#1
-	b	Ldec_sbox
-.align	4
-Ldec_loop:
-	vldmia	r4!, {q8,q9,q10,q11}
-	veor	q8, q8, q0
-	veor	q9, q9, q1
-	vtbl.8	d0, {q8}, d24
-	vtbl.8	d1, {q8}, d25
-	vldmia	r4!, {q8}
-	veor	q10, q10, q2
-	vtbl.8	d2, {q9}, d24
-	vtbl.8	d3, {q9}, d25
-	vldmia	r4!, {q9}
-	veor	q11, q11, q3
-	vtbl.8	d4, {q10}, d24
-	vtbl.8	d5, {q10}, d25
-	vldmia	r4!, {q10}
-	vtbl.8	d6, {q11}, d24
-	vtbl.8	d7, {q11}, d25
-	vldmia	r4!, {q11}
-	veor	q8, q8, q4
-	veor	q9, q9, q5
-	vtbl.8	d8, {q8}, d24
-	vtbl.8	d9, {q8}, d25
-	veor	q10, q10, q6
-	vtbl.8	d10, {q9}, d24
-	vtbl.8	d11, {q9}, d25
-	veor	q11, q11, q7
-	vtbl.8	d12, {q10}, d24
-	vtbl.8	d13, {q10}, d25
-	vtbl.8	d14, {q11}, d24
-	vtbl.8	d15, {q11}, d25
-Ldec_sbox:
-	veor	q1, q1, q4
-	veor	q3, q3, q4
-
-	veor	q4, q4, q7
-	veor	q1, q1, q6
-	veor	q2, q2, q7
-	veor	q6, q6, q4
-
-	veor	q0, q0, q1
-	veor	q2, q2, q5
-	veor	q7, q7, q6
-	veor	q3, q3, q0
-	veor	q5, q5, q0
-	veor	q1, q1, q3
-	veor	q11, q3, q0
-	veor	q10, q7, q4
-	veor	q9, q1, q6
-	veor	q13, q4, q0
-	vmov	q8, q10
-	veor	q12, q5, q2
-
-	vorr	q10, q10, q9
-	veor	q15, q11, q8
-	vand	q14, q11, q12
-	vorr	q11, q11, q12
-	veor	q12, q12, q9
-	vand	q8, q8, q9
-	veor	q9, q6, q2
-	vand	q15, q15, q12
-	vand	q13, q13, q9
-	veor	q9, q3, q7
-	veor	q12, q1, q5
-	veor	q11, q11, q13
-	veor	q10, q10, q13
-	vand	q13, q9, q12
-	vorr	q9, q9, q12
-	veor	q11, q11, q15
-	veor	q8, q8, q13
-	veor	q10, q10, q14
-	veor	q9, q9, q15
-	veor	q8, q8, q14
-	vand	q12, q4, q6
-	veor	q9, q9, q14
-	vand	q13, q0, q2
-	vand	q14, q7, q1
-	vorr	q15, q3, q5
-	veor	q11, q11, q12
-	veor	q9, q9, q14
-	veor	q8, q8, q15
-	veor	q10, q10, q13
-
-	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
-
-	@ new smaller inversion
-
-	vand	q14, q11, q9
-	vmov	q12, q8
-
-	veor	q13, q10, q14
-	veor	q15, q8, q14
-	veor	q14, q8, q14	@ q14=q15
-
-	vbsl	q13, q9, q8
-	vbsl	q15, q11, q10
-	veor	q11, q11, q10
-
-	vbsl	q12, q13, q14
-	vbsl	q8, q14, q13
-
-	vand	q14, q12, q15
-	veor	q9, q9, q8
-
-	veor	q14, q14, q11
-	veor	q12, q5, q2
-	veor	q8, q1, q6
-	veor	q10, q15, q14
-	vand	q10, q10, q5
-	veor	q5, q5, q1
-	vand	q11, q1, q15
-	vand	q5, q5, q14
-	veor	q1, q11, q10
-	veor	q5, q5, q11
-	veor	q15, q15, q13
-	veor	q14, q14, q9
-	veor	q11, q15, q14
-	veor	q10, q13, q9
-	vand	q11, q11, q12
-	vand	q10, q10, q2
-	veor	q12, q12, q8
-	veor	q2, q2, q6
-	vand	q8, q8, q15
-	vand	q6, q6, q13
-	vand	q12, q12, q14
-	vand	q2, q2, q9
-	veor	q8, q8, q12
-	veor	q2, q2, q6
-	veor	q12, q12, q11
-	veor	q6, q6, q10
-	veor	q5, q5, q12
-	veor	q2, q2, q12
-	veor	q1, q1, q8
-	veor	q6, q6, q8
-
-	veor	q12, q3, q0
-	veor	q8, q7, q4
-	veor	q11, q15, q14
-	veor	q10, q13, q9
-	vand	q11, q11, q12
-	vand	q10, q10, q0
-	veor	q12, q12, q8
-	veor	q0, q0, q4
-	vand	q8, q8, q15
-	vand	q4, q4, q13
-	vand	q12, q12, q14
-	vand	q0, q0, q9
-	veor	q8, q8, q12
-	veor	q0, q0, q4
-	veor	q12, q12, q11
-	veor	q4, q4, q10
-	veor	q15, q15, q13
-	veor	q14, q14, q9
-	veor	q10, q15, q14
-	vand	q10, q10, q3
-	veor	q3, q3, q7
-	vand	q11, q7, q15
-	vand	q3, q3, q14
-	veor	q7, q11, q10
-	veor	q3, q3, q11
-	veor	q3, q3, q12
-	veor	q0, q0, q12
-	veor	q7, q7, q8
-	veor	q4, q4, q8
-	veor	q1, q1, q7
-	veor	q6, q6, q5
-
-	veor	q4, q4, q1
-	veor	q2, q2, q7
-	veor	q5, q5, q7
-	veor	q4, q4, q2
-	veor	q7, q7, q0
-	veor	q4, q4, q5
-	veor	q3, q3, q6
-	veor	q6, q6, q1
-	veor	q3, q3, q4
-
-	veor	q4, q4, q0
-	veor	q7, q7, q3
-	subs	r5,r5,#1
-	bcc	Ldec_done
-	@ multiplication by 0x05-0x00-0x04-0x00
-	vext.8	q8, q0, q0, #8
-	vext.8	q14, q3, q3, #8
-	vext.8	q15, q5, q5, #8
-	veor	q8, q8, q0
-	vext.8	q9, q1, q1, #8
-	veor	q14, q14, q3
-	vext.8	q10, q6, q6, #8
-	veor	q15, q15, q5
-	vext.8	q11, q4, q4, #8
-	veor	q9, q9, q1
-	vext.8	q12, q2, q2, #8
-	veor	q10, q10, q6
-	vext.8	q13, q7, q7, #8
-	veor	q11, q11, q4
-	veor	q12, q12, q2
-	veor	q13, q13, q7
-
-	veor	q0, q0, q14
-	veor	q1, q1, q14
-	veor	q6, q6, q8
-	veor	q2, q2, q10
-	veor	q4, q4, q9
-	veor	q1, q1, q15
-	veor	q6, q6, q15
-	veor	q2, q2, q14
-	veor	q7, q7, q11
-	veor	q4, q4, q14
-	veor	q3, q3, q12
-	veor	q2, q2, q15
-	veor	q7, q7, q15
-	veor	q5, q5, q13
-	vext.8	q8, q0, q0, #12	@ x0 <<< 32
-	vext.8	q9, q1, q1, #12
-	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
-	vext.8	q10, q6, q6, #12
-	veor	q1, q1, q9
-	vext.8	q11, q4, q4, #12
-	veor	q6, q6, q10
-	vext.8	q12, q2, q2, #12
-	veor	q4, q4, q11
-	vext.8	q13, q7, q7, #12
-	veor	q2, q2, q12
-	vext.8	q14, q3, q3, #12
-	veor	q7, q7, q13
-	vext.8	q15, q5, q5, #12
-	veor	q3, q3, q14
-
-	veor	q9, q9, q0
-	veor	q5, q5, q15
-	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
-	veor	q10, q10, q1
-	veor	q8, q8, q5
-	veor	q9, q9, q5
-	vext.8	q1, q1, q1, #8
-	veor	q13, q13, q2
-	veor	q0, q0, q8
-	veor	q14, q14, q7
-	veor	q1, q1, q9
-	vext.8	q8, q2, q2, #8
-	veor	q12, q12, q4
-	vext.8	q9, q7, q7, #8
-	veor	q15, q15, q3
-	vext.8	q2, q4, q4, #8
-	veor	q11, q11, q6
-	vext.8	q7, q5, q5, #8
-	veor	q12, q12, q5
-	vext.8	q4, q3, q3, #8
-	veor	q11, q11, q5
-	vext.8	q3, q6, q6, #8
-	veor	q5, q9, q13
-	veor	q11, q11, q2
-	veor	q7, q7, q15
-	veor	q6, q4, q14
-	veor	q4, q8, q12
-	veor	q2, q3, q10
-	vmov	q3, q11
-	 @ vmov	q5, q9
-	vldmia	r6, {q12}		@ LISR
-	ite	eq				@ Thumb2 thing, sanity check in ARM
-	addeq	r6,r6,#0x10
-	bne	Ldec_loop
-	vldmia	r6, {q12}		@ LISRM0
-	b	Ldec_loop
-.align	4
-Ldec_done:
-	vmov.i8	q8,#0x55			@ compose LBS0
-	vmov.i8	q9,#0x33			@ compose LBS1
-	vshr.u64	q10, q3, #1
-	vshr.u64	q11, q2, #1
-	veor	q10, q10, q5
-	veor	q11, q11, q7
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #1
-	veor	q7, q7, q11
-	vshl.u64	q11, q11, #1
-	veor	q3, q3, q10
-	veor	q2, q2, q11
-	vshr.u64	q10, q6, #1
-	vshr.u64	q11, q0, #1
-	veor	q10, q10, q4
-	veor	q11, q11, q1
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q4, q4, q10
-	vshl.u64	q10, q10, #1
-	veor	q1, q1, q11
-	vshl.u64	q11, q11, #1
-	veor	q6, q6, q10
-	veor	q0, q0, q11
-	vmov.i8	q8,#0x0f			@ compose LBS2
-	vshr.u64	q10, q7, #2
-	vshr.u64	q11, q2, #2
-	veor	q10, q10, q5
-	veor	q11, q11, q3
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #2
-	veor	q3, q3, q11
-	vshl.u64	q11, q11, #2
-	veor	q7, q7, q10
-	veor	q2, q2, q11
-	vshr.u64	q10, q1, #2
-	vshr.u64	q11, q0, #2
-	veor	q10, q10, q4
-	veor	q11, q11, q6
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q4, q4, q10
-	vshl.u64	q10, q10, #2
-	veor	q6, q6, q11
-	vshl.u64	q11, q11, #2
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	vshr.u64	q10, q4, #4
-	vshr.u64	q11, q6, #4
-	veor	q10, q10, q5
-	veor	q11, q11, q3
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #4
-	veor	q3, q3, q11
-	vshl.u64	q11, q11, #4
-	veor	q4, q4, q10
-	veor	q6, q6, q11
-	vshr.u64	q10, q1, #4
-	vshr.u64	q11, q0, #4
-	veor	q10, q10, q7
-	veor	q11, q11, q2
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #4
-	veor	q2, q2, q11
-	vshl.u64	q11, q11, #4
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	vldmia	r4, {q8}			@ last round key
-	veor	q6, q6, q8
-	veor	q4, q4, q8
-	veor	q2, q2, q8
-	veor	q7, q7, q8
-	veor	q3, q3, q8
-	veor	q5, q5, q8
-	veor	q0, q0, q8
-	veor	q1, q1, q8
-	bx	lr
-
-
-
-.align	6
-_bsaes_const:
-LM0ISR:@ InvShiftRows constants
-.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
-LISR:
-.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
-LISRM0:
-.quad	0x01040b0e0205080f, 0x0306090c00070a0d
-LM0SR:@ ShiftRows constants
-.quad	0x0a0e02060f03070b, 0x0004080c05090d01
-LSR:
-.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
-LSRM0:
-.quad	0x0304090e00050a0f, 0x01060b0c0207080d
-LM0:
-.quad	0x02060a0e03070b0f, 0x0004080c0105090d
-LREVM0SR:
-.quad	0x090d01050c000408, 0x03070b0f060a0e02
-.byte	66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	6
-
-
-#ifdef __thumb2__
-.thumb_func	_bsaes_encrypt8
-#endif
-.align	4
-_bsaes_encrypt8:
-	adr	r6,.
-	vldmia	r4!, {q9}		@ round 0 key
-#if defined(__thumb2__) || defined(__APPLE__)
-	adr	r6,LM0SR
-#else
-	sub	r6,r6,#_bsaes_encrypt8-LM0SR
-#endif
-
-	vldmia	r6!, {q8}		@ LM0SR
-_bsaes_encrypt8_alt:
-	veor	q10, q0, q9	@ xor with round0 key
-	veor	q11, q1, q9
-	vtbl.8	d0, {q10}, d16
-	vtbl.8	d1, {q10}, d17
-	veor	q12, q2, q9
-	vtbl.8	d2, {q11}, d16
-	vtbl.8	d3, {q11}, d17
-	veor	q13, q3, q9
-	vtbl.8	d4, {q12}, d16
-	vtbl.8	d5, {q12}, d17
-	veor	q14, q4, q9
-	vtbl.8	d6, {q13}, d16
-	vtbl.8	d7, {q13}, d17
-	veor	q15, q5, q9
-	vtbl.8	d8, {q14}, d16
-	vtbl.8	d9, {q14}, d17
-	veor	q10, q6, q9
-	vtbl.8	d10, {q15}, d16
-	vtbl.8	d11, {q15}, d17
-	veor	q11, q7, q9
-	vtbl.8	d12, {q10}, d16
-	vtbl.8	d13, {q10}, d17
-	vtbl.8	d14, {q11}, d16
-	vtbl.8	d15, {q11}, d17
-_bsaes_encrypt8_bitslice:
-	vmov.i8	q8,#0x55			@ compose LBS0
-	vmov.i8	q9,#0x33			@ compose LBS1
-	vshr.u64	q10, q6, #1
-	vshr.u64	q11, q4, #1
-	veor	q10, q10, q7
-	veor	q11, q11, q5
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #1
-	veor	q5, q5, q11
-	vshl.u64	q11, q11, #1
-	veor	q6, q6, q10
-	veor	q4, q4, q11
-	vshr.u64	q10, q2, #1
-	vshr.u64	q11, q0, #1
-	veor	q10, q10, q3
-	veor	q11, q11, q1
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q3, q3, q10
-	vshl.u64	q10, q10, #1
-	veor	q1, q1, q11
-	vshl.u64	q11, q11, #1
-	veor	q2, q2, q10
-	veor	q0, q0, q11
-	vmov.i8	q8,#0x0f			@ compose LBS2
-	vshr.u64	q10, q5, #2
-	vshr.u64	q11, q4, #2
-	veor	q10, q10, q7
-	veor	q11, q11, q6
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #2
-	veor	q6, q6, q11
-	vshl.u64	q11, q11, #2
-	veor	q5, q5, q10
-	veor	q4, q4, q11
-	vshr.u64	q10, q1, #2
-	vshr.u64	q11, q0, #2
-	veor	q10, q10, q3
-	veor	q11, q11, q2
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q3, q3, q10
-	vshl.u64	q10, q10, #2
-	veor	q2, q2, q11
-	vshl.u64	q11, q11, #2
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	vshr.u64	q10, q3, #4
-	vshr.u64	q11, q2, #4
-	veor	q10, q10, q7
-	veor	q11, q11, q6
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #4
-	veor	q6, q6, q11
-	vshl.u64	q11, q11, #4
-	veor	q3, q3, q10
-	veor	q2, q2, q11
-	vshr.u64	q10, q1, #4
-	vshr.u64	q11, q0, #4
-	veor	q10, q10, q5
-	veor	q11, q11, q4
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #4
-	veor	q4, q4, q11
-	vshl.u64	q11, q11, #4
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	sub	r5,r5,#1
-	b	Lenc_sbox
-.align	4
-Lenc_loop:
-	vldmia	r4!, {q8,q9,q10,q11}
-	veor	q8, q8, q0
-	veor	q9, q9, q1
-	vtbl.8	d0, {q8}, d24
-	vtbl.8	d1, {q8}, d25
-	vldmia	r4!, {q8}
-	veor	q10, q10, q2
-	vtbl.8	d2, {q9}, d24
-	vtbl.8	d3, {q9}, d25
-	vldmia	r4!, {q9}
-	veor	q11, q11, q3
-	vtbl.8	d4, {q10}, d24
-	vtbl.8	d5, {q10}, d25
-	vldmia	r4!, {q10}
-	vtbl.8	d6, {q11}, d24
-	vtbl.8	d7, {q11}, d25
-	vldmia	r4!, {q11}
-	veor	q8, q8, q4
-	veor	q9, q9, q5
-	vtbl.8	d8, {q8}, d24
-	vtbl.8	d9, {q8}, d25
-	veor	q10, q10, q6
-	vtbl.8	d10, {q9}, d24
-	vtbl.8	d11, {q9}, d25
-	veor	q11, q11, q7
-	vtbl.8	d12, {q10}, d24
-	vtbl.8	d13, {q10}, d25
-	vtbl.8	d14, {q11}, d24
-	vtbl.8	d15, {q11}, d25
-Lenc_sbox:
-	veor	q2, q2, q1
-	veor	q5, q5, q6
-	veor	q3, q3, q0
-	veor	q6, q6, q2
-	veor	q5, q5, q0
-
-	veor	q6, q6, q3
-	veor	q3, q3, q7
-	veor	q7, q7, q5
-	veor	q3, q3, q4
-	veor	q4, q4, q5
-
-	veor	q2, q2, q7
-	veor	q3, q3, q1
-	veor	q1, q1, q5
-	veor	q11, q7, q4
-	veor	q10, q1, q2
-	veor	q9, q5, q3
-	veor	q13, q2, q4
-	vmov	q8, q10
-	veor	q12, q6, q0
-
-	vorr	q10, q10, q9
-	veor	q15, q11, q8
-	vand	q14, q11, q12
-	vorr	q11, q11, q12
-	veor	q12, q12, q9
-	vand	q8, q8, q9
-	veor	q9, q3, q0
-	vand	q15, q15, q12
-	vand	q13, q13, q9
-	veor	q9, q7, q1
-	veor	q12, q5, q6
-	veor	q11, q11, q13
-	veor	q10, q10, q13
-	vand	q13, q9, q12
-	vorr	q9, q9, q12
-	veor	q11, q11, q15
-	veor	q8, q8, q13
-	veor	q10, q10, q14
-	veor	q9, q9, q15
-	veor	q8, q8, q14
-	vand	q12, q2, q3
-	veor	q9, q9, q14
-	vand	q13, q4, q0
-	vand	q14, q1, q5
-	vorr	q15, q7, q6
-	veor	q11, q11, q12
-	veor	q9, q9, q14
-	veor	q8, q8, q15
-	veor	q10, q10, q13
-
-	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
-
-	@ new smaller inversion
-
-	vand	q14, q11, q9
-	vmov	q12, q8
-
-	veor	q13, q10, q14
-	veor	q15, q8, q14
-	veor	q14, q8, q14	@ q14=q15
-
-	vbsl	q13, q9, q8
-	vbsl	q15, q11, q10
-	veor	q11, q11, q10
-
-	vbsl	q12, q13, q14
-	vbsl	q8, q14, q13
-
-	vand	q14, q12, q15
-	veor	q9, q9, q8
-
-	veor	q14, q14, q11
-	veor	q12, q6, q0
-	veor	q8, q5, q3
-	veor	q10, q15, q14
-	vand	q10, q10, q6
-	veor	q6, q6, q5
-	vand	q11, q5, q15
-	vand	q6, q6, q14
-	veor	q5, q11, q10
-	veor	q6, q6, q11
-	veor	q15, q15, q13
-	veor	q14, q14, q9
-	veor	q11, q15, q14
-	veor	q10, q13, q9
-	vand	q11, q11, q12
-	vand	q10, q10, q0
-	veor	q12, q12, q8
-	veor	q0, q0, q3
-	vand	q8, q8, q15
-	vand	q3, q3, q13
-	vand	q12, q12, q14
-	vand	q0, q0, q9
-	veor	q8, q8, q12
-	veor	q0, q0, q3
-	veor	q12, q12, q11
-	veor	q3, q3, q10
-	veor	q6, q6, q12
-	veor	q0, q0, q12
-	veor	q5, q5, q8
-	veor	q3, q3, q8
-
-	veor	q12, q7, q4
-	veor	q8, q1, q2
-	veor	q11, q15, q14
-	veor	q10, q13, q9
-	vand	q11, q11, q12
-	vand	q10, q10, q4
-	veor	q12, q12, q8
-	veor	q4, q4, q2
-	vand	q8, q8, q15
-	vand	q2, q2, q13
-	vand	q12, q12, q14
-	vand	q4, q4, q9
-	veor	q8, q8, q12
-	veor	q4, q4, q2
-	veor	q12, q12, q11
-	veor	q2, q2, q10
-	veor	q15, q15, q13
-	veor	q14, q14, q9
-	veor	q10, q15, q14
-	vand	q10, q10, q7
-	veor	q7, q7, q1
-	vand	q11, q1, q15
-	vand	q7, q7, q14
-	veor	q1, q11, q10
-	veor	q7, q7, q11
-	veor	q7, q7, q12
-	veor	q4, q4, q12
-	veor	q1, q1, q8
-	veor	q2, q2, q8
-	veor	q7, q7, q0
-	veor	q1, q1, q6
-	veor	q6, q6, q0
-	veor	q4, q4, q7
-	veor	q0, q0, q1
-
-	veor	q1, q1, q5
-	veor	q5, q5, q2
-	veor	q2, q2, q3
-	veor	q3, q3, q5
-	veor	q4, q4, q5
-
-	veor	q6, q6, q3
-	subs	r5,r5,#1
-	bcc	Lenc_done
-	vext.8	q8, q0, q0, #12	@ x0 <<< 32
-	vext.8	q9, q1, q1, #12
-	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
-	vext.8	q10, q4, q4, #12
-	veor	q1, q1, q9
-	vext.8	q11, q6, q6, #12
-	veor	q4, q4, q10
-	vext.8	q12, q3, q3, #12
-	veor	q6, q6, q11
-	vext.8	q13, q7, q7, #12
-	veor	q3, q3, q12
-	vext.8	q14, q2, q2, #12
-	veor	q7, q7, q13
-	vext.8	q15, q5, q5, #12
-	veor	q2, q2, q14
-
-	veor	q9, q9, q0
-	veor	q5, q5, q15
-	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
-	veor	q10, q10, q1
-	veor	q8, q8, q5
-	veor	q9, q9, q5
-	vext.8	q1, q1, q1, #8
-	veor	q13, q13, q3
-	veor	q0, q0, q8
-	veor	q14, q14, q7
-	veor	q1, q1, q9
-	vext.8	q8, q3, q3, #8
-	veor	q12, q12, q6
-	vext.8	q9, q7, q7, #8
-	veor	q15, q15, q2
-	vext.8	q3, q6, q6, #8
-	veor	q11, q11, q4
-	vext.8	q7, q5, q5, #8
-	veor	q12, q12, q5
-	vext.8	q6, q2, q2, #8
-	veor	q11, q11, q5
-	vext.8	q2, q4, q4, #8
-	veor	q5, q9, q13
-	veor	q4, q8, q12
-	veor	q3, q3, q11
-	veor	q7, q7, q15
-	veor	q6, q6, q14
-	 @ vmov	q4, q8
-	veor	q2, q2, q10
-	 @ vmov	q5, q9
-	vldmia	r6, {q12}		@ LSR
-	ite	eq				@ Thumb2 thing, samity check in ARM
-	addeq	r6,r6,#0x10
-	bne	Lenc_loop
-	vldmia	r6, {q12}		@ LSRM0
-	b	Lenc_loop
-.align	4
-Lenc_done:
-	vmov.i8	q8,#0x55			@ compose LBS0
-	vmov.i8	q9,#0x33			@ compose LBS1
-	vshr.u64	q10, q2, #1
-	vshr.u64	q11, q3, #1
-	veor	q10, q10, q5
-	veor	q11, q11, q7
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #1
-	veor	q7, q7, q11
-	vshl.u64	q11, q11, #1
-	veor	q2, q2, q10
-	veor	q3, q3, q11
-	vshr.u64	q10, q4, #1
-	vshr.u64	q11, q0, #1
-	veor	q10, q10, q6
-	veor	q11, q11, q1
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q6, q6, q10
-	vshl.u64	q10, q10, #1
-	veor	q1, q1, q11
-	vshl.u64	q11, q11, #1
-	veor	q4, q4, q10
-	veor	q0, q0, q11
-	vmov.i8	q8,#0x0f			@ compose LBS2
-	vshr.u64	q10, q7, #2
-	vshr.u64	q11, q3, #2
-	veor	q10, q10, q5
-	veor	q11, q11, q2
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #2
-	veor	q2, q2, q11
-	vshl.u64	q11, q11, #2
-	veor	q7, q7, q10
-	veor	q3, q3, q11
-	vshr.u64	q10, q1, #2
-	vshr.u64	q11, q0, #2
-	veor	q10, q10, q6
-	veor	q11, q11, q4
-	vand	q10, q10, q9
-	vand	q11, q11, q9
-	veor	q6, q6, q10
-	vshl.u64	q10, q10, #2
-	veor	q4, q4, q11
-	vshl.u64	q11, q11, #2
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	vshr.u64	q10, q6, #4
-	vshr.u64	q11, q4, #4
-	veor	q10, q10, q5
-	veor	q11, q11, q2
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q5, q5, q10
-	vshl.u64	q10, q10, #4
-	veor	q2, q2, q11
-	vshl.u64	q11, q11, #4
-	veor	q6, q6, q10
-	veor	q4, q4, q11
-	vshr.u64	q10, q1, #4
-	vshr.u64	q11, q0, #4
-	veor	q10, q10, q7
-	veor	q11, q11, q3
-	vand	q10, q10, q8
-	vand	q11, q11, q8
-	veor	q7, q7, q10
-	vshl.u64	q10, q10, #4
-	veor	q3, q3, q11
-	vshl.u64	q11, q11, #4
-	veor	q1, q1, q10
-	veor	q0, q0, q11
-	vldmia	r4, {q8}			@ last round key
-	veor	q4, q4, q8
-	veor	q6, q6, q8
-	veor	q3, q3, q8
-	veor	q7, q7, q8
-	veor	q2, q2, q8
-	veor	q5, q5, q8
-	veor	q0, q0, q8
-	veor	q1, q1, q8
-	bx	lr
-
-#ifdef __thumb2__
-.thumb_func	_bsaes_key_convert
-#endif
-.align	4
-_bsaes_key_convert:
-	adr	r6,.
-	vld1.8	{q7},  [r4]!		@ load round 0 key
-#if defined(__thumb2__) || defined(__APPLE__)
-	adr	r6,LM0
-#else
-	sub	r6,r6,#_bsaes_key_convert-LM0
-#endif
-	vld1.8	{q15}, [r4]!		@ load round 1 key
-
-	vmov.i8	q8,  #0x01			@ bit masks
-	vmov.i8	q9,  #0x02
-	vmov.i8	q10, #0x04
-	vmov.i8	q11, #0x08
-	vmov.i8	q12, #0x10
-	vmov.i8	q13, #0x20
-	vldmia	r6, {q14}		@ LM0
-
-#ifdef __ARMEL__
-	vrev32.8	q7,  q7
-	vrev32.8	q15, q15
-#endif
-	sub	r5,r5,#1
-	vstmia	r12!, {q7}		@ save round 0 key
-	b	Lkey_loop
-
-.align	4
-Lkey_loop:
-	vtbl.8	d14,{q15},d28
-	vtbl.8	d15,{q15},d29
-	vmov.i8	q6,  #0x40
-	vmov.i8	q15, #0x80
-
-	vtst.8	q0, q7, q8
-	vtst.8	q1, q7, q9
-	vtst.8	q2, q7, q10
-	vtst.8	q3, q7, q11
-	vtst.8	q4, q7, q12
-	vtst.8	q5, q7, q13
-	vtst.8	q6, q7, q6
-	vtst.8	q7, q7, q15
-	vld1.8	{q15}, [r4]!		@ load next round key
-	vmvn	q0, q0		@ "pnot"
-	vmvn	q1, q1
-	vmvn	q5, q5
-	vmvn	q6, q6
-#ifdef __ARMEL__
-	vrev32.8	q15, q15
-#endif
-	subs	r5,r5,#1
-	vstmia	r12!,{q0,q1,q2,q3,q4,q5,q6,q7}		@ write bit-sliced round key
-	bne	Lkey_loop
-
-	vmov.i8	q7,#0x63			@ compose L63
-	@ don't save last round key
-	bx	lr
-
-.globl	_bsaes_cbc_encrypt
-.private_extern	_bsaes_cbc_encrypt
-#ifdef __thumb2__
-.thumb_func	_bsaes_cbc_encrypt
-#endif
-.align	5
-_bsaes_cbc_encrypt:
-	@ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
-	@ short inputs. We patch this out, using bsaes for all input sizes.
-
-	@ it is up to the caller to make sure we are called with enc == 0
-
-	mov	ip, sp
-	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
-	VFP_ABI_PUSH
-	ldr	r8, [ip]			@ IV is 1st arg on the stack
-	mov	r2, r2, lsr#4		@ len in 16 byte blocks
-	sub	sp, #0x10			@ scratch space to carry over the IV
-	mov	r9, sp				@ save sp
-
-	ldr	r10, [r3, #240]		@ get # of rounds
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	@ allocate the key schedule on the stack
-	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
-	add	r12, #96			@ sifze of bit-slices key schedule
-
-	@ populate the key schedule
-	mov	r4, r3			@ pass key
-	mov	r5, r10			@ pass # of rounds
-	mov	sp, r12				@ sp is sp
-	bl	_bsaes_key_convert
-	vldmia	sp, {q6}
-	vstmia	r12,  {q15}		@ save last round key
-	veor	q7, q7, q6	@ fix up round 0 key
-	vstmia	sp, {q7}
-#else
-	ldr	r12, [r3, #244]
-	eors	r12, #1
-	beq	0f
-
-	@ populate the key schedule
-	str	r12, [r3, #244]
-	mov	r4, r3			@ pass key
-	mov	r5, r10			@ pass # of rounds
-	add	r12, r3, #248			@ pass key schedule
-	bl	_bsaes_key_convert
-	add	r4, r3, #248
-	vldmia	r4, {q6}
-	vstmia	r12, {q15}			@ save last round key
-	veor	q7, q7, q6	@ fix up round 0 key
-	vstmia	r4, {q7}
-
-.align	2
-
-#endif
-
-	vld1.8	{q15}, [r8]		@ load IV
-	b	Lcbc_dec_loop
-
-.align	4
-Lcbc_dec_loop:
-	subs	r2, r2, #0x8
-	bmi	Lcbc_dec_loop_finish
-
-	vld1.8	{q0,q1}, [r0]!	@ load input
-	vld1.8	{q2,q3}, [r0]!
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	mov	r4, sp			@ pass the key
-#else
-	add	r4, r3, #248
-#endif
-	vld1.8	{q4,q5}, [r0]!
-	mov	r5, r10
-	vld1.8	{q6,q7}, [r0]
-	sub	r0, r0, #0x60
-	vstmia	r9, {q15}			@ put aside IV
-
-	bl	_bsaes_decrypt8
-
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8,q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10,q11}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q12,q13}, [r0]!
-	veor	q4, q4, q10
-	veor	q2, q2, q11
-	vld1.8	{q14,q15}, [r0]!
-	veor	q7, q7, q12
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	veor	q3, q3, q13
-	vst1.8	{q6}, [r1]!
-	veor	q5, q5, q14
-	vst1.8	{q4}, [r1]!
-	vst1.8	{q2}, [r1]!
-	vst1.8	{q7}, [r1]!
-	vst1.8	{q3}, [r1]!
-	vst1.8	{q5}, [r1]!
-
-	b	Lcbc_dec_loop
-
-Lcbc_dec_loop_finish:
-	adds	r2, r2, #8
-	beq	Lcbc_dec_done
-
-	@ Set up most parameters for the _bsaes_decrypt8 call.
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	mov	r4, sp			@ pass the key
-#else
-	add	r4, r3, #248
-#endif
-	mov	r5, r10
-	vstmia	r9, {q15}			@ put aside IV
-
-	vld1.8	{q0}, [r0]!		@ load input
-	cmp	r2, #2
-	blo	Lcbc_dec_one
-	vld1.8	{q1}, [r0]!
-	beq	Lcbc_dec_two
-	vld1.8	{q2}, [r0]!
-	cmp	r2, #4
-	blo	Lcbc_dec_three
-	vld1.8	{q3}, [r0]!
-	beq	Lcbc_dec_four
-	vld1.8	{q4}, [r0]!
-	cmp	r2, #6
-	blo	Lcbc_dec_five
-	vld1.8	{q5}, [r0]!
-	beq	Lcbc_dec_six
-	vld1.8	{q6}, [r0]!
-	sub	r0, r0, #0x70
-
-	bl	_bsaes_decrypt8
-
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8,q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10,q11}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q12,q13}, [r0]!
-	veor	q4, q4, q10
-	veor	q2, q2, q11
-	vld1.8	{q15}, [r0]!
-	veor	q7, q7, q12
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	veor	q3, q3, q13
-	vst1.8	{q6}, [r1]!
-	vst1.8	{q4}, [r1]!
-	vst1.8	{q2}, [r1]!
-	vst1.8	{q7}, [r1]!
-	vst1.8	{q3}, [r1]!
-	b	Lcbc_dec_done
-.align	4
-Lcbc_dec_six:
-	sub	r0, r0, #0x60
-	bl	_bsaes_decrypt8
-	vldmia	r9,{q14}			@ reload IV
-	vld1.8	{q8,q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10,q11}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q12}, [r0]!
-	veor	q4, q4, q10
-	veor	q2, q2, q11
-	vld1.8	{q15}, [r0]!
-	veor	q7, q7, q12
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	vst1.8	{q6}, [r1]!
-	vst1.8	{q4}, [r1]!
-	vst1.8	{q2}, [r1]!
-	vst1.8	{q7}, [r1]!
-	b	Lcbc_dec_done
-.align	4
-Lcbc_dec_five:
-	sub	r0, r0, #0x50
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8,q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10,q11}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q15}, [r0]!
-	veor	q4, q4, q10
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	veor	q2, q2, q11
-	vst1.8	{q6}, [r1]!
-	vst1.8	{q4}, [r1]!
-	vst1.8	{q2}, [r1]!
-	b	Lcbc_dec_done
-.align	4
-Lcbc_dec_four:
-	sub	r0, r0, #0x40
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8,q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q15}, [r0]!
-	veor	q4, q4, q10
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	vst1.8	{q6}, [r1]!
-	vst1.8	{q4}, [r1]!
-	b	Lcbc_dec_done
-.align	4
-Lcbc_dec_three:
-	sub	r0, r0, #0x30
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8,q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q15}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	vst1.8	{q6}, [r1]!
-	b	Lcbc_dec_done
-.align	4
-Lcbc_dec_two:
-	sub	r0, r0, #0x20
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8}, [r0]!		@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q15}, [r0]!		@ reload input
-	veor	q1, q1, q8
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	b	Lcbc_dec_done
-.align	4
-Lcbc_dec_one:
-	sub	r0, r0, #0x10
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q15}, [r0]!		@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vst1.8	{q0}, [r1]!		@ write output
-
-Lcbc_dec_done:
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-Lcbc_dec_bzero:@ wipe key schedule [if any]
-	vstmia	sp!, {q0,q1}
-	cmp	sp, r9
-	bne	Lcbc_dec_bzero
-#endif
-
-	mov	sp, r9
-	add	sp, #0x10			@ add sp,r9,#0x10 is no good for thumb
-	vst1.8	{q15}, [r8]		@ return IV
-	VFP_ABI_POP
-	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
-
-.globl	_bsaes_ctr32_encrypt_blocks
-.private_extern	_bsaes_ctr32_encrypt_blocks
-#ifdef __thumb2__
-.thumb_func	_bsaes_ctr32_encrypt_blocks
-#endif
-.align	5
-_bsaes_ctr32_encrypt_blocks:
-	@ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
-	@ out to retain a constant-time implementation.
-	mov	ip, sp
-	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
-	VFP_ABI_PUSH
-	ldr	r8, [ip]			@ ctr is 1st arg on the stack
-	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
-	mov	r9, sp				@ save sp
-
-	ldr	r10, [r3, #240]		@ get # of rounds
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	@ allocate the key schedule on the stack
-	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
-	add	r12, #96			@ size of bit-sliced key schedule
-
-	@ populate the key schedule
-	mov	r4, r3			@ pass key
-	mov	r5, r10			@ pass # of rounds
-	mov	sp, r12				@ sp is sp
-	bl	_bsaes_key_convert
-	veor	q7,q7,q15	@ fix up last round key
-	vstmia	r12, {q7}			@ save last round key
-
-	vld1.8	{q0}, [r8]		@ load counter
-#ifdef	__APPLE__
-	mov	r8, #:lower16:(LREVM0SR-LM0)
-	add	r8, r6, r8
-#else
-	add	r8, r6, #LREVM0SR-LM0	@ borrow r8
-#endif
-	vldmia	sp, {q4}		@ load round0 key
-#else
-	ldr	r12, [r3, #244]
-	eors	r12, #1
-	beq	0f
-
-	@ populate the key schedule
-	str	r12, [r3, #244]
-	mov	r4, r3			@ pass key
-	mov	r5, r10			@ pass # of rounds
-	add	r12, r3, #248			@ pass key schedule
-	bl	_bsaes_key_convert
-	veor	q7,q7,q15	@ fix up last round key
-	vstmia	r12, {q7}			@ save last round key
-
-.align	2
-	add	r12, r3, #248
-	vld1.8	{q0}, [r8]		@ load counter
-	adrl	r8, LREVM0SR			@ borrow r8
-	vldmia	r12, {q4}			@ load round0 key
-	sub	sp, #0x10			@ place for adjusted round0 key
-#endif
-
-	vmov.i32	q8,#1		@ compose 1<<96
-	veor	q9,q9,q9
-	vrev32.8	q0,q0
-	vext.8	q8,q9,q8,#4
-	vrev32.8	q4,q4
-	vadd.u32	q9,q8,q8	@ compose 2<<96
-	vstmia	sp, {q4}		@ save adjusted round0 key
-	b	Lctr_enc_loop
-
-.align	4
-Lctr_enc_loop:
-	vadd.u32	q10, q8, q9	@ compose 3<<96
-	vadd.u32	q1, q0, q8	@ +1
-	vadd.u32	q2, q0, q9	@ +2
-	vadd.u32	q3, q0, q10	@ +3
-	vadd.u32	q4, q1, q10
-	vadd.u32	q5, q2, q10
-	vadd.u32	q6, q3, q10
-	vadd.u32	q7, q4, q10
-	vadd.u32	q10, q5, q10	@ next counter
-
-	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
-	@ to flip byte order in 32-bit counter
-
-	vldmia	sp, {q9}		@ load round0 key
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add	r4, sp, #0x10		@ pass next round key
-#else
-	add	r4, r3, #264
-#endif
-	vldmia	r8, {q8}			@ LREVM0SR
-	mov	r5, r10			@ pass rounds
-	vstmia	r9, {q10}			@ save next counter
-#ifdef	__APPLE__
-	mov	r6, #:lower16:(LREVM0SR-LSR)
-	sub	r6, r8, r6
-#else
-	sub	r6, r8, #LREVM0SR-LSR	@ pass constants
-#endif
-
-	bl	_bsaes_encrypt8_alt
-
-	subs	r2, r2, #8
-	blo	Lctr_enc_loop_done
-
-	vld1.8	{q8,q9}, [r0]!	@ load input
-	vld1.8	{q10,q11}, [r0]!
-	veor	q0, q8
-	veor	q1, q9
-	vld1.8	{q12,q13}, [r0]!
-	veor	q4, q10
-	veor	q6, q11
-	vld1.8	{q14,q15}, [r0]!
-	veor	q3, q12
-	vst1.8	{q0,q1}, [r1]!	@ write output
-	veor	q7, q13
-	veor	q2, q14
-	vst1.8	{q4}, [r1]!
-	veor	q5, q15
-	vst1.8	{q6}, [r1]!
-	vmov.i32	q8, #1			@ compose 1<<96
-	vst1.8	{q3}, [r1]!
-	veor	q9, q9, q9
-	vst1.8	{q7}, [r1]!
-	vext.8	q8, q9, q8, #4
-	vst1.8	{q2}, [r1]!
-	vadd.u32	q9,q8,q8		@ compose 2<<96
-	vst1.8	{q5}, [r1]!
-	vldmia	r9, {q0}			@ load counter
-
-	bne	Lctr_enc_loop
-	b	Lctr_enc_done
-
-.align	4
-Lctr_enc_loop_done:
-	add	r2, r2, #8
-	vld1.8	{q8}, [r0]!	@ load input
-	veor	q0, q8
-	vst1.8	{q0}, [r1]!	@ write output
-	cmp	r2, #2
-	blo	Lctr_enc_done
-	vld1.8	{q9}, [r0]!
-	veor	q1, q9
-	vst1.8	{q1}, [r1]!
-	beq	Lctr_enc_done
-	vld1.8	{q10}, [r0]!
-	veor	q4, q10
-	vst1.8	{q4}, [r1]!
-	cmp	r2, #4
-	blo	Lctr_enc_done
-	vld1.8	{q11}, [r0]!
-	veor	q6, q11
-	vst1.8	{q6}, [r1]!
-	beq	Lctr_enc_done
-	vld1.8	{q12}, [r0]!
-	veor	q3, q12
-	vst1.8	{q3}, [r1]!
-	cmp	r2, #6
-	blo	Lctr_enc_done
-	vld1.8	{q13}, [r0]!
-	veor	q7, q13
-	vst1.8	{q7}, [r1]!
-	beq	Lctr_enc_done
-	vld1.8	{q14}, [r0]
-	veor	q2, q14
-	vst1.8	{q2}, [r1]!
-
-Lctr_enc_done:
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-#ifndef	BSAES_ASM_EXTENDED_KEY
-Lctr_enc_bzero:@ wipe key schedule [if any]
-	vstmia	sp!, {q0,q1}
-	cmp	sp, r9
-	bne	Lctr_enc_bzero
-#else
-	vstmia	sp, {q0,q1}
-#endif
-
-	mov	sp, r9
-	add	sp, #0x10		@ add sp,r9,#0x10 is no good for thumb
-	VFP_ABI_POP
-	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
-
-	@ OpenSSL contains aes_nohw_* fallback code here. We patch this
-	@ out to retain a constant-time implementation.
-
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/ghash-armv4-apple.S b/apple-arm/crypto/fipsmodule/ghash-armv4-apple.S
deleted file mode 100644
index 0f47586..0000000
--- a/apple-arm/crypto/fipsmodule/ghash-armv4-apple.S
+++ /dev/null
@@ -1,250 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
-@ instructions are in aesv8-armx.pl.)
-
-
-.text
-#if defined(__thumb2__) || defined(__clang__)
-.syntax	unified
-#define ldrplb  ldrbpl
-#define ldrneb  ldrbne
-#endif
-#if defined(__thumb2__)
-.thumb
-#else
-.code	32
-#endif
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.globl	_gcm_init_neon
-.private_extern	_gcm_init_neon
-#ifdef __thumb2__
-.thumb_func	_gcm_init_neon
-#endif
-.align	4
-_gcm_init_neon:
-	vld1.64	d7,[r1]!		@ load H
-	vmov.i8	q8,#0xe1
-	vld1.64	d6,[r1]
-	vshl.i64	d17,#57
-	vshr.u64	d16,#63		@ t0=0xc2....01
-	vdup.8	q9,d7[7]
-	vshr.u64	d26,d6,#63
-	vshr.s8	q9,#7			@ broadcast carry bit
-	vshl.i64	q3,q3,#1
-	vand	q8,q8,q9
-	vorr	d7,d26		@ H<<<=1
-	veor	q3,q3,q8		@ twisted H
-	vstmia	r0,{q3}
-
-	bx	lr					@ bx lr
-
-
-.globl	_gcm_gmult_neon
-.private_extern	_gcm_gmult_neon
-#ifdef __thumb2__
-.thumb_func	_gcm_gmult_neon
-#endif
-.align	4
-_gcm_gmult_neon:
-	vld1.64	d7,[r0]!		@ load Xi
-	vld1.64	d6,[r0]!
-	vmov.i64	d29,#0x0000ffffffffffff
-	vldmia	r1,{d26,d27}	@ load twisted H
-	vmov.i64	d30,#0x00000000ffffffff
-#ifdef __ARMEL__
-	vrev64.8	q3,q3
-#endif
-	vmov.i64	d31,#0x000000000000ffff
-	veor	d28,d26,d27		@ Karatsuba pre-processing
-	mov	r3,#16
-	b	Lgmult_neon
-
-
-.globl	_gcm_ghash_neon
-.private_extern	_gcm_ghash_neon
-#ifdef __thumb2__
-.thumb_func	_gcm_ghash_neon
-#endif
-.align	4
-_gcm_ghash_neon:
-	vld1.64	d1,[r0]!		@ load Xi
-	vld1.64	d0,[r0]!
-	vmov.i64	d29,#0x0000ffffffffffff
-	vldmia	r1,{d26,d27}	@ load twisted H
-	vmov.i64	d30,#0x00000000ffffffff
-#ifdef __ARMEL__
-	vrev64.8	q0,q0
-#endif
-	vmov.i64	d31,#0x000000000000ffff
-	veor	d28,d26,d27		@ Karatsuba pre-processing
-
-Loop_neon:
-	vld1.64	d7,[r2]!		@ load inp
-	vld1.64	d6,[r2]!
-#ifdef __ARMEL__
-	vrev64.8	q3,q3
-#endif
-	veor	q3,q0			@ inp^=Xi
-Lgmult_neon:
-	vext.8	d16, d26, d26, #1	@ A1
-	vmull.p8	q8, d16, d6		@ F = A1*B
-	vext.8	d0, d6, d6, #1	@ B1
-	vmull.p8	q0, d26, d0		@ E = A*B1
-	vext.8	d18, d26, d26, #2	@ A2
-	vmull.p8	q9, d18, d6		@ H = A2*B
-	vext.8	d22, d6, d6, #2	@ B2
-	vmull.p8	q11, d26, d22		@ G = A*B2
-	vext.8	d20, d26, d26, #3	@ A3
-	veor	q8, q8, q0		@ L = E + F
-	vmull.p8	q10, d20, d6		@ J = A3*B
-	vext.8	d0, d6, d6, #3	@ B3
-	veor	q9, q9, q11		@ M = G + H
-	vmull.p8	q0, d26, d0		@ I = A*B3
-	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
-	vand	d17, d17, d29
-	vext.8	d22, d6, d6, #4	@ B4
-	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
-	vand	d19, d19, d30
-	vmull.p8	q11, d26, d22		@ K = A*B4
-	veor	q10, q10, q0		@ N = I + J
-	veor	d16, d16, d17
-	veor	d18, d18, d19
-	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
-	vand	d21, d21, d31
-	vext.8	q8, q8, q8, #15
-	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
-	vmov.i64	d23, #0
-	vext.8	q9, q9, q9, #14
-	veor	d20, d20, d21
-	vmull.p8	q0, d26, d6		@ D = A*B
-	vext.8	q11, q11, q11, #12
-	vext.8	q10, q10, q10, #13
-	veor	q8, q8, q9
-	veor	q10, q10, q11
-	veor	q0, q0, q8
-	veor	q0, q0, q10
-	veor	d6,d6,d7	@ Karatsuba pre-processing
-	vext.8	d16, d28, d28, #1	@ A1
-	vmull.p8	q8, d16, d6		@ F = A1*B
-	vext.8	d2, d6, d6, #1	@ B1
-	vmull.p8	q1, d28, d2		@ E = A*B1
-	vext.8	d18, d28, d28, #2	@ A2
-	vmull.p8	q9, d18, d6		@ H = A2*B
-	vext.8	d22, d6, d6, #2	@ B2
-	vmull.p8	q11, d28, d22		@ G = A*B2
-	vext.8	d20, d28, d28, #3	@ A3
-	veor	q8, q8, q1		@ L = E + F
-	vmull.p8	q10, d20, d6		@ J = A3*B
-	vext.8	d2, d6, d6, #3	@ B3
-	veor	q9, q9, q11		@ M = G + H
-	vmull.p8	q1, d28, d2		@ I = A*B3
-	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
-	vand	d17, d17, d29
-	vext.8	d22, d6, d6, #4	@ B4
-	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
-	vand	d19, d19, d30
-	vmull.p8	q11, d28, d22		@ K = A*B4
-	veor	q10, q10, q1		@ N = I + J
-	veor	d16, d16, d17
-	veor	d18, d18, d19
-	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
-	vand	d21, d21, d31
-	vext.8	q8, q8, q8, #15
-	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
-	vmov.i64	d23, #0
-	vext.8	q9, q9, q9, #14
-	veor	d20, d20, d21
-	vmull.p8	q1, d28, d6		@ D = A*B
-	vext.8	q11, q11, q11, #12
-	vext.8	q10, q10, q10, #13
-	veor	q8, q8, q9
-	veor	q10, q10, q11
-	veor	q1, q1, q8
-	veor	q1, q1, q10
-	vext.8	d16, d27, d27, #1	@ A1
-	vmull.p8	q8, d16, d7		@ F = A1*B
-	vext.8	d4, d7, d7, #1	@ B1
-	vmull.p8	q2, d27, d4		@ E = A*B1
-	vext.8	d18, d27, d27, #2	@ A2
-	vmull.p8	q9, d18, d7		@ H = A2*B
-	vext.8	d22, d7, d7, #2	@ B2
-	vmull.p8	q11, d27, d22		@ G = A*B2
-	vext.8	d20, d27, d27, #3	@ A3
-	veor	q8, q8, q2		@ L = E + F
-	vmull.p8	q10, d20, d7		@ J = A3*B
-	vext.8	d4, d7, d7, #3	@ B3
-	veor	q9, q9, q11		@ M = G + H
-	vmull.p8	q2, d27, d4		@ I = A*B3
-	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
-	vand	d17, d17, d29
-	vext.8	d22, d7, d7, #4	@ B4
-	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
-	vand	d19, d19, d30
-	vmull.p8	q11, d27, d22		@ K = A*B4
-	veor	q10, q10, q2		@ N = I + J
-	veor	d16, d16, d17
-	veor	d18, d18, d19
-	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
-	vand	d21, d21, d31
-	vext.8	q8, q8, q8, #15
-	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
-	vmov.i64	d23, #0
-	vext.8	q9, q9, q9, #14
-	veor	d20, d20, d21
-	vmull.p8	q2, d27, d7		@ D = A*B
-	vext.8	q11, q11, q11, #12
-	vext.8	q10, q10, q10, #13
-	veor	q8, q8, q9
-	veor	q10, q10, q11
-	veor	q2, q2, q8
-	veor	q2, q2, q10
-	veor	q1,q1,q0		@ Karatsuba post-processing
-	veor	q1,q1,q2
-	veor	d1,d1,d2
-	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
-
-	@ equivalent of reduction_avx from ghash-x86_64.pl
-	vshl.i64	q9,q0,#57		@ 1st phase
-	vshl.i64	q10,q0,#62
-	veor	q10,q10,q9		@
-	vshl.i64	q9,q0,#63
-	veor	q10, q10, q9		@
-	veor	d1,d1,d20	@
-	veor	d4,d4,d21
-
-	vshr.u64	q10,q0,#1		@ 2nd phase
-	veor	q2,q2,q0
-	veor	q0,q0,q10		@
-	vshr.u64	q10,q10,#6
-	vshr.u64	q0,q0,#1		@
-	veor	q0,q0,q2		@
-	veor	q0,q0,q10		@
-
-	subs	r3,#16
-	bne	Loop_neon
-
-#ifdef __ARMEL__
-	vrev64.8	q0,q0
-#endif
-	sub	r0,#16
-	vst1.64	d1,[r0]!		@ write out Xi
-	vst1.64	d0,[r0]
-
-	bx	lr					@ bx lr
-
-#endif
-.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/ghashv8-armv7-apple.S b/apple-arm/crypto/fipsmodule/ghashv8-armv7-apple.S
deleted file mode 100644
index 37ee28e..0000000
--- a/apple-arm/crypto/fipsmodule/ghashv8-armv7-apple.S
+++ /dev/null
@@ -1,252 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-#if __ARM_MAX_ARCH__>=7
-.text
-
-.code	32
-#undef	__thumb2__
-.globl	_gcm_init_v8
-.private_extern	_gcm_init_v8
-#ifdef __thumb2__
-.thumb_func	_gcm_init_v8
-#endif
-.align	4
-_gcm_init_v8:
-	AARCH64_VALID_CALL_TARGET
-	vld1.64	{q9},[r1]		@ load input H
-	vmov.i8	q11,#0xe1
-	vshl.i64	q11,q11,#57		@ 0xc2.0
-	vext.8	q3,q9,q9,#8
-	vshr.u64	q10,q11,#63
-	vdup.32	q9,d18[1]
-	vext.8	q8,q10,q11,#8		@ t0=0xc2....01
-	vshr.u64	q10,q3,#63
-	vshr.s32	q9,q9,#31		@ broadcast carry bit
-	vand	q10,q10,q8
-	vshl.i64	q3,q3,#1
-	vext.8	q10,q10,q10,#8
-	vand	q8,q8,q9
-	vorr	q3,q3,q10		@ H<<<=1
-	veor	q12,q3,q8		@ twisted H
-	vst1.64	{q12},[r0]!		@ store Htable[0]
-
-	@ calculate H^2
-	vext.8	q8,q12,q12,#8		@ Karatsuba pre-processing
-.byte	0xa8,0x0e,0xa8,0xf2	@ pmull q0,q12,q12
-	veor	q8,q8,q12
-.byte	0xa9,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q12
-.byte	0xa0,0x2e,0xa0,0xf2	@ pmull q1,q8,q8
-
-	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
-	veor	q10,q0,q2
-	veor	q1,q1,q9
-	veor	q1,q1,q10
-.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase
-
-	vmov	d4,d3		@ Xh|Xm - 256-bit result
-	vmov	d3,d0		@ Xm is rotated Xl
-	veor	q0,q1,q10
-
-	vext.8	q10,q0,q0,#8		@ 2nd phase
-.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
-	veor	q10,q10,q2
-	veor	q14,q0,q10
-
-	vext.8	q9,q14,q14,#8		@ Karatsuba pre-processing
-	veor	q9,q9,q14
-	vext.8	q13,q8,q9,#8		@ pack Karatsuba pre-processed
-	vst1.64	{q13,q14},[r0]!	@ store Htable[1..2]
-	bx	lr
-
-.globl	_gcm_gmult_v8
-.private_extern	_gcm_gmult_v8
-#ifdef __thumb2__
-.thumb_func	_gcm_gmult_v8
-#endif
-.align	4
-_gcm_gmult_v8:
-	AARCH64_VALID_CALL_TARGET
-	vld1.64	{q9},[r0]		@ load Xi
-	vmov.i8	q11,#0xe1
-	vld1.64	{q12,q13},[r1]	@ load twisted H, ...
-	vshl.u64	q11,q11,#57
-#ifndef __ARMEB__
-	vrev64.8	q9,q9
-#endif
-	vext.8	q3,q9,q9,#8
-
-.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo
-	veor	q9,q9,q3		@ Karatsuba pre-processing
-.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi
-.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
-
-	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
-	veor	q10,q0,q2
-	veor	q1,q1,q9
-	veor	q1,q1,q10
-.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
-
-	vmov	d4,d3		@ Xh|Xm - 256-bit result
-	vmov	d3,d0		@ Xm is rotated Xl
-	veor	q0,q1,q10
-
-	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
-.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
-	veor	q10,q10,q2
-	veor	q0,q0,q10
-
-#ifndef __ARMEB__
-	vrev64.8	q0,q0
-#endif
-	vext.8	q0,q0,q0,#8
-	vst1.64	{q0},[r0]		@ write out Xi
-
-	bx	lr
-
-.globl	_gcm_ghash_v8
-.private_extern	_gcm_ghash_v8
-#ifdef __thumb2__
-.thumb_func	_gcm_ghash_v8
-#endif
-.align	4
-_gcm_ghash_v8:
-	AARCH64_VALID_CALL_TARGET
-	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
-	vld1.64	{q0},[r0]		@ load [rotated] Xi
-						@ "[rotated]" means that
-						@ loaded value would have
-						@ to be rotated in order to
-						@ make it appear as in
-						@ algorithm specification
-	subs	r3,r3,#32		@ see if r3 is 32 or larger
-	mov	r12,#16		@ r12 is used as post-
-						@ increment for input pointer;
-						@ as loop is modulo-scheduled
-						@ r12 is zeroed just in time
-						@ to preclude overstepping
-						@ inp[len], which means that
-						@ last block[s] are actually
-						@ loaded twice, but last
-						@ copy is not processed
-	vld1.64	{q12,q13},[r1]!	@ load twisted H, ..., H^2
-	vmov.i8	q11,#0xe1
-	vld1.64	{q14},[r1]
-	moveq	r12,#0			@ is it time to zero r12?
-	vext.8	q0,q0,q0,#8		@ rotate Xi
-	vld1.64	{q8},[r2]!	@ load [rotated] I[0]
-	vshl.u64	q11,q11,#57		@ compose 0xc2.0 constant
-#ifndef __ARMEB__
-	vrev64.8	q8,q8
-	vrev64.8	q0,q0
-#endif
-	vext.8	q3,q8,q8,#8		@ rotate I[0]
-	blo	Lodd_tail_v8		@ r3 was less than 32
-	vld1.64	{q9},[r2],r12	@ load [rotated] I[1]
-#ifndef __ARMEB__
-	vrev64.8	q9,q9
-#endif
-	vext.8	q7,q9,q9,#8
-	veor	q3,q3,q0		@ I[i]^=Xi
-.byte	0x8e,0x8e,0xa8,0xf2	@ pmull q4,q12,q7		@ H·Ii+1
-	veor	q9,q9,q7		@ Karatsuba pre-processing
-.byte	0x8f,0xce,0xa9,0xf2	@ pmull2 q6,q12,q7
-	b	Loop_mod2x_v8
-
-.align	4
-Loop_mod2x_v8:
-	vext.8	q10,q3,q3,#8
-	subs	r3,r3,#32		@ is there more data?
-.byte	0x86,0x0e,0xac,0xf2	@ pmull q0,q14,q3		@ H^2.lo·Xi.lo
-	movlo	r12,#0			@ is it time to zero r12?
-
-.byte	0xa2,0xae,0xaa,0xf2	@ pmull q5,q13,q9
-	veor	q10,q10,q3		@ Karatsuba pre-processing
-.byte	0x87,0x4e,0xad,0xf2	@ pmull2 q2,q14,q3		@ H^2.hi·Xi.hi
-	veor	q0,q0,q4		@ accumulate
-.byte	0xa5,0x2e,0xab,0xf2	@ pmull2 q1,q13,q10		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
-	vld1.64	{q8},[r2],r12	@ load [rotated] I[i+2]
-
-	veor	q2,q2,q6
-	moveq	r12,#0			@ is it time to zero r12?
-	veor	q1,q1,q5
-
-	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
-	veor	q10,q0,q2
-	veor	q1,q1,q9
-	vld1.64	{q9},[r2],r12	@ load [rotated] I[i+3]
-#ifndef __ARMEB__
-	vrev64.8	q8,q8
-#endif
-	veor	q1,q1,q10
-.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
-
-#ifndef __ARMEB__
-	vrev64.8	q9,q9
-#endif
-	vmov	d4,d3		@ Xh|Xm - 256-bit result
-	vmov	d3,d0		@ Xm is rotated Xl
-	vext.8	q7,q9,q9,#8
-	vext.8	q3,q8,q8,#8
-	veor	q0,q1,q10
-.byte	0x8e,0x8e,0xa8,0xf2	@ pmull q4,q12,q7		@ H·Ii+1
-	veor	q3,q3,q2		@ accumulate q3 early
-
-	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
-.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
-	veor	q3,q3,q10
-	veor	q9,q9,q7		@ Karatsuba pre-processing
-	veor	q3,q3,q0
-.byte	0x8f,0xce,0xa9,0xf2	@ pmull2 q6,q12,q7
-	bhs	Loop_mod2x_v8		@ there was at least 32 more bytes
-
-	veor	q2,q2,q10
-	vext.8	q3,q8,q8,#8		@ re-construct q3
-	adds	r3,r3,#32		@ re-construct r3
-	veor	q0,q0,q2		@ re-construct q0
-	beq	Ldone_v8		@ is r3 zero?
-Lodd_tail_v8:
-	vext.8	q10,q0,q0,#8
-	veor	q3,q3,q0		@ inp^=Xi
-	veor	q9,q8,q10		@ q9 is rotated inp^Xi
-
-.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo
-	veor	q9,q9,q3		@ Karatsuba pre-processing
-.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi
-.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
-
-	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
-	veor	q10,q0,q2
-	veor	q1,q1,q9
-	veor	q1,q1,q10
-.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
-
-	vmov	d4,d3		@ Xh|Xm - 256-bit result
-	vmov	d3,d0		@ Xm is rotated Xl
-	veor	q0,q1,q10
-
-	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
-.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
-	veor	q10,q10,q2
-	veor	q0,q0,q10
-
-Ldone_v8:
-#ifndef __ARMEB__
-	vrev64.8	q0,q0
-#endif
-	vext.8	q0,q0,q0,#8
-	vst1.64	{q0},[r0]		@ write out Xi
-
-	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
-	bx	lr
-
-.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/sha1-armv4-large-apple.S b/apple-arm/crypto/fipsmodule/sha1-armv4-large-apple.S
deleted file mode 100644
index d653f2d..0000000
--- a/apple-arm/crypto/fipsmodule/sha1-armv4-large-apple.S
+++ /dev/null
@@ -1,1510 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-#include <openssl/arm_arch.h>
-
-.text
-#if defined(__thumb2__)
-.syntax	unified
-.thumb
-#else
-.code	32
-#endif
-
-.globl	_sha1_block_data_order
-.private_extern	_sha1_block_data_order
-#ifdef __thumb2__
-.thumb_func	_sha1_block_data_order
-#endif
-
-.align	5
-_sha1_block_data_order:
-#if __ARM_MAX_ARCH__>=7
-Lsha1_block:
-	adr	r3,Lsha1_block
-	ldr	r12,LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-#ifdef	__APPLE__
-	ldr	r12,[r12]
-#endif
-	tst	r12,#ARMV8_SHA1
-	bne	LARMv8
-	tst	r12,#ARMV7_NEON
-	bne	LNEON
-#endif
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
-	ldmia	r0,{r3,r4,r5,r6,r7}
-Lloop:
-	ldr	r8,LK_00_19
-	mov	r14,sp
-	sub	sp,sp,#15*4
-	mov	r5,r5,ror#30
-	mov	r6,r6,ror#30
-	mov	r7,r7,ror#30		@ [6]
-L_00_15:
-#if __ARM_ARCH__<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r5,r6			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	eor	r10,r5,r6			@ F_xx_xx
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r4,r10,ror#2
-	add	r7,r7,r9			@ E+=X[i]
-	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r6,r8,r6,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r4,r5			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r6,r8,r6,ror#2			@ E+=K_00_19
-	eor	r10,r4,r5			@ F_xx_xx
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r3,r10,ror#2
-	add	r6,r6,r9			@ E+=X[i]
-	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r5,r8,r5,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r3,r4			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r5,r8,r5,ror#2			@ E+=K_00_19
-	eor	r10,r3,r4			@ F_xx_xx
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r7,r10,ror#2
-	add	r5,r5,r9			@ E+=X[i]
-	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r4,r8,r4,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r7,r3			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r4,r8,r4,ror#2			@ E+=K_00_19
-	eor	r10,r7,r3			@ F_xx_xx
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r6,r10,ror#2
-	add	r4,r4,r9			@ E+=X[i]
-	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r3,r8,r3,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r6,r7			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r3,r8,r3,ror#2			@ E+=K_00_19
-	eor	r10,r6,r7			@ F_xx_xx
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r5,r10,ror#2
-	add	r3,r3,r9			@ E+=X[i]
-	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
-#if defined(__thumb2__)
-	mov	r12,sp
-	teq	r14,r12
-#else
-	teq	r14,sp
-#endif
-	bne	L_00_15		@ [((11+4)*5+2)*3]
-	sub	sp,sp,#25*4
-#if __ARM_ARCH__<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r5,r6			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	eor	r10,r5,r6			@ F_xx_xx
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r4,r10,ror#2
-	add	r7,r7,r9			@ E+=X[i]
-	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r4,r5			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r3,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r6,r6,r9			@ E+=X[i]
-	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
-	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r3,r4			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r7,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r5,r5,r9			@ E+=X[i]
-	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
-	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r7,r3			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r6,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r4,r4,r9			@ E+=X[i]
-	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
-	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r6,r7			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r5,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r3,r3,r9			@ E+=X[i]
-	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
-	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
-
-	ldr	r8,LK_20_39		@ [+15+16*4]
-	cmn	sp,#0			@ [+3], clear carry to denote 20_39
-L_20_39_or_60_79:
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r5,r6			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor	r10,r4,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r7,r7,r9			@ E+=X[i]
-	add	r7,r7,r10			@ E+=F_20_39(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r4,r5			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor	r10,r3,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r6,r6,r9			@ E+=X[i]
-	add	r6,r6,r10			@ E+=F_20_39(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r3,r4			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor	r10,r7,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r5,r5,r9			@ E+=X[i]
-	add	r5,r5,r10			@ E+=F_20_39(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r7,r3			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor	r10,r6,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r4,r4,r9			@ E+=X[i]
-	add	r4,r4,r10			@ E+=F_20_39(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r6,r7			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor	r10,r5,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r3,r3,r9			@ E+=X[i]
-	add	r3,r3,r10			@ E+=F_20_39(B,C,D)
-#if defined(__thumb2__)
-	mov	r12,sp
-	teq	r14,r12
-#else
-	teq	r14,sp			@ preserve carry
-#endif
-	bne	L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
-	bcs	L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
-
-	ldr	r8,LK_40_59
-	sub	sp,sp,#20*4		@ [+2]
-L_40_59:
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r5,r6			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r4,r10,ror#2					@ F_xx_xx
-	and	r11,r5,r6					@ F_xx_xx
-	add	r7,r7,r9			@ E+=X[i]
-	add	r7,r7,r10			@ E+=F_40_59(B,C,D)
-	add	r7,r7,r11,ror#2
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r4,r5			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r3,r10,ror#2					@ F_xx_xx
-	and	r11,r4,r5					@ F_xx_xx
-	add	r6,r6,r9			@ E+=X[i]
-	add	r6,r6,r10			@ E+=F_40_59(B,C,D)
-	add	r6,r6,r11,ror#2
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r3,r4			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r7,r10,ror#2					@ F_xx_xx
-	and	r11,r3,r4					@ F_xx_xx
-	add	r5,r5,r9			@ E+=X[i]
-	add	r5,r5,r10			@ E+=F_40_59(B,C,D)
-	add	r5,r5,r11,ror#2
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r7,r3			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r6,r10,ror#2					@ F_xx_xx
-	and	r11,r7,r3					@ F_xx_xx
-	add	r4,r4,r9			@ E+=X[i]
-	add	r4,r4,r10			@ E+=F_40_59(B,C,D)
-	add	r4,r4,r11,ror#2
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r6,r7			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and	r10,r5,r10,ror#2					@ F_xx_xx
-	and	r11,r6,r7					@ F_xx_xx
-	add	r3,r3,r9			@ E+=X[i]
-	add	r3,r3,r10			@ E+=F_40_59(B,C,D)
-	add	r3,r3,r11,ror#2
-#if defined(__thumb2__)
-	mov	r12,sp
-	teq	r14,r12
-#else
-	teq	r14,sp
-#endif
-	bne	L_40_59		@ [+((12+5)*5+2)*4]
-
-	ldr	r8,LK_60_79
-	sub	sp,sp,#20*4
-	cmp	sp,#0			@ set carry to denote 60_79
-	b	L_20_39_or_60_79	@ [+4], spare 300 bytes
-L_done:
-	add	sp,sp,#80*4		@ "deallocate" stack frame
-	ldmia	r0,{r8,r9,r10,r11,r12}
-	add	r3,r8,r3
-	add	r4,r9,r4
-	add	r5,r10,r5,ror#2
-	add	r6,r11,r6,ror#2
-	add	r7,r12,r7,ror#2
-	stmia	r0,{r3,r4,r5,r6,r7}
-	teq	r1,r2
-	bne	Lloop			@ [+18], total 1307
-
-#if __ARM_ARCH__>=5
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-#else
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-#endif
-
-
-.align	5
-LK_00_19:.word	0x5a827999
-LK_20_39:.word	0x6ed9eba1
-LK_40_59:.word	0x8f1bbcdc
-LK_60_79:.word	0xca62c1d6
-#if __ARM_MAX_ARCH__>=7
-LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-Lsha1_block
-#endif
-.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	5
-#if __ARM_MAX_ARCH__>=7
-
-
-
-#ifdef __thumb2__
-.thumb_func	sha1_block_data_order_neon
-#endif
-.align	4
-sha1_block_data_order_neon:
-LNEON:
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
-	@ dmb				@ errata #451034 on early Cortex A8
-	@ vstmdb	sp!,{d8-d15}	@ ABI specification says so
-	mov	r14,sp
-	sub	r12,sp,#64
-	adr	r8,LK_00_19
-	bic	r12,r12,#15		@ align for 128-bit stores
-
-	ldmia	r0,{r3,r4,r5,r6,r7}	@ load context
-	mov	sp,r12		@ alloca
-
-	vld1.8	{q0,q1},[r1]!	@ handles unaligned
-	veor	q15,q15,q15
-	vld1.8	{q2,q3},[r1]!
-	vld1.32	{d28[],d29[]},[r8,:32]!	@ load K_00_19
-	vrev32.8	q0,q0		@ yes, even on
-	vrev32.8	q1,q1		@ big-endian...
-	vrev32.8	q2,q2
-	vadd.i32	q8,q0,q14
-	vrev32.8	q3,q3
-	vadd.i32	q9,q1,q14
-	vst1.32	{q8},[r12,:128]!
-	vadd.i32	q10,q2,q14
-	vst1.32	{q9},[r12,:128]!
-	vst1.32	{q10},[r12,:128]!
-	ldr	r9,[sp]			@ big RAW stall
-
-Loop_neon:
-	vext.8	q8,q0,q1,#8
-	bic	r10,r6,r4
-	add	r7,r7,r9
-	and	r11,r5,r4
-	vadd.i32	q13,q3,q14
-	ldr	r9,[sp,#4]
-	add	r7,r7,r3,ror#27
-	vext.8	q12,q3,q15,#4
-	eor	r11,r11,r10
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	veor	q8,q8,q0
-	bic	r10,r5,r3
-	add	r6,r6,r9
-	veor	q12,q12,q2
-	and	r11,r4,r3
-	ldr	r9,[sp,#8]
-	veor	q12,q12,q8
-	add	r6,r6,r7,ror#27
-	eor	r11,r11,r10
-	vst1.32	{q13},[r12,:128]!
-	sub	r12,r12,#64
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	vext.8	q13,q15,q12,#4
-	bic	r10,r4,r7
-	add	r5,r5,r9
-	vadd.i32	q8,q12,q12
-	and	r11,r3,r7
-	ldr	r9,[sp,#12]
-	vsri.32	q8,q12,#31
-	add	r5,r5,r6,ror#27
-	eor	r11,r11,r10
-	mov	r7,r7,ror#2
-	vshr.u32	q12,q13,#30
-	add	r5,r5,r11
-	bic	r10,r3,r6
-	vshl.u32	q13,q13,#2
-	add	r4,r4,r9
-	and	r11,r7,r6
-	veor	q8,q8,q12
-	ldr	r9,[sp,#16]
-	add	r4,r4,r5,ror#27
-	veor	q8,q8,q13
-	eor	r11,r11,r10
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	vext.8	q9,q1,q2,#8
-	bic	r10,r7,r5
-	add	r3,r3,r9
-	and	r11,r6,r5
-	vadd.i32	q13,q8,q14
-	ldr	r9,[sp,#20]
-	vld1.32	{d28[],d29[]},[r8,:32]!
-	add	r3,r3,r4,ror#27
-	vext.8	q12,q8,q15,#4
-	eor	r11,r11,r10
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	veor	q9,q9,q1
-	bic	r10,r6,r4
-	add	r7,r7,r9
-	veor	q12,q12,q3
-	and	r11,r5,r4
-	ldr	r9,[sp,#24]
-	veor	q12,q12,q9
-	add	r7,r7,r3,ror#27
-	eor	r11,r11,r10
-	vst1.32	{q13},[r12,:128]!
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vext.8	q13,q15,q12,#4
-	bic	r10,r5,r3
-	add	r6,r6,r9
-	vadd.i32	q9,q12,q12
-	and	r11,r4,r3
-	ldr	r9,[sp,#28]
-	vsri.32	q9,q12,#31
-	add	r6,r6,r7,ror#27
-	eor	r11,r11,r10
-	mov	r3,r3,ror#2
-	vshr.u32	q12,q13,#30
-	add	r6,r6,r11
-	bic	r10,r4,r7
-	vshl.u32	q13,q13,#2
-	add	r5,r5,r9
-	and	r11,r3,r7
-	veor	q9,q9,q12
-	ldr	r9,[sp,#32]
-	add	r5,r5,r6,ror#27
-	veor	q9,q9,q13
-	eor	r11,r11,r10
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	vext.8	q10,q2,q3,#8
-	bic	r10,r3,r6
-	add	r4,r4,r9
-	and	r11,r7,r6
-	vadd.i32	q13,q9,q14
-	ldr	r9,[sp,#36]
-	add	r4,r4,r5,ror#27
-	vext.8	q12,q9,q15,#4
-	eor	r11,r11,r10
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	veor	q10,q10,q2
-	bic	r10,r7,r5
-	add	r3,r3,r9
-	veor	q12,q12,q8
-	and	r11,r6,r5
-	ldr	r9,[sp,#40]
-	veor	q12,q12,q10
-	add	r3,r3,r4,ror#27
-	eor	r11,r11,r10
-	vst1.32	{q13},[r12,:128]!
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	vext.8	q13,q15,q12,#4
-	bic	r10,r6,r4
-	add	r7,r7,r9
-	vadd.i32	q10,q12,q12
-	and	r11,r5,r4
-	ldr	r9,[sp,#44]
-	vsri.32	q10,q12,#31
-	add	r7,r7,r3,ror#27
-	eor	r11,r11,r10
-	mov	r4,r4,ror#2
-	vshr.u32	q12,q13,#30
-	add	r7,r7,r11
-	bic	r10,r5,r3
-	vshl.u32	q13,q13,#2
-	add	r6,r6,r9
-	and	r11,r4,r3
-	veor	q10,q10,q12
-	ldr	r9,[sp,#48]
-	add	r6,r6,r7,ror#27
-	veor	q10,q10,q13
-	eor	r11,r11,r10
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	vext.8	q11,q3,q8,#8
-	bic	r10,r4,r7
-	add	r5,r5,r9
-	and	r11,r3,r7
-	vadd.i32	q13,q10,q14
-	ldr	r9,[sp,#52]
-	add	r5,r5,r6,ror#27
-	vext.8	q12,q10,q15,#4
-	eor	r11,r11,r10
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	veor	q11,q11,q3
-	bic	r10,r3,r6
-	add	r4,r4,r9
-	veor	q12,q12,q9
-	and	r11,r7,r6
-	ldr	r9,[sp,#56]
-	veor	q12,q12,q11
-	add	r4,r4,r5,ror#27
-	eor	r11,r11,r10
-	vst1.32	{q13},[r12,:128]!
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	vext.8	q13,q15,q12,#4
-	bic	r10,r7,r5
-	add	r3,r3,r9
-	vadd.i32	q11,q12,q12
-	and	r11,r6,r5
-	ldr	r9,[sp,#60]
-	vsri.32	q11,q12,#31
-	add	r3,r3,r4,ror#27
-	eor	r11,r11,r10
-	mov	r5,r5,ror#2
-	vshr.u32	q12,q13,#30
-	add	r3,r3,r11
-	bic	r10,r6,r4
-	vshl.u32	q13,q13,#2
-	add	r7,r7,r9
-	and	r11,r5,r4
-	veor	q11,q11,q12
-	ldr	r9,[sp,#0]
-	add	r7,r7,r3,ror#27
-	veor	q11,q11,q13
-	eor	r11,r11,r10
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vext.8	q12,q10,q11,#8
-	bic	r10,r5,r3
-	add	r6,r6,r9
-	and	r11,r4,r3
-	veor	q0,q0,q8
-	ldr	r9,[sp,#4]
-	add	r6,r6,r7,ror#27
-	veor	q0,q0,q1
-	eor	r11,r11,r10
-	mov	r3,r3,ror#2
-	vadd.i32	q13,q11,q14
-	add	r6,r6,r11
-	bic	r10,r4,r7
-	veor	q12,q12,q0
-	add	r5,r5,r9
-	and	r11,r3,r7
-	vshr.u32	q0,q12,#30
-	ldr	r9,[sp,#8]
-	add	r5,r5,r6,ror#27
-	vst1.32	{q13},[r12,:128]!
-	sub	r12,r12,#64
-	eor	r11,r11,r10
-	mov	r7,r7,ror#2
-	vsli.32	q0,q12,#2
-	add	r5,r5,r11
-	bic	r10,r3,r6
-	add	r4,r4,r9
-	and	r11,r7,r6
-	ldr	r9,[sp,#12]
-	add	r4,r4,r5,ror#27
-	eor	r11,r11,r10
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	bic	r10,r7,r5
-	add	r3,r3,r9
-	and	r11,r6,r5
-	ldr	r9,[sp,#16]
-	add	r3,r3,r4,ror#27
-	eor	r11,r11,r10
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	vext.8	q12,q11,q0,#8
-	eor	r10,r4,r6
-	add	r7,r7,r9
-	ldr	r9,[sp,#20]
-	veor	q1,q1,q9
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	veor	q1,q1,q2
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vadd.i32	q13,q0,q14
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	veor	q12,q12,q1
-	ldr	r9,[sp,#24]
-	eor	r11,r10,r4
-	vshr.u32	q1,q12,#30
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	vst1.32	{q13},[r12,:128]!
-	add	r6,r6,r11
-	eor	r10,r7,r4
-	vsli.32	q1,q12,#2
-	add	r5,r5,r9
-	ldr	r9,[sp,#28]
-	eor	r11,r10,r3
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	ldr	r9,[sp,#32]
-	eor	r11,r10,r7
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	vext.8	q12,q0,q1,#8
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	ldr	r9,[sp,#36]
-	veor	q2,q2,q10
-	eor	r11,r10,r6
-	add	r3,r3,r4,ror#27
-	veor	q2,q2,q3
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	vadd.i32	q13,q1,q14
-	eor	r10,r4,r6
-	vld1.32	{d28[],d29[]},[r8,:32]!
-	add	r7,r7,r9
-	veor	q12,q12,q2
-	ldr	r9,[sp,#40]
-	eor	r11,r10,r5
-	vshr.u32	q2,q12,#30
-	add	r7,r7,r3,ror#27
-	mov	r4,r4,ror#2
-	vst1.32	{q13},[r12,:128]!
-	add	r7,r7,r11
-	eor	r10,r3,r5
-	vsli.32	q2,q12,#2
-	add	r6,r6,r9
-	ldr	r9,[sp,#44]
-	eor	r11,r10,r4
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	eor	r10,r7,r4
-	add	r5,r5,r9
-	ldr	r9,[sp,#48]
-	eor	r11,r10,r3
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	vext.8	q12,q1,q2,#8
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	ldr	r9,[sp,#52]
-	veor	q3,q3,q11
-	eor	r11,r10,r7
-	add	r4,r4,r5,ror#27
-	veor	q3,q3,q8
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	vadd.i32	q13,q2,q14
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	veor	q12,q12,q3
-	ldr	r9,[sp,#56]
-	eor	r11,r10,r6
-	vshr.u32	q3,q12,#30
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	vst1.32	{q13},[r12,:128]!
-	add	r3,r3,r11
-	eor	r10,r4,r6
-	vsli.32	q3,q12,#2
-	add	r7,r7,r9
-	ldr	r9,[sp,#60]
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	ldr	r9,[sp,#0]
-	eor	r11,r10,r4
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	vext.8	q12,q2,q3,#8
-	eor	r10,r7,r4
-	add	r5,r5,r9
-	ldr	r9,[sp,#4]
-	veor	q8,q8,q0
-	eor	r11,r10,r3
-	add	r5,r5,r6,ror#27
-	veor	q8,q8,q9
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	vadd.i32	q13,q3,q14
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	veor	q12,q12,q8
-	ldr	r9,[sp,#8]
-	eor	r11,r10,r7
-	vshr.u32	q8,q12,#30
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	vst1.32	{q13},[r12,:128]!
-	sub	r12,r12,#64
-	add	r4,r4,r11
-	eor	r10,r5,r7
-	vsli.32	q8,q12,#2
-	add	r3,r3,r9
-	ldr	r9,[sp,#12]
-	eor	r11,r10,r6
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	eor	r10,r4,r6
-	add	r7,r7,r9
-	ldr	r9,[sp,#16]
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vext.8	q12,q3,q8,#8
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	ldr	r9,[sp,#20]
-	veor	q9,q9,q1
-	eor	r11,r10,r4
-	add	r6,r6,r7,ror#27
-	veor	q9,q9,q10
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	vadd.i32	q13,q8,q14
-	eor	r10,r7,r4
-	add	r5,r5,r9
-	veor	q12,q12,q9
-	ldr	r9,[sp,#24]
-	eor	r11,r10,r3
-	vshr.u32	q9,q12,#30
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	vst1.32	{q13},[r12,:128]!
-	add	r5,r5,r11
-	eor	r10,r6,r3
-	vsli.32	q9,q12,#2
-	add	r4,r4,r9
-	ldr	r9,[sp,#28]
-	eor	r11,r10,r7
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	ldr	r9,[sp,#32]
-	eor	r11,r10,r6
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	vext.8	q12,q8,q9,#8
-	add	r7,r7,r9
-	and	r10,r5,r6
-	ldr	r9,[sp,#36]
-	veor	q10,q10,q2
-	add	r7,r7,r3,ror#27
-	eor	r11,r5,r6
-	veor	q10,q10,q11
-	add	r7,r7,r10
-	and	r11,r11,r4
-	vadd.i32	q13,q9,q14
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	veor	q12,q12,q10
-	add	r6,r6,r9
-	and	r10,r4,r5
-	vshr.u32	q10,q12,#30
-	ldr	r9,[sp,#40]
-	add	r6,r6,r7,ror#27
-	vst1.32	{q13},[r12,:128]!
-	eor	r11,r4,r5
-	add	r6,r6,r10
-	vsli.32	q10,q12,#2
-	and	r11,r11,r3
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	add	r5,r5,r9
-	and	r10,r3,r4
-	ldr	r9,[sp,#44]
-	add	r5,r5,r6,ror#27
-	eor	r11,r3,r4
-	add	r5,r5,r10
-	and	r11,r11,r7
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	add	r4,r4,r9
-	and	r10,r7,r3
-	ldr	r9,[sp,#48]
-	add	r4,r4,r5,ror#27
-	eor	r11,r7,r3
-	add	r4,r4,r10
-	and	r11,r11,r6
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	vext.8	q12,q9,q10,#8
-	add	r3,r3,r9
-	and	r10,r6,r7
-	ldr	r9,[sp,#52]
-	veor	q11,q11,q3
-	add	r3,r3,r4,ror#27
-	eor	r11,r6,r7
-	veor	q11,q11,q0
-	add	r3,r3,r10
-	and	r11,r11,r5
-	vadd.i32	q13,q10,q14
-	mov	r5,r5,ror#2
-	vld1.32	{d28[],d29[]},[r8,:32]!
-	add	r3,r3,r11
-	veor	q12,q12,q11
-	add	r7,r7,r9
-	and	r10,r5,r6
-	vshr.u32	q11,q12,#30
-	ldr	r9,[sp,#56]
-	add	r7,r7,r3,ror#27
-	vst1.32	{q13},[r12,:128]!
-	eor	r11,r5,r6
-	add	r7,r7,r10
-	vsli.32	q11,q12,#2
-	and	r11,r11,r4
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	add	r6,r6,r9
-	and	r10,r4,r5
-	ldr	r9,[sp,#60]
-	add	r6,r6,r7,ror#27
-	eor	r11,r4,r5
-	add	r6,r6,r10
-	and	r11,r11,r3
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	add	r5,r5,r9
-	and	r10,r3,r4
-	ldr	r9,[sp,#0]
-	add	r5,r5,r6,ror#27
-	eor	r11,r3,r4
-	add	r5,r5,r10
-	and	r11,r11,r7
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	vext.8	q12,q10,q11,#8
-	add	r4,r4,r9
-	and	r10,r7,r3
-	ldr	r9,[sp,#4]
-	veor	q0,q0,q8
-	add	r4,r4,r5,ror#27
-	eor	r11,r7,r3
-	veor	q0,q0,q1
-	add	r4,r4,r10
-	and	r11,r11,r6
-	vadd.i32	q13,q11,q14
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	veor	q12,q12,q0
-	add	r3,r3,r9
-	and	r10,r6,r7
-	vshr.u32	q0,q12,#30
-	ldr	r9,[sp,#8]
-	add	r3,r3,r4,ror#27
-	vst1.32	{q13},[r12,:128]!
-	sub	r12,r12,#64
-	eor	r11,r6,r7
-	add	r3,r3,r10
-	vsli.32	q0,q12,#2
-	and	r11,r11,r5
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	add	r7,r7,r9
-	and	r10,r5,r6
-	ldr	r9,[sp,#12]
-	add	r7,r7,r3,ror#27
-	eor	r11,r5,r6
-	add	r7,r7,r10
-	and	r11,r11,r4
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	add	r6,r6,r9
-	and	r10,r4,r5
-	ldr	r9,[sp,#16]
-	add	r6,r6,r7,ror#27
-	eor	r11,r4,r5
-	add	r6,r6,r10
-	and	r11,r11,r3
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	vext.8	q12,q11,q0,#8
-	add	r5,r5,r9
-	and	r10,r3,r4
-	ldr	r9,[sp,#20]
-	veor	q1,q1,q9
-	add	r5,r5,r6,ror#27
-	eor	r11,r3,r4
-	veor	q1,q1,q2
-	add	r5,r5,r10
-	and	r11,r11,r7
-	vadd.i32	q13,q0,q14
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	veor	q12,q12,q1
-	add	r4,r4,r9
-	and	r10,r7,r3
-	vshr.u32	q1,q12,#30
-	ldr	r9,[sp,#24]
-	add	r4,r4,r5,ror#27
-	vst1.32	{q13},[r12,:128]!
-	eor	r11,r7,r3
-	add	r4,r4,r10
-	vsli.32	q1,q12,#2
-	and	r11,r11,r6
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	add	r3,r3,r9
-	and	r10,r6,r7
-	ldr	r9,[sp,#28]
-	add	r3,r3,r4,ror#27
-	eor	r11,r6,r7
-	add	r3,r3,r10
-	and	r11,r11,r5
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	add	r7,r7,r9
-	and	r10,r5,r6
-	ldr	r9,[sp,#32]
-	add	r7,r7,r3,ror#27
-	eor	r11,r5,r6
-	add	r7,r7,r10
-	and	r11,r11,r4
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vext.8	q12,q0,q1,#8
-	add	r6,r6,r9
-	and	r10,r4,r5
-	ldr	r9,[sp,#36]
-	veor	q2,q2,q10
-	add	r6,r6,r7,ror#27
-	eor	r11,r4,r5
-	veor	q2,q2,q3
-	add	r6,r6,r10
-	and	r11,r11,r3
-	vadd.i32	q13,q1,q14
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	veor	q12,q12,q2
-	add	r5,r5,r9
-	and	r10,r3,r4
-	vshr.u32	q2,q12,#30
-	ldr	r9,[sp,#40]
-	add	r5,r5,r6,ror#27
-	vst1.32	{q13},[r12,:128]!
-	eor	r11,r3,r4
-	add	r5,r5,r10
-	vsli.32	q2,q12,#2
-	and	r11,r11,r7
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	add	r4,r4,r9
-	and	r10,r7,r3
-	ldr	r9,[sp,#44]
-	add	r4,r4,r5,ror#27
-	eor	r11,r7,r3
-	add	r4,r4,r10
-	and	r11,r11,r6
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	add	r3,r3,r9
-	and	r10,r6,r7
-	ldr	r9,[sp,#48]
-	add	r3,r3,r4,ror#27
-	eor	r11,r6,r7
-	add	r3,r3,r10
-	and	r11,r11,r5
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	vext.8	q12,q1,q2,#8
-	eor	r10,r4,r6
-	add	r7,r7,r9
-	ldr	r9,[sp,#52]
-	veor	q3,q3,q11
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	veor	q3,q3,q8
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vadd.i32	q13,q2,q14
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	veor	q12,q12,q3
-	ldr	r9,[sp,#56]
-	eor	r11,r10,r4
-	vshr.u32	q3,q12,#30
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	vst1.32	{q13},[r12,:128]!
-	add	r6,r6,r11
-	eor	r10,r7,r4
-	vsli.32	q3,q12,#2
-	add	r5,r5,r9
-	ldr	r9,[sp,#60]
-	eor	r11,r10,r3
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	ldr	r9,[sp,#0]
-	eor	r11,r10,r7
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	vadd.i32	q13,q3,q14
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	vst1.32	{q13},[r12,:128]!
-	sub	r12,r12,#64
-	teq	r1,r2
-	sub	r8,r8,#16
-	it	eq
-	subeq	r1,r1,#64
-	vld1.8	{q0,q1},[r1]!
-	ldr	r9,[sp,#4]
-	eor	r11,r10,r6
-	vld1.8	{q2,q3},[r1]!
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	vld1.32	{d28[],d29[]},[r8,:32]!
-	add	r3,r3,r11
-	eor	r10,r4,r6
-	vrev32.8	q0,q0
-	add	r7,r7,r9
-	ldr	r9,[sp,#8]
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	ldr	r9,[sp,#12]
-	eor	r11,r10,r4
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	eor	r10,r7,r4
-	add	r5,r5,r9
-	ldr	r9,[sp,#16]
-	eor	r11,r10,r3
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	vrev32.8	q1,q1
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	vadd.i32	q8,q0,q14
-	ldr	r9,[sp,#20]
-	eor	r11,r10,r7
-	vst1.32	{q8},[r12,:128]!
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	ldr	r9,[sp,#24]
-	eor	r11,r10,r6
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	eor	r10,r4,r6
-	add	r7,r7,r9
-	ldr	r9,[sp,#28]
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	ldr	r9,[sp,#32]
-	eor	r11,r10,r4
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	vrev32.8	q2,q2
-	eor	r10,r7,r4
-	add	r5,r5,r9
-	vadd.i32	q9,q1,q14
-	ldr	r9,[sp,#36]
-	eor	r11,r10,r3
-	vst1.32	{q9},[r12,:128]!
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	ldr	r9,[sp,#40]
-	eor	r11,r10,r7
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	ldr	r9,[sp,#44]
-	eor	r11,r10,r6
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	eor	r10,r4,r6
-	add	r7,r7,r9
-	ldr	r9,[sp,#48]
-	eor	r11,r10,r5
-	add	r7,r7,r3,ror#27
-	mov	r4,r4,ror#2
-	add	r7,r7,r11
-	vrev32.8	q3,q3
-	eor	r10,r3,r5
-	add	r6,r6,r9
-	vadd.i32	q10,q2,q14
-	ldr	r9,[sp,#52]
-	eor	r11,r10,r4
-	vst1.32	{q10},[r12,:128]!
-	add	r6,r6,r7,ror#27
-	mov	r3,r3,ror#2
-	add	r6,r6,r11
-	eor	r10,r7,r4
-	add	r5,r5,r9
-	ldr	r9,[sp,#56]
-	eor	r11,r10,r3
-	add	r5,r5,r6,ror#27
-	mov	r7,r7,ror#2
-	add	r5,r5,r11
-	eor	r10,r6,r3
-	add	r4,r4,r9
-	ldr	r9,[sp,#60]
-	eor	r11,r10,r7
-	add	r4,r4,r5,ror#27
-	mov	r6,r6,ror#2
-	add	r4,r4,r11
-	eor	r10,r5,r7
-	add	r3,r3,r9
-	eor	r11,r10,r6
-	add	r3,r3,r4,ror#27
-	mov	r5,r5,ror#2
-	add	r3,r3,r11
-	ldmia	r0,{r9,r10,r11,r12}	@ accumulate context
-	add	r3,r3,r9
-	ldr	r9,[r0,#16]
-	add	r4,r4,r10
-	add	r5,r5,r11
-	add	r6,r6,r12
-	it	eq
-	moveq	sp,r14
-	add	r7,r7,r9
-	it	ne
-	ldrne	r9,[sp]
-	stmia	r0,{r3,r4,r5,r6,r7}
-	itt	ne
-	addne	r12,sp,#3*16
-	bne	Loop_neon
-
-	@ vldmia	sp!,{d8-d15}
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-
-#endif
-#if __ARM_MAX_ARCH__>=7
-
-# if defined(__thumb2__)
-#  define INST(a,b,c,d)	.byte	c,d|0xf,a,b
-# else
-#  define INST(a,b,c,d)	.byte	a,b,c,d|0x10
-# endif
-
-#ifdef __thumb2__
-.thumb_func	sha1_block_data_order_armv8
-#endif
-.align	5
-sha1_block_data_order_armv8:
-LARMv8:
-	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
-
-	veor	q1,q1,q1
-	adr	r3,LK_00_19
-	vld1.32	{q0},[r0]!
-	vld1.32	{d2[0]},[r0]
-	sub	r0,r0,#16
-	vld1.32	{d16[],d17[]},[r3,:32]!
-	vld1.32	{d18[],d19[]},[r3,:32]!
-	vld1.32	{d20[],d21[]},[r3,:32]!
-	vld1.32	{d22[],d23[]},[r3,:32]
-
-Loop_v8:
-	vld1.8	{q4,q5},[r1]!
-	vld1.8	{q6,q7},[r1]!
-	vrev32.8	q4,q4
-	vrev32.8	q5,q5
-
-	vadd.i32	q12,q8,q4
-	vrev32.8	q6,q6
-	vmov	q14,q0	@ offload
-	subs	r2,r2,#1
-
-	vadd.i32	q13,q8,q5
-	vrev32.8	q7,q7
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 0
-	INST(0x68,0x0c,0x02,0xe2)	@ sha1c q0,q1,q12
-	vadd.i32	q12,q8,q6
-	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 1
-	INST(0x6a,0x0c,0x06,0xe2)	@ sha1c q0,q3,q13
-	vadd.i32	q13,q8,q7
-	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
-	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 2
-	INST(0x68,0x0c,0x04,0xe2)	@ sha1c q0,q2,q12
-	vadd.i32	q12,q8,q4
-	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
-	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 3
-	INST(0x6a,0x0c,0x06,0xe2)	@ sha1c q0,q3,q13
-	vadd.i32	q13,q9,q5
-	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
-	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 4
-	INST(0x68,0x0c,0x04,0xe2)	@ sha1c q0,q2,q12
-	vadd.i32	q12,q9,q6
-	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
-	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 5
-	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
-	vadd.i32	q13,q9,q7
-	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
-	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 6
-	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
-	vadd.i32	q12,q9,q4
-	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
-	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 7
-	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
-	vadd.i32	q13,q9,q5
-	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
-	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 8
-	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
-	vadd.i32	q12,q10,q6
-	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
-	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 9
-	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
-	vadd.i32	q13,q10,q7
-	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
-	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 10
-	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
-	vadd.i32	q12,q10,q4
-	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
-	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 11
-	INST(0x6a,0x0c,0x26,0xe2)	@ sha1m q0,q3,q13
-	vadd.i32	q13,q10,q5
-	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
-	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 12
-	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
-	vadd.i32	q12,q10,q6
-	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
-	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 13
-	INST(0x6a,0x0c,0x26,0xe2)	@ sha1m q0,q3,q13
-	vadd.i32	q13,q11,q7
-	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
-	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 14
-	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
-	vadd.i32	q12,q11,q4
-	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
-	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 15
-	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
-	vadd.i32	q13,q11,q5
-	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
-	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 16
-	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
-	vadd.i32	q12,q11,q6
-	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 17
-	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
-	vadd.i32	q13,q11,q7
-
-	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 18
-	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
-
-	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 19
-	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
-
-	vadd.i32	q1,q1,q2
-	vadd.i32	q0,q0,q14
-	bne	Loop_v8
-
-	vst1.32	{q0},[r0]!
-	vst1.32	{d2[0]},[r0]
-
-	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
-	bx	lr					@ bx lr
-
-#endif
-#if __ARM_MAX_ARCH__>=7
-.comm	_OPENSSL_armcap_P,4
-.non_lazy_symbol_pointer
-OPENSSL_armcap_P:
-.indirect_symbol	_OPENSSL_armcap_P
-.long	0
-.private_extern	_OPENSSL_armcap_P
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/sha256-armv4-apple.S b/apple-arm/crypto/fipsmodule/sha256-armv4-apple.S
deleted file mode 100644
index 8379765..0000000
--- a/apple-arm/crypto/fipsmodule/sha256-armv4-apple.S
+++ /dev/null
@@ -1,2838 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
-@
-@ Licensed under the OpenSSL license (the "License").  You may not use
-@ this file except in compliance with the License.  You can obtain a copy
-@ in the file LICENSE in the source distribution or at
-@ https://www.openssl.org/source/license.html
-
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@
-@ Permission to use under GPL terms is granted.
-@ ====================================================================
-
-@ SHA256 block procedure for ARMv4. May 2007.
-
-@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
-@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-@ byte [on single-issue Xscale PXA250 core].
-
-@ July 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
-@ Cortex A8 core and ~20 cycles per processed byte.
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 16%
-@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
-
-@ September 2013.
-@
-@ Add NEON implementation. On Cortex A8 it was measured to process one
-@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
-@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
-@ code (meaning that latter performs sub-optimally, nothing was done
-@ about it).
-
-@ May 2014.
-@
-@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-
-#ifndef __KERNEL__
-# include <openssl/arm_arch.h>
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
-@ instructions are manually-encoded. (See unsha256.)
-
-
-.text
-#if defined(__thumb2__)
-.syntax	unified
-.thumb
-#else
-.code	32
-#endif
-
-
-.align	5
-K256:
-.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.word	0				@ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-Lsha256_block_data_order
-#endif
-.align	5
-
-.globl	_sha256_block_data_order
-.private_extern	_sha256_block_data_order
-#ifdef __thumb2__
-.thumb_func	_sha256_block_data_order
-#endif
-_sha256_block_data_order:
-Lsha256_block_data_order:
-#if __ARM_ARCH__<7 && !defined(__thumb2__)
-	sub	r3,pc,#8		@ _sha256_block_data_order
-#else
-	adr	r3,Lsha256_block_data_order
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	ldr	r12,LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-#ifdef	__APPLE__
-	ldr	r12,[r12]
-#endif
-	tst	r12,#ARMV8_SHA256
-	bne	LARMv8
-	tst	r12,#ARMV7_NEON
-	bne	LNEON
-#endif
-	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
-	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
-	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
-	sub	r14,r3,#256+32	@ K256
-	sub	sp,sp,#16*4		@ alloca(X[16])
-Loop:
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r5,r6		@ magic
-	eor	r12,r12,r12
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 0
-# if 0==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r8,r8,ror#5
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r8,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 0
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 0==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r8,r8,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r8,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r11,r11,r2			@ h+=X[i]
-	str	r2,[sp,#0*4]
-	eor	r2,r9,r10
-	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r8
-	add	r11,r11,r12			@ h+=K256[i]
-	eor	r2,r2,r10			@ Ch(e,f,g)
-	eor	r0,r4,r4,ror#11
-	add	r11,r11,r2			@ h+=Ch(e,f,g)
-#if 0==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 0<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r4,r5			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
-	eor	r12,r4,r5			@ a^b, b^c in next round
-	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r4,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r7,r7,r11			@ d+=h
-	eor	r3,r3,r5			@ Maj(a,b,c)
-	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 1
-# if 1==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r7,r7,ror#5
-	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r7,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 1
-	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 1==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r7,r7,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r7,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r10,r10,r2			@ h+=X[i]
-	str	r2,[sp,#1*4]
-	eor	r2,r8,r9
-	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r7
-	add	r10,r10,r3			@ h+=K256[i]
-	eor	r2,r2,r9			@ Ch(e,f,g)
-	eor	r0,r11,r11,ror#11
-	add	r10,r10,r2			@ h+=Ch(e,f,g)
-#if 1==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 1<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r11,r4			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
-	eor	r3,r11,r4			@ a^b, b^c in next round
-	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r11,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r6,r6,r10			@ d+=h
-	eor	r12,r12,r4			@ Maj(a,b,c)
-	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 2
-# if 2==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r6,r6,ror#5
-	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r6,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 2
-	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 2==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r6,r6,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r6,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r9,r9,r2			@ h+=X[i]
-	str	r2,[sp,#2*4]
-	eor	r2,r7,r8
-	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r6
-	add	r9,r9,r12			@ h+=K256[i]
-	eor	r2,r2,r8			@ Ch(e,f,g)
-	eor	r0,r10,r10,ror#11
-	add	r9,r9,r2			@ h+=Ch(e,f,g)
-#if 2==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 2<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r10,r11			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
-	eor	r12,r10,r11			@ a^b, b^c in next round
-	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r10,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r5,r5,r9			@ d+=h
-	eor	r3,r3,r11			@ Maj(a,b,c)
-	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 3
-# if 3==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r5,r5,ror#5
-	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r5,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 3
-	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 3==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r5,r5,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r5,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r8,r8,r2			@ h+=X[i]
-	str	r2,[sp,#3*4]
-	eor	r2,r6,r7
-	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r5
-	add	r8,r8,r3			@ h+=K256[i]
-	eor	r2,r2,r7			@ Ch(e,f,g)
-	eor	r0,r9,r9,ror#11
-	add	r8,r8,r2			@ h+=Ch(e,f,g)
-#if 3==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 3<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r9,r10			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
-	eor	r3,r9,r10			@ a^b, b^c in next round
-	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r9,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r4,r4,r8			@ d+=h
-	eor	r12,r12,r10			@ Maj(a,b,c)
-	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 4
-# if 4==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r4,r4,ror#5
-	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r4,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 4
-	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 4==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r4,r4,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r4,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r7,r7,r2			@ h+=X[i]
-	str	r2,[sp,#4*4]
-	eor	r2,r5,r6
-	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r4
-	add	r7,r7,r12			@ h+=K256[i]
-	eor	r2,r2,r6			@ Ch(e,f,g)
-	eor	r0,r8,r8,ror#11
-	add	r7,r7,r2			@ h+=Ch(e,f,g)
-#if 4==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 4<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r8,r9			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
-	eor	r12,r8,r9			@ a^b, b^c in next round
-	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r8,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r11,r11,r7			@ d+=h
-	eor	r3,r3,r9			@ Maj(a,b,c)
-	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 5
-# if 5==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r11,r11,ror#5
-	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r11,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 5
-	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 5==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r11,r11,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r11,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r6,r6,r2			@ h+=X[i]
-	str	r2,[sp,#5*4]
-	eor	r2,r4,r5
-	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r11
-	add	r6,r6,r3			@ h+=K256[i]
-	eor	r2,r2,r5			@ Ch(e,f,g)
-	eor	r0,r7,r7,ror#11
-	add	r6,r6,r2			@ h+=Ch(e,f,g)
-#if 5==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 5<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r7,r8			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
-	eor	r3,r7,r8			@ a^b, b^c in next round
-	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r7,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r10,r10,r6			@ d+=h
-	eor	r12,r12,r8			@ Maj(a,b,c)
-	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 6
-# if 6==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r10,r10,ror#5
-	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r10,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 6
-	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 6==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r10,r10,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r10,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r5,r5,r2			@ h+=X[i]
-	str	r2,[sp,#6*4]
-	eor	r2,r11,r4
-	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r10
-	add	r5,r5,r12			@ h+=K256[i]
-	eor	r2,r2,r4			@ Ch(e,f,g)
-	eor	r0,r6,r6,ror#11
-	add	r5,r5,r2			@ h+=Ch(e,f,g)
-#if 6==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 6<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r6,r7			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
-	eor	r12,r6,r7			@ a^b, b^c in next round
-	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r6,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r9,r9,r5			@ d+=h
-	eor	r3,r3,r7			@ Maj(a,b,c)
-	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 7
-# if 7==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r9,r9,ror#5
-	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r9,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 7
-	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 7==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r9,r9,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r9,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r4,r4,r2			@ h+=X[i]
-	str	r2,[sp,#7*4]
-	eor	r2,r10,r11
-	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r9
-	add	r4,r4,r3			@ h+=K256[i]
-	eor	r2,r2,r11			@ Ch(e,f,g)
-	eor	r0,r5,r5,ror#11
-	add	r4,r4,r2			@ h+=Ch(e,f,g)
-#if 7==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 7<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r5,r6			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
-	eor	r3,r5,r6			@ a^b, b^c in next round
-	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r5,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r8,r8,r4			@ d+=h
-	eor	r12,r12,r6			@ Maj(a,b,c)
-	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 8
-# if 8==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r8,r8,ror#5
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r8,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 8
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 8==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r8,r8,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r8,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r11,r11,r2			@ h+=X[i]
-	str	r2,[sp,#8*4]
-	eor	r2,r9,r10
-	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r8
-	add	r11,r11,r12			@ h+=K256[i]
-	eor	r2,r2,r10			@ Ch(e,f,g)
-	eor	r0,r4,r4,ror#11
-	add	r11,r11,r2			@ h+=Ch(e,f,g)
-#if 8==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 8<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r4,r5			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
-	eor	r12,r4,r5			@ a^b, b^c in next round
-	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r4,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r7,r7,r11			@ d+=h
-	eor	r3,r3,r5			@ Maj(a,b,c)
-	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 9
-# if 9==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r7,r7,ror#5
-	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r7,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 9
-	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 9==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r7,r7,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r7,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r10,r10,r2			@ h+=X[i]
-	str	r2,[sp,#9*4]
-	eor	r2,r8,r9
-	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r7
-	add	r10,r10,r3			@ h+=K256[i]
-	eor	r2,r2,r9			@ Ch(e,f,g)
-	eor	r0,r11,r11,ror#11
-	add	r10,r10,r2			@ h+=Ch(e,f,g)
-#if 9==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 9<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r11,r4			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
-	eor	r3,r11,r4			@ a^b, b^c in next round
-	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r11,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r6,r6,r10			@ d+=h
-	eor	r12,r12,r4			@ Maj(a,b,c)
-	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 10
-# if 10==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r6,r6,ror#5
-	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r6,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 10
-	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 10==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r6,r6,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r6,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r9,r9,r2			@ h+=X[i]
-	str	r2,[sp,#10*4]
-	eor	r2,r7,r8
-	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r6
-	add	r9,r9,r12			@ h+=K256[i]
-	eor	r2,r2,r8			@ Ch(e,f,g)
-	eor	r0,r10,r10,ror#11
-	add	r9,r9,r2			@ h+=Ch(e,f,g)
-#if 10==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 10<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r10,r11			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
-	eor	r12,r10,r11			@ a^b, b^c in next round
-	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r10,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r5,r5,r9			@ d+=h
-	eor	r3,r3,r11			@ Maj(a,b,c)
-	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 11
-# if 11==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r5,r5,ror#5
-	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r5,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 11
-	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 11==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r5,r5,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r5,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r8,r8,r2			@ h+=X[i]
-	str	r2,[sp,#11*4]
-	eor	r2,r6,r7
-	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r5
-	add	r8,r8,r3			@ h+=K256[i]
-	eor	r2,r2,r7			@ Ch(e,f,g)
-	eor	r0,r9,r9,ror#11
-	add	r8,r8,r2			@ h+=Ch(e,f,g)
-#if 11==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 11<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r9,r10			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
-	eor	r3,r9,r10			@ a^b, b^c in next round
-	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r9,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r4,r4,r8			@ d+=h
-	eor	r12,r12,r10			@ Maj(a,b,c)
-	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 12
-# if 12==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r4,r4,ror#5
-	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r4,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 12
-	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 12==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r4,r4,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r4,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r7,r7,r2			@ h+=X[i]
-	str	r2,[sp,#12*4]
-	eor	r2,r5,r6
-	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r4
-	add	r7,r7,r12			@ h+=K256[i]
-	eor	r2,r2,r6			@ Ch(e,f,g)
-	eor	r0,r8,r8,ror#11
-	add	r7,r7,r2			@ h+=Ch(e,f,g)
-#if 12==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 12<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r8,r9			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
-	eor	r12,r8,r9			@ a^b, b^c in next round
-	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r8,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r11,r11,r7			@ d+=h
-	eor	r3,r3,r9			@ Maj(a,b,c)
-	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 13
-# if 13==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r11,r11,ror#5
-	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r11,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 13
-	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 13==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r11,r11,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r11,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r6,r6,r2			@ h+=X[i]
-	str	r2,[sp,#13*4]
-	eor	r2,r4,r5
-	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r11
-	add	r6,r6,r3			@ h+=K256[i]
-	eor	r2,r2,r5			@ Ch(e,f,g)
-	eor	r0,r7,r7,ror#11
-	add	r6,r6,r2			@ h+=Ch(e,f,g)
-#if 13==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 13<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r7,r8			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
-	eor	r3,r7,r8			@ a^b, b^c in next round
-	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r7,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r10,r10,r6			@ d+=h
-	eor	r12,r12,r8			@ Maj(a,b,c)
-	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 14
-# if 14==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r10,r10,ror#5
-	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r10,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 14
-	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
-	ldrb	r12,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r12,lsl#8
-	ldrb	r12,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 14==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r10,r10,ror#5
-	orr	r2,r2,r12,lsl#24
-	eor	r0,r0,r10,ror#19	@ Sigma1(e)
-#endif
-	ldr	r12,[r14],#4			@ *K256++
-	add	r5,r5,r2			@ h+=X[i]
-	str	r2,[sp,#14*4]
-	eor	r2,r11,r4
-	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r10
-	add	r5,r5,r12			@ h+=K256[i]
-	eor	r2,r2,r4			@ Ch(e,f,g)
-	eor	r0,r6,r6,ror#11
-	add	r5,r5,r2			@ h+=Ch(e,f,g)
-#if 14==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 14<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r6,r7			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
-	eor	r12,r6,r7			@ a^b, b^c in next round
-	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r6,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r9,r9,r5			@ d+=h
-	eor	r3,r3,r7			@ Maj(a,b,c)
-	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	@ ldr	r2,[r1],#4			@ 15
-# if 15==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r9,r9,ror#5
-	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
-	eor	r0,r0,r9,ror#19	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	r2,r2
-# endif
-#else
-	@ ldrb	r2,[r1,#3]			@ 15
-	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
-	ldrb	r3,[r1,#2]
-	ldrb	r0,[r1,#1]
-	orr	r2,r2,r3,lsl#8
-	ldrb	r3,[r1],#4
-	orr	r2,r2,r0,lsl#16
-# if 15==15
-	str	r1,[sp,#17*4]			@ make room for r1
-# endif
-	eor	r0,r9,r9,ror#5
-	orr	r2,r2,r3,lsl#24
-	eor	r0,r0,r9,ror#19	@ Sigma1(e)
-#endif
-	ldr	r3,[r14],#4			@ *K256++
-	add	r4,r4,r2			@ h+=X[i]
-	str	r2,[sp,#15*4]
-	eor	r2,r10,r11
-	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r9
-	add	r4,r4,r3			@ h+=K256[i]
-	eor	r2,r2,r11			@ Ch(e,f,g)
-	eor	r0,r5,r5,ror#11
-	add	r4,r4,r2			@ h+=Ch(e,f,g)
-#if 15==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 15<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r5,r6			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
-	eor	r3,r5,r6			@ a^b, b^c in next round
-	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r5,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r8,r8,r4			@ d+=h
-	eor	r12,r12,r6			@ Maj(a,b,c)
-	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
-Lrounds_16_xx:
-	@ ldr	r2,[sp,#1*4]		@ 16
-	@ ldr	r1,[sp,#14*4]
-	mov	r0,r2,ror#7
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#0*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#9*4]
-
-	add	r12,r12,r0
-	eor	r0,r8,r8,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r8,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r11,r11,r2			@ h+=X[i]
-	str	r2,[sp,#0*4]
-	eor	r2,r9,r10
-	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r8
-	add	r11,r11,r12			@ h+=K256[i]
-	eor	r2,r2,r10			@ Ch(e,f,g)
-	eor	r0,r4,r4,ror#11
-	add	r11,r11,r2			@ h+=Ch(e,f,g)
-#if 16==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 16<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r4,r5			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
-	eor	r12,r4,r5			@ a^b, b^c in next round
-	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r4,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r7,r7,r11			@ d+=h
-	eor	r3,r3,r5			@ Maj(a,b,c)
-	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#2*4]		@ 17
-	@ ldr	r1,[sp,#15*4]
-	mov	r0,r2,ror#7
-	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#1*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#10*4]
-
-	add	r3,r3,r0
-	eor	r0,r7,r7,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r7,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r10,r10,r2			@ h+=X[i]
-	str	r2,[sp,#1*4]
-	eor	r2,r8,r9
-	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r7
-	add	r10,r10,r3			@ h+=K256[i]
-	eor	r2,r2,r9			@ Ch(e,f,g)
-	eor	r0,r11,r11,ror#11
-	add	r10,r10,r2			@ h+=Ch(e,f,g)
-#if 17==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 17<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r11,r4			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
-	eor	r3,r11,r4			@ a^b, b^c in next round
-	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r11,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r6,r6,r10			@ d+=h
-	eor	r12,r12,r4			@ Maj(a,b,c)
-	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#3*4]		@ 18
-	@ ldr	r1,[sp,#0*4]
-	mov	r0,r2,ror#7
-	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#2*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#11*4]
-
-	add	r12,r12,r0
-	eor	r0,r6,r6,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r6,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r9,r9,r2			@ h+=X[i]
-	str	r2,[sp,#2*4]
-	eor	r2,r7,r8
-	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r6
-	add	r9,r9,r12			@ h+=K256[i]
-	eor	r2,r2,r8			@ Ch(e,f,g)
-	eor	r0,r10,r10,ror#11
-	add	r9,r9,r2			@ h+=Ch(e,f,g)
-#if 18==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 18<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r10,r11			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
-	eor	r12,r10,r11			@ a^b, b^c in next round
-	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r10,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r5,r5,r9			@ d+=h
-	eor	r3,r3,r11			@ Maj(a,b,c)
-	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#4*4]		@ 19
-	@ ldr	r1,[sp,#1*4]
-	mov	r0,r2,ror#7
-	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#3*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#12*4]
-
-	add	r3,r3,r0
-	eor	r0,r5,r5,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r5,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r8,r8,r2			@ h+=X[i]
-	str	r2,[sp,#3*4]
-	eor	r2,r6,r7
-	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r5
-	add	r8,r8,r3			@ h+=K256[i]
-	eor	r2,r2,r7			@ Ch(e,f,g)
-	eor	r0,r9,r9,ror#11
-	add	r8,r8,r2			@ h+=Ch(e,f,g)
-#if 19==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 19<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r9,r10			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
-	eor	r3,r9,r10			@ a^b, b^c in next round
-	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r9,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r4,r4,r8			@ d+=h
-	eor	r12,r12,r10			@ Maj(a,b,c)
-	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#5*4]		@ 20
-	@ ldr	r1,[sp,#2*4]
-	mov	r0,r2,ror#7
-	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#4*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#13*4]
-
-	add	r12,r12,r0
-	eor	r0,r4,r4,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r4,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r7,r7,r2			@ h+=X[i]
-	str	r2,[sp,#4*4]
-	eor	r2,r5,r6
-	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r4
-	add	r7,r7,r12			@ h+=K256[i]
-	eor	r2,r2,r6			@ Ch(e,f,g)
-	eor	r0,r8,r8,ror#11
-	add	r7,r7,r2			@ h+=Ch(e,f,g)
-#if 20==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 20<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r8,r9			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
-	eor	r12,r8,r9			@ a^b, b^c in next round
-	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r8,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r11,r11,r7			@ d+=h
-	eor	r3,r3,r9			@ Maj(a,b,c)
-	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#6*4]		@ 21
-	@ ldr	r1,[sp,#3*4]
-	mov	r0,r2,ror#7
-	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#5*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#14*4]
-
-	add	r3,r3,r0
-	eor	r0,r11,r11,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r11,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r6,r6,r2			@ h+=X[i]
-	str	r2,[sp,#5*4]
-	eor	r2,r4,r5
-	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r11
-	add	r6,r6,r3			@ h+=K256[i]
-	eor	r2,r2,r5			@ Ch(e,f,g)
-	eor	r0,r7,r7,ror#11
-	add	r6,r6,r2			@ h+=Ch(e,f,g)
-#if 21==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 21<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r7,r8			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
-	eor	r3,r7,r8			@ a^b, b^c in next round
-	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r7,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r10,r10,r6			@ d+=h
-	eor	r12,r12,r8			@ Maj(a,b,c)
-	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#7*4]		@ 22
-	@ ldr	r1,[sp,#4*4]
-	mov	r0,r2,ror#7
-	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#6*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#15*4]
-
-	add	r12,r12,r0
-	eor	r0,r10,r10,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r10,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r5,r5,r2			@ h+=X[i]
-	str	r2,[sp,#6*4]
-	eor	r2,r11,r4
-	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r10
-	add	r5,r5,r12			@ h+=K256[i]
-	eor	r2,r2,r4			@ Ch(e,f,g)
-	eor	r0,r6,r6,ror#11
-	add	r5,r5,r2			@ h+=Ch(e,f,g)
-#if 22==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 22<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r6,r7			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
-	eor	r12,r6,r7			@ a^b, b^c in next round
-	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r6,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r9,r9,r5			@ d+=h
-	eor	r3,r3,r7			@ Maj(a,b,c)
-	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#8*4]		@ 23
-	@ ldr	r1,[sp,#5*4]
-	mov	r0,r2,ror#7
-	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#7*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#0*4]
-
-	add	r3,r3,r0
-	eor	r0,r9,r9,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r9,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r4,r4,r2			@ h+=X[i]
-	str	r2,[sp,#7*4]
-	eor	r2,r10,r11
-	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r9
-	add	r4,r4,r3			@ h+=K256[i]
-	eor	r2,r2,r11			@ Ch(e,f,g)
-	eor	r0,r5,r5,ror#11
-	add	r4,r4,r2			@ h+=Ch(e,f,g)
-#if 23==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 23<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r5,r6			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
-	eor	r3,r5,r6			@ a^b, b^c in next round
-	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r5,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r8,r8,r4			@ d+=h
-	eor	r12,r12,r6			@ Maj(a,b,c)
-	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#9*4]		@ 24
-	@ ldr	r1,[sp,#6*4]
-	mov	r0,r2,ror#7
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#8*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#1*4]
-
-	add	r12,r12,r0
-	eor	r0,r8,r8,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r8,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r11,r11,r2			@ h+=X[i]
-	str	r2,[sp,#8*4]
-	eor	r2,r9,r10
-	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r8
-	add	r11,r11,r12			@ h+=K256[i]
-	eor	r2,r2,r10			@ Ch(e,f,g)
-	eor	r0,r4,r4,ror#11
-	add	r11,r11,r2			@ h+=Ch(e,f,g)
-#if 24==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 24<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r4,r5			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
-	eor	r12,r4,r5			@ a^b, b^c in next round
-	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r4,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r7,r7,r11			@ d+=h
-	eor	r3,r3,r5			@ Maj(a,b,c)
-	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#10*4]		@ 25
-	@ ldr	r1,[sp,#7*4]
-	mov	r0,r2,ror#7
-	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#9*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#2*4]
-
-	add	r3,r3,r0
-	eor	r0,r7,r7,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r7,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r10,r10,r2			@ h+=X[i]
-	str	r2,[sp,#9*4]
-	eor	r2,r8,r9
-	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r7
-	add	r10,r10,r3			@ h+=K256[i]
-	eor	r2,r2,r9			@ Ch(e,f,g)
-	eor	r0,r11,r11,ror#11
-	add	r10,r10,r2			@ h+=Ch(e,f,g)
-#if 25==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 25<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r11,r4			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
-	eor	r3,r11,r4			@ a^b, b^c in next round
-	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r11,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r6,r6,r10			@ d+=h
-	eor	r12,r12,r4			@ Maj(a,b,c)
-	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#11*4]		@ 26
-	@ ldr	r1,[sp,#8*4]
-	mov	r0,r2,ror#7
-	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#10*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#3*4]
-
-	add	r12,r12,r0
-	eor	r0,r6,r6,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r6,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r9,r9,r2			@ h+=X[i]
-	str	r2,[sp,#10*4]
-	eor	r2,r7,r8
-	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r6
-	add	r9,r9,r12			@ h+=K256[i]
-	eor	r2,r2,r8			@ Ch(e,f,g)
-	eor	r0,r10,r10,ror#11
-	add	r9,r9,r2			@ h+=Ch(e,f,g)
-#if 26==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 26<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r10,r11			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
-	eor	r12,r10,r11			@ a^b, b^c in next round
-	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r10,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r5,r5,r9			@ d+=h
-	eor	r3,r3,r11			@ Maj(a,b,c)
-	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#12*4]		@ 27
-	@ ldr	r1,[sp,#9*4]
-	mov	r0,r2,ror#7
-	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#11*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#4*4]
-
-	add	r3,r3,r0
-	eor	r0,r5,r5,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r5,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r8,r8,r2			@ h+=X[i]
-	str	r2,[sp,#11*4]
-	eor	r2,r6,r7
-	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r5
-	add	r8,r8,r3			@ h+=K256[i]
-	eor	r2,r2,r7			@ Ch(e,f,g)
-	eor	r0,r9,r9,ror#11
-	add	r8,r8,r2			@ h+=Ch(e,f,g)
-#if 27==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 27<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r9,r10			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
-	eor	r3,r9,r10			@ a^b, b^c in next round
-	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r9,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r4,r4,r8			@ d+=h
-	eor	r12,r12,r10			@ Maj(a,b,c)
-	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#13*4]		@ 28
-	@ ldr	r1,[sp,#10*4]
-	mov	r0,r2,ror#7
-	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#12*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#5*4]
-
-	add	r12,r12,r0
-	eor	r0,r4,r4,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r4,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r7,r7,r2			@ h+=X[i]
-	str	r2,[sp,#12*4]
-	eor	r2,r5,r6
-	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r4
-	add	r7,r7,r12			@ h+=K256[i]
-	eor	r2,r2,r6			@ Ch(e,f,g)
-	eor	r0,r8,r8,ror#11
-	add	r7,r7,r2			@ h+=Ch(e,f,g)
-#if 28==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 28<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r8,r9			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
-	eor	r12,r8,r9			@ a^b, b^c in next round
-	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r8,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r11,r11,r7			@ d+=h
-	eor	r3,r3,r9			@ Maj(a,b,c)
-	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#14*4]		@ 29
-	@ ldr	r1,[sp,#11*4]
-	mov	r0,r2,ror#7
-	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#13*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#6*4]
-
-	add	r3,r3,r0
-	eor	r0,r11,r11,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r11,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r6,r6,r2			@ h+=X[i]
-	str	r2,[sp,#13*4]
-	eor	r2,r4,r5
-	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r11
-	add	r6,r6,r3			@ h+=K256[i]
-	eor	r2,r2,r5			@ Ch(e,f,g)
-	eor	r0,r7,r7,ror#11
-	add	r6,r6,r2			@ h+=Ch(e,f,g)
-#if 29==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 29<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r7,r8			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
-	eor	r3,r7,r8			@ a^b, b^c in next round
-	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r7,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r10,r10,r6			@ d+=h
-	eor	r12,r12,r8			@ Maj(a,b,c)
-	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#15*4]		@ 30
-	@ ldr	r1,[sp,#12*4]
-	mov	r0,r2,ror#7
-	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
-	mov	r12,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r12,r12,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#14*4]
-	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#7*4]
-
-	add	r12,r12,r0
-	eor	r0,r10,r10,ror#5	@ from BODY_00_15
-	add	r2,r2,r12
-	eor	r0,r0,r10,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r12,[r14],#4			@ *K256++
-	add	r5,r5,r2			@ h+=X[i]
-	str	r2,[sp,#14*4]
-	eor	r2,r11,r4
-	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r10
-	add	r5,r5,r12			@ h+=K256[i]
-	eor	r2,r2,r4			@ Ch(e,f,g)
-	eor	r0,r6,r6,ror#11
-	add	r5,r5,r2			@ h+=Ch(e,f,g)
-#if 30==31
-	and	r12,r12,#0xff
-	cmp	r12,#0xf2			@ done?
-#endif
-#if 30<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r12,r6,r7			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
-	eor	r12,r6,r7			@ a^b, b^c in next round
-	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r6,ror#20	@ Sigma0(a)
-	and	r3,r3,r12			@ (b^c)&=(a^b)
-	add	r9,r9,r5			@ d+=h
-	eor	r3,r3,r7			@ Maj(a,b,c)
-	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
-	@ ldr	r2,[sp,#0*4]		@ 31
-	@ ldr	r1,[sp,#13*4]
-	mov	r0,r2,ror#7
-	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
-	mov	r3,r1,ror#17
-	eor	r0,r0,r2,ror#18
-	eor	r3,r3,r1,ror#19
-	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
-	ldr	r2,[sp,#15*4]
-	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
-	ldr	r1,[sp,#8*4]
-
-	add	r3,r3,r0
-	eor	r0,r9,r9,ror#5	@ from BODY_00_15
-	add	r2,r2,r3
-	eor	r0,r0,r9,ror#19	@ Sigma1(e)
-	add	r2,r2,r1			@ X[i]
-	ldr	r3,[r14],#4			@ *K256++
-	add	r4,r4,r2			@ h+=X[i]
-	str	r2,[sp,#15*4]
-	eor	r2,r10,r11
-	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
-	and	r2,r2,r9
-	add	r4,r4,r3			@ h+=K256[i]
-	eor	r2,r2,r11			@ Ch(e,f,g)
-	eor	r0,r5,r5,ror#11
-	add	r4,r4,r2			@ h+=Ch(e,f,g)
-#if 31==31
-	and	r3,r3,#0xff
-	cmp	r3,#0xf2			@ done?
-#endif
-#if 31<15
-# if __ARM_ARCH__>=7
-	ldr	r2,[r1],#4			@ prefetch
-# else
-	ldrb	r2,[r1,#3]
-# endif
-	eor	r3,r5,r6			@ a^b, b^c in next round
-#else
-	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
-	eor	r3,r5,r6			@ a^b, b^c in next round
-	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
-#endif
-	eor	r0,r0,r5,ror#20	@ Sigma0(a)
-	and	r12,r12,r3			@ (b^c)&=(a^b)
-	add	r8,r8,r4			@ d+=h
-	eor	r12,r12,r6			@ Maj(a,b,c)
-	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
-	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
-#if __ARM_ARCH__>=7
-	ite	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	ldreq	r3,[sp,#16*4]		@ pull ctx
-	bne	Lrounds_16_xx
-
-	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
-	ldr	r0,[r3,#0]
-	ldr	r2,[r3,#4]
-	ldr	r12,[r3,#8]
-	add	r4,r4,r0
-	ldr	r0,[r3,#12]
-	add	r5,r5,r2
-	ldr	r2,[r3,#16]
-	add	r6,r6,r12
-	ldr	r12,[r3,#20]
-	add	r7,r7,r0
-	ldr	r0,[r3,#24]
-	add	r8,r8,r2
-	ldr	r2,[r3,#28]
-	add	r9,r9,r12
-	ldr	r1,[sp,#17*4]		@ pull inp
-	ldr	r12,[sp,#18*4]		@ pull inp+len
-	add	r10,r10,r0
-	add	r11,r11,r2
-	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
-	cmp	r1,r12
-	sub	r14,r14,#256	@ rewind Ktbl
-	bne	Loop
-
-	add	sp,sp,#19*4	@ destroy frame
-#if __ARM_ARCH__>=5
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
-#else
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.globl	_sha256_block_data_order_neon
-.private_extern	_sha256_block_data_order_neon
-#ifdef __thumb2__
-.thumb_func	_sha256_block_data_order_neon
-#endif
-.align	5
-.skip	16
-_sha256_block_data_order_neon:
-LNEON:
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-
-	sub	r11,sp,#16*4+16
-	adr	r14,K256
-	bic	r11,r11,#15		@ align for 128-bit stores
-	mov	r12,sp
-	mov	sp,r11			@ alloca
-	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
-
-	vld1.8	{q0},[r1]!
-	vld1.8	{q1},[r1]!
-	vld1.8	{q2},[r1]!
-	vld1.8	{q3},[r1]!
-	vld1.32	{q8},[r14,:128]!
-	vld1.32	{q9},[r14,:128]!
-	vld1.32	{q10},[r14,:128]!
-	vld1.32	{q11},[r14,:128]!
-	vrev32.8	q0,q0		@ yes, even on
-	str	r0,[sp,#64]
-	vrev32.8	q1,q1		@ big-endian
-	str	r1,[sp,#68]
-	mov	r1,sp
-	vrev32.8	q2,q2
-	str	r2,[sp,#72]
-	vrev32.8	q3,q3
-	str	r12,[sp,#76]		@ save original sp
-	vadd.i32	q8,q8,q0
-	vadd.i32	q9,q9,q1
-	vst1.32	{q8},[r1,:128]!
-	vadd.i32	q10,q10,q2
-	vst1.32	{q9},[r1,:128]!
-	vadd.i32	q11,q11,q3
-	vst1.32	{q10},[r1,:128]!
-	vst1.32	{q11},[r1,:128]!
-
-	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
-	sub	r1,r1,#64
-	ldr	r2,[sp,#0]
-	eor	r12,r12,r12
-	eor	r3,r5,r6
-	b	L_00_48
-
-.align	4
-L_00_48:
-	vext.8	q8,q0,q1,#4
-	add	r11,r11,r2
-	eor	r2,r9,r10
-	eor	r0,r8,r8,ror#5
-	vext.8	q9,q2,q3,#4
-	add	r4,r4,r12
-	and	r2,r2,r8
-	eor	r12,r0,r8,ror#19
-	vshr.u32	q10,q8,#7
-	eor	r0,r4,r4,ror#11
-	eor	r2,r2,r10
-	vadd.i32	q0,q0,q9
-	add	r11,r11,r12,ror#6
-	eor	r12,r4,r5
-	vshr.u32	q9,q8,#3
-	eor	r0,r0,r4,ror#20
-	add	r11,r11,r2
-	vsli.32	q10,q8,#25
-	ldr	r2,[sp,#4]
-	and	r3,r3,r12
-	vshr.u32	q11,q8,#18
-	add	r7,r7,r11
-	add	r11,r11,r0,ror#2
-	eor	r3,r3,r5
-	veor	q9,q9,q10
-	add	r10,r10,r2
-	vsli.32	q11,q8,#14
-	eor	r2,r8,r9
-	eor	r0,r7,r7,ror#5
-	vshr.u32	d24,d7,#17
-	add	r11,r11,r3
-	and	r2,r2,r7
-	veor	q9,q9,q11
-	eor	r3,r0,r7,ror#19
-	eor	r0,r11,r11,ror#11
-	vsli.32	d24,d7,#15
-	eor	r2,r2,r9
-	add	r10,r10,r3,ror#6
-	vshr.u32	d25,d7,#10
-	eor	r3,r11,r4
-	eor	r0,r0,r11,ror#20
-	vadd.i32	q0,q0,q9
-	add	r10,r10,r2
-	ldr	r2,[sp,#8]
-	veor	d25,d25,d24
-	and	r12,r12,r3
-	add	r6,r6,r10
-	vshr.u32	d24,d7,#19
-	add	r10,r10,r0,ror#2
-	eor	r12,r12,r4
-	vsli.32	d24,d7,#13
-	add	r9,r9,r2
-	eor	r2,r7,r8
-	veor	d25,d25,d24
-	eor	r0,r6,r6,ror#5
-	add	r10,r10,r12
-	vadd.i32	d0,d0,d25
-	and	r2,r2,r6
-	eor	r12,r0,r6,ror#19
-	vshr.u32	d24,d0,#17
-	eor	r0,r10,r10,ror#11
-	eor	r2,r2,r8
-	vsli.32	d24,d0,#15
-	add	r9,r9,r12,ror#6
-	eor	r12,r10,r11
-	vshr.u32	d25,d0,#10
-	eor	r0,r0,r10,ror#20
-	add	r9,r9,r2
-	veor	d25,d25,d24
-	ldr	r2,[sp,#12]
-	and	r3,r3,r12
-	vshr.u32	d24,d0,#19
-	add	r5,r5,r9
-	add	r9,r9,r0,ror#2
-	eor	r3,r3,r11
-	vld1.32	{q8},[r14,:128]!
-	add	r8,r8,r2
-	vsli.32	d24,d0,#13
-	eor	r2,r6,r7
-	eor	r0,r5,r5,ror#5
-	veor	d25,d25,d24
-	add	r9,r9,r3
-	and	r2,r2,r5
-	vadd.i32	d1,d1,d25
-	eor	r3,r0,r5,ror#19
-	eor	r0,r9,r9,ror#11
-	vadd.i32	q8,q8,q0
-	eor	r2,r2,r7
-	add	r8,r8,r3,ror#6
-	eor	r3,r9,r10
-	eor	r0,r0,r9,ror#20
-	add	r8,r8,r2
-	ldr	r2,[sp,#16]
-	and	r12,r12,r3
-	add	r4,r4,r8
-	vst1.32	{q8},[r1,:128]!
-	add	r8,r8,r0,ror#2
-	eor	r12,r12,r10
-	vext.8	q8,q1,q2,#4
-	add	r7,r7,r2
-	eor	r2,r5,r6
-	eor	r0,r4,r4,ror#5
-	vext.8	q9,q3,q0,#4
-	add	r8,r8,r12
-	and	r2,r2,r4
-	eor	r12,r0,r4,ror#19
-	vshr.u32	q10,q8,#7
-	eor	r0,r8,r8,ror#11
-	eor	r2,r2,r6
-	vadd.i32	q1,q1,q9
-	add	r7,r7,r12,ror#6
-	eor	r12,r8,r9
-	vshr.u32	q9,q8,#3
-	eor	r0,r0,r8,ror#20
-	add	r7,r7,r2
-	vsli.32	q10,q8,#25
-	ldr	r2,[sp,#20]
-	and	r3,r3,r12
-	vshr.u32	q11,q8,#18
-	add	r11,r11,r7
-	add	r7,r7,r0,ror#2
-	eor	r3,r3,r9
-	veor	q9,q9,q10
-	add	r6,r6,r2
-	vsli.32	q11,q8,#14
-	eor	r2,r4,r5
-	eor	r0,r11,r11,ror#5
-	vshr.u32	d24,d1,#17
-	add	r7,r7,r3
-	and	r2,r2,r11
-	veor	q9,q9,q11
-	eor	r3,r0,r11,ror#19
-	eor	r0,r7,r7,ror#11
-	vsli.32	d24,d1,#15
-	eor	r2,r2,r5
-	add	r6,r6,r3,ror#6
-	vshr.u32	d25,d1,#10
-	eor	r3,r7,r8
-	eor	r0,r0,r7,ror#20
-	vadd.i32	q1,q1,q9
-	add	r6,r6,r2
-	ldr	r2,[sp,#24]
-	veor	d25,d25,d24
-	and	r12,r12,r3
-	add	r10,r10,r6
-	vshr.u32	d24,d1,#19
-	add	r6,r6,r0,ror#2
-	eor	r12,r12,r8
-	vsli.32	d24,d1,#13
-	add	r5,r5,r2
-	eor	r2,r11,r4
-	veor	d25,d25,d24
-	eor	r0,r10,r10,ror#5
-	add	r6,r6,r12
-	vadd.i32	d2,d2,d25
-	and	r2,r2,r10
-	eor	r12,r0,r10,ror#19
-	vshr.u32	d24,d2,#17
-	eor	r0,r6,r6,ror#11
-	eor	r2,r2,r4
-	vsli.32	d24,d2,#15
-	add	r5,r5,r12,ror#6
-	eor	r12,r6,r7
-	vshr.u32	d25,d2,#10
-	eor	r0,r0,r6,ror#20
-	add	r5,r5,r2
-	veor	d25,d25,d24
-	ldr	r2,[sp,#28]
-	and	r3,r3,r12
-	vshr.u32	d24,d2,#19
-	add	r9,r9,r5
-	add	r5,r5,r0,ror#2
-	eor	r3,r3,r7
-	vld1.32	{q8},[r14,:128]!
-	add	r4,r4,r2
-	vsli.32	d24,d2,#13
-	eor	r2,r10,r11
-	eor	r0,r9,r9,ror#5
-	veor	d25,d25,d24
-	add	r5,r5,r3
-	and	r2,r2,r9
-	vadd.i32	d3,d3,d25
-	eor	r3,r0,r9,ror#19
-	eor	r0,r5,r5,ror#11
-	vadd.i32	q8,q8,q1
-	eor	r2,r2,r11
-	add	r4,r4,r3,ror#6
-	eor	r3,r5,r6
-	eor	r0,r0,r5,ror#20
-	add	r4,r4,r2
-	ldr	r2,[sp,#32]
-	and	r12,r12,r3
-	add	r8,r8,r4
-	vst1.32	{q8},[r1,:128]!
-	add	r4,r4,r0,ror#2
-	eor	r12,r12,r6
-	vext.8	q8,q2,q3,#4
-	add	r11,r11,r2
-	eor	r2,r9,r10
-	eor	r0,r8,r8,ror#5
-	vext.8	q9,q0,q1,#4
-	add	r4,r4,r12
-	and	r2,r2,r8
-	eor	r12,r0,r8,ror#19
-	vshr.u32	q10,q8,#7
-	eor	r0,r4,r4,ror#11
-	eor	r2,r2,r10
-	vadd.i32	q2,q2,q9
-	add	r11,r11,r12,ror#6
-	eor	r12,r4,r5
-	vshr.u32	q9,q8,#3
-	eor	r0,r0,r4,ror#20
-	add	r11,r11,r2
-	vsli.32	q10,q8,#25
-	ldr	r2,[sp,#36]
-	and	r3,r3,r12
-	vshr.u32	q11,q8,#18
-	add	r7,r7,r11
-	add	r11,r11,r0,ror#2
-	eor	r3,r3,r5
-	veor	q9,q9,q10
-	add	r10,r10,r2
-	vsli.32	q11,q8,#14
-	eor	r2,r8,r9
-	eor	r0,r7,r7,ror#5
-	vshr.u32	d24,d3,#17
-	add	r11,r11,r3
-	and	r2,r2,r7
-	veor	q9,q9,q11
-	eor	r3,r0,r7,ror#19
-	eor	r0,r11,r11,ror#11
-	vsli.32	d24,d3,#15
-	eor	r2,r2,r9
-	add	r10,r10,r3,ror#6
-	vshr.u32	d25,d3,#10
-	eor	r3,r11,r4
-	eor	r0,r0,r11,ror#20
-	vadd.i32	q2,q2,q9
-	add	r10,r10,r2
-	ldr	r2,[sp,#40]
-	veor	d25,d25,d24
-	and	r12,r12,r3
-	add	r6,r6,r10
-	vshr.u32	d24,d3,#19
-	add	r10,r10,r0,ror#2
-	eor	r12,r12,r4
-	vsli.32	d24,d3,#13
-	add	r9,r9,r2
-	eor	r2,r7,r8
-	veor	d25,d25,d24
-	eor	r0,r6,r6,ror#5
-	add	r10,r10,r12
-	vadd.i32	d4,d4,d25
-	and	r2,r2,r6
-	eor	r12,r0,r6,ror#19
-	vshr.u32	d24,d4,#17
-	eor	r0,r10,r10,ror#11
-	eor	r2,r2,r8
-	vsli.32	d24,d4,#15
-	add	r9,r9,r12,ror#6
-	eor	r12,r10,r11
-	vshr.u32	d25,d4,#10
-	eor	r0,r0,r10,ror#20
-	add	r9,r9,r2
-	veor	d25,d25,d24
-	ldr	r2,[sp,#44]
-	and	r3,r3,r12
-	vshr.u32	d24,d4,#19
-	add	r5,r5,r9
-	add	r9,r9,r0,ror#2
-	eor	r3,r3,r11
-	vld1.32	{q8},[r14,:128]!
-	add	r8,r8,r2
-	vsli.32	d24,d4,#13
-	eor	r2,r6,r7
-	eor	r0,r5,r5,ror#5
-	veor	d25,d25,d24
-	add	r9,r9,r3
-	and	r2,r2,r5
-	vadd.i32	d5,d5,d25
-	eor	r3,r0,r5,ror#19
-	eor	r0,r9,r9,ror#11
-	vadd.i32	q8,q8,q2
-	eor	r2,r2,r7
-	add	r8,r8,r3,ror#6
-	eor	r3,r9,r10
-	eor	r0,r0,r9,ror#20
-	add	r8,r8,r2
-	ldr	r2,[sp,#48]
-	and	r12,r12,r3
-	add	r4,r4,r8
-	vst1.32	{q8},[r1,:128]!
-	add	r8,r8,r0,ror#2
-	eor	r12,r12,r10
-	vext.8	q8,q3,q0,#4
-	add	r7,r7,r2
-	eor	r2,r5,r6
-	eor	r0,r4,r4,ror#5
-	vext.8	q9,q1,q2,#4
-	add	r8,r8,r12
-	and	r2,r2,r4
-	eor	r12,r0,r4,ror#19
-	vshr.u32	q10,q8,#7
-	eor	r0,r8,r8,ror#11
-	eor	r2,r2,r6
-	vadd.i32	q3,q3,q9
-	add	r7,r7,r12,ror#6
-	eor	r12,r8,r9
-	vshr.u32	q9,q8,#3
-	eor	r0,r0,r8,ror#20
-	add	r7,r7,r2
-	vsli.32	q10,q8,#25
-	ldr	r2,[sp,#52]
-	and	r3,r3,r12
-	vshr.u32	q11,q8,#18
-	add	r11,r11,r7
-	add	r7,r7,r0,ror#2
-	eor	r3,r3,r9
-	veor	q9,q9,q10
-	add	r6,r6,r2
-	vsli.32	q11,q8,#14
-	eor	r2,r4,r5
-	eor	r0,r11,r11,ror#5
-	vshr.u32	d24,d5,#17
-	add	r7,r7,r3
-	and	r2,r2,r11
-	veor	q9,q9,q11
-	eor	r3,r0,r11,ror#19
-	eor	r0,r7,r7,ror#11
-	vsli.32	d24,d5,#15
-	eor	r2,r2,r5
-	add	r6,r6,r3,ror#6
-	vshr.u32	d25,d5,#10
-	eor	r3,r7,r8
-	eor	r0,r0,r7,ror#20
-	vadd.i32	q3,q3,q9
-	add	r6,r6,r2
-	ldr	r2,[sp,#56]
-	veor	d25,d25,d24
-	and	r12,r12,r3
-	add	r10,r10,r6
-	vshr.u32	d24,d5,#19
-	add	r6,r6,r0,ror#2
-	eor	r12,r12,r8
-	vsli.32	d24,d5,#13
-	add	r5,r5,r2
-	eor	r2,r11,r4
-	veor	d25,d25,d24
-	eor	r0,r10,r10,ror#5
-	add	r6,r6,r12
-	vadd.i32	d6,d6,d25
-	and	r2,r2,r10
-	eor	r12,r0,r10,ror#19
-	vshr.u32	d24,d6,#17
-	eor	r0,r6,r6,ror#11
-	eor	r2,r2,r4
-	vsli.32	d24,d6,#15
-	add	r5,r5,r12,ror#6
-	eor	r12,r6,r7
-	vshr.u32	d25,d6,#10
-	eor	r0,r0,r6,ror#20
-	add	r5,r5,r2
-	veor	d25,d25,d24
-	ldr	r2,[sp,#60]
-	and	r3,r3,r12
-	vshr.u32	d24,d6,#19
-	add	r9,r9,r5
-	add	r5,r5,r0,ror#2
-	eor	r3,r3,r7
-	vld1.32	{q8},[r14,:128]!
-	add	r4,r4,r2
-	vsli.32	d24,d6,#13
-	eor	r2,r10,r11
-	eor	r0,r9,r9,ror#5
-	veor	d25,d25,d24
-	add	r5,r5,r3
-	and	r2,r2,r9
-	vadd.i32	d7,d7,d25
-	eor	r3,r0,r9,ror#19
-	eor	r0,r5,r5,ror#11
-	vadd.i32	q8,q8,q3
-	eor	r2,r2,r11
-	add	r4,r4,r3,ror#6
-	eor	r3,r5,r6
-	eor	r0,r0,r5,ror#20
-	add	r4,r4,r2
-	ldr	r2,[r14]
-	and	r12,r12,r3
-	add	r8,r8,r4
-	vst1.32	{q8},[r1,:128]!
-	add	r4,r4,r0,ror#2
-	eor	r12,r12,r6
-	teq	r2,#0				@ check for K256 terminator
-	ldr	r2,[sp,#0]
-	sub	r1,r1,#64
-	bne	L_00_48
-
-	ldr	r1,[sp,#68]
-	ldr	r0,[sp,#72]
-	sub	r14,r14,#256	@ rewind r14
-	teq	r1,r0
-	it	eq
-	subeq	r1,r1,#64		@ avoid SEGV
-	vld1.8	{q0},[r1]!		@ load next input block
-	vld1.8	{q1},[r1]!
-	vld1.8	{q2},[r1]!
-	vld1.8	{q3},[r1]!
-	it	ne
-	strne	r1,[sp,#68]
-	mov	r1,sp
-	add	r11,r11,r2
-	eor	r2,r9,r10
-	eor	r0,r8,r8,ror#5
-	add	r4,r4,r12
-	vld1.32	{q8},[r14,:128]!
-	and	r2,r2,r8
-	eor	r12,r0,r8,ror#19
-	eor	r0,r4,r4,ror#11
-	eor	r2,r2,r10
-	vrev32.8	q0,q0
-	add	r11,r11,r12,ror#6
-	eor	r12,r4,r5
-	eor	r0,r0,r4,ror#20
-	add	r11,r11,r2
-	vadd.i32	q8,q8,q0
-	ldr	r2,[sp,#4]
-	and	r3,r3,r12
-	add	r7,r7,r11
-	add	r11,r11,r0,ror#2
-	eor	r3,r3,r5
-	add	r10,r10,r2
-	eor	r2,r8,r9
-	eor	r0,r7,r7,ror#5
-	add	r11,r11,r3
-	and	r2,r2,r7
-	eor	r3,r0,r7,ror#19
-	eor	r0,r11,r11,ror#11
-	eor	r2,r2,r9
-	add	r10,r10,r3,ror#6
-	eor	r3,r11,r4
-	eor	r0,r0,r11,ror#20
-	add	r10,r10,r2
-	ldr	r2,[sp,#8]
-	and	r12,r12,r3
-	add	r6,r6,r10
-	add	r10,r10,r0,ror#2
-	eor	r12,r12,r4
-	add	r9,r9,r2
-	eor	r2,r7,r8
-	eor	r0,r6,r6,ror#5
-	add	r10,r10,r12
-	and	r2,r2,r6
-	eor	r12,r0,r6,ror#19
-	eor	r0,r10,r10,ror#11
-	eor	r2,r2,r8
-	add	r9,r9,r12,ror#6
-	eor	r12,r10,r11
-	eor	r0,r0,r10,ror#20
-	add	r9,r9,r2
-	ldr	r2,[sp,#12]
-	and	r3,r3,r12
-	add	r5,r5,r9
-	add	r9,r9,r0,ror#2
-	eor	r3,r3,r11
-	add	r8,r8,r2
-	eor	r2,r6,r7
-	eor	r0,r5,r5,ror#5
-	add	r9,r9,r3
-	and	r2,r2,r5
-	eor	r3,r0,r5,ror#19
-	eor	r0,r9,r9,ror#11
-	eor	r2,r2,r7
-	add	r8,r8,r3,ror#6
-	eor	r3,r9,r10
-	eor	r0,r0,r9,ror#20
-	add	r8,r8,r2
-	ldr	r2,[sp,#16]
-	and	r12,r12,r3
-	add	r4,r4,r8
-	add	r8,r8,r0,ror#2
-	eor	r12,r12,r10
-	vst1.32	{q8},[r1,:128]!
-	add	r7,r7,r2
-	eor	r2,r5,r6
-	eor	r0,r4,r4,ror#5
-	add	r8,r8,r12
-	vld1.32	{q8},[r14,:128]!
-	and	r2,r2,r4
-	eor	r12,r0,r4,ror#19
-	eor	r0,r8,r8,ror#11
-	eor	r2,r2,r6
-	vrev32.8	q1,q1
-	add	r7,r7,r12,ror#6
-	eor	r12,r8,r9
-	eor	r0,r0,r8,ror#20
-	add	r7,r7,r2
-	vadd.i32	q8,q8,q1
-	ldr	r2,[sp,#20]
-	and	r3,r3,r12
-	add	r11,r11,r7
-	add	r7,r7,r0,ror#2
-	eor	r3,r3,r9
-	add	r6,r6,r2
-	eor	r2,r4,r5
-	eor	r0,r11,r11,ror#5
-	add	r7,r7,r3
-	and	r2,r2,r11
-	eor	r3,r0,r11,ror#19
-	eor	r0,r7,r7,ror#11
-	eor	r2,r2,r5
-	add	r6,r6,r3,ror#6
-	eor	r3,r7,r8
-	eor	r0,r0,r7,ror#20
-	add	r6,r6,r2
-	ldr	r2,[sp,#24]
-	and	r12,r12,r3
-	add	r10,r10,r6
-	add	r6,r6,r0,ror#2
-	eor	r12,r12,r8
-	add	r5,r5,r2
-	eor	r2,r11,r4
-	eor	r0,r10,r10,ror#5
-	add	r6,r6,r12
-	and	r2,r2,r10
-	eor	r12,r0,r10,ror#19
-	eor	r0,r6,r6,ror#11
-	eor	r2,r2,r4
-	add	r5,r5,r12,ror#6
-	eor	r12,r6,r7
-	eor	r0,r0,r6,ror#20
-	add	r5,r5,r2
-	ldr	r2,[sp,#28]
-	and	r3,r3,r12
-	add	r9,r9,r5
-	add	r5,r5,r0,ror#2
-	eor	r3,r3,r7
-	add	r4,r4,r2
-	eor	r2,r10,r11
-	eor	r0,r9,r9,ror#5
-	add	r5,r5,r3
-	and	r2,r2,r9
-	eor	r3,r0,r9,ror#19
-	eor	r0,r5,r5,ror#11
-	eor	r2,r2,r11
-	add	r4,r4,r3,ror#6
-	eor	r3,r5,r6
-	eor	r0,r0,r5,ror#20
-	add	r4,r4,r2
-	ldr	r2,[sp,#32]
-	and	r12,r12,r3
-	add	r8,r8,r4
-	add	r4,r4,r0,ror#2
-	eor	r12,r12,r6
-	vst1.32	{q8},[r1,:128]!
-	add	r11,r11,r2
-	eor	r2,r9,r10
-	eor	r0,r8,r8,ror#5
-	add	r4,r4,r12
-	vld1.32	{q8},[r14,:128]!
-	and	r2,r2,r8
-	eor	r12,r0,r8,ror#19
-	eor	r0,r4,r4,ror#11
-	eor	r2,r2,r10
-	vrev32.8	q2,q2
-	add	r11,r11,r12,ror#6
-	eor	r12,r4,r5
-	eor	r0,r0,r4,ror#20
-	add	r11,r11,r2
-	vadd.i32	q8,q8,q2
-	ldr	r2,[sp,#36]
-	and	r3,r3,r12
-	add	r7,r7,r11
-	add	r11,r11,r0,ror#2
-	eor	r3,r3,r5
-	add	r10,r10,r2
-	eor	r2,r8,r9
-	eor	r0,r7,r7,ror#5
-	add	r11,r11,r3
-	and	r2,r2,r7
-	eor	r3,r0,r7,ror#19
-	eor	r0,r11,r11,ror#11
-	eor	r2,r2,r9
-	add	r10,r10,r3,ror#6
-	eor	r3,r11,r4
-	eor	r0,r0,r11,ror#20
-	add	r10,r10,r2
-	ldr	r2,[sp,#40]
-	and	r12,r12,r3
-	add	r6,r6,r10
-	add	r10,r10,r0,ror#2
-	eor	r12,r12,r4
-	add	r9,r9,r2
-	eor	r2,r7,r8
-	eor	r0,r6,r6,ror#5
-	add	r10,r10,r12
-	and	r2,r2,r6
-	eor	r12,r0,r6,ror#19
-	eor	r0,r10,r10,ror#11
-	eor	r2,r2,r8
-	add	r9,r9,r12,ror#6
-	eor	r12,r10,r11
-	eor	r0,r0,r10,ror#20
-	add	r9,r9,r2
-	ldr	r2,[sp,#44]
-	and	r3,r3,r12
-	add	r5,r5,r9
-	add	r9,r9,r0,ror#2
-	eor	r3,r3,r11
-	add	r8,r8,r2
-	eor	r2,r6,r7
-	eor	r0,r5,r5,ror#5
-	add	r9,r9,r3
-	and	r2,r2,r5
-	eor	r3,r0,r5,ror#19
-	eor	r0,r9,r9,ror#11
-	eor	r2,r2,r7
-	add	r8,r8,r3,ror#6
-	eor	r3,r9,r10
-	eor	r0,r0,r9,ror#20
-	add	r8,r8,r2
-	ldr	r2,[sp,#48]
-	and	r12,r12,r3
-	add	r4,r4,r8
-	add	r8,r8,r0,ror#2
-	eor	r12,r12,r10
-	vst1.32	{q8},[r1,:128]!
-	add	r7,r7,r2
-	eor	r2,r5,r6
-	eor	r0,r4,r4,ror#5
-	add	r8,r8,r12
-	vld1.32	{q8},[r14,:128]!
-	and	r2,r2,r4
-	eor	r12,r0,r4,ror#19
-	eor	r0,r8,r8,ror#11
-	eor	r2,r2,r6
-	vrev32.8	q3,q3
-	add	r7,r7,r12,ror#6
-	eor	r12,r8,r9
-	eor	r0,r0,r8,ror#20
-	add	r7,r7,r2
-	vadd.i32	q8,q8,q3
-	ldr	r2,[sp,#52]
-	and	r3,r3,r12
-	add	r11,r11,r7
-	add	r7,r7,r0,ror#2
-	eor	r3,r3,r9
-	add	r6,r6,r2
-	eor	r2,r4,r5
-	eor	r0,r11,r11,ror#5
-	add	r7,r7,r3
-	and	r2,r2,r11
-	eor	r3,r0,r11,ror#19
-	eor	r0,r7,r7,ror#11
-	eor	r2,r2,r5
-	add	r6,r6,r3,ror#6
-	eor	r3,r7,r8
-	eor	r0,r0,r7,ror#20
-	add	r6,r6,r2
-	ldr	r2,[sp,#56]
-	and	r12,r12,r3
-	add	r10,r10,r6
-	add	r6,r6,r0,ror#2
-	eor	r12,r12,r8
-	add	r5,r5,r2
-	eor	r2,r11,r4
-	eor	r0,r10,r10,ror#5
-	add	r6,r6,r12
-	and	r2,r2,r10
-	eor	r12,r0,r10,ror#19
-	eor	r0,r6,r6,ror#11
-	eor	r2,r2,r4
-	add	r5,r5,r12,ror#6
-	eor	r12,r6,r7
-	eor	r0,r0,r6,ror#20
-	add	r5,r5,r2
-	ldr	r2,[sp,#60]
-	and	r3,r3,r12
-	add	r9,r9,r5
-	add	r5,r5,r0,ror#2
-	eor	r3,r3,r7
-	add	r4,r4,r2
-	eor	r2,r10,r11
-	eor	r0,r9,r9,ror#5
-	add	r5,r5,r3
-	and	r2,r2,r9
-	eor	r3,r0,r9,ror#19
-	eor	r0,r5,r5,ror#11
-	eor	r2,r2,r11
-	add	r4,r4,r3,ror#6
-	eor	r3,r5,r6
-	eor	r0,r0,r5,ror#20
-	add	r4,r4,r2
-	ldr	r2,[sp,#64]
-	and	r12,r12,r3
-	add	r8,r8,r4
-	add	r4,r4,r0,ror#2
-	eor	r12,r12,r6
-	vst1.32	{q8},[r1,:128]!
-	ldr	r0,[r2,#0]
-	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
-	ldr	r12,[r2,#4]
-	ldr	r3,[r2,#8]
-	ldr	r1,[r2,#12]
-	add	r4,r4,r0			@ accumulate
-	ldr	r0,[r2,#16]
-	add	r5,r5,r12
-	ldr	r12,[r2,#20]
-	add	r6,r6,r3
-	ldr	r3,[r2,#24]
-	add	r7,r7,r1
-	ldr	r1,[r2,#28]
-	add	r8,r8,r0
-	str	r4,[r2],#4
-	add	r9,r9,r12
-	str	r5,[r2],#4
-	add	r10,r10,r3
-	str	r6,[r2],#4
-	add	r11,r11,r1
-	str	r7,[r2],#4
-	stmia	r2,{r8,r9,r10,r11}
-
-	ittte	ne
-	movne	r1,sp
-	ldrne	r2,[sp,#0]
-	eorne	r12,r12,r12
-	ldreq	sp,[sp,#76]			@ restore original sp
-	itt	ne
-	eorne	r3,r5,r6
-	bne	L_00_48
-
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-
-# if defined(__thumb2__)
-#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
-# else
-#  define INST(a,b,c,d)	.byte	a,b,c,d
-# endif
-
-#ifdef __thumb2__
-.thumb_func	sha256_block_data_order_armv8
-#endif
-.align	5
-sha256_block_data_order_armv8:
-LARMv8:
-	vld1.32	{q0,q1},[r0]
-	sub	r3,r3,#256+32
-	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
-	b	Loop_v8
-
-.align	4
-Loop_v8:
-	vld1.8	{q8,q9},[r1]!
-	vld1.8	{q10,q11},[r1]!
-	vld1.32	{q12},[r3]!
-	vrev32.8	q8,q8
-	vrev32.8	q9,q9
-	vrev32.8	q10,q10
-	vrev32.8	q11,q11
-	vmov	q14,q0	@ offload
-	vmov	q15,q1
-	teq	r1,r2
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q8
-	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q9
-	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q10
-	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q11
-	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q8
-	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q9
-	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q10
-	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q11
-	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q8
-	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q9
-	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q10
-	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q11
-	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
-	vld1.32	{q13},[r3]!
-	vadd.i32	q12,q12,q8
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-
-	vld1.32	{q12},[r3]!
-	vadd.i32	q13,q13,q9
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-
-	vld1.32	{q13},[r3]
-	vadd.i32	q12,q12,q10
-	sub	r3,r3,#256-16	@ rewind
-	vmov	q2,q0
-	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
-	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
-
-	vadd.i32	q13,q13,q11
-	vmov	q2,q0
-	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
-	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
-
-	vadd.i32	q0,q0,q14
-	vadd.i32	q1,q1,q15
-	it	ne
-	bne	Loop_v8
-
-	vst1.32	{q0,q1},[r0]
-
-	bx	lr		@ bx lr
-
-#endif
-.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm	_OPENSSL_armcap_P,4
-.non_lazy_symbol_pointer
-OPENSSL_armcap_P:
-.indirect_symbol	_OPENSSL_armcap_P
-.long	0
-.private_extern	_OPENSSL_armcap_P
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/sha512-armv4-apple.S b/apple-arm/crypto/fipsmodule/sha512-armv4-apple.S
deleted file mode 100644
index 12884b5..0000000
--- a/apple-arm/crypto/fipsmodule/sha512-armv4-apple.S
+++ /dev/null
@@ -1,1891 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
-@
-@ Licensed under the OpenSSL license (the "License").  You may not use
-@ this file except in compliance with the License.  You can obtain a copy
-@ in the file LICENSE in the source distribution or at
-@ https://www.openssl.org/source/license.html
-
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@
-@ Permission to use under GPL terms is granted.
-@ ====================================================================
-
-@ SHA512 block procedure for ARMv4. September 2007.
-
-@ This code is ~4.5 (four and a half) times faster than code generated
-@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
-@ Xscale PXA250 core].
-@
-@ July 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 6% improvement on
-@ Cortex A8 core and ~40 cycles per processed byte.
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 7%
-@ improvement on Coxtex A8 core and ~38 cycles per byte.
-
-@ March 2011.
-@
-@ Add NEON implementation. On Cortex A8 it was measured to process
-@ one byte in 23.3 cycles or ~60% faster than integer-only code.
-
-@ August 2012.
-@
-@ Improve NEON performance by 12% on Snapdragon S4. In absolute
-@ terms it's 22.6 cycles per byte, which is disappointing result.
-@ Technical writers asserted that 3-way S4 pipeline can sustain
-@ multiple NEON instructions per cycle, but dual NEON issue could
-@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
-@ for further details. On side note Cortex-A15 processes one byte in
-@ 16 cycles.
-
-@ Byte order [in]dependence. =========================================
-@
-@ Originally caller was expected to maintain specific *dword* order in
-@ h[0-7], namely with most significant dword at *lower* address, which
-@ was reflected in below two parameters as 0 and 4. Now caller is
-@ expected to maintain native byte order for whole 64-bit values.
-#ifndef __KERNEL__
-# include <openssl/arm_arch.h>
-# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
-# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-#endif
-
-@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
-@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
-
-
-#ifdef __ARMEL__
-# define LO 0
-# define HI 4
-# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
-#else
-# define HI 0
-# define LO 4
-# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
-#endif
-
-.text
-#if defined(__thumb2__)
-.syntax	unified
-.thumb
-# define adrl adr
-#else
-.code	32
-#endif
-
-
-.align	5
-K512:
-	WORD64(0x428a2f98,0xd728ae22,	0x71374491,0x23ef65cd)
-	WORD64(0xb5c0fbcf,0xec4d3b2f,	0xe9b5dba5,0x8189dbbc)
-	WORD64(0x3956c25b,0xf348b538,	0x59f111f1,0xb605d019)
-	WORD64(0x923f82a4,0xaf194f9b,	0xab1c5ed5,0xda6d8118)
-	WORD64(0xd807aa98,0xa3030242,	0x12835b01,0x45706fbe)
-	WORD64(0x243185be,0x4ee4b28c,	0x550c7dc3,0xd5ffb4e2)
-	WORD64(0x72be5d74,0xf27b896f,	0x80deb1fe,0x3b1696b1)
-	WORD64(0x9bdc06a7,0x25c71235,	0xc19bf174,0xcf692694)
-	WORD64(0xe49b69c1,0x9ef14ad2,	0xefbe4786,0x384f25e3)
-	WORD64(0x0fc19dc6,0x8b8cd5b5,	0x240ca1cc,0x77ac9c65)
-	WORD64(0x2de92c6f,0x592b0275,	0x4a7484aa,0x6ea6e483)
-	WORD64(0x5cb0a9dc,0xbd41fbd4,	0x76f988da,0x831153b5)
-	WORD64(0x983e5152,0xee66dfab,	0xa831c66d,0x2db43210)
-	WORD64(0xb00327c8,0x98fb213f,	0xbf597fc7,0xbeef0ee4)
-	WORD64(0xc6e00bf3,0x3da88fc2,	0xd5a79147,0x930aa725)
-	WORD64(0x06ca6351,0xe003826f,	0x14292967,0x0a0e6e70)
-	WORD64(0x27b70a85,0x46d22ffc,	0x2e1b2138,0x5c26c926)
-	WORD64(0x4d2c6dfc,0x5ac42aed,	0x53380d13,0x9d95b3df)
-	WORD64(0x650a7354,0x8baf63de,	0x766a0abb,0x3c77b2a8)
-	WORD64(0x81c2c92e,0x47edaee6,	0x92722c85,0x1482353b)
-	WORD64(0xa2bfe8a1,0x4cf10364,	0xa81a664b,0xbc423001)
-	WORD64(0xc24b8b70,0xd0f89791,	0xc76c51a3,0x0654be30)
-	WORD64(0xd192e819,0xd6ef5218,	0xd6990624,0x5565a910)
-	WORD64(0xf40e3585,0x5771202a,	0x106aa070,0x32bbd1b8)
-	WORD64(0x19a4c116,0xb8d2d0c8,	0x1e376c08,0x5141ab53)
-	WORD64(0x2748774c,0xdf8eeb99,	0x34b0bcb5,0xe19b48a8)
-	WORD64(0x391c0cb3,0xc5c95a63,	0x4ed8aa4a,0xe3418acb)
-	WORD64(0x5b9cca4f,0x7763e373,	0x682e6ff3,0xd6b2b8a3)
-	WORD64(0x748f82ee,0x5defb2fc,	0x78a5636f,0x43172f60)
-	WORD64(0x84c87814,0xa1f0ab72,	0x8cc70208,0x1a6439ec)
-	WORD64(0x90befffa,0x23631e28,	0xa4506ceb,0xde82bde9)
-	WORD64(0xbef9a3f7,0xb2c67915,	0xc67178f2,0xe372532b)
-	WORD64(0xca273ece,0xea26619c,	0xd186b8c7,0x21c0c207)
-	WORD64(0xeada7dd6,0xcde0eb1e,	0xf57d4f7f,0xee6ed178)
-	WORD64(0x06f067aa,0x72176fba,	0x0a637dc5,0xa2c898a6)
-	WORD64(0x113f9804,0xbef90dae,	0x1b710b35,0x131c471b)
-	WORD64(0x28db77f5,0x23047d84,	0x32caab7b,0x40c72493)
-	WORD64(0x3c9ebe0a,0x15c9bebc,	0x431d67c4,0x9c100d4c)
-	WORD64(0x4cc5d4be,0xcb3e42b6,	0x597f299c,0xfc657e2a)
-	WORD64(0x5fcb6fab,0x3ad6faec,	0x6c44198c,0x4a475817)
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-Lsha512_block_data_order
-.skip	32-4
-#else
-.skip	32
-#endif
-
-.globl	_sha512_block_data_order
-.private_extern	_sha512_block_data_order
-#ifdef __thumb2__
-.thumb_func	_sha512_block_data_order
-#endif
-_sha512_block_data_order:
-Lsha512_block_data_order:
-#if __ARM_ARCH__<7 && !defined(__thumb2__)
-	sub	r3,pc,#8		@ _sha512_block_data_order
-#else
-	adr	r3,Lsha512_block_data_order
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	ldr	r12,LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-#ifdef	__APPLE__
-	ldr	r12,[r12]
-#endif
-	tst	r12,#ARMV7_NEON
-	bne	LNEON
-#endif
-	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
-	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-	sub	r14,r3,#672		@ K512
-	sub	sp,sp,#9*8
-
-	ldr	r7,[r0,#32+LO]
-	ldr	r8,[r0,#32+HI]
-	ldr	r9, [r0,#48+LO]
-	ldr	r10, [r0,#48+HI]
-	ldr	r11, [r0,#56+LO]
-	ldr	r12, [r0,#56+HI]
-Loop:
-	str	r9, [sp,#48+0]
-	str	r10, [sp,#48+4]
-	str	r11, [sp,#56+0]
-	str	r12, [sp,#56+4]
-	ldr	r5,[r0,#0+LO]
-	ldr	r6,[r0,#0+HI]
-	ldr	r3,[r0,#8+LO]
-	ldr	r4,[r0,#8+HI]
-	ldr	r9, [r0,#16+LO]
-	ldr	r10, [r0,#16+HI]
-	ldr	r11, [r0,#24+LO]
-	ldr	r12, [r0,#24+HI]
-	str	r3,[sp,#8+0]
-	str	r4,[sp,#8+4]
-	str	r9, [sp,#16+0]
-	str	r10, [sp,#16+4]
-	str	r11, [sp,#24+0]
-	str	r12, [sp,#24+4]
-	ldr	r3,[r0,#40+LO]
-	ldr	r4,[r0,#40+HI]
-	str	r3,[sp,#40+0]
-	str	r4,[sp,#40+4]
-
-L00_15:
-#if __ARM_ARCH__<7
-	ldrb	r3,[r1,#7]
-	ldrb	r9, [r1,#6]
-	ldrb	r10, [r1,#5]
-	ldrb	r11, [r1,#4]
-	ldrb	r4,[r1,#3]
-	ldrb	r12, [r1,#2]
-	orr	r3,r3,r9,lsl#8
-	ldrb	r9, [r1,#1]
-	orr	r3,r3,r10,lsl#16
-	ldrb	r10, [r1],#8
-	orr	r3,r3,r11,lsl#24
-	orr	r4,r4,r12,lsl#8
-	orr	r4,r4,r9,lsl#16
-	orr	r4,r4,r10,lsl#24
-#else
-	ldr	r3,[r1,#4]
-	ldr	r4,[r1],#8
-#ifdef __ARMEL__
-	rev	r3,r3
-	rev	r4,r4
-#endif
-#endif
-	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
-	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
-	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
-	mov	r9,r7,lsr#14
-	str	r3,[sp,#64+0]
-	mov	r10,r8,lsr#14
-	str	r4,[sp,#64+4]
-	eor	r9,r9,r8,lsl#18
-	ldr	r11,[sp,#56+0]	@ h.lo
-	eor	r10,r10,r7,lsl#18
-	ldr	r12,[sp,#56+4]	@ h.hi
-	eor	r9,r9,r7,lsr#18
-	eor	r10,r10,r8,lsr#18
-	eor	r9,r9,r8,lsl#14
-	eor	r10,r10,r7,lsl#14
-	eor	r9,r9,r8,lsr#9
-	eor	r10,r10,r7,lsr#9
-	eor	r9,r9,r7,lsl#23
-	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
-	adds	r3,r3,r9
-	ldr	r9,[sp,#40+0]	@ f.lo
-	adc	r4,r4,r10		@ T += Sigma1(e)
-	ldr	r10,[sp,#40+4]	@ f.hi
-	adds	r3,r3,r11
-	ldr	r11,[sp,#48+0]	@ g.lo
-	adc	r4,r4,r12		@ T += h
-	ldr	r12,[sp,#48+4]	@ g.hi
-
-	eor	r9,r9,r11
-	str	r7,[sp,#32+0]
-	eor	r10,r10,r12
-	str	r8,[sp,#32+4]
-	and	r9,r9,r7
-	str	r5,[sp,#0+0]
-	and	r10,r10,r8
-	str	r6,[sp,#0+4]
-	eor	r9,r9,r11
-	ldr	r11,[r14,#LO]	@ K[i].lo
-	eor	r10,r10,r12		@ Ch(e,f,g)
-	ldr	r12,[r14,#HI]	@ K[i].hi
-
-	adds	r3,r3,r9
-	ldr	r7,[sp,#24+0]	@ d.lo
-	adc	r4,r4,r10		@ T += Ch(e,f,g)
-	ldr	r8,[sp,#24+4]	@ d.hi
-	adds	r3,r3,r11
-	and	r9,r11,#0xff
-	adc	r4,r4,r12		@ T += K[i]
-	adds	r7,r7,r3
-	ldr	r11,[sp,#8+0]	@ b.lo
-	adc	r8,r8,r4		@ d += T
-	teq	r9,#148
-
-	ldr	r12,[sp,#16+0]	@ c.lo
-#if __ARM_ARCH__>=7
-	it	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	orreq	r14,r14,#1
-	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
-	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
-	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
-	mov	r9,r5,lsr#28
-	mov	r10,r6,lsr#28
-	eor	r9,r9,r6,lsl#4
-	eor	r10,r10,r5,lsl#4
-	eor	r9,r9,r6,lsr#2
-	eor	r10,r10,r5,lsr#2
-	eor	r9,r9,r5,lsl#30
-	eor	r10,r10,r6,lsl#30
-	eor	r9,r9,r6,lsr#7
-	eor	r10,r10,r5,lsr#7
-	eor	r9,r9,r5,lsl#25
-	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
-	adds	r3,r3,r9
-	and	r9,r5,r11
-	adc	r4,r4,r10		@ T += Sigma0(a)
-
-	ldr	r10,[sp,#8+4]	@ b.hi
-	orr	r5,r5,r11
-	ldr	r11,[sp,#16+4]	@ c.hi
-	and	r5,r5,r12
-	and	r12,r6,r10
-	orr	r6,r6,r10
-	orr	r5,r5,r9		@ Maj(a,b,c).lo
-	and	r6,r6,r11
-	adds	r5,r5,r3
-	orr	r6,r6,r12		@ Maj(a,b,c).hi
-	sub	sp,sp,#8
-	adc	r6,r6,r4		@ h += T
-	tst	r14,#1
-	add	r14,r14,#8
-	tst	r14,#1
-	beq	L00_15
-	ldr	r9,[sp,#184+0]
-	ldr	r10,[sp,#184+4]
-	bic	r14,r14,#1
-L16_79:
-	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
-	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
-	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
-	mov	r3,r9,lsr#1
-	ldr	r11,[sp,#80+0]
-	mov	r4,r10,lsr#1
-	ldr	r12,[sp,#80+4]
-	eor	r3,r3,r10,lsl#31
-	eor	r4,r4,r9,lsl#31
-	eor	r3,r3,r9,lsr#8
-	eor	r4,r4,r10,lsr#8
-	eor	r3,r3,r10,lsl#24
-	eor	r4,r4,r9,lsl#24
-	eor	r3,r3,r9,lsr#7
-	eor	r4,r4,r10,lsr#7
-	eor	r3,r3,r10,lsl#25
-
-	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
-	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
-	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
-	mov	r9,r11,lsr#19
-	mov	r10,r12,lsr#19
-	eor	r9,r9,r12,lsl#13
-	eor	r10,r10,r11,lsl#13
-	eor	r9,r9,r12,lsr#29
-	eor	r10,r10,r11,lsr#29
-	eor	r9,r9,r11,lsl#3
-	eor	r10,r10,r12,lsl#3
-	eor	r9,r9,r11,lsr#6
-	eor	r10,r10,r12,lsr#6
-	ldr	r11,[sp,#120+0]
-	eor	r9,r9,r12,lsl#26
-
-	ldr	r12,[sp,#120+4]
-	adds	r3,r3,r9
-	ldr	r9,[sp,#192+0]
-	adc	r4,r4,r10
-
-	ldr	r10,[sp,#192+4]
-	adds	r3,r3,r11
-	adc	r4,r4,r12
-	adds	r3,r3,r9
-	adc	r4,r4,r10
-	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
-	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
-	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
-	mov	r9,r7,lsr#14
-	str	r3,[sp,#64+0]
-	mov	r10,r8,lsr#14
-	str	r4,[sp,#64+4]
-	eor	r9,r9,r8,lsl#18
-	ldr	r11,[sp,#56+0]	@ h.lo
-	eor	r10,r10,r7,lsl#18
-	ldr	r12,[sp,#56+4]	@ h.hi
-	eor	r9,r9,r7,lsr#18
-	eor	r10,r10,r8,lsr#18
-	eor	r9,r9,r8,lsl#14
-	eor	r10,r10,r7,lsl#14
-	eor	r9,r9,r8,lsr#9
-	eor	r10,r10,r7,lsr#9
-	eor	r9,r9,r7,lsl#23
-	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
-	adds	r3,r3,r9
-	ldr	r9,[sp,#40+0]	@ f.lo
-	adc	r4,r4,r10		@ T += Sigma1(e)
-	ldr	r10,[sp,#40+4]	@ f.hi
-	adds	r3,r3,r11
-	ldr	r11,[sp,#48+0]	@ g.lo
-	adc	r4,r4,r12		@ T += h
-	ldr	r12,[sp,#48+4]	@ g.hi
-
-	eor	r9,r9,r11
-	str	r7,[sp,#32+0]
-	eor	r10,r10,r12
-	str	r8,[sp,#32+4]
-	and	r9,r9,r7
-	str	r5,[sp,#0+0]
-	and	r10,r10,r8
-	str	r6,[sp,#0+4]
-	eor	r9,r9,r11
-	ldr	r11,[r14,#LO]	@ K[i].lo
-	eor	r10,r10,r12		@ Ch(e,f,g)
-	ldr	r12,[r14,#HI]	@ K[i].hi
-
-	adds	r3,r3,r9
-	ldr	r7,[sp,#24+0]	@ d.lo
-	adc	r4,r4,r10		@ T += Ch(e,f,g)
-	ldr	r8,[sp,#24+4]	@ d.hi
-	adds	r3,r3,r11
-	and	r9,r11,#0xff
-	adc	r4,r4,r12		@ T += K[i]
-	adds	r7,r7,r3
-	ldr	r11,[sp,#8+0]	@ b.lo
-	adc	r8,r8,r4		@ d += T
-	teq	r9,#23
-
-	ldr	r12,[sp,#16+0]	@ c.lo
-#if __ARM_ARCH__>=7
-	it	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	orreq	r14,r14,#1
-	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
-	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
-	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
-	mov	r9,r5,lsr#28
-	mov	r10,r6,lsr#28
-	eor	r9,r9,r6,lsl#4
-	eor	r10,r10,r5,lsl#4
-	eor	r9,r9,r6,lsr#2
-	eor	r10,r10,r5,lsr#2
-	eor	r9,r9,r5,lsl#30
-	eor	r10,r10,r6,lsl#30
-	eor	r9,r9,r6,lsr#7
-	eor	r10,r10,r5,lsr#7
-	eor	r9,r9,r5,lsl#25
-	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
-	adds	r3,r3,r9
-	and	r9,r5,r11
-	adc	r4,r4,r10		@ T += Sigma0(a)
-
-	ldr	r10,[sp,#8+4]	@ b.hi
-	orr	r5,r5,r11
-	ldr	r11,[sp,#16+4]	@ c.hi
-	and	r5,r5,r12
-	and	r12,r6,r10
-	orr	r6,r6,r10
-	orr	r5,r5,r9		@ Maj(a,b,c).lo
-	and	r6,r6,r11
-	adds	r5,r5,r3
-	orr	r6,r6,r12		@ Maj(a,b,c).hi
-	sub	sp,sp,#8
-	adc	r6,r6,r4		@ h += T
-	tst	r14,#1
-	add	r14,r14,#8
-#if __ARM_ARCH__>=7
-	ittt	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	ldreq	r9,[sp,#184+0]
-	ldreq	r10,[sp,#184+4]
-	beq	L16_79
-	bic	r14,r14,#1
-
-	ldr	r3,[sp,#8+0]
-	ldr	r4,[sp,#8+4]
-	ldr	r9, [r0,#0+LO]
-	ldr	r10, [r0,#0+HI]
-	ldr	r11, [r0,#8+LO]
-	ldr	r12, [r0,#8+HI]
-	adds	r9,r5,r9
-	str	r9, [r0,#0+LO]
-	adc	r10,r6,r10
-	str	r10, [r0,#0+HI]
-	adds	r11,r3,r11
-	str	r11, [r0,#8+LO]
-	adc	r12,r4,r12
-	str	r12, [r0,#8+HI]
-
-	ldr	r5,[sp,#16+0]
-	ldr	r6,[sp,#16+4]
-	ldr	r3,[sp,#24+0]
-	ldr	r4,[sp,#24+4]
-	ldr	r9, [r0,#16+LO]
-	ldr	r10, [r0,#16+HI]
-	ldr	r11, [r0,#24+LO]
-	ldr	r12, [r0,#24+HI]
-	adds	r9,r5,r9
-	str	r9, [r0,#16+LO]
-	adc	r10,r6,r10
-	str	r10, [r0,#16+HI]
-	adds	r11,r3,r11
-	str	r11, [r0,#24+LO]
-	adc	r12,r4,r12
-	str	r12, [r0,#24+HI]
-
-	ldr	r3,[sp,#40+0]
-	ldr	r4,[sp,#40+4]
-	ldr	r9, [r0,#32+LO]
-	ldr	r10, [r0,#32+HI]
-	ldr	r11, [r0,#40+LO]
-	ldr	r12, [r0,#40+HI]
-	adds	r7,r7,r9
-	str	r7,[r0,#32+LO]
-	adc	r8,r8,r10
-	str	r8,[r0,#32+HI]
-	adds	r11,r3,r11
-	str	r11, [r0,#40+LO]
-	adc	r12,r4,r12
-	str	r12, [r0,#40+HI]
-
-	ldr	r5,[sp,#48+0]
-	ldr	r6,[sp,#48+4]
-	ldr	r3,[sp,#56+0]
-	ldr	r4,[sp,#56+4]
-	ldr	r9, [r0,#48+LO]
-	ldr	r10, [r0,#48+HI]
-	ldr	r11, [r0,#56+LO]
-	ldr	r12, [r0,#56+HI]
-	adds	r9,r5,r9
-	str	r9, [r0,#48+LO]
-	adc	r10,r6,r10
-	str	r10, [r0,#48+HI]
-	adds	r11,r3,r11
-	str	r11, [r0,#56+LO]
-	adc	r12,r4,r12
-	str	r12, [r0,#56+HI]
-
-	add	sp,sp,#640
-	sub	r14,r14,#640
-
-	teq	r1,r2
-	bne	Loop
-
-	add	sp,sp,#8*9		@ destroy frame
-#if __ARM_ARCH__>=5
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
-#else
-	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-
-
-
-.globl	_sha512_block_data_order_neon
-.private_extern	_sha512_block_data_order_neon
-#ifdef __thumb2__
-.thumb_func	_sha512_block_data_order_neon
-#endif
-.align	4
-_sha512_block_data_order_neon:
-LNEON:
-	dmb	@ errata #451034 on early Cortex A8
-	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
-	adr	r3,K512
-	VFP_ABI_PUSH
-	vldmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}		@ load context
-Loop_neon:
-	vshr.u64	d24,d20,#14	@ 0
-#if 0<16
-	vld1.64	{d0},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d20,#18
-#if 0>0
-	vadd.i64	d16,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d20,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d20,#50
-	vsli.64	d25,d20,#46
-	vmov	d29,d20
-	vsli.64	d26,d20,#23
-#if 0<16 && defined(__ARMEL__)
-	vrev64.8	d0,d0
-#endif
-	veor	d25,d24
-	vbsl	d29,d21,d22		@ Ch(e,f,g)
-	vshr.u64	d24,d16,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d23
-	vshr.u64	d25,d16,#34
-	vsli.64	d24,d16,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d16,#39
-	vadd.i64	d28,d0
-	vsli.64	d25,d16,#30
-	veor	d30,d16,d17
-	vsli.64	d26,d16,#25
-	veor	d23,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d18,d17		@ Maj(a,b,c)
-	veor	d23,d26			@ Sigma0(a)
-	vadd.i64	d19,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d23,d30
-	vshr.u64	d24,d19,#14	@ 1
-#if 1<16
-	vld1.64	{d1},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d19,#18
-#if 1>0
-	vadd.i64	d23,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d19,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d19,#50
-	vsli.64	d25,d19,#46
-	vmov	d29,d19
-	vsli.64	d26,d19,#23
-#if 1<16 && defined(__ARMEL__)
-	vrev64.8	d1,d1
-#endif
-	veor	d25,d24
-	vbsl	d29,d20,d21		@ Ch(e,f,g)
-	vshr.u64	d24,d23,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d22
-	vshr.u64	d25,d23,#34
-	vsli.64	d24,d23,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d23,#39
-	vadd.i64	d28,d1
-	vsli.64	d25,d23,#30
-	veor	d30,d23,d16
-	vsli.64	d26,d23,#25
-	veor	d22,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d17,d16		@ Maj(a,b,c)
-	veor	d22,d26			@ Sigma0(a)
-	vadd.i64	d18,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d22,d30
-	vshr.u64	d24,d18,#14	@ 2
-#if 2<16
-	vld1.64	{d2},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d18,#18
-#if 2>0
-	vadd.i64	d22,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d18,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d18,#50
-	vsli.64	d25,d18,#46
-	vmov	d29,d18
-	vsli.64	d26,d18,#23
-#if 2<16 && defined(__ARMEL__)
-	vrev64.8	d2,d2
-#endif
-	veor	d25,d24
-	vbsl	d29,d19,d20		@ Ch(e,f,g)
-	vshr.u64	d24,d22,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d21
-	vshr.u64	d25,d22,#34
-	vsli.64	d24,d22,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d22,#39
-	vadd.i64	d28,d2
-	vsli.64	d25,d22,#30
-	veor	d30,d22,d23
-	vsli.64	d26,d22,#25
-	veor	d21,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d16,d23		@ Maj(a,b,c)
-	veor	d21,d26			@ Sigma0(a)
-	vadd.i64	d17,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d21,d30
-	vshr.u64	d24,d17,#14	@ 3
-#if 3<16
-	vld1.64	{d3},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d17,#18
-#if 3>0
-	vadd.i64	d21,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d17,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d17,#50
-	vsli.64	d25,d17,#46
-	vmov	d29,d17
-	vsli.64	d26,d17,#23
-#if 3<16 && defined(__ARMEL__)
-	vrev64.8	d3,d3
-#endif
-	veor	d25,d24
-	vbsl	d29,d18,d19		@ Ch(e,f,g)
-	vshr.u64	d24,d21,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d20
-	vshr.u64	d25,d21,#34
-	vsli.64	d24,d21,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d21,#39
-	vadd.i64	d28,d3
-	vsli.64	d25,d21,#30
-	veor	d30,d21,d22
-	vsli.64	d26,d21,#25
-	veor	d20,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d23,d22		@ Maj(a,b,c)
-	veor	d20,d26			@ Sigma0(a)
-	vadd.i64	d16,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d20,d30
-	vshr.u64	d24,d16,#14	@ 4
-#if 4<16
-	vld1.64	{d4},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d16,#18
-#if 4>0
-	vadd.i64	d20,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d16,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d16,#50
-	vsli.64	d25,d16,#46
-	vmov	d29,d16
-	vsli.64	d26,d16,#23
-#if 4<16 && defined(__ARMEL__)
-	vrev64.8	d4,d4
-#endif
-	veor	d25,d24
-	vbsl	d29,d17,d18		@ Ch(e,f,g)
-	vshr.u64	d24,d20,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d19
-	vshr.u64	d25,d20,#34
-	vsli.64	d24,d20,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d20,#39
-	vadd.i64	d28,d4
-	vsli.64	d25,d20,#30
-	veor	d30,d20,d21
-	vsli.64	d26,d20,#25
-	veor	d19,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d22,d21		@ Maj(a,b,c)
-	veor	d19,d26			@ Sigma0(a)
-	vadd.i64	d23,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d19,d30
-	vshr.u64	d24,d23,#14	@ 5
-#if 5<16
-	vld1.64	{d5},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d23,#18
-#if 5>0
-	vadd.i64	d19,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d23,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d23,#50
-	vsli.64	d25,d23,#46
-	vmov	d29,d23
-	vsli.64	d26,d23,#23
-#if 5<16 && defined(__ARMEL__)
-	vrev64.8	d5,d5
-#endif
-	veor	d25,d24
-	vbsl	d29,d16,d17		@ Ch(e,f,g)
-	vshr.u64	d24,d19,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d18
-	vshr.u64	d25,d19,#34
-	vsli.64	d24,d19,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d19,#39
-	vadd.i64	d28,d5
-	vsli.64	d25,d19,#30
-	veor	d30,d19,d20
-	vsli.64	d26,d19,#25
-	veor	d18,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d21,d20		@ Maj(a,b,c)
-	veor	d18,d26			@ Sigma0(a)
-	vadd.i64	d22,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d18,d30
-	vshr.u64	d24,d22,#14	@ 6
-#if 6<16
-	vld1.64	{d6},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d22,#18
-#if 6>0
-	vadd.i64	d18,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d22,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d22,#50
-	vsli.64	d25,d22,#46
-	vmov	d29,d22
-	vsli.64	d26,d22,#23
-#if 6<16 && defined(__ARMEL__)
-	vrev64.8	d6,d6
-#endif
-	veor	d25,d24
-	vbsl	d29,d23,d16		@ Ch(e,f,g)
-	vshr.u64	d24,d18,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d17
-	vshr.u64	d25,d18,#34
-	vsli.64	d24,d18,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d18,#39
-	vadd.i64	d28,d6
-	vsli.64	d25,d18,#30
-	veor	d30,d18,d19
-	vsli.64	d26,d18,#25
-	veor	d17,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d20,d19		@ Maj(a,b,c)
-	veor	d17,d26			@ Sigma0(a)
-	vadd.i64	d21,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d17,d30
-	vshr.u64	d24,d21,#14	@ 7
-#if 7<16
-	vld1.64	{d7},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d21,#18
-#if 7>0
-	vadd.i64	d17,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d21,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d21,#50
-	vsli.64	d25,d21,#46
-	vmov	d29,d21
-	vsli.64	d26,d21,#23
-#if 7<16 && defined(__ARMEL__)
-	vrev64.8	d7,d7
-#endif
-	veor	d25,d24
-	vbsl	d29,d22,d23		@ Ch(e,f,g)
-	vshr.u64	d24,d17,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d16
-	vshr.u64	d25,d17,#34
-	vsli.64	d24,d17,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d17,#39
-	vadd.i64	d28,d7
-	vsli.64	d25,d17,#30
-	veor	d30,d17,d18
-	vsli.64	d26,d17,#25
-	veor	d16,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d19,d18		@ Maj(a,b,c)
-	veor	d16,d26			@ Sigma0(a)
-	vadd.i64	d20,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d16,d30
-	vshr.u64	d24,d20,#14	@ 8
-#if 8<16
-	vld1.64	{d8},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d20,#18
-#if 8>0
-	vadd.i64	d16,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d20,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d20,#50
-	vsli.64	d25,d20,#46
-	vmov	d29,d20
-	vsli.64	d26,d20,#23
-#if 8<16 && defined(__ARMEL__)
-	vrev64.8	d8,d8
-#endif
-	veor	d25,d24
-	vbsl	d29,d21,d22		@ Ch(e,f,g)
-	vshr.u64	d24,d16,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d23
-	vshr.u64	d25,d16,#34
-	vsli.64	d24,d16,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d16,#39
-	vadd.i64	d28,d8
-	vsli.64	d25,d16,#30
-	veor	d30,d16,d17
-	vsli.64	d26,d16,#25
-	veor	d23,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d18,d17		@ Maj(a,b,c)
-	veor	d23,d26			@ Sigma0(a)
-	vadd.i64	d19,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d23,d30
-	vshr.u64	d24,d19,#14	@ 9
-#if 9<16
-	vld1.64	{d9},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d19,#18
-#if 9>0
-	vadd.i64	d23,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d19,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d19,#50
-	vsli.64	d25,d19,#46
-	vmov	d29,d19
-	vsli.64	d26,d19,#23
-#if 9<16 && defined(__ARMEL__)
-	vrev64.8	d9,d9
-#endif
-	veor	d25,d24
-	vbsl	d29,d20,d21		@ Ch(e,f,g)
-	vshr.u64	d24,d23,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d22
-	vshr.u64	d25,d23,#34
-	vsli.64	d24,d23,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d23,#39
-	vadd.i64	d28,d9
-	vsli.64	d25,d23,#30
-	veor	d30,d23,d16
-	vsli.64	d26,d23,#25
-	veor	d22,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d17,d16		@ Maj(a,b,c)
-	veor	d22,d26			@ Sigma0(a)
-	vadd.i64	d18,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d22,d30
-	vshr.u64	d24,d18,#14	@ 10
-#if 10<16
-	vld1.64	{d10},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d18,#18
-#if 10>0
-	vadd.i64	d22,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d18,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d18,#50
-	vsli.64	d25,d18,#46
-	vmov	d29,d18
-	vsli.64	d26,d18,#23
-#if 10<16 && defined(__ARMEL__)
-	vrev64.8	d10,d10
-#endif
-	veor	d25,d24
-	vbsl	d29,d19,d20		@ Ch(e,f,g)
-	vshr.u64	d24,d22,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d21
-	vshr.u64	d25,d22,#34
-	vsli.64	d24,d22,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d22,#39
-	vadd.i64	d28,d10
-	vsli.64	d25,d22,#30
-	veor	d30,d22,d23
-	vsli.64	d26,d22,#25
-	veor	d21,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d16,d23		@ Maj(a,b,c)
-	veor	d21,d26			@ Sigma0(a)
-	vadd.i64	d17,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d21,d30
-	vshr.u64	d24,d17,#14	@ 11
-#if 11<16
-	vld1.64	{d11},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d17,#18
-#if 11>0
-	vadd.i64	d21,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d17,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d17,#50
-	vsli.64	d25,d17,#46
-	vmov	d29,d17
-	vsli.64	d26,d17,#23
-#if 11<16 && defined(__ARMEL__)
-	vrev64.8	d11,d11
-#endif
-	veor	d25,d24
-	vbsl	d29,d18,d19		@ Ch(e,f,g)
-	vshr.u64	d24,d21,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d20
-	vshr.u64	d25,d21,#34
-	vsli.64	d24,d21,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d21,#39
-	vadd.i64	d28,d11
-	vsli.64	d25,d21,#30
-	veor	d30,d21,d22
-	vsli.64	d26,d21,#25
-	veor	d20,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d23,d22		@ Maj(a,b,c)
-	veor	d20,d26			@ Sigma0(a)
-	vadd.i64	d16,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d20,d30
-	vshr.u64	d24,d16,#14	@ 12
-#if 12<16
-	vld1.64	{d12},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d16,#18
-#if 12>0
-	vadd.i64	d20,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d16,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d16,#50
-	vsli.64	d25,d16,#46
-	vmov	d29,d16
-	vsli.64	d26,d16,#23
-#if 12<16 && defined(__ARMEL__)
-	vrev64.8	d12,d12
-#endif
-	veor	d25,d24
-	vbsl	d29,d17,d18		@ Ch(e,f,g)
-	vshr.u64	d24,d20,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d19
-	vshr.u64	d25,d20,#34
-	vsli.64	d24,d20,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d20,#39
-	vadd.i64	d28,d12
-	vsli.64	d25,d20,#30
-	veor	d30,d20,d21
-	vsli.64	d26,d20,#25
-	veor	d19,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d22,d21		@ Maj(a,b,c)
-	veor	d19,d26			@ Sigma0(a)
-	vadd.i64	d23,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d19,d30
-	vshr.u64	d24,d23,#14	@ 13
-#if 13<16
-	vld1.64	{d13},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d23,#18
-#if 13>0
-	vadd.i64	d19,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d23,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d23,#50
-	vsli.64	d25,d23,#46
-	vmov	d29,d23
-	vsli.64	d26,d23,#23
-#if 13<16 && defined(__ARMEL__)
-	vrev64.8	d13,d13
-#endif
-	veor	d25,d24
-	vbsl	d29,d16,d17		@ Ch(e,f,g)
-	vshr.u64	d24,d19,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d18
-	vshr.u64	d25,d19,#34
-	vsli.64	d24,d19,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d19,#39
-	vadd.i64	d28,d13
-	vsli.64	d25,d19,#30
-	veor	d30,d19,d20
-	vsli.64	d26,d19,#25
-	veor	d18,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d21,d20		@ Maj(a,b,c)
-	veor	d18,d26			@ Sigma0(a)
-	vadd.i64	d22,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d18,d30
-	vshr.u64	d24,d22,#14	@ 14
-#if 14<16
-	vld1.64	{d14},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d22,#18
-#if 14>0
-	vadd.i64	d18,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d22,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d22,#50
-	vsli.64	d25,d22,#46
-	vmov	d29,d22
-	vsli.64	d26,d22,#23
-#if 14<16 && defined(__ARMEL__)
-	vrev64.8	d14,d14
-#endif
-	veor	d25,d24
-	vbsl	d29,d23,d16		@ Ch(e,f,g)
-	vshr.u64	d24,d18,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d17
-	vshr.u64	d25,d18,#34
-	vsli.64	d24,d18,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d18,#39
-	vadd.i64	d28,d14
-	vsli.64	d25,d18,#30
-	veor	d30,d18,d19
-	vsli.64	d26,d18,#25
-	veor	d17,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d20,d19		@ Maj(a,b,c)
-	veor	d17,d26			@ Sigma0(a)
-	vadd.i64	d21,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d17,d30
-	vshr.u64	d24,d21,#14	@ 15
-#if 15<16
-	vld1.64	{d15},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d21,#18
-#if 15>0
-	vadd.i64	d17,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d21,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d21,#50
-	vsli.64	d25,d21,#46
-	vmov	d29,d21
-	vsli.64	d26,d21,#23
-#if 15<16 && defined(__ARMEL__)
-	vrev64.8	d15,d15
-#endif
-	veor	d25,d24
-	vbsl	d29,d22,d23		@ Ch(e,f,g)
-	vshr.u64	d24,d17,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d16
-	vshr.u64	d25,d17,#34
-	vsli.64	d24,d17,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d17,#39
-	vadd.i64	d28,d15
-	vsli.64	d25,d17,#30
-	veor	d30,d17,d18
-	vsli.64	d26,d17,#25
-	veor	d16,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d19,d18		@ Maj(a,b,c)
-	veor	d16,d26			@ Sigma0(a)
-	vadd.i64	d20,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d16,d30
-	mov	r12,#4
-L16_79_neon:
-	subs	r12,#1
-	vshr.u64	q12,q7,#19
-	vshr.u64	q13,q7,#61
-	vadd.i64	d16,d30			@ h+=Maj from the past
-	vshr.u64	q15,q7,#6
-	vsli.64	q12,q7,#45
-	vext.8	q14,q0,q1,#8	@ X[i+1]
-	vsli.64	q13,q7,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q0,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q4,q5,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d20,#14		@ from NEON_00_15
-	vadd.i64	q0,q14
-	vshr.u64	d25,d20,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d20,#41		@ from NEON_00_15
-	vadd.i64	q0,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d20,#50
-	vsli.64	d25,d20,#46
-	vmov	d29,d20
-	vsli.64	d26,d20,#23
-#if 16<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d21,d22		@ Ch(e,f,g)
-	vshr.u64	d24,d16,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d23
-	vshr.u64	d25,d16,#34
-	vsli.64	d24,d16,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d16,#39
-	vadd.i64	d28,d0
-	vsli.64	d25,d16,#30
-	veor	d30,d16,d17
-	vsli.64	d26,d16,#25
-	veor	d23,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d18,d17		@ Maj(a,b,c)
-	veor	d23,d26			@ Sigma0(a)
-	vadd.i64	d19,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d23,d30
-	vshr.u64	d24,d19,#14	@ 17
-#if 17<16
-	vld1.64	{d1},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d19,#18
-#if 17>0
-	vadd.i64	d23,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d19,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d19,#50
-	vsli.64	d25,d19,#46
-	vmov	d29,d19
-	vsli.64	d26,d19,#23
-#if 17<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d20,d21		@ Ch(e,f,g)
-	vshr.u64	d24,d23,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d22
-	vshr.u64	d25,d23,#34
-	vsli.64	d24,d23,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d23,#39
-	vadd.i64	d28,d1
-	vsli.64	d25,d23,#30
-	veor	d30,d23,d16
-	vsli.64	d26,d23,#25
-	veor	d22,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d17,d16		@ Maj(a,b,c)
-	veor	d22,d26			@ Sigma0(a)
-	vadd.i64	d18,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d22,d30
-	vshr.u64	q12,q0,#19
-	vshr.u64	q13,q0,#61
-	vadd.i64	d22,d30			@ h+=Maj from the past
-	vshr.u64	q15,q0,#6
-	vsli.64	q12,q0,#45
-	vext.8	q14,q1,q2,#8	@ X[i+1]
-	vsli.64	q13,q0,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q1,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q5,q6,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d18,#14		@ from NEON_00_15
-	vadd.i64	q1,q14
-	vshr.u64	d25,d18,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d18,#41		@ from NEON_00_15
-	vadd.i64	q1,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d18,#50
-	vsli.64	d25,d18,#46
-	vmov	d29,d18
-	vsli.64	d26,d18,#23
-#if 18<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d19,d20		@ Ch(e,f,g)
-	vshr.u64	d24,d22,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d21
-	vshr.u64	d25,d22,#34
-	vsli.64	d24,d22,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d22,#39
-	vadd.i64	d28,d2
-	vsli.64	d25,d22,#30
-	veor	d30,d22,d23
-	vsli.64	d26,d22,#25
-	veor	d21,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d16,d23		@ Maj(a,b,c)
-	veor	d21,d26			@ Sigma0(a)
-	vadd.i64	d17,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d21,d30
-	vshr.u64	d24,d17,#14	@ 19
-#if 19<16
-	vld1.64	{d3},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d17,#18
-#if 19>0
-	vadd.i64	d21,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d17,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d17,#50
-	vsli.64	d25,d17,#46
-	vmov	d29,d17
-	vsli.64	d26,d17,#23
-#if 19<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d18,d19		@ Ch(e,f,g)
-	vshr.u64	d24,d21,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d20
-	vshr.u64	d25,d21,#34
-	vsli.64	d24,d21,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d21,#39
-	vadd.i64	d28,d3
-	vsli.64	d25,d21,#30
-	veor	d30,d21,d22
-	vsli.64	d26,d21,#25
-	veor	d20,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d23,d22		@ Maj(a,b,c)
-	veor	d20,d26			@ Sigma0(a)
-	vadd.i64	d16,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d20,d30
-	vshr.u64	q12,q1,#19
-	vshr.u64	q13,q1,#61
-	vadd.i64	d20,d30			@ h+=Maj from the past
-	vshr.u64	q15,q1,#6
-	vsli.64	q12,q1,#45
-	vext.8	q14,q2,q3,#8	@ X[i+1]
-	vsli.64	q13,q1,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q2,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q6,q7,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d16,#14		@ from NEON_00_15
-	vadd.i64	q2,q14
-	vshr.u64	d25,d16,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d16,#41		@ from NEON_00_15
-	vadd.i64	q2,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d16,#50
-	vsli.64	d25,d16,#46
-	vmov	d29,d16
-	vsli.64	d26,d16,#23
-#if 20<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d17,d18		@ Ch(e,f,g)
-	vshr.u64	d24,d20,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d19
-	vshr.u64	d25,d20,#34
-	vsli.64	d24,d20,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d20,#39
-	vadd.i64	d28,d4
-	vsli.64	d25,d20,#30
-	veor	d30,d20,d21
-	vsli.64	d26,d20,#25
-	veor	d19,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d22,d21		@ Maj(a,b,c)
-	veor	d19,d26			@ Sigma0(a)
-	vadd.i64	d23,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d19,d30
-	vshr.u64	d24,d23,#14	@ 21
-#if 21<16
-	vld1.64	{d5},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d23,#18
-#if 21>0
-	vadd.i64	d19,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d23,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d23,#50
-	vsli.64	d25,d23,#46
-	vmov	d29,d23
-	vsli.64	d26,d23,#23
-#if 21<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d16,d17		@ Ch(e,f,g)
-	vshr.u64	d24,d19,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d18
-	vshr.u64	d25,d19,#34
-	vsli.64	d24,d19,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d19,#39
-	vadd.i64	d28,d5
-	vsli.64	d25,d19,#30
-	veor	d30,d19,d20
-	vsli.64	d26,d19,#25
-	veor	d18,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d21,d20		@ Maj(a,b,c)
-	veor	d18,d26			@ Sigma0(a)
-	vadd.i64	d22,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d18,d30
-	vshr.u64	q12,q2,#19
-	vshr.u64	q13,q2,#61
-	vadd.i64	d18,d30			@ h+=Maj from the past
-	vshr.u64	q15,q2,#6
-	vsli.64	q12,q2,#45
-	vext.8	q14,q3,q4,#8	@ X[i+1]
-	vsli.64	q13,q2,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q3,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q7,q0,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d22,#14		@ from NEON_00_15
-	vadd.i64	q3,q14
-	vshr.u64	d25,d22,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d22,#41		@ from NEON_00_15
-	vadd.i64	q3,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d22,#50
-	vsli.64	d25,d22,#46
-	vmov	d29,d22
-	vsli.64	d26,d22,#23
-#if 22<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d23,d16		@ Ch(e,f,g)
-	vshr.u64	d24,d18,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d17
-	vshr.u64	d25,d18,#34
-	vsli.64	d24,d18,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d18,#39
-	vadd.i64	d28,d6
-	vsli.64	d25,d18,#30
-	veor	d30,d18,d19
-	vsli.64	d26,d18,#25
-	veor	d17,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d20,d19		@ Maj(a,b,c)
-	veor	d17,d26			@ Sigma0(a)
-	vadd.i64	d21,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d17,d30
-	vshr.u64	d24,d21,#14	@ 23
-#if 23<16
-	vld1.64	{d7},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d21,#18
-#if 23>0
-	vadd.i64	d17,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d21,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d21,#50
-	vsli.64	d25,d21,#46
-	vmov	d29,d21
-	vsli.64	d26,d21,#23
-#if 23<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d22,d23		@ Ch(e,f,g)
-	vshr.u64	d24,d17,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d16
-	vshr.u64	d25,d17,#34
-	vsli.64	d24,d17,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d17,#39
-	vadd.i64	d28,d7
-	vsli.64	d25,d17,#30
-	veor	d30,d17,d18
-	vsli.64	d26,d17,#25
-	veor	d16,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d19,d18		@ Maj(a,b,c)
-	veor	d16,d26			@ Sigma0(a)
-	vadd.i64	d20,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d16,d30
-	vshr.u64	q12,q3,#19
-	vshr.u64	q13,q3,#61
-	vadd.i64	d16,d30			@ h+=Maj from the past
-	vshr.u64	q15,q3,#6
-	vsli.64	q12,q3,#45
-	vext.8	q14,q4,q5,#8	@ X[i+1]
-	vsli.64	q13,q3,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q4,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q0,q1,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d20,#14		@ from NEON_00_15
-	vadd.i64	q4,q14
-	vshr.u64	d25,d20,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d20,#41		@ from NEON_00_15
-	vadd.i64	q4,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d20,#50
-	vsli.64	d25,d20,#46
-	vmov	d29,d20
-	vsli.64	d26,d20,#23
-#if 24<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d21,d22		@ Ch(e,f,g)
-	vshr.u64	d24,d16,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d23
-	vshr.u64	d25,d16,#34
-	vsli.64	d24,d16,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d16,#39
-	vadd.i64	d28,d8
-	vsli.64	d25,d16,#30
-	veor	d30,d16,d17
-	vsli.64	d26,d16,#25
-	veor	d23,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d18,d17		@ Maj(a,b,c)
-	veor	d23,d26			@ Sigma0(a)
-	vadd.i64	d19,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d23,d30
-	vshr.u64	d24,d19,#14	@ 25
-#if 25<16
-	vld1.64	{d9},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d19,#18
-#if 25>0
-	vadd.i64	d23,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d19,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d19,#50
-	vsli.64	d25,d19,#46
-	vmov	d29,d19
-	vsli.64	d26,d19,#23
-#if 25<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d20,d21		@ Ch(e,f,g)
-	vshr.u64	d24,d23,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d22
-	vshr.u64	d25,d23,#34
-	vsli.64	d24,d23,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d23,#39
-	vadd.i64	d28,d9
-	vsli.64	d25,d23,#30
-	veor	d30,d23,d16
-	vsli.64	d26,d23,#25
-	veor	d22,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d17,d16		@ Maj(a,b,c)
-	veor	d22,d26			@ Sigma0(a)
-	vadd.i64	d18,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d22,d30
-	vshr.u64	q12,q4,#19
-	vshr.u64	q13,q4,#61
-	vadd.i64	d22,d30			@ h+=Maj from the past
-	vshr.u64	q15,q4,#6
-	vsli.64	q12,q4,#45
-	vext.8	q14,q5,q6,#8	@ X[i+1]
-	vsli.64	q13,q4,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q5,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q1,q2,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d18,#14		@ from NEON_00_15
-	vadd.i64	q5,q14
-	vshr.u64	d25,d18,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d18,#41		@ from NEON_00_15
-	vadd.i64	q5,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d18,#50
-	vsli.64	d25,d18,#46
-	vmov	d29,d18
-	vsli.64	d26,d18,#23
-#if 26<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d19,d20		@ Ch(e,f,g)
-	vshr.u64	d24,d22,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d21
-	vshr.u64	d25,d22,#34
-	vsli.64	d24,d22,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d22,#39
-	vadd.i64	d28,d10
-	vsli.64	d25,d22,#30
-	veor	d30,d22,d23
-	vsli.64	d26,d22,#25
-	veor	d21,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d16,d23		@ Maj(a,b,c)
-	veor	d21,d26			@ Sigma0(a)
-	vadd.i64	d17,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d21,d30
-	vshr.u64	d24,d17,#14	@ 27
-#if 27<16
-	vld1.64	{d11},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d17,#18
-#if 27>0
-	vadd.i64	d21,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d17,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d17,#50
-	vsli.64	d25,d17,#46
-	vmov	d29,d17
-	vsli.64	d26,d17,#23
-#if 27<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d18,d19		@ Ch(e,f,g)
-	vshr.u64	d24,d21,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d20
-	vshr.u64	d25,d21,#34
-	vsli.64	d24,d21,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d21,#39
-	vadd.i64	d28,d11
-	vsli.64	d25,d21,#30
-	veor	d30,d21,d22
-	vsli.64	d26,d21,#25
-	veor	d20,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d23,d22		@ Maj(a,b,c)
-	veor	d20,d26			@ Sigma0(a)
-	vadd.i64	d16,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d20,d30
-	vshr.u64	q12,q5,#19
-	vshr.u64	q13,q5,#61
-	vadd.i64	d20,d30			@ h+=Maj from the past
-	vshr.u64	q15,q5,#6
-	vsli.64	q12,q5,#45
-	vext.8	q14,q6,q7,#8	@ X[i+1]
-	vsli.64	q13,q5,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q6,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q2,q3,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d16,#14		@ from NEON_00_15
-	vadd.i64	q6,q14
-	vshr.u64	d25,d16,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d16,#41		@ from NEON_00_15
-	vadd.i64	q6,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d16,#50
-	vsli.64	d25,d16,#46
-	vmov	d29,d16
-	vsli.64	d26,d16,#23
-#if 28<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d17,d18		@ Ch(e,f,g)
-	vshr.u64	d24,d20,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d19
-	vshr.u64	d25,d20,#34
-	vsli.64	d24,d20,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d20,#39
-	vadd.i64	d28,d12
-	vsli.64	d25,d20,#30
-	veor	d30,d20,d21
-	vsli.64	d26,d20,#25
-	veor	d19,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d22,d21		@ Maj(a,b,c)
-	veor	d19,d26			@ Sigma0(a)
-	vadd.i64	d23,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d19,d30
-	vshr.u64	d24,d23,#14	@ 29
-#if 29<16
-	vld1.64	{d13},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d23,#18
-#if 29>0
-	vadd.i64	d19,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d23,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d23,#50
-	vsli.64	d25,d23,#46
-	vmov	d29,d23
-	vsli.64	d26,d23,#23
-#if 29<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d16,d17		@ Ch(e,f,g)
-	vshr.u64	d24,d19,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d18
-	vshr.u64	d25,d19,#34
-	vsli.64	d24,d19,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d19,#39
-	vadd.i64	d28,d13
-	vsli.64	d25,d19,#30
-	veor	d30,d19,d20
-	vsli.64	d26,d19,#25
-	veor	d18,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d21,d20		@ Maj(a,b,c)
-	veor	d18,d26			@ Sigma0(a)
-	vadd.i64	d22,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d18,d30
-	vshr.u64	q12,q6,#19
-	vshr.u64	q13,q6,#61
-	vadd.i64	d18,d30			@ h+=Maj from the past
-	vshr.u64	q15,q6,#6
-	vsli.64	q12,q6,#45
-	vext.8	q14,q7,q0,#8	@ X[i+1]
-	vsli.64	q13,q6,#3
-	veor	q15,q12
-	vshr.u64	q12,q14,#1
-	veor	q15,q13				@ sigma1(X[i+14])
-	vshr.u64	q13,q14,#8
-	vadd.i64	q7,q15
-	vshr.u64	q15,q14,#7
-	vsli.64	q12,q14,#63
-	vsli.64	q13,q14,#56
-	vext.8	q14,q3,q4,#8	@ X[i+9]
-	veor	q15,q12
-	vshr.u64	d24,d22,#14		@ from NEON_00_15
-	vadd.i64	q7,q14
-	vshr.u64	d25,d22,#18		@ from NEON_00_15
-	veor	q15,q13				@ sigma0(X[i+1])
-	vshr.u64	d26,d22,#41		@ from NEON_00_15
-	vadd.i64	q7,q15
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d22,#50
-	vsli.64	d25,d22,#46
-	vmov	d29,d22
-	vsli.64	d26,d22,#23
-#if 30<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d23,d16		@ Ch(e,f,g)
-	vshr.u64	d24,d18,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d17
-	vshr.u64	d25,d18,#34
-	vsli.64	d24,d18,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d18,#39
-	vadd.i64	d28,d14
-	vsli.64	d25,d18,#30
-	veor	d30,d18,d19
-	vsli.64	d26,d18,#25
-	veor	d17,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d20,d19		@ Maj(a,b,c)
-	veor	d17,d26			@ Sigma0(a)
-	vadd.i64	d21,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d17,d30
-	vshr.u64	d24,d21,#14	@ 31
-#if 31<16
-	vld1.64	{d15},[r1]!	@ handles unaligned
-#endif
-	vshr.u64	d25,d21,#18
-#if 31>0
-	vadd.i64	d17,d30			@ h+=Maj from the past
-#endif
-	vshr.u64	d26,d21,#41
-	vld1.64	{d28},[r3,:64]!	@ K[i++]
-	vsli.64	d24,d21,#50
-	vsli.64	d25,d21,#46
-	vmov	d29,d21
-	vsli.64	d26,d21,#23
-#if 31<16 && defined(__ARMEL__)
-	vrev64.8	,
-#endif
-	veor	d25,d24
-	vbsl	d29,d22,d23		@ Ch(e,f,g)
-	vshr.u64	d24,d17,#28
-	veor	d26,d25			@ Sigma1(e)
-	vadd.i64	d27,d29,d16
-	vshr.u64	d25,d17,#34
-	vsli.64	d24,d17,#36
-	vadd.i64	d27,d26
-	vshr.u64	d26,d17,#39
-	vadd.i64	d28,d15
-	vsli.64	d25,d17,#30
-	veor	d30,d17,d18
-	vsli.64	d26,d17,#25
-	veor	d16,d24,d25
-	vadd.i64	d27,d28
-	vbsl	d30,d19,d18		@ Maj(a,b,c)
-	veor	d16,d26			@ Sigma0(a)
-	vadd.i64	d20,d27
-	vadd.i64	d30,d27
-	@ vadd.i64	d16,d30
-	bne	L16_79_neon
-
-	vadd.i64	d16,d30		@ h+=Maj from the past
-	vldmia	r0,{d24,d25,d26,d27,d28,d29,d30,d31}	@ load context to temp
-	vadd.i64	q8,q12		@ vectorized accumulate
-	vadd.i64	q9,q13
-	vadd.i64	q10,q14
-	vadd.i64	q11,q15
-	vstmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}	@ save context
-	teq	r1,r2
-	sub	r3,#640	@ rewind K512
-	bne	Loop_neon
-
-	VFP_ABI_POP
-	bx	lr				@ .word	0xe12fff1e
-
-#endif
-.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align	2
-.align	2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm	_OPENSSL_armcap_P,4
-.non_lazy_symbol_pointer
-OPENSSL_armcap_P:
-.indirect_symbol	_OPENSSL_armcap_P
-.long	0
-.private_extern	_OPENSSL_armcap_P
-#endif
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/fipsmodule/vpaes-armv7-apple.S b/apple-arm/crypto/fipsmodule/vpaes-armv7-apple.S
deleted file mode 100644
index 4cdc521..0000000
--- a/apple-arm/crypto/fipsmodule/vpaes-armv7-apple.S
+++ /dev/null
@@ -1,1257 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-.syntax	unified
-
-
-
-
-#if defined(__thumb2__)
-.thumb
-#else
-.code	32
-#endif
-
-.text
-
-
-.align	7	@ totally strategic alignment
-_vpaes_consts:
-Lk_mc_forward:@ mc_forward
-.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
-.quad	0x080B0A0904070605, 0x000302010C0F0E0D
-.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
-.quad	0x000302010C0F0E0D, 0x080B0A0904070605
-Lk_mc_backward:@ mc_backward
-.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
-.quad	0x020100030E0D0C0F, 0x0A09080B06050407
-.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
-.quad	0x0A09080B06050407, 0x020100030E0D0C0F
-Lk_sr:@ sr
-.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
-.quad	0x030E09040F0A0500, 0x0B06010C07020D08
-.quad	0x0F060D040B020900, 0x070E050C030A0108
-.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
-
-@
-@ "Hot" constants
-@
-Lk_inv:@ inv, inva
-.quad	0x0E05060F0D080180, 0x040703090A0B0C02
-.quad	0x01040A060F0B0780, 0x030D0E0C02050809
-Lk_ipt:@ input transform (lo, hi)
-.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
-.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
-Lk_sbo:@ sbou, sbot
-.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
-.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
-Lk_sb1:@ sb1u, sb1t
-.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
-.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
-Lk_sb2:@ sb2u, sb2t
-.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
-.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
-
-.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
-.align	2
-
-.align	6
-@@
-@@  _aes_preheat
-@@
-@@  Fills q9-q15 as specified below.
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_preheat
-#endif
-.align	4
-_vpaes_preheat:
-	adr	r10, Lk_inv
-	vmov.i8	q9, #0x0f		@ Lk_s0F
-	vld1.64	{q10,q11}, [r10]!	@ Lk_inv
-	add	r10, r10, #64		@ Skip Lk_ipt, Lk_sbo
-	vld1.64	{q12,q13}, [r10]!	@ Lk_sb1
-	vld1.64	{q14,q15}, [r10]	@ Lk_sb2
-	bx	lr
-
-@@
-@@  _aes_encrypt_core
-@@
-@@  AES-encrypt q0.
-@@
-@@  Inputs:
-@@     q0 = input
-@@     q9-q15 as in _vpaes_preheat
-@@    [r2] = scheduled keys
-@@
-@@  Output in q0
-@@  Clobbers  q1-q5, r8-r11
-@@  Preserves q6-q8 so you get some local vectors
-@@
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_encrypt_core
-#endif
-.align	4
-_vpaes_encrypt_core:
-	mov	r9, r2
-	ldr	r8, [r2,#240]		@ pull rounds
-	adr	r11, Lk_ipt
-	@ vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
-	@ vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
-	vld1.64	{q2, q3}, [r11]
-	adr	r11, Lk_mc_forward+16
-	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5		# round0 key
-	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
-	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
-	vtbl.8	d2, {q2}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm1
-	vtbl.8	d3, {q2}, d3
-	vtbl.8	d4, {q3}, d0	@ vpshufb	%xmm0,	%xmm3,	%xmm2
-	vtbl.8	d5, {q3}, d1
-	veor	q0, q1, q5		@ vpxor	%xmm5,	%xmm1,	%xmm0
-	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
-
-	@ .Lenc_entry ends with a bnz instruction which is normally paired with
-	@ subs in .Lenc_loop.
-	tst	r8, r8
-	b	Lenc_entry
-
-.align	4
-Lenc_loop:
-	@ middle of middle round
-	add	r10, r11, #0x40
-	vtbl.8	d8, {q13}, d4	@ vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
-	vtbl.8	d9, {q13}, d5
-	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
-	vtbl.8	d0, {q12}, d6	@ vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
-	vtbl.8	d1, {q12}, d7
-	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
-	vtbl.8	d10, {q15}, d4	@ vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
-	vtbl.8	d11, {q15}, d5
-	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
-	vtbl.8	d4, {q14}, d6	@ vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
-	vtbl.8	d5, {q14}, d7
-	vld1.64	{q4}, [r10]		@ vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
-	vtbl.8	d6, {q0}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
-	vtbl.8	d7, {q0}, d3
-	veor	q2, q2, q5		@ vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
-	@ Write to q5 instead of q0, so the table and destination registers do
-	@ not overlap.
-	vtbl.8	d10, {q0}, d8	@ vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
-	vtbl.8	d11, {q0}, d9
-	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
-	vtbl.8	d8, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
-	vtbl.8	d9, {q3}, d3
-	@ Here we restore the original q0/q5 usage.
-	veor	q0, q5, q3		@ vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
-	and	r11, r11, #~(1<<6)	@ and		$0x30,	%r11		# ... mod 4
-	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
-	subs	r8, r8, #1		@ nr--
-
-Lenc_entry:
-	@ top of round
-	vand	q1, q0, q9		@ vpand		%xmm0,	%xmm9,	%xmm1   # 0 = k
-	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
-	vtbl.8	d10, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
-	vtbl.8	d11, {q11}, d3
-	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
-	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
-	vtbl.8	d7, {q10}, d1
-	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
-	vtbl.8	d9, {q10}, d3
-	veor	q3, q3, q5		@ vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
-	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
-	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
-	vtbl.8	d5, {q10}, d7
-	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
-	vtbl.8	d7, {q10}, d9
-	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
-	veor	q3, q3, q0		@ vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
-	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5
-	bne	Lenc_loop
-
-	@ middle of last round
-	add	r10, r11, #0x80
-
-	adr	r11, Lk_sbo
-	@ Read to q1 instead of q4, so the vtbl.8 instruction below does not
-	@ overlap table and destination registers.
-	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou
-	vld1.64	{q0}, [r11]		@ vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	Lk_sbo+16
-	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
-	vtbl.8	d9, {q1}, d5
-	vld1.64	{q1}, [r10]		@ vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
-	@ Write to q2 instead of q0 below, to avoid overlapping table and
-	@ destination registers.
-	vtbl.8	d4, {q0}, d6	@ vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
-	vtbl.8	d5, {q0}, d7
-	veor	q4, q4, q5		@ vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
-	veor	q2, q2, q4		@ vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
-	@ Here we restore the original q0/q2 usage.
-	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0
-	vtbl.8	d1, {q2}, d3
-	bx	lr
-
-
-.globl	_vpaes_encrypt
-.private_extern	_vpaes_encrypt
-#ifdef __thumb2__
-.thumb_func	_vpaes_encrypt
-#endif
-.align	4
-_vpaes_encrypt:
-	@ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
-	@ alignment.
-	stmdb	sp!, {r7,r8,r9,r10,r11,lr}
-	@ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
-	vstmdb	sp!, {d8,d9,d10,d11}
-
-	vld1.64	{q0}, [r0]
-	bl	_vpaes_preheat
-	bl	_vpaes_encrypt_core
-	vst1.64	{q0}, [r1]
-
-	vldmia	sp!, {d8,d9,d10,d11}
-	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
-
-
-@
-@  Decryption stuff
-@
-
-.align	4
-_vpaes_decrypt_consts:
-Lk_dipt:@ decryption input transform
-.quad	0x0F505B040B545F00, 0x154A411E114E451A
-.quad	0x86E383E660056500, 0x12771772F491F194
-Lk_dsbo:@ decryption sbox final output
-.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
-.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-Lk_dsb9:@ decryption sbox output *9*u, *9*t
-.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
-.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
-Lk_dsbd:@ decryption sbox output *D*u, *D*t
-.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
-.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
-Lk_dsbb:@ decryption sbox output *B*u, *B*t
-.quad	0xD022649296B44200, 0x602646F6B0F2D404
-.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
-Lk_dsbe:@ decryption sbox output *E*u, *E*t
-.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
-.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
-
-
-@@
-@@  Decryption core
-@@
-@@  Same API as encryption core, except it clobbers q12-q15 rather than using
-@@  the values from _vpaes_preheat. q9-q11 must still be set from
-@@  _vpaes_preheat.
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_decrypt_core
-#endif
-.align	4
-_vpaes_decrypt_core:
-	mov	r9, r2
-	ldr	r8, [r2,#240]		@ pull rounds
-
-	@ This function performs shuffles with various constants. The x86_64
-	@ version loads them on-demand into %xmm0-%xmm5. This does not work well
-	@ for ARMv7 because those registers are shuffle destinations. The ARMv8
-	@ version preloads those constants into registers, but ARMv7 has half
-	@ the registers to work with. Instead, we load them on-demand into
-	@ q12-q15, registers normally use for preloaded constants. This is fine
-	@ because decryption doesn't use those constants. The values are
-	@ constant, so this does not interfere with potential 2x optimizations.
-	adr	r7, Lk_dipt
-
-	vld1.64	{q12,q13}, [r7]		@ vmovdqa	Lk_dipt(%rip), %xmm2	# iptlo
-	lsl	r11, r8, #4		@ mov		%rax,	%r11;	shl	$4, %r11
-	eor	r11, r11, #0x30		@ xor		$0x30,	%r11
-	adr	r10, Lk_sr
-	and	r11, r11, #0x30		@ and		$0x30,	%r11
-	add	r11, r11, r10
-	adr	r10, Lk_mc_forward+48
-
-	vld1.64	{q4}, [r9]!		@ vmovdqu	(%r9),	%xmm4		# round0 key
-	vand	q1, q0, q9		@ vpand		%xmm9,	%xmm0,	%xmm1
-	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
-	vtbl.8	d4, {q12}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
-	vtbl.8	d5, {q12}, d3
-	vld1.64	{q5}, [r10]		@ vmovdqa	Lk_mc_forward+48(%rip), %xmm5
-					@ vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
-	vtbl.8	d0, {q13}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
-	vtbl.8	d1, {q13}, d1
-	veor	q2, q2, q4		@ vpxor		%xmm4,	%xmm2,	%xmm2
-	veor	q0, q0, q2		@ vpxor		%xmm2,	%xmm0,	%xmm0
-
-	@ .Ldec_entry ends with a bnz instruction which is normally paired with
-	@ subs in .Ldec_loop.
-	tst	r8, r8
-	b	Ldec_entry
-
-.align	4
-Ldec_loop:
-@
-@  Inverse mix columns
-@
-
-	@ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of
-	@ the function.
-	adr	r10, Lk_dsb9
-	vld1.64	{q12,q13}, [r10]!	@ vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
-					@ vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
-	@ Load sbd* ahead of time.
-	vld1.64	{q14,q15}, [r10]!	@ vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
-					@ vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
-	vtbl.8	d8, {q12}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
-	vtbl.8	d9, {q12}, d5
-	vtbl.8	d2, {q13}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
-	vtbl.8	d3, {q13}, d7
-	veor	q0, q4, q0		@ vpxor		%xmm4,	%xmm0,	%xmm0
-
-	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
-
-	@ Load sbb* ahead of time.
-	vld1.64	{q12,q13}, [r10]!	@ vmovdqa	0x20(%r10),%xmm4		# 4 : sbbu
-					@ vmovdqa	0x30(%r10),%xmm1		# 0 : sbbt
-
-	vtbl.8	d8, {q14}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
-	vtbl.8	d9, {q14}, d5
-	@ Write to q1 instead of q0, so the table and destination registers do
-	@ not overlap.
-	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	vtbl.8	d3, {q0}, d11
-	@ Here we restore the original q0/q1 usage. This instruction is
-	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
-	@ below.
-	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	vtbl.8	d2, {q15}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
-	vtbl.8	d3, {q15}, d7
-					@ vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
-	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
-					@ vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
-
-	@ Load sbd* ahead of time.
-	vld1.64	{q14,q15}, [r10]!	@ vmovdqa	0x40(%r10),%xmm4		# 4 : sbeu
-					@ vmovdqa	0x50(%r10),%xmm1		# 0 : sbet
-
-	vtbl.8	d8, {q12}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
-	vtbl.8	d9, {q12}, d5
-	@ Write to q1 instead of q0, so the table and destination registers do
-	@ not overlap.
-	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	vtbl.8	d3, {q0}, d11
-	@ Here we restore the original q0/q1 usage. This instruction is
-	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
-	@ below.
-	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	vtbl.8	d2, {q13}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
-	vtbl.8	d3, {q13}, d7
-	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
-
-	vtbl.8	d8, {q14}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
-	vtbl.8	d9, {q14}, d5
-	@ Write to q1 instead of q0, so the table and destination registers do
-	@ not overlap.
-	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
-	vtbl.8	d3, {q0}, d11
-	@ Here we restore the original q0/q1 usage. This instruction is
-	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
-	@ below.
-	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
-	vtbl.8	d2, {q15}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
-	vtbl.8	d3, {q15}, d7
-	vext.8	q5, q5, q5, #12		@ vpalignr 	$12,	%xmm5,	%xmm5,	%xmm5
-	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
-	subs	r8, r8, #1		@ sub		$1,%rax			# nr--
-
-Ldec_entry:
-	@ top of round
-	vand	q1, q0, q9		@ vpand		%xmm9,	%xmm0,	%xmm1	# 0 = k
-	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
-	vtbl.8	d4, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
-	vtbl.8	d5, {q11}, d3
-	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
-	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
-	vtbl.8	d7, {q10}, d1
-	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
-	vtbl.8	d9, {q10}, d3
-	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
-	veor	q4, q4, q2		@ vpxor		%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
-	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
-	vtbl.8	d5, {q10}, d7
-	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
-	vtbl.8	d7, {q10}, d9
-	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2	# 2 = io
-	veor	q3, q3, q0		@ vpxor		%xmm0,  %xmm3,	%xmm3	# 3 = jo
-	vld1.64	{q0}, [r9]!		@ vmovdqu	(%r9),	%xmm0
-	bne	Ldec_loop
-
-	@ middle of last round
-
-	adr	r10, Lk_dsbo
-
-	@ Write to q1 rather than q4 to avoid overlapping table and destination.
-	vld1.64	{q1}, [r10]!		@ vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
-	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
-	vtbl.8	d9, {q1}, d5
-	@ Write to q2 rather than q1 to avoid overlapping table and destination.
-	vld1.64	{q2}, [r10]		@ vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
-	vtbl.8	d2, {q2}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
-	vtbl.8	d3, {q2}, d7
-	vld1.64	{q2}, [r11]		@ vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
-	veor	q4, q4, q0		@ vpxor		%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
-	@ Write to q1 rather than q0 so the table and destination registers
-	@ below do not overlap.
-	veor	q1, q1, q4		@ vpxor		%xmm4,	%xmm1,	%xmm0	# 0 = A
-	vtbl.8	d0, {q1}, d4	@ vpshufb	%xmm2,	%xmm0,	%xmm0
-	vtbl.8	d1, {q1}, d5
-	bx	lr
-
-
-.globl	_vpaes_decrypt
-.private_extern	_vpaes_decrypt
-#ifdef __thumb2__
-.thumb_func	_vpaes_decrypt
-#endif
-.align	4
-_vpaes_decrypt:
-	@ _vpaes_decrypt_core uses r7-r11.
-	stmdb	sp!, {r7,r8,r9,r10,r11,lr}
-	@ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved.
-	vstmdb	sp!, {d8,d9,d10,d11}
-
-	vld1.64	{q0}, [r0]
-	bl	_vpaes_preheat
-	bl	_vpaes_decrypt_core
-	vst1.64	{q0}, [r1]
-
-	vldmia	sp!, {d8,d9,d10,d11}
-	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
-
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-@@                                                    @@
-@@                  AES key schedule                  @@
-@@                                                    @@
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
-@ This function diverges from both x86_64 and armv7 in which constants are
-@ pinned. x86_64 has a common preheat function for all operations. aarch64
-@ separates them because it has enough registers to pin nearly all constants.
-@ armv7 does not have enough registers, but needing explicit loads and stores
-@ also complicates using x86_64's register allocation directly.
-@
-@ We pin some constants for convenience and leave q14 and q15 free to load
-@ others on demand.
-
-@
-@  Key schedule constants
-@
-
-.align	4
-_vpaes_key_consts:
-Lk_dksd:@ decryption key schedule: invskew x*D
-.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
-.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
-Lk_dksb:@ decryption key schedule: invskew x*B
-.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
-.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
-Lk_dkse:@ decryption key schedule: invskew x*E + 0x63
-.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
-.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
-Lk_dks9:@ decryption key schedule: invskew x*9
-.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
-.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
-
-Lk_rcon:@ rcon
-.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
-
-Lk_opt:@ output transform
-.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
-.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
-Lk_deskew:@ deskew tables: inverts the sbox's "skew"
-.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
-.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
-
-
-#ifdef __thumb2__
-.thumb_func	_vpaes_key_preheat
-#endif
-.align	4
-_vpaes_key_preheat:
-	adr	r11, Lk_rcon
-	vmov.i8	q12, #0x5b			@ Lk_s63
-	adr	r10, Lk_inv			@ Must be aligned to 8 mod 16.
-	vmov.i8	q9, #0x0f			@ Lk_s0F
-	vld1.64	{q10,q11}, [r10]		@ Lk_inv
-	vld1.64	{q8}, [r11]			@ Lk_rcon
-	bx	lr
-
-
-#ifdef __thumb2__
-.thumb_func	_vpaes_schedule_core
-#endif
-.align	4
-_vpaes_schedule_core:
-	@ We only need to save lr, but ARM requires an 8-byte stack alignment,
-	@ so save an extra register.
-	stmdb	sp!, {r3,lr}
-
-	bl	_vpaes_key_preheat	@ load the tables
-
-	adr	r11, Lk_ipt		@ Must be aligned to 8 mod 16.
-	vld1.64	{q0}, [r0]!		@ vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
-
-	@ input transform
-	@ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
-	@ overlap table and destination.
-	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm3
-	bl	_vpaes_schedule_transform
-	adr	r10, Lk_sr		@ Must be aligned to 8 mod 16.
-	vmov	q7, q0			@ vmovdqa	%xmm0,	%xmm7
-
-	add	r8, r8, r10
-	tst	r3, r3
-	bne	Lschedule_am_decrypting
-
-	@ encrypting, output zeroth round key after transform
-	vst1.64	{q0}, [r2]		@ vmovdqu	%xmm0,	(%rdx)
-	b	Lschedule_go
-
-Lschedule_am_decrypting:
-	@ decrypting, output zeroth round key after shiftrows
-	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
-	vtbl.8	d6, {q4}, d2	@ vpshufb  	%xmm1,	%xmm3,	%xmm3
-	vtbl.8	d7, {q4}, d3
-	vst1.64	{q3}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
-	eor	r8, r8, #0x30		@ xor	$0x30, %r8
-
-Lschedule_go:
-	cmp	r1, #192		@ cmp	$192,	%esi
-	bhi	Lschedule_256
-	beq	Lschedule_192
-	@ 128: fall though
-
-@@
-@@  .schedule_128
-@@
-@@  128-bit specific part of key schedule.
-@@
-@@  This schedule is really simple, because all its parts
-@@  are accomplished by the subroutines.
-@@
-Lschedule_128:
-	mov	r0, #10		@ mov	$10, %esi
-
-Loop_schedule_128:
-	bl	_vpaes_schedule_round
-	subs	r0, r0, #1		@ dec	%esi
-	beq	Lschedule_mangle_last
-	bl	_vpaes_schedule_mangle	@ write output
-	b	Loop_schedule_128
-
-@@
-@@  .aes_schedule_192
-@@
-@@  192-bit specific part of key schedule.
-@@
-@@  The main body of this schedule is the same as the 128-bit
-@@  schedule, but with more smearing.  The long, high side is
-@@  stored in q7 as before, and the short, low side is in
-@@  the high bits of q6.
-@@
-@@  This schedule is somewhat nastier, however, because each
-@@  round produces 192 bits of key material, or 1.5 round keys.
-@@  Therefore, on each cycle we do 2 rounds and produce 3 round
-@@  keys.
-@@
-.align	4
-Lschedule_192:
-	sub	r0, r0, #8
-	vld1.64	{q0}, [r0]			@ vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
-	bl	_vpaes_schedule_transform	@ input transform
-	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save short part
-	vmov.i8	d12, #0			@ vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
-						@ vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
-	mov	r0, #4			@ mov	$4,	%esi
-
-Loop_schedule_192:
-	bl	_vpaes_schedule_round
-	vext.8	q0, q6, q0, #8			@ vpalignr	$8,%xmm6,%xmm0,%xmm0
-	bl	_vpaes_schedule_mangle		@ save key n
-	bl	_vpaes_schedule_192_smear
-	bl	_vpaes_schedule_mangle		@ save key n+1
-	bl	_vpaes_schedule_round
-	subs	r0, r0, #1			@ dec	%esi
-	beq	Lschedule_mangle_last
-	bl	_vpaes_schedule_mangle		@ save key n+2
-	bl	_vpaes_schedule_192_smear
-	b	Loop_schedule_192
-
-@@
-@@  .aes_schedule_256
-@@
-@@  256-bit specific part of key schedule.
-@@
-@@  The structure here is very similar to the 128-bit
-@@  schedule, but with an additional "low side" in
-@@  q6.  The low side's rounds are the same as the
-@@  high side's, except no rcon and no rotation.
-@@
-.align	4
-Lschedule_256:
-	vld1.64	{q0}, [r0]			@ vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
-	bl	_vpaes_schedule_transform	@ input transform
-	mov	r0, #7			@ mov	$7, %esi
-
-Loop_schedule_256:
-	bl	_vpaes_schedule_mangle		@ output low result
-	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
-
-	@ high round
-	bl	_vpaes_schedule_round
-	subs	r0, r0, #1			@ dec	%esi
-	beq	Lschedule_mangle_last
-	bl	_vpaes_schedule_mangle
-
-	@ low round. swap xmm7 and xmm6
-	vdup.32	q0, d1[1]		@ vpshufd	$0xFF,	%xmm0,	%xmm0
-	vmov.i8	q4, #0
-	vmov	q5, q7			@ vmovdqa	%xmm7,	%xmm5
-	vmov	q7, q6			@ vmovdqa	%xmm6,	%xmm7
-	bl	_vpaes_schedule_low_round
-	vmov	q7, q5			@ vmovdqa	%xmm5,	%xmm7
-
-	b	Loop_schedule_256
-
-@@
-@@  .aes_schedule_mangle_last
-@@
-@@  Mangler for last round of key schedule
-@@  Mangles q0
-@@    when encrypting, outputs out(q0) ^ 63
-@@    when decrypting, outputs unskew(q0)
-@@
-@@  Always called right before return... jumps to cleanup and exits
-@@
-.align	4
-Lschedule_mangle_last:
-	@ schedule last round key from xmm0
-	adr	r11, Lk_deskew			@ lea	Lk_deskew(%rip),%r11	# prepare to deskew
-	tst	r3, r3
-	bne	Lschedule_mangle_last_dec
-
-	@ encrypting
-	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),%xmm1
-	adr	r11, Lk_opt		@ lea		Lk_opt(%rip),	%r11		# prepare to output transform
-	add	r2, r2, #32		@ add		$32,	%rdx
-	vmov	q2, q0
-	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
-	vtbl.8	d1, {q2}, d3
-
-Lschedule_mangle_last_dec:
-	sub	r2, r2, #16			@ add	$-16,	%rdx
-	veor	q0, q0, q12			@ vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
-	bl	_vpaes_schedule_transform	@ output transform
-	vst1.64	{q0}, [r2]			@ vmovdqu	%xmm0,	(%rdx)		# save last key
-
-	@ cleanup
-	veor	q0, q0, q0		@ vpxor	%xmm0,	%xmm0,	%xmm0
-	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
-	veor	q2, q2, q2		@ vpxor	%xmm2,	%xmm2,	%xmm2
-	veor	q3, q3, q3		@ vpxor	%xmm3,	%xmm3,	%xmm3
-	veor	q4, q4, q4		@ vpxor	%xmm4,	%xmm4,	%xmm4
-	veor	q5, q5, q5		@ vpxor	%xmm5,	%xmm5,	%xmm5
-	veor	q6, q6, q6		@ vpxor	%xmm6,	%xmm6,	%xmm6
-	veor	q7, q7, q7		@ vpxor	%xmm7,	%xmm7,	%xmm7
-	ldmia	sp!, {r3,pc}		@ return
-
-
-@@
-@@  .aes_schedule_192_smear
-@@
-@@  Smear the short, low side in the 192-bit key schedule.
-@@
-@@  Inputs:
-@@    q7: high side, b  a  x  y
-@@    q6:  low side, d  c  0  0
-@@
-@@  Outputs:
-@@    q6: b+c+d  b+c  0  0
-@@    q0: b+c+d  b+c  b  a
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_schedule_192_smear
-#endif
-.align	4
-_vpaes_schedule_192_smear:
-	vmov.i8	q1, #0
-	vdup.32	q0, d15[1]
-	vshl.i64	q1, q6, #32		@ vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
-	vmov	d0, d15		@ vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
-	veor	q6, q6, q1		@ vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
-	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
-	veor	q6, q6, q0		@ vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
-	vmov	q0, q6			@ vmovdqa	%xmm6,	%xmm0
-	vmov	d12, d2		@ vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
-	bx	lr
-
-
-@@
-@@  .aes_schedule_round
-@@
-@@  Runs one main round of the key schedule on q0, q7
-@@
-@@  Specifically, runs subbytes on the high dword of q0
-@@  then rotates it by one byte and xors into the low dword of
-@@  q7.
-@@
-@@  Adds rcon from low byte of q8, then rotates q8 for
-@@  next rcon.
-@@
-@@  Smears the dwords of q7 by xoring the low into the
-@@  second low, result into third, result into highest.
-@@
-@@  Returns results in q7 = q0.
-@@  Clobbers q1-q4, r11.
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_schedule_round
-#endif
-.align	4
-_vpaes_schedule_round:
-	@ extract rcon from xmm8
-	vmov.i8	q4, #0				@ vpxor		%xmm4,	%xmm4,	%xmm4
-	vext.8	q1, q8, q4, #15		@ vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
-	vext.8	q8, q8, q8, #15	@ vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
-	veor	q7, q7, q1			@ vpxor		%xmm1,	%xmm7,	%xmm7
-
-	@ rotate
-	vdup.32	q0, d1[1]			@ vpshufd	$0xFF,	%xmm0,	%xmm0
-	vext.8	q0, q0, q0, #1			@ vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
-
-	@ fall through...
-
-	@ low round: same as high round, but no rotation and no rcon.
-_vpaes_schedule_low_round:
-	@ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
-	@ We pin other values in _vpaes_key_preheat, so load them now.
-	adr	r11, Lk_sb1
-	vld1.64	{q14,q15}, [r11]
-
-	@ smear xmm7
-	vext.8	q1, q4, q7, #12			@ vpslldq	$4,	%xmm7,	%xmm1
-	veor	q7, q7, q1			@ vpxor	%xmm1,	%xmm7,	%xmm7
-	vext.8	q4, q4, q7, #8			@ vpslldq	$8,	%xmm7,	%xmm4
-
-	@ subbytes
-	vand	q1, q0, q9			@ vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
-	vshr.u8	q0, q0, #4			@ vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
-	veor	q7, q7, q4			@ vpxor		%xmm4,	%xmm7,	%xmm7
-	vtbl.8	d4, {q11}, d2		@ vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
-	vtbl.8	d5, {q11}, d3
-	veor	q1, q1, q0			@ vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
-	vtbl.8	d6, {q10}, d0		@ vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
-	vtbl.8	d7, {q10}, d1
-	veor	q3, q3, q2			@ vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
-	vtbl.8	d8, {q10}, d2		@ vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
-	vtbl.8	d9, {q10}, d3
-	veor	q7, q7, q12			@ vpxor		Lk_s63(%rip),	%xmm7,	%xmm7
-	vtbl.8	d6, {q10}, d6		@ vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
-	vtbl.8	d7, {q10}, d7
-	veor	q4, q4, q2			@ vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
-	vtbl.8	d4, {q10}, d8		@ vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
-	vtbl.8	d5, {q10}, d9
-	veor	q3, q3, q1			@ vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
-	veor	q2, q2, q0			@ vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
-	vtbl.8	d8, {q15}, d6		@ vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
-	vtbl.8	d9, {q15}, d7
-	vtbl.8	d2, {q14}, d4		@ vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
-	vtbl.8	d3, {q14}, d5
-	veor	q1, q1, q4			@ vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
-
-	@ add in smeared stuff
-	veor	q0, q1, q7			@ vpxor	%xmm7,	%xmm1,	%xmm0
-	veor	q7, q1, q7			@ vmovdqa	%xmm0,	%xmm7
-	bx	lr
-
-
-@@
-@@  .aes_schedule_transform
-@@
-@@  Linear-transform q0 according to tables at [r11]
-@@
-@@  Requires that q9 = 0x0F0F... as in preheat
-@@  Output in q0
-@@  Clobbers q1, q2, q14, q15
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_schedule_transform
-#endif
-.align	4
-_vpaes_schedule_transform:
-	vld1.64	{q14,q15}, [r11]	@ vmovdqa	(%r11),	%xmm2 	# lo
-					@ vmovdqa	16(%r11),	%xmm1 # hi
-	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
-	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
-	vtbl.8	d4, {q14}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
-	vtbl.8	d5, {q14}, d3
-	vtbl.8	d0, {q15}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
-	vtbl.8	d1, {q15}, d1
-	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
-	bx	lr
-
-
-@@
-@@  .aes_schedule_mangle
-@@
-@@  Mangles q0 from (basis-transformed) standard version
-@@  to our version.
-@@
-@@  On encrypt,
-@@    xor with 0x63
-@@    multiply by circulant 0,1,1,1
-@@    apply shiftrows transform
-@@
-@@  On decrypt,
-@@    xor with 0x63
-@@    multiply by "inverse mixcolumns" circulant E,B,D,9
-@@    deskew
-@@    apply shiftrows transform
-@@
-@@
-@@  Writes out to [r2], and increments or decrements it
-@@  Keeps track of round number mod 4 in r8
-@@  Preserves q0
-@@  Clobbers q1-q5
-@@
-#ifdef __thumb2__
-.thumb_func	_vpaes_schedule_mangle
-#endif
-.align	4
-_vpaes_schedule_mangle:
-	tst	r3, r3
-	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
-	adr	r11, Lk_mc_forward	@ Must be aligned to 8 mod 16.
-	vld1.64	{q5}, [r11]		@ vmovdqa	Lk_mc_forward(%rip),%xmm5
-	bne	Lschedule_mangle_dec
-
-	@ encrypting
-	@ Write to q2 so we do not overlap table and destination below.
-	veor	q2, q0, q12		@ vpxor		Lk_s63(%rip),	%xmm0,	%xmm4
-	add	r2, r2, #16		@ add		$16,	%rdx
-	vtbl.8	d8, {q2}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm4
-	vtbl.8	d9, {q2}, d11
-	vtbl.8	d2, {q4}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm1
-	vtbl.8	d3, {q4}, d11
-	vtbl.8	d6, {q1}, d10	@ vpshufb	%xmm5,	%xmm1,	%xmm3
-	vtbl.8	d7, {q1}, d11
-	veor	q4, q4, q1		@ vpxor		%xmm1,	%xmm4,	%xmm4
-	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
-	veor	q3, q3, q4		@ vpxor		%xmm4,	%xmm3,	%xmm3
-
-	b	Lschedule_mangle_both
-.align	4
-Lschedule_mangle_dec:
-	@ inverse mix columns
-	adr	r11, Lk_dksd 		@ lea		Lk_dksd(%rip),%r11
-	vshr.u8	q1, q4, #4		@ vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
-	vand	q4, q4, q9		@ vpand		%xmm9,	%xmm4,	%xmm4	# 4 = lo
-
-	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x00(%r11),	%xmm2
-					@ vmovdqa	0x10(%r11),	%xmm3
-	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
-	vtbl.8	d5, {q14}, d9
-	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
-	vtbl.8	d7, {q15}, d3
-	@ Load .Lk_dksb ahead of time.
-	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x20(%r11),	%xmm2
-					@ vmovdqa	0x30(%r11),	%xmm3
-	@ Write to q13 so we do not overlap table and destination.
-	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
-	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
-	vtbl.8	d7, {q13}, d11
-
-	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
-	vtbl.8	d5, {q14}, d9
-	veor	q2, q2, q3		@ vpxor		%xmm3,	%xmm2,	%xmm2
-	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
-	vtbl.8	d7, {q15}, d3
-	@ Load .Lk_dkse ahead of time.
-	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x40(%r11),	%xmm2
-					@ vmovdqa	0x50(%r11),	%xmm3
-	@ Write to q13 so we do not overlap table and destination.
-	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
-	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
-	vtbl.8	d7, {q13}, d11
-
-	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
-	vtbl.8	d5, {q14}, d9
-	veor	q2, q2, q3		@ vpxor		%xmm3,	%xmm2,	%xmm2
-	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
-	vtbl.8	d7, {q15}, d3
-	@ Load .Lk_dkse ahead of time.
-	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x60(%r11),	%xmm2
-					@ vmovdqa	0x70(%r11),	%xmm4
-	@ Write to q13 so we do not overlap table and destination.
-	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
-
-	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
-	vtbl.8	d5, {q14}, d9
-	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
-	vtbl.8	d7, {q13}, d11
-	vtbl.8	d8, {q15}, d2	@ vpshufb	%xmm1,	%xmm4,	%xmm4
-	vtbl.8	d9, {q15}, d3
-	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
-	veor	q2, q2, q3		@ vpxor	%xmm3,	%xmm2,	%xmm2
-	veor	q3, q4, q2		@ vpxor	%xmm2,	%xmm4,	%xmm3
-
-	sub	r2, r2, #16		@ add	$-16,	%rdx
-
-Lschedule_mangle_both:
-	@ Write to q2 so table and destination do not overlap.
-	vtbl.8	d4, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
-	vtbl.8	d5, {q3}, d3
-	add	r8, r8, #64-16		@ add	$-16,	%r8
-	and	r8, r8, #~(1<<6)	@ and	$0x30,	%r8
-	vst1.64	{q2}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
-	bx	lr
-
-
-.globl	_vpaes_set_encrypt_key
-.private_extern	_vpaes_set_encrypt_key
-#ifdef __thumb2__
-.thumb_func	_vpaes_set_encrypt_key
-#endif
-.align	4
-_vpaes_set_encrypt_key:
-	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
-	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-
-	lsr	r9, r1, #5		@ shr	$5,%eax
-	add	r9, r9, #5		@ $5,%eax
-	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
-
-	mov	r3, #0		@ mov	$0,%ecx
-	mov	r8, #0x30		@ mov	$0x30,%r8d
-	bl	_vpaes_schedule_core
-	eor	r0, r0, r0
-
-	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
-
-
-.globl	_vpaes_set_decrypt_key
-.private_extern	_vpaes_set_decrypt_key
-#ifdef __thumb2__
-.thumb_func	_vpaes_set_decrypt_key
-#endif
-.align	4
-_vpaes_set_decrypt_key:
-	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
-	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-
-	lsr	r9, r1, #5		@ shr	$5,%eax
-	add	r9, r9, #5		@ $5,%eax
-	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
-	lsl	r9, r9, #4		@ shl	$4,%eax
-	add	r2, r2, #16		@ lea	16(%rdx,%rax),%rdx
-	add	r2, r2, r9
-
-	mov	r3, #1		@ mov	$1,%ecx
-	lsr	r8, r1, #1		@ shr	$1,%r8d
-	and	r8, r8, #32		@ and	$32,%r8d
-	eor	r8, r8, #32		@ xor	$32,%r8d	# nbits==192?0:32
-	bl	_vpaes_schedule_core
-
-	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
-
-
-@ Additional constants for converting to bsaes.
-
-.align	4
-_vpaes_convert_consts:
-@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
-@ transform in the AES S-box. 0x63 is incorporated into the low half of the
-@ table. This was computed with the following script:
-@
-@   def u64s_to_u128(x, y):
-@       return x | (y << 64)
-@   def u128_to_u64s(w):
-@       return w & ((1<<64)-1), w >> 64
-@   def get_byte(w, i):
-@       return (w >> (i*8)) & 0xff
-@   def apply_table(table, b):
-@       lo = b & 0xf
-@       hi = b >> 4
-@       return get_byte(table[0], lo) ^ get_byte(table[1], hi)
-@   def opt(b):
-@       table = [
-@           u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
-@           u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
-@       ]
-@       return apply_table(table, b)
-@   def rot_byte(b, n):
-@       return 0xff & ((b << n) | (b >> (8-n)))
-@   def skew(x):
-@       return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
-@               rot_byte(x, 4))
-@   table = [0, 0]
-@   for i in range(16):
-@       table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
-@       table[1] |= skew(opt(i<<4)) << (i*8)
-@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[0]))
-@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[1]))
-Lk_opt_then_skew:
-.quad	0x9cb8436798bc4763, 0x6440bb9f6044bf9b
-.quad	0x1f30062936192f00, 0xb49bad829db284ab
-
-@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
-@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
-@ becomes 0x22334411 and then 0x11443322.
-Lk_decrypt_transform:
-.quad	0x0704050603000102, 0x0f0c0d0e0b08090a
-
-
-@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
-.globl	_vpaes_encrypt_key_to_bsaes
-.private_extern	_vpaes_encrypt_key_to_bsaes
-#ifdef __thumb2__
-.thumb_func	_vpaes_encrypt_key_to_bsaes
-#endif
-.align	4
-_vpaes_encrypt_key_to_bsaes:
-	stmdb	sp!, {r11, lr}
-
-	@ See _vpaes_schedule_core for the key schedule logic. In particular,
-	@ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
-	@ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
-	@ contain the transformations not in the bsaes representation. This
-	@ function inverts those transforms.
-	@
-	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
-	@ representation, which does not match the other aes_nohw_*
-	@ implementations. The ARM aes_nohw_* stores each 32-bit word
-	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
-	@ cost of extra REV and VREV32 operations in little-endian ARM.
-
-	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
-	adr	r2, Lk_mc_forward	@ Must be aligned to 8 mod 16.
-	add	r3, r2, 0x90		@ Lk_sr+0x10-Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
-
-	vld1.64	{q12}, [r2]
-	vmov.i8	q10, #0x5b		@ Lk_s63 from vpaes-x86_64
-	adr	r11, Lk_opt		@ Must be aligned to 8 mod 16.
-	vmov.i8	q11, #0x63		@ LK_s63 without Lk_ipt applied
-
-	@ vpaes stores one fewer round count than bsaes, but the number of keys
-	@ is the same.
-	ldr	r2, [r1,#240]
-	add	r2, r2, #1
-	str	r2, [r0,#240]
-
-	@ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
-	@ Invert this with .Lk_opt.
-	vld1.64	{q0}, [r1]!
-	bl	_vpaes_schedule_transform
-	vrev32.8	q0, q0
-	vst1.64	{q0}, [r0]!
-
-	@ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
-	@ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
-	@ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
-Loop_enc_key_to_bsaes:
-	vld1.64	{q0}, [r1]!
-
-	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
-	@ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
-	@ We use r3 rather than r8 to avoid a callee-saved register.
-	vld1.64	{q1}, [r3]
-	vtbl.8	d4, {q0}, d2
-	vtbl.8	d5, {q0}, d3
-	add	r3, r3, #16
-	and	r3, r3, #~(1<<6)
-	vmov	q0, q2
-
-	@ Handle the last key differently.
-	subs	r2, r2, #1
-	beq	Loop_enc_key_to_bsaes_last
-
-	@ Multiply by the circulant. This is its own inverse.
-	vtbl.8	d2, {q0}, d24
-	vtbl.8	d3, {q0}, d25
-	vmov	q0, q1
-	vtbl.8	d4, {q1}, d24
-	vtbl.8	d5, {q1}, d25
-	veor	q0, q0, q2
-	vtbl.8	d2, {q2}, d24
-	vtbl.8	d3, {q2}, d25
-	veor	q0, q0, q1
-
-	@ XOR and finish.
-	veor	q0, q0, q10
-	bl	_vpaes_schedule_transform
-	vrev32.8	q0, q0
-	vst1.64	{q0}, [r0]!
-	b	Loop_enc_key_to_bsaes
-
-Loop_enc_key_to_bsaes_last:
-	@ The final key does not have a basis transform (note
-	@ .Lschedule_mangle_last inverts the original transform). It only XORs
-	@ 0x63 and applies ShiftRows. The latter was already inverted in the
-	@ loop. Note that, because we act on the original representation, we use
-	@ q11, not q10.
-	veor	q0, q0, q11
-	vrev32.8	q0, q0
-	vst1.64	{q0}, [r0]
-
-	@ Wipe registers which contained key material.
-	veor	q0, q0, q0
-	veor	q1, q1, q1
-	veor	q2, q2, q2
-
-	ldmia	sp!, {r11, pc}	@ return
-
-
-@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
-.globl	_vpaes_decrypt_key_to_bsaes
-.private_extern	_vpaes_decrypt_key_to_bsaes
-#ifdef __thumb2__
-.thumb_func	_vpaes_decrypt_key_to_bsaes
-#endif
-.align	4
-_vpaes_decrypt_key_to_bsaes:
-	stmdb	sp!, {r11, lr}
-
-	@ See _vpaes_schedule_core for the key schedule logic. Note vpaes
-	@ computes the decryption key schedule in reverse. Additionally,
-	@ aes-x86_64.pl shares some transformations, so we must only partially
-	@ invert vpaes's transformations. In general, vpaes computes in a
-	@ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
-	@ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
-	@ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
-	@
-	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
-	@ representation, which does not match the other aes_nohw_*
-	@ implementations. The ARM aes_nohw_* stores each 32-bit word
-	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
-	@ cost of extra REV and VREV32 operations in little-endian ARM.
-
-	adr	r2, Lk_decrypt_transform
-	adr	r3, Lk_sr+0x30
-	adr	r11, Lk_opt_then_skew	@ Input to _vpaes_schedule_transform.
-	vld1.64	{q12}, [r2]	@ Reuse q12 from encryption.
-	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
-
-	@ vpaes stores one fewer round count than bsaes, but the number of keys
-	@ is the same.
-	ldr	r2, [r1,#240]
-	add	r2, r2, #1
-	str	r2, [r0,#240]
-
-	@ Undo the basis change and reapply the S-box affine transform. See
-	@ .Lschedule_mangle_last.
-	vld1.64	{q0}, [r1]!
-	bl	_vpaes_schedule_transform
-	vrev32.8	q0, q0
-	vst1.64	{q0}, [r0]!
-
-	@ See _vpaes_schedule_mangle for the transform on the middle keys. Note
-	@ it simultaneously inverts MixColumns and the S-box affine transform.
-	@ See .Lk_dksd through .Lk_dks9.
-Loop_dec_key_to_bsaes:
-	vld1.64	{q0}, [r1]!
-
-	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
-	@ forwards cancels inverting for which direction we cycle r3. We use r3
-	@ rather than r8 to avoid a callee-saved register.
-	vld1.64	{q1}, [r3]
-	vtbl.8	d4, {q0}, d2
-	vtbl.8	d5, {q0}, d3
-	add	r3, r3, #64-16
-	and	r3, r3, #~(1<<6)
-	vmov	q0, q2
-
-	@ Handle the last key differently.
-	subs	r2, r2, #1
-	beq	Loop_dec_key_to_bsaes_last
-
-	@ Undo the basis change and reapply the S-box affine transform.
-	bl	_vpaes_schedule_transform
-
-	@ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
-	@ combine the two operations in .Lk_decrypt_transform.
-	@
-	@ TODO(davidben): Where does the rotation come from?
-	vtbl.8	d2, {q0}, d24
-	vtbl.8	d3, {q0}, d25
-
-	vst1.64	{q1}, [r0]!
-	b	Loop_dec_key_to_bsaes
-
-Loop_dec_key_to_bsaes_last:
-	@ The final key only inverts ShiftRows (already done in the loop). See
-	@ .Lschedule_am_decrypting. Its basis is not transformed.
-	vrev32.8	q0, q0
-	vst1.64	{q0}, [r0]!
-
-	@ Wipe registers which contained key material.
-	veor	q0, q0, q0
-	veor	q1, q1, q1
-	veor	q2, q2, q2
-
-	ldmia	sp!, {r11, pc}	@ return
-
-.globl	_vpaes_ctr32_encrypt_blocks
-.private_extern	_vpaes_ctr32_encrypt_blocks
-#ifdef __thumb2__
-.thumb_func	_vpaes_ctr32_encrypt_blocks
-#endif
-.align	4
-_vpaes_ctr32_encrypt_blocks:
-	mov	ip, sp
-	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
-	@ This function uses q4-q7 (d8-d15), which are callee-saved.
-	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-
-	cmp	r2, #0
-	@ r8 is passed on the stack.
-	ldr	r8, [ip]
-	beq	Lctr32_done
-
-	@ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
-	mov	r9, r3
-	mov	r3, r2
-	mov	r2, r9
-
-	@ Load the IV and counter portion.
-	ldr	r7, [r8, #12]
-	vld1.8	{q7}, [r8]
-
-	bl	_vpaes_preheat
-	rev	r7, r7		@ The counter is big-endian.
-
-Lctr32_loop:
-	vmov	q0, q7
-	vld1.8	{q6}, [r0]!		@ Load input ahead of time
-	bl	_vpaes_encrypt_core
-	veor	q0, q0, q6		@ XOR input and result
-	vst1.8	{q0}, [r1]!
-	subs	r3, r3, #1
-	@ Update the counter.
-	add	r7, r7, #1
-	rev	r9, r7
-	vmov.32	d15[1], r9
-	bne	Lctr32_loop
-
-Lctr32_done:
-	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
-
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-arm/crypto/test/trampoline-armv4-apple.S b/apple-arm/crypto/test/trampoline-armv4-apple.S
deleted file mode 100644
index 425a085..0000000
--- a/apple-arm/crypto/test/trampoline-armv4-apple.S
+++ /dev/null
@@ -1,368 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__APPLE__)
-.syntax	unified
-
-
-
-
-.text
-
-@ abi_test_trampoline loads callee-saved registers from |state|, calls |func|
-@ with |argv|, then saves the callee-saved registers into |state|. It returns
-@ the result of |func|. The |unwind| argument is unused.
-@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state,
-@                              const uint32_t *argv, size_t argc,
-@                              int unwind);
-
-.globl	_abi_test_trampoline
-.private_extern	_abi_test_trampoline
-.align	4
-_abi_test_trampoline:
-	@ Save parameters and all callee-saved registers. For convenience, we
-	@ save r9 on iOS even though it's volatile.
-	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-	stmdb	sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
-
-	@ Reserve stack space for six (10-4) stack parameters, plus an extra 4
-	@ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3).
-	sub	sp, sp, #28
-
-	@ Every register in AAPCS is either non-volatile or a parameter (except
-	@ r9 on iOS), so this code, by the actual call, loses all its scratch
-	@ registers. First fill in stack parameters while there are registers
-	@ to spare.
-	cmp	r3, #4
-	bls	Lstack_args_done
-	mov	r4, sp				@ r4 is the output pointer.
-	add	r5, r2, r3, lsl #2	@ Set r5 to the end of argv.
-	add	r2, r2, #16		@ Skip four arguments.
-Lstack_args_loop:
-	ldr	r6, [r2], #4
-	cmp	r2, r5
-	str	r6, [r4], #4
-	bne	Lstack_args_loop
-
-Lstack_args_done:
-	@ Load registers from |r1|.
-	vldmia	r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
-#if defined(__APPLE__)
-	@ r9 is not volatile on iOS.
-	ldmia	r1!, {r4,r5,r6,r7,r8,r10-r11}
-#else
-	ldmia	r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
-#endif
-
-	@ Load register parameters. This uses up our remaining registers, so we
-	@ repurpose lr as scratch space.
-	ldr	r3, [sp, #40]	@ Reload argc.
-	ldr	lr, [sp, #36]		@ Load argv into lr.
-	cmp	r3, #3
-	bhi	Larg_r3
-	beq	Larg_r2
-	cmp	r3, #1
-	bhi	Larg_r1
-	beq	Larg_r0
-	b	Largs_done
-
-Larg_r3:
-	ldr	r3, [lr, #12]	@ argv[3]
-Larg_r2:
-	ldr	r2, [lr, #8]	@ argv[2]
-Larg_r1:
-	ldr	r1, [lr, #4]	@ argv[1]
-Larg_r0:
-	ldr	r0, [lr]	@ argv[0]
-Largs_done:
-
-	@ With every other register in use, load the function pointer into lr
-	@ and call the function.
-	ldr	lr, [sp, #28]
-	blx	lr
-
-	@ r1-r3 are free for use again. The trampoline only supports
-	@ single-return functions. Pass r4-r11 to the caller.
-	ldr	r1, [sp, #32]
-	vstmia	r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
-#if defined(__APPLE__)
-	@ r9 is not volatile on iOS.
-	stmia	r1!, {r4,r5,r6,r7,r8,r10-r11}
-#else
-	stmia	r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
-#endif
-
-	@ Unwind the stack and restore registers.
-	add	sp, sp, #44		@ 44 = 28+16
-	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}	@ Skip r0-r3 (see +16 above).
-	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
-
-	bx	lr
-
-
-.globl	_abi_test_clobber_r0
-.private_extern	_abi_test_clobber_r0
-.align	4
-_abi_test_clobber_r0:
-	mov	r0, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_r1
-.private_extern	_abi_test_clobber_r1
-.align	4
-_abi_test_clobber_r1:
-	mov	r1, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_r2
-.private_extern	_abi_test_clobber_r2
-.align	4
-_abi_test_clobber_r2:
-	mov	r2, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_r3
-.private_extern	_abi_test_clobber_r3
-.align	4
-_abi_test_clobber_r3:
-	mov	r3, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_r4
-.private_extern	_abi_test_clobber_r4
-.align	4
-_abi_test_clobber_r4:
-	mov	r4, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_r5
-.private_extern	_abi_test_clobber_r5
-.align	4
-_abi_test_clobber_r5:
-	mov	r5, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_r6
-.private_extern	_abi_test_clobber_r6
-.align	4
-_abi_test_clobber_r6:
-	mov	r6, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_r7
-.private_extern	_abi_test_clobber_r7
-.align	4
-_abi_test_clobber_r7:
-	mov	r7, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_r8
-.private_extern	_abi_test_clobber_r8
-.align	4
-_abi_test_clobber_r8:
-	mov	r8, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_r9
-.private_extern	_abi_test_clobber_r9
-.align	4
-_abi_test_clobber_r9:
-	mov	r9, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_r10
-.private_extern	_abi_test_clobber_r10
-.align	4
-_abi_test_clobber_r10:
-	mov	r10, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_r11
-.private_extern	_abi_test_clobber_r11
-.align	4
-_abi_test_clobber_r11:
-	mov	r11, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_r12
-.private_extern	_abi_test_clobber_r12
-.align	4
-_abi_test_clobber_r12:
-	mov	r12, #0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d0
-.private_extern	_abi_test_clobber_d0
-.align	4
-_abi_test_clobber_d0:
-	mov	r0, #0
-	vmov	s0, r0
-	vmov	s1, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d1
-.private_extern	_abi_test_clobber_d1
-.align	4
-_abi_test_clobber_d1:
-	mov	r0, #0
-	vmov	s2, r0
-	vmov	s3, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d2
-.private_extern	_abi_test_clobber_d2
-.align	4
-_abi_test_clobber_d2:
-	mov	r0, #0
-	vmov	s4, r0
-	vmov	s5, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d3
-.private_extern	_abi_test_clobber_d3
-.align	4
-_abi_test_clobber_d3:
-	mov	r0, #0
-	vmov	s6, r0
-	vmov	s7, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d4
-.private_extern	_abi_test_clobber_d4
-.align	4
-_abi_test_clobber_d4:
-	mov	r0, #0
-	vmov	s8, r0
-	vmov	s9, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d5
-.private_extern	_abi_test_clobber_d5
-.align	4
-_abi_test_clobber_d5:
-	mov	r0, #0
-	vmov	s10, r0
-	vmov	s11, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d6
-.private_extern	_abi_test_clobber_d6
-.align	4
-_abi_test_clobber_d6:
-	mov	r0, #0
-	vmov	s12, r0
-	vmov	s13, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d7
-.private_extern	_abi_test_clobber_d7
-.align	4
-_abi_test_clobber_d7:
-	mov	r0, #0
-	vmov	s14, r0
-	vmov	s15, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d8
-.private_extern	_abi_test_clobber_d8
-.align	4
-_abi_test_clobber_d8:
-	mov	r0, #0
-	vmov	s16, r0
-	vmov	s17, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d9
-.private_extern	_abi_test_clobber_d9
-.align	4
-_abi_test_clobber_d9:
-	mov	r0, #0
-	vmov	s18, r0
-	vmov	s19, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d10
-.private_extern	_abi_test_clobber_d10
-.align	4
-_abi_test_clobber_d10:
-	mov	r0, #0
-	vmov	s20, r0
-	vmov	s21, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d11
-.private_extern	_abi_test_clobber_d11
-.align	4
-_abi_test_clobber_d11:
-	mov	r0, #0
-	vmov	s22, r0
-	vmov	s23, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d12
-.private_extern	_abi_test_clobber_d12
-.align	4
-_abi_test_clobber_d12:
-	mov	r0, #0
-	vmov	s24, r0
-	vmov	s25, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d13
-.private_extern	_abi_test_clobber_d13
-.align	4
-_abi_test_clobber_d13:
-	mov	r0, #0
-	vmov	s26, r0
-	vmov	s27, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d14
-.private_extern	_abi_test_clobber_d14
-.align	4
-_abi_test_clobber_d14:
-	mov	r0, #0
-	vmov	s28, r0
-	vmov	s29, r0
-	bx	lr
-
-
-.globl	_abi_test_clobber_d15
-.private_extern	_abi_test_clobber_d15
-.align	4
-_abi_test_clobber_d15:
-	mov	r0, #0
-	vmov	s30, r0
-	vmov	s31, r0
-	bx	lr
-
-#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__APPLE__)
diff --git a/apple-x86/crypto/chacha/chacha-x86-apple.S b/apple-x86/crypto/chacha/chacha-x86-apple.S
deleted file mode 100644
index baa06ac..0000000
--- a/apple-x86/crypto/chacha/chacha-x86-apple.S
+++ /dev/null
@@ -1,973 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl	_ChaCha20_ctr32
-.private_extern	_ChaCha20_ctr32
-.align	4
-_ChaCha20_ctr32:
-L_ChaCha20_ctr32_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	xorl	%eax,%eax
-	cmpl	28(%esp),%eax
-	je	L000no_data
-	call	Lpic_point
-Lpic_point:
-	popl	%eax
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-Lpic_point(%eax),%ebp
-	testl	$16777216,(%ebp)
-	jz	L001x86
-	testl	$512,4(%ebp)
-	jz	L001x86
-	jmp	Lssse3_shortcut
-L001x86:
-	movl	32(%esp),%esi
-	movl	36(%esp),%edi
-	subl	$132,%esp
-	movl	(%esi),%eax
-	movl	4(%esi),%ebx
-	movl	8(%esi),%ecx
-	movl	12(%esi),%edx
-	movl	%eax,80(%esp)
-	movl	%ebx,84(%esp)
-	movl	%ecx,88(%esp)
-	movl	%edx,92(%esp)
-	movl	16(%esi),%eax
-	movl	20(%esi),%ebx
-	movl	24(%esi),%ecx
-	movl	28(%esi),%edx
-	movl	%eax,96(%esp)
-	movl	%ebx,100(%esp)
-	movl	%ecx,104(%esp)
-	movl	%edx,108(%esp)
-	movl	(%edi),%eax
-	movl	4(%edi),%ebx
-	movl	8(%edi),%ecx
-	movl	12(%edi),%edx
-	subl	$1,%eax
-	movl	%eax,112(%esp)
-	movl	%ebx,116(%esp)
-	movl	%ecx,120(%esp)
-	movl	%edx,124(%esp)
-	jmp	L002entry
-.align	4,0x90
-L003outer_loop:
-	movl	%ebx,156(%esp)
-	movl	%eax,152(%esp)
-	movl	%ecx,160(%esp)
-L002entry:
-	movl	$1634760805,%eax
-	movl	$857760878,4(%esp)
-	movl	$2036477234,8(%esp)
-	movl	$1797285236,12(%esp)
-	movl	84(%esp),%ebx
-	movl	88(%esp),%ebp
-	movl	104(%esp),%ecx
-	movl	108(%esp),%esi
-	movl	116(%esp),%edx
-	movl	120(%esp),%edi
-	movl	%ebx,20(%esp)
-	movl	%ebp,24(%esp)
-	movl	%ecx,40(%esp)
-	movl	%esi,44(%esp)
-	movl	%edx,52(%esp)
-	movl	%edi,56(%esp)
-	movl	92(%esp),%ebx
-	movl	124(%esp),%edi
-	movl	112(%esp),%edx
-	movl	80(%esp),%ebp
-	movl	96(%esp),%ecx
-	movl	100(%esp),%esi
-	addl	$1,%edx
-	movl	%ebx,28(%esp)
-	movl	%edi,60(%esp)
-	movl	%edx,112(%esp)
-	movl	$10,%ebx
-	jmp	L004loop
-.align	4,0x90
-L004loop:
-	addl	%ebp,%eax
-	movl	%ebx,128(%esp)
-	movl	%ebp,%ebx
-	xorl	%eax,%edx
-	roll	$16,%edx
-	addl	%edx,%ecx
-	xorl	%ecx,%ebx
-	movl	52(%esp),%edi
-	roll	$12,%ebx
-	movl	20(%esp),%ebp
-	addl	%ebx,%eax
-	xorl	%eax,%edx
-	movl	%eax,(%esp)
-	roll	$8,%edx
-	movl	4(%esp),%eax
-	addl	%edx,%ecx
-	movl	%edx,48(%esp)
-	xorl	%ecx,%ebx
-	addl	%ebp,%eax
-	roll	$7,%ebx
-	xorl	%eax,%edi
-	movl	%ecx,32(%esp)
-	roll	$16,%edi
-	movl	%ebx,16(%esp)
-	addl	%edi,%esi
-	movl	40(%esp),%ecx
-	xorl	%esi,%ebp
-	movl	56(%esp),%edx
-	roll	$12,%ebp
-	movl	24(%esp),%ebx
-	addl	%ebp,%eax
-	xorl	%eax,%edi
-	movl	%eax,4(%esp)
-	roll	$8,%edi
-	movl	8(%esp),%eax
-	addl	%edi,%esi
-	movl	%edi,52(%esp)
-	xorl	%esi,%ebp
-	addl	%ebx,%eax
-	roll	$7,%ebp
-	xorl	%eax,%edx
-	movl	%esi,36(%esp)
-	roll	$16,%edx
-	movl	%ebp,20(%esp)
-	addl	%edx,%ecx
-	movl	44(%esp),%esi
-	xorl	%ecx,%ebx
-	movl	60(%esp),%edi
-	roll	$12,%ebx
-	movl	28(%esp),%ebp
-	addl	%ebx,%eax
-	xorl	%eax,%edx
-	movl	%eax,8(%esp)
-	roll	$8,%edx
-	movl	12(%esp),%eax
-	addl	%edx,%ecx
-	movl	%edx,56(%esp)
-	xorl	%ecx,%ebx
-	addl	%ebp,%eax
-	roll	$7,%ebx
-	xorl	%eax,%edi
-	roll	$16,%edi
-	movl	%ebx,24(%esp)
-	addl	%edi,%esi
-	xorl	%esi,%ebp
-	roll	$12,%ebp
-	movl	20(%esp),%ebx
-	addl	%ebp,%eax
-	xorl	%eax,%edi
-	movl	%eax,12(%esp)
-	roll	$8,%edi
-	movl	(%esp),%eax
-	addl	%edi,%esi
-	movl	%edi,%edx
-	xorl	%esi,%ebp
-	addl	%ebx,%eax
-	roll	$7,%ebp
-	xorl	%eax,%edx
-	roll	$16,%edx
-	movl	%ebp,28(%esp)
-	addl	%edx,%ecx
-	xorl	%ecx,%ebx
-	movl	48(%esp),%edi
-	roll	$12,%ebx
-	movl	24(%esp),%ebp
-	addl	%ebx,%eax
-	xorl	%eax,%edx
-	movl	%eax,(%esp)
-	roll	$8,%edx
-	movl	4(%esp),%eax
-	addl	%edx,%ecx
-	movl	%edx,60(%esp)
-	xorl	%ecx,%ebx
-	addl	%ebp,%eax
-	roll	$7,%ebx
-	xorl	%eax,%edi
-	movl	%ecx,40(%esp)
-	roll	$16,%edi
-	movl	%ebx,20(%esp)
-	addl	%edi,%esi
-	movl	32(%esp),%ecx
-	xorl	%esi,%ebp
-	movl	52(%esp),%edx
-	roll	$12,%ebp
-	movl	28(%esp),%ebx
-	addl	%ebp,%eax
-	xorl	%eax,%edi
-	movl	%eax,4(%esp)
-	roll	$8,%edi
-	movl	8(%esp),%eax
-	addl	%edi,%esi
-	movl	%edi,48(%esp)
-	xorl	%esi,%ebp
-	addl	%ebx,%eax
-	roll	$7,%ebp
-	xorl	%eax,%edx
-	movl	%esi,44(%esp)
-	roll	$16,%edx
-	movl	%ebp,24(%esp)
-	addl	%edx,%ecx
-	movl	36(%esp),%esi
-	xorl	%ecx,%ebx
-	movl	56(%esp),%edi
-	roll	$12,%ebx
-	movl	16(%esp),%ebp
-	addl	%ebx,%eax
-	xorl	%eax,%edx
-	movl	%eax,8(%esp)
-	roll	$8,%edx
-	movl	12(%esp),%eax
-	addl	%edx,%ecx
-	movl	%edx,52(%esp)
-	xorl	%ecx,%ebx
-	addl	%ebp,%eax
-	roll	$7,%ebx
-	xorl	%eax,%edi
-	roll	$16,%edi
-	movl	%ebx,28(%esp)
-	addl	%edi,%esi
-	xorl	%esi,%ebp
-	movl	48(%esp),%edx
-	roll	$12,%ebp
-	movl	128(%esp),%ebx
-	addl	%ebp,%eax
-	xorl	%eax,%edi
-	movl	%eax,12(%esp)
-	roll	$8,%edi
-	movl	(%esp),%eax
-	addl	%edi,%esi
-	movl	%edi,56(%esp)
-	xorl	%esi,%ebp
-	roll	$7,%ebp
-	decl	%ebx
-	jnz	L004loop
-	movl	160(%esp),%ebx
-	addl	$1634760805,%eax
-	addl	80(%esp),%ebp
-	addl	96(%esp),%ecx
-	addl	100(%esp),%esi
-	cmpl	$64,%ebx
-	jb	L005tail
-	movl	156(%esp),%ebx
-	addl	112(%esp),%edx
-	addl	120(%esp),%edi
-	xorl	(%ebx),%eax
-	xorl	16(%ebx),%ebp
-	movl	%eax,(%esp)
-	movl	152(%esp),%eax
-	xorl	32(%ebx),%ecx
-	xorl	36(%ebx),%esi
-	xorl	48(%ebx),%edx
-	xorl	56(%ebx),%edi
-	movl	%ebp,16(%eax)
-	movl	%ecx,32(%eax)
-	movl	%esi,36(%eax)
-	movl	%edx,48(%eax)
-	movl	%edi,56(%eax)
-	movl	4(%esp),%ebp
-	movl	8(%esp),%ecx
-	movl	12(%esp),%esi
-	movl	20(%esp),%edx
-	movl	24(%esp),%edi
-	addl	$857760878,%ebp
-	addl	$2036477234,%ecx
-	addl	$1797285236,%esi
-	addl	84(%esp),%edx
-	addl	88(%esp),%edi
-	xorl	4(%ebx),%ebp
-	xorl	8(%ebx),%ecx
-	xorl	12(%ebx),%esi
-	xorl	20(%ebx),%edx
-	xorl	24(%ebx),%edi
-	movl	%ebp,4(%eax)
-	movl	%ecx,8(%eax)
-	movl	%esi,12(%eax)
-	movl	%edx,20(%eax)
-	movl	%edi,24(%eax)
-	movl	28(%esp),%ebp
-	movl	40(%esp),%ecx
-	movl	44(%esp),%esi
-	movl	52(%esp),%edx
-	movl	60(%esp),%edi
-	addl	92(%esp),%ebp
-	addl	104(%esp),%ecx
-	addl	108(%esp),%esi
-	addl	116(%esp),%edx
-	addl	124(%esp),%edi
-	xorl	28(%ebx),%ebp
-	xorl	40(%ebx),%ecx
-	xorl	44(%ebx),%esi
-	xorl	52(%ebx),%edx
-	xorl	60(%ebx),%edi
-	leal	64(%ebx),%ebx
-	movl	%ebp,28(%eax)
-	movl	(%esp),%ebp
-	movl	%ecx,40(%eax)
-	movl	160(%esp),%ecx
-	movl	%esi,44(%eax)
-	movl	%edx,52(%eax)
-	movl	%edi,60(%eax)
-	movl	%ebp,(%eax)
-	leal	64(%eax),%eax
-	subl	$64,%ecx
-	jnz	L003outer_loop
-	jmp	L006done
-L005tail:
-	addl	112(%esp),%edx
-	addl	120(%esp),%edi
-	movl	%eax,(%esp)
-	movl	%ebp,16(%esp)
-	movl	%ecx,32(%esp)
-	movl	%esi,36(%esp)
-	movl	%edx,48(%esp)
-	movl	%edi,56(%esp)
-	movl	4(%esp),%ebp
-	movl	8(%esp),%ecx
-	movl	12(%esp),%esi
-	movl	20(%esp),%edx
-	movl	24(%esp),%edi
-	addl	$857760878,%ebp
-	addl	$2036477234,%ecx
-	addl	$1797285236,%esi
-	addl	84(%esp),%edx
-	addl	88(%esp),%edi
-	movl	%ebp,4(%esp)
-	movl	%ecx,8(%esp)
-	movl	%esi,12(%esp)
-	movl	%edx,20(%esp)
-	movl	%edi,24(%esp)
-	movl	28(%esp),%ebp
-	movl	40(%esp),%ecx
-	movl	44(%esp),%esi
-	movl	52(%esp),%edx
-	movl	60(%esp),%edi
-	addl	92(%esp),%ebp
-	addl	104(%esp),%ecx
-	addl	108(%esp),%esi
-	addl	116(%esp),%edx
-	addl	124(%esp),%edi
-	movl	%ebp,28(%esp)
-	movl	156(%esp),%ebp
-	movl	%ecx,40(%esp)
-	movl	152(%esp),%ecx
-	movl	%esi,44(%esp)
-	xorl	%esi,%esi
-	movl	%edx,52(%esp)
-	movl	%edi,60(%esp)
-	xorl	%eax,%eax
-	xorl	%edx,%edx
-L007tail_loop:
-	movb	(%esi,%ebp,1),%al
-	movb	(%esp,%esi,1),%dl
-	leal	1(%esi),%esi
-	xorb	%dl,%al
-	movb	%al,-1(%ecx,%esi,1)
-	decl	%ebx
-	jnz	L007tail_loop
-L006done:
-	addl	$132,%esp
-L000no_data:
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_ChaCha20_ssse3
-.private_extern	_ChaCha20_ssse3
-.align	4
-_ChaCha20_ssse3:
-L_ChaCha20_ssse3_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-Lssse3_shortcut:
-	movl	20(%esp),%edi
-	movl	24(%esp),%esi
-	movl	28(%esp),%ecx
-	movl	32(%esp),%edx
-	movl	36(%esp),%ebx
-	movl	%esp,%ebp
-	subl	$524,%esp
-	andl	$-64,%esp
-	movl	%ebp,512(%esp)
-	leal	Lssse3_data-Lpic_point(%eax),%eax
-	movdqu	(%ebx),%xmm3
-	cmpl	$256,%ecx
-	jb	L0081x
-	movl	%edx,516(%esp)
-	movl	%ebx,520(%esp)
-	subl	$256,%ecx
-	leal	384(%esp),%ebp
-	movdqu	(%edx),%xmm7
-	pshufd	$0,%xmm3,%xmm0
-	pshufd	$85,%xmm3,%xmm1
-	pshufd	$170,%xmm3,%xmm2
-	pshufd	$255,%xmm3,%xmm3
-	paddd	48(%eax),%xmm0
-	pshufd	$0,%xmm7,%xmm4
-	pshufd	$85,%xmm7,%xmm5
-	psubd	64(%eax),%xmm0
-	pshufd	$170,%xmm7,%xmm6
-	pshufd	$255,%xmm7,%xmm7
-	movdqa	%xmm0,64(%ebp)
-	movdqa	%xmm1,80(%ebp)
-	movdqa	%xmm2,96(%ebp)
-	movdqa	%xmm3,112(%ebp)
-	movdqu	16(%edx),%xmm3
-	movdqa	%xmm4,-64(%ebp)
-	movdqa	%xmm5,-48(%ebp)
-	movdqa	%xmm6,-32(%ebp)
-	movdqa	%xmm7,-16(%ebp)
-	movdqa	32(%eax),%xmm7
-	leal	128(%esp),%ebx
-	pshufd	$0,%xmm3,%xmm0
-	pshufd	$85,%xmm3,%xmm1
-	pshufd	$170,%xmm3,%xmm2
-	pshufd	$255,%xmm3,%xmm3
-	pshufd	$0,%xmm7,%xmm4
-	pshufd	$85,%xmm7,%xmm5
-	pshufd	$170,%xmm7,%xmm6
-	pshufd	$255,%xmm7,%xmm7
-	movdqa	%xmm0,(%ebp)
-	movdqa	%xmm1,16(%ebp)
-	movdqa	%xmm2,32(%ebp)
-	movdqa	%xmm3,48(%ebp)
-	movdqa	%xmm4,-128(%ebp)
-	movdqa	%xmm5,-112(%ebp)
-	movdqa	%xmm6,-96(%ebp)
-	movdqa	%xmm7,-80(%ebp)
-	leal	128(%esi),%esi
-	leal	128(%edi),%edi
-	jmp	L009outer_loop
-.align	4,0x90
-L009outer_loop:
-	movdqa	-112(%ebp),%xmm1
-	movdqa	-96(%ebp),%xmm2
-	movdqa	-80(%ebp),%xmm3
-	movdqa	-48(%ebp),%xmm5
-	movdqa	-32(%ebp),%xmm6
-	movdqa	-16(%ebp),%xmm7
-	movdqa	%xmm1,-112(%ebx)
-	movdqa	%xmm2,-96(%ebx)
-	movdqa	%xmm3,-80(%ebx)
-	movdqa	%xmm5,-48(%ebx)
-	movdqa	%xmm6,-32(%ebx)
-	movdqa	%xmm7,-16(%ebx)
-	movdqa	32(%ebp),%xmm2
-	movdqa	48(%ebp),%xmm3
-	movdqa	64(%ebp),%xmm4
-	movdqa	80(%ebp),%xmm5
-	movdqa	96(%ebp),%xmm6
-	movdqa	112(%ebp),%xmm7
-	paddd	64(%eax),%xmm4
-	movdqa	%xmm2,32(%ebx)
-	movdqa	%xmm3,48(%ebx)
-	movdqa	%xmm4,64(%ebx)
-	movdqa	%xmm5,80(%ebx)
-	movdqa	%xmm6,96(%ebx)
-	movdqa	%xmm7,112(%ebx)
-	movdqa	%xmm4,64(%ebp)
-	movdqa	-128(%ebp),%xmm0
-	movdqa	%xmm4,%xmm6
-	movdqa	-64(%ebp),%xmm3
-	movdqa	(%ebp),%xmm4
-	movdqa	16(%ebp),%xmm5
-	movl	$10,%edx
-	nop
-.align	4,0x90
-L010loop:
-	paddd	%xmm3,%xmm0
-	movdqa	%xmm3,%xmm2
-	pxor	%xmm0,%xmm6
-	pshufb	(%eax),%xmm6
-	paddd	%xmm6,%xmm4
-	pxor	%xmm4,%xmm2
-	movdqa	-48(%ebx),%xmm3
-	movdqa	%xmm2,%xmm1
-	pslld	$12,%xmm2
-	psrld	$20,%xmm1
-	por	%xmm1,%xmm2
-	movdqa	-112(%ebx),%xmm1
-	paddd	%xmm2,%xmm0
-	movdqa	80(%ebx),%xmm7
-	pxor	%xmm0,%xmm6
-	movdqa	%xmm0,-128(%ebx)
-	pshufb	16(%eax),%xmm6
-	paddd	%xmm6,%xmm4
-	movdqa	%xmm6,64(%ebx)
-	pxor	%xmm4,%xmm2
-	paddd	%xmm3,%xmm1
-	movdqa	%xmm2,%xmm0
-	pslld	$7,%xmm2
-	psrld	$25,%xmm0
-	pxor	%xmm1,%xmm7
-	por	%xmm0,%xmm2
-	movdqa	%xmm4,(%ebx)
-	pshufb	(%eax),%xmm7
-	movdqa	%xmm2,-64(%ebx)
-	paddd	%xmm7,%xmm5
-	movdqa	32(%ebx),%xmm4
-	pxor	%xmm5,%xmm3
-	movdqa	-32(%ebx),%xmm2
-	movdqa	%xmm3,%xmm0
-	pslld	$12,%xmm3
-	psrld	$20,%xmm0
-	por	%xmm0,%xmm3
-	movdqa	-96(%ebx),%xmm0
-	paddd	%xmm3,%xmm1
-	movdqa	96(%ebx),%xmm6
-	pxor	%xmm1,%xmm7
-	movdqa	%xmm1,-112(%ebx)
-	pshufb	16(%eax),%xmm7
-	paddd	%xmm7,%xmm5
-	movdqa	%xmm7,80(%ebx)
-	pxor	%xmm5,%xmm3
-	paddd	%xmm2,%xmm0
-	movdqa	%xmm3,%xmm1
-	pslld	$7,%xmm3
-	psrld	$25,%xmm1
-	pxor	%xmm0,%xmm6
-	por	%xmm1,%xmm3
-	movdqa	%xmm5,16(%ebx)
-	pshufb	(%eax),%xmm6
-	movdqa	%xmm3,-48(%ebx)
-	paddd	%xmm6,%xmm4
-	movdqa	48(%ebx),%xmm5
-	pxor	%xmm4,%xmm2
-	movdqa	-16(%ebx),%xmm3
-	movdqa	%xmm2,%xmm1
-	pslld	$12,%xmm2
-	psrld	$20,%xmm1
-	por	%xmm1,%xmm2
-	movdqa	-80(%ebx),%xmm1
-	paddd	%xmm2,%xmm0
-	movdqa	112(%ebx),%xmm7
-	pxor	%xmm0,%xmm6
-	movdqa	%xmm0,-96(%ebx)
-	pshufb	16(%eax),%xmm6
-	paddd	%xmm6,%xmm4
-	movdqa	%xmm6,96(%ebx)
-	pxor	%xmm4,%xmm2
-	paddd	%xmm3,%xmm1
-	movdqa	%xmm2,%xmm0
-	pslld	$7,%xmm2
-	psrld	$25,%xmm0
-	pxor	%xmm1,%xmm7
-	por	%xmm0,%xmm2
-	pshufb	(%eax),%xmm7
-	movdqa	%xmm2,-32(%ebx)
-	paddd	%xmm7,%xmm5
-	pxor	%xmm5,%xmm3
-	movdqa	-48(%ebx),%xmm2
-	movdqa	%xmm3,%xmm0
-	pslld	$12,%xmm3
-	psrld	$20,%xmm0
-	por	%xmm0,%xmm3
-	movdqa	-128(%ebx),%xmm0
-	paddd	%xmm3,%xmm1
-	pxor	%xmm1,%xmm7
-	movdqa	%xmm1,-80(%ebx)
-	pshufb	16(%eax),%xmm7
-	paddd	%xmm7,%xmm5
-	movdqa	%xmm7,%xmm6
-	pxor	%xmm5,%xmm3
-	paddd	%xmm2,%xmm0
-	movdqa	%xmm3,%xmm1
-	pslld	$7,%xmm3
-	psrld	$25,%xmm1
-	pxor	%xmm0,%xmm6
-	por	%xmm1,%xmm3
-	pshufb	(%eax),%xmm6
-	movdqa	%xmm3,-16(%ebx)
-	paddd	%xmm6,%xmm4
-	pxor	%xmm4,%xmm2
-	movdqa	-32(%ebx),%xmm3
-	movdqa	%xmm2,%xmm1
-	pslld	$12,%xmm2
-	psrld	$20,%xmm1
-	por	%xmm1,%xmm2
-	movdqa	-112(%ebx),%xmm1
-	paddd	%xmm2,%xmm0
-	movdqa	64(%ebx),%xmm7
-	pxor	%xmm0,%xmm6
-	movdqa	%xmm0,-128(%ebx)
-	pshufb	16(%eax),%xmm6
-	paddd	%xmm6,%xmm4
-	movdqa	%xmm6,112(%ebx)
-	pxor	%xmm4,%xmm2
-	paddd	%xmm3,%xmm1
-	movdqa	%xmm2,%xmm0
-	pslld	$7,%xmm2
-	psrld	$25,%xmm0
-	pxor	%xmm1,%xmm7
-	por	%xmm0,%xmm2
-	movdqa	%xmm4,32(%ebx)
-	pshufb	(%eax),%xmm7
-	movdqa	%xmm2,-48(%ebx)
-	paddd	%xmm7,%xmm5
-	movdqa	(%ebx),%xmm4
-	pxor	%xmm5,%xmm3
-	movdqa	-16(%ebx),%xmm2
-	movdqa	%xmm3,%xmm0
-	pslld	$12,%xmm3
-	psrld	$20,%xmm0
-	por	%xmm0,%xmm3
-	movdqa	-96(%ebx),%xmm0
-	paddd	%xmm3,%xmm1
-	movdqa	80(%ebx),%xmm6
-	pxor	%xmm1,%xmm7
-	movdqa	%xmm1,-112(%ebx)
-	pshufb	16(%eax),%xmm7
-	paddd	%xmm7,%xmm5
-	movdqa	%xmm7,64(%ebx)
-	pxor	%xmm5,%xmm3
-	paddd	%xmm2,%xmm0
-	movdqa	%xmm3,%xmm1
-	pslld	$7,%xmm3
-	psrld	$25,%xmm1
-	pxor	%xmm0,%xmm6
-	por	%xmm1,%xmm3
-	movdqa	%xmm5,48(%ebx)
-	pshufb	(%eax),%xmm6
-	movdqa	%xmm3,-32(%ebx)
-	paddd	%xmm6,%xmm4
-	movdqa	16(%ebx),%xmm5
-	pxor	%xmm4,%xmm2
-	movdqa	-64(%ebx),%xmm3
-	movdqa	%xmm2,%xmm1
-	pslld	$12,%xmm2
-	psrld	$20,%xmm1
-	por	%xmm1,%xmm2
-	movdqa	-80(%ebx),%xmm1
-	paddd	%xmm2,%xmm0
-	movdqa	96(%ebx),%xmm7
-	pxor	%xmm0,%xmm6
-	movdqa	%xmm0,-96(%ebx)
-	pshufb	16(%eax),%xmm6
-	paddd	%xmm6,%xmm4
-	movdqa	%xmm6,80(%ebx)
-	pxor	%xmm4,%xmm2
-	paddd	%xmm3,%xmm1
-	movdqa	%xmm2,%xmm0
-	pslld	$7,%xmm2
-	psrld	$25,%xmm0
-	pxor	%xmm1,%xmm7
-	por	%xmm0,%xmm2
-	pshufb	(%eax),%xmm7
-	movdqa	%xmm2,-16(%ebx)
-	paddd	%xmm7,%xmm5
-	pxor	%xmm5,%xmm3
-	movdqa	%xmm3,%xmm0
-	pslld	$12,%xmm3
-	psrld	$20,%xmm0
-	por	%xmm0,%xmm3
-	movdqa	-128(%ebx),%xmm0
-	paddd	%xmm3,%xmm1
-	movdqa	64(%ebx),%xmm6
-	pxor	%xmm1,%xmm7
-	movdqa	%xmm1,-80(%ebx)
-	pshufb	16(%eax),%xmm7
-	paddd	%xmm7,%xmm5
-	movdqa	%xmm7,96(%ebx)
-	pxor	%xmm5,%xmm3
-	movdqa	%xmm3,%xmm1
-	pslld	$7,%xmm3
-	psrld	$25,%xmm1
-	por	%xmm1,%xmm3
-	decl	%edx
-	jnz	L010loop
-	movdqa	%xmm3,-64(%ebx)
-	movdqa	%xmm4,(%ebx)
-	movdqa	%xmm5,16(%ebx)
-	movdqa	%xmm6,64(%ebx)
-	movdqa	%xmm7,96(%ebx)
-	movdqa	-112(%ebx),%xmm1
-	movdqa	-96(%ebx),%xmm2
-	movdqa	-80(%ebx),%xmm3
-	paddd	-128(%ebp),%xmm0
-	paddd	-112(%ebp),%xmm1
-	paddd	-96(%ebp),%xmm2
-	paddd	-80(%ebp),%xmm3
-	movdqa	%xmm0,%xmm6
-	punpckldq	%xmm1,%xmm0
-	movdqa	%xmm2,%xmm7
-	punpckldq	%xmm3,%xmm2
-	punpckhdq	%xmm1,%xmm6
-	punpckhdq	%xmm3,%xmm7
-	movdqa	%xmm0,%xmm1
-	punpcklqdq	%xmm2,%xmm0
-	movdqa	%xmm6,%xmm3
-	punpcklqdq	%xmm7,%xmm6
-	punpckhqdq	%xmm2,%xmm1
-	punpckhqdq	%xmm7,%xmm3
-	movdqu	-128(%esi),%xmm4
-	movdqu	-64(%esi),%xmm5
-	movdqu	(%esi),%xmm2
-	movdqu	64(%esi),%xmm7
-	leal	16(%esi),%esi
-	pxor	%xmm0,%xmm4
-	movdqa	-64(%ebx),%xmm0
-	pxor	%xmm1,%xmm5
-	movdqa	-48(%ebx),%xmm1
-	pxor	%xmm2,%xmm6
-	movdqa	-32(%ebx),%xmm2
-	pxor	%xmm3,%xmm7
-	movdqa	-16(%ebx),%xmm3
-	movdqu	%xmm4,-128(%edi)
-	movdqu	%xmm5,-64(%edi)
-	movdqu	%xmm6,(%edi)
-	movdqu	%xmm7,64(%edi)
-	leal	16(%edi),%edi
-	paddd	-64(%ebp),%xmm0
-	paddd	-48(%ebp),%xmm1
-	paddd	-32(%ebp),%xmm2
-	paddd	-16(%ebp),%xmm3
-	movdqa	%xmm0,%xmm6
-	punpckldq	%xmm1,%xmm0
-	movdqa	%xmm2,%xmm7
-	punpckldq	%xmm3,%xmm2
-	punpckhdq	%xmm1,%xmm6
-	punpckhdq	%xmm3,%xmm7
-	movdqa	%xmm0,%xmm1
-	punpcklqdq	%xmm2,%xmm0
-	movdqa	%xmm6,%xmm3
-	punpcklqdq	%xmm7,%xmm6
-	punpckhqdq	%xmm2,%xmm1
-	punpckhqdq	%xmm7,%xmm3
-	movdqu	-128(%esi),%xmm4
-	movdqu	-64(%esi),%xmm5
-	movdqu	(%esi),%xmm2
-	movdqu	64(%esi),%xmm7
-	leal	16(%esi),%esi
-	pxor	%xmm0,%xmm4
-	movdqa	(%ebx),%xmm0
-	pxor	%xmm1,%xmm5
-	movdqa	16(%ebx),%xmm1
-	pxor	%xmm2,%xmm6
-	movdqa	32(%ebx),%xmm2
-	pxor	%xmm3,%xmm7
-	movdqa	48(%ebx),%xmm3
-	movdqu	%xmm4,-128(%edi)
-	movdqu	%xmm5,-64(%edi)
-	movdqu	%xmm6,(%edi)
-	movdqu	%xmm7,64(%edi)
-	leal	16(%edi),%edi
-	paddd	(%ebp),%xmm0
-	paddd	16(%ebp),%xmm1
-	paddd	32(%ebp),%xmm2
-	paddd	48(%ebp),%xmm3
-	movdqa	%xmm0,%xmm6
-	punpckldq	%xmm1,%xmm0
-	movdqa	%xmm2,%xmm7
-	punpckldq	%xmm3,%xmm2
-	punpckhdq	%xmm1,%xmm6
-	punpckhdq	%xmm3,%xmm7
-	movdqa	%xmm0,%xmm1
-	punpcklqdq	%xmm2,%xmm0
-	movdqa	%xmm6,%xmm3
-	punpcklqdq	%xmm7,%xmm6
-	punpckhqdq	%xmm2,%xmm1
-	punpckhqdq	%xmm7,%xmm3
-	movdqu	-128(%esi),%xmm4
-	movdqu	-64(%esi),%xmm5
-	movdqu	(%esi),%xmm2
-	movdqu	64(%esi),%xmm7
-	leal	16(%esi),%esi
-	pxor	%xmm0,%xmm4
-	movdqa	64(%ebx),%xmm0
-	pxor	%xmm1,%xmm5
-	movdqa	80(%ebx),%xmm1
-	pxor	%xmm2,%xmm6
-	movdqa	96(%ebx),%xmm2
-	pxor	%xmm3,%xmm7
-	movdqa	112(%ebx),%xmm3
-	movdqu	%xmm4,-128(%edi)
-	movdqu	%xmm5,-64(%edi)
-	movdqu	%xmm6,(%edi)
-	movdqu	%xmm7,64(%edi)
-	leal	16(%edi),%edi
-	paddd	64(%ebp),%xmm0
-	paddd	80(%ebp),%xmm1
-	paddd	96(%ebp),%xmm2
-	paddd	112(%ebp),%xmm3
-	movdqa	%xmm0,%xmm6
-	punpckldq	%xmm1,%xmm0
-	movdqa	%xmm2,%xmm7
-	punpckldq	%xmm3,%xmm2
-	punpckhdq	%xmm1,%xmm6
-	punpckhdq	%xmm3,%xmm7
-	movdqa	%xmm0,%xmm1
-	punpcklqdq	%xmm2,%xmm0
-	movdqa	%xmm6,%xmm3
-	punpcklqdq	%xmm7,%xmm6
-	punpckhqdq	%xmm2,%xmm1
-	punpckhqdq	%xmm7,%xmm3
-	movdqu	-128(%esi),%xmm4
-	movdqu	-64(%esi),%xmm5
-	movdqu	(%esi),%xmm2
-	movdqu	64(%esi),%xmm7
-	leal	208(%esi),%esi
-	pxor	%xmm0,%xmm4
-	pxor	%xmm1,%xmm5
-	pxor	%xmm2,%xmm6
-	pxor	%xmm3,%xmm7
-	movdqu	%xmm4,-128(%edi)
-	movdqu	%xmm5,-64(%edi)
-	movdqu	%xmm6,(%edi)
-	movdqu	%xmm7,64(%edi)
-	leal	208(%edi),%edi
-	subl	$256,%ecx
-	jnc	L009outer_loop
-	addl	$256,%ecx
-	jz	L011done
-	movl	520(%esp),%ebx
-	leal	-128(%esi),%esi
-	movl	516(%esp),%edx
-	leal	-128(%edi),%edi
-	movd	64(%ebp),%xmm2
-	movdqu	(%ebx),%xmm3
-	paddd	96(%eax),%xmm2
-	pand	112(%eax),%xmm3
-	por	%xmm2,%xmm3
-L0081x:
-	movdqa	32(%eax),%xmm0
-	movdqu	(%edx),%xmm1
-	movdqu	16(%edx),%xmm2
-	movdqa	(%eax),%xmm6
-	movdqa	16(%eax),%xmm7
-	movl	%ebp,48(%esp)
-	movdqa	%xmm0,(%esp)
-	movdqa	%xmm1,16(%esp)
-	movdqa	%xmm2,32(%esp)
-	movdqa	%xmm3,48(%esp)
-	movl	$10,%edx
-	jmp	L012loop1x
-.align	4,0x90
-L013outer1x:
-	movdqa	80(%eax),%xmm3
-	movdqa	(%esp),%xmm0
-	movdqa	16(%esp),%xmm1
-	movdqa	32(%esp),%xmm2
-	paddd	48(%esp),%xmm3
-	movl	$10,%edx
-	movdqa	%xmm3,48(%esp)
-	jmp	L012loop1x
-.align	4,0x90
-L012loop1x:
-	paddd	%xmm1,%xmm0
-	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,222
-	paddd	%xmm3,%xmm2
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm1,%xmm4
-	psrld	$20,%xmm1
-	pslld	$12,%xmm4
-	por	%xmm4,%xmm1
-	paddd	%xmm1,%xmm0
-	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,223
-	paddd	%xmm3,%xmm2
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm1,%xmm4
-	psrld	$25,%xmm1
-	pslld	$7,%xmm4
-	por	%xmm4,%xmm1
-	pshufd	$78,%xmm2,%xmm2
-	pshufd	$57,%xmm1,%xmm1
-	pshufd	$147,%xmm3,%xmm3
-	nop
-	paddd	%xmm1,%xmm0
-	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,222
-	paddd	%xmm3,%xmm2
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm1,%xmm4
-	psrld	$20,%xmm1
-	pslld	$12,%xmm4
-	por	%xmm4,%xmm1
-	paddd	%xmm1,%xmm0
-	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,223
-	paddd	%xmm3,%xmm2
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm1,%xmm4
-	psrld	$25,%xmm1
-	pslld	$7,%xmm4
-	por	%xmm4,%xmm1
-	pshufd	$78,%xmm2,%xmm2
-	pshufd	$147,%xmm1,%xmm1
-	pshufd	$57,%xmm3,%xmm3
-	decl	%edx
-	jnz	L012loop1x
-	paddd	(%esp),%xmm0
-	paddd	16(%esp),%xmm1
-	paddd	32(%esp),%xmm2
-	paddd	48(%esp),%xmm3
-	cmpl	$64,%ecx
-	jb	L014tail
-	movdqu	(%esi),%xmm4
-	movdqu	16(%esi),%xmm5
-	pxor	%xmm4,%xmm0
-	movdqu	32(%esi),%xmm4
-	pxor	%xmm5,%xmm1
-	movdqu	48(%esi),%xmm5
-	pxor	%xmm4,%xmm2
-	pxor	%xmm5,%xmm3
-	leal	64(%esi),%esi
-	movdqu	%xmm0,(%edi)
-	movdqu	%xmm1,16(%edi)
-	movdqu	%xmm2,32(%edi)
-	movdqu	%xmm3,48(%edi)
-	leal	64(%edi),%edi
-	subl	$64,%ecx
-	jnz	L013outer1x
-	jmp	L011done
-L014tail:
-	movdqa	%xmm0,(%esp)
-	movdqa	%xmm1,16(%esp)
-	movdqa	%xmm2,32(%esp)
-	movdqa	%xmm3,48(%esp)
-	xorl	%eax,%eax
-	xorl	%edx,%edx
-	xorl	%ebp,%ebp
-L015tail_loop:
-	movb	(%esp,%ebp,1),%al
-	movb	(%esi,%ebp,1),%dl
-	leal	1(%ebp),%ebp
-	xorb	%dl,%al
-	movb	%al,-1(%edi,%ebp,1)
-	decl	%ecx
-	jnz	L015tail_loop
-L011done:
-	movl	512(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.align	6,0x90
-Lssse3_data:
-.byte	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
-.byte	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
-.long	1634760805,857760878,2036477234,1797285236
-.long	0,1,2,3
-.long	4,4,4,4
-.long	1,0,0,0
-.long	4,0,0,0
-.long	0,-1,-1,-1
-.align	6,0x90
-.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
-.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
-.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
-.byte	114,103,62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol	_OPENSSL_ia32cap_P
-.long	0
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/aesni-x86-apple.S b/apple-x86/crypto/fipsmodule/aesni-x86-apple.S
deleted file mode 100644
index 4467604..0000000
--- a/apple-x86/crypto/fipsmodule/aesni-x86-apple.S
+++ /dev/null
@@ -1,2475 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-#ifdef BORINGSSL_DISPATCH_TEST
-#endif
-.globl	_aes_hw_encrypt
-.private_extern	_aes_hw_encrypt
-.align	4
-_aes_hw_encrypt:
-L_aes_hw_encrypt_begin:
-#ifdef BORINGSSL_DISPATCH_TEST
-	pushl	%ebx
-	pushl	%edx
-	call	L000pic
-L000pic:
-	popl	%ebx
-	leal	_BORINGSSL_function_hit+1-L000pic(%ebx),%ebx
-	movl	$1,%edx
-	movb	%dl,(%ebx)
-	popl	%edx
-	popl	%ebx
-#endif
-	movl	4(%esp),%eax
-	movl	12(%esp),%edx
-	movups	(%eax),%xmm2
-	movl	240(%edx),%ecx
-	movl	8(%esp),%eax
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L001enc1_loop_1:
-.byte	102,15,56,220,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L001enc1_loop_1
-.byte	102,15,56,221,209
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	movups	%xmm2,(%eax)
-	pxor	%xmm2,%xmm2
-	ret
-.globl	_aes_hw_decrypt
-.private_extern	_aes_hw_decrypt
-.align	4
-_aes_hw_decrypt:
-L_aes_hw_decrypt_begin:
-	movl	4(%esp),%eax
-	movl	12(%esp),%edx
-	movups	(%eax),%xmm2
-	movl	240(%edx),%ecx
-	movl	8(%esp),%eax
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L002dec1_loop_2:
-.byte	102,15,56,222,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L002dec1_loop_2
-.byte	102,15,56,223,209
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	movups	%xmm2,(%eax)
-	pxor	%xmm2,%xmm2
-	ret
-.private_extern	__aesni_encrypt2
-.align	4
-__aesni_encrypt2:
-	movups	(%edx),%xmm0
-	shll	$4,%ecx
-	movups	16(%edx),%xmm1
-	xorps	%xmm0,%xmm2
-	pxor	%xmm0,%xmm3
-	movups	32(%edx),%xmm0
-	leal	32(%edx,%ecx,1),%edx
-	negl	%ecx
-	addl	$16,%ecx
-L003enc2_loop:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-	movups	(%edx,%ecx,1),%xmm1
-	addl	$32,%ecx
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-	movups	-16(%edx,%ecx,1),%xmm0
-	jnz	L003enc2_loop
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-	ret
-.private_extern	__aesni_decrypt2
-.align	4
-__aesni_decrypt2:
-	movups	(%edx),%xmm0
-	shll	$4,%ecx
-	movups	16(%edx),%xmm1
-	xorps	%xmm0,%xmm2
-	pxor	%xmm0,%xmm3
-	movups	32(%edx),%xmm0
-	leal	32(%edx,%ecx,1),%edx
-	negl	%ecx
-	addl	$16,%ecx
-L004dec2_loop:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-	movups	(%edx,%ecx,1),%xmm1
-	addl	$32,%ecx
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-	movups	-16(%edx,%ecx,1),%xmm0
-	jnz	L004dec2_loop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-	ret
-.private_extern	__aesni_encrypt3
-.align	4
-__aesni_encrypt3:
-	movups	(%edx),%xmm0
-	shll	$4,%ecx
-	movups	16(%edx),%xmm1
-	xorps	%xmm0,%xmm2
-	pxor	%xmm0,%xmm3
-	pxor	%xmm0,%xmm4
-	movups	32(%edx),%xmm0
-	leal	32(%edx,%ecx,1),%edx
-	negl	%ecx
-	addl	$16,%ecx
-L005enc3_loop:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-	movups	(%edx,%ecx,1),%xmm1
-	addl	$32,%ecx
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-	movups	-16(%edx,%ecx,1),%xmm0
-	jnz	L005enc3_loop
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-	ret
-.private_extern	__aesni_decrypt3
-.align	4
-__aesni_decrypt3:
-	movups	(%edx),%xmm0
-	shll	$4,%ecx
-	movups	16(%edx),%xmm1
-	xorps	%xmm0,%xmm2
-	pxor	%xmm0,%xmm3
-	pxor	%xmm0,%xmm4
-	movups	32(%edx),%xmm0
-	leal	32(%edx,%ecx,1),%edx
-	negl	%ecx
-	addl	$16,%ecx
-L006dec3_loop:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-	movups	(%edx,%ecx,1),%xmm1
-	addl	$32,%ecx
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-	movups	-16(%edx,%ecx,1),%xmm0
-	jnz	L006dec3_loop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-	ret
-.private_extern	__aesni_encrypt4
-.align	4
-__aesni_encrypt4:
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	shll	$4,%ecx
-	xorps	%xmm0,%xmm2
-	pxor	%xmm0,%xmm3
-	pxor	%xmm0,%xmm4
-	pxor	%xmm0,%xmm5
-	movups	32(%edx),%xmm0
-	leal	32(%edx,%ecx,1),%edx
-	negl	%ecx
-.byte	15,31,64,0
-	addl	$16,%ecx
-L007enc4_loop:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-	movups	(%edx,%ecx,1),%xmm1
-	addl	$32,%ecx
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-	movups	-16(%edx,%ecx,1),%xmm0
-	jnz	L007enc4_loop
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-.byte	102,15,56,221,232
-	ret
-.private_extern	__aesni_decrypt4
-.align	4
-__aesni_decrypt4:
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	shll	$4,%ecx
-	xorps	%xmm0,%xmm2
-	pxor	%xmm0,%xmm3
-	pxor	%xmm0,%xmm4
-	pxor	%xmm0,%xmm5
-	movups	32(%edx),%xmm0
-	leal	32(%edx,%ecx,1),%edx
-	negl	%ecx
-.byte	15,31,64,0
-	addl	$16,%ecx
-L008dec4_loop:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-	movups	(%edx,%ecx,1),%xmm1
-	addl	$32,%ecx
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-	movups	-16(%edx,%ecx,1),%xmm0
-	jnz	L008dec4_loop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-.byte	102,15,56,223,232
-	ret
-.private_extern	__aesni_encrypt6
-.align	4
-__aesni_encrypt6:
-	movups	(%edx),%xmm0
-	shll	$4,%ecx
-	movups	16(%edx),%xmm1
-	xorps	%xmm0,%xmm2
-	pxor	%xmm0,%xmm3
-	pxor	%xmm0,%xmm4
-.byte	102,15,56,220,209
-	pxor	%xmm0,%xmm5
-	pxor	%xmm0,%xmm6
-.byte	102,15,56,220,217
-	leal	32(%edx,%ecx,1),%edx
-	negl	%ecx
-.byte	102,15,56,220,225
-	pxor	%xmm0,%xmm7
-	movups	(%edx,%ecx,1),%xmm0
-	addl	$16,%ecx
-	jmp	L009_aesni_encrypt6_inner
-.align	4,0x90
-L010enc6_loop:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-L009_aesni_encrypt6_inner:
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-L_aesni_encrypt6_enter:
-	movups	(%edx,%ecx,1),%xmm1
-	addl	$32,%ecx
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-	movups	-16(%edx,%ecx,1),%xmm0
-	jnz	L010enc6_loop
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-.byte	102,15,56,221,232
-.byte	102,15,56,221,240
-.byte	102,15,56,221,248
-	ret
-.private_extern	__aesni_decrypt6
-.align	4
-__aesni_decrypt6:
-	movups	(%edx),%xmm0
-	shll	$4,%ecx
-	movups	16(%edx),%xmm1
-	xorps	%xmm0,%xmm2
-	pxor	%xmm0,%xmm3
-	pxor	%xmm0,%xmm4
-.byte	102,15,56,222,209
-	pxor	%xmm0,%xmm5
-	pxor	%xmm0,%xmm6
-.byte	102,15,56,222,217
-	leal	32(%edx,%ecx,1),%edx
-	negl	%ecx
-.byte	102,15,56,222,225
-	pxor	%xmm0,%xmm7
-	movups	(%edx,%ecx,1),%xmm0
-	addl	$16,%ecx
-	jmp	L011_aesni_decrypt6_inner
-.align	4,0x90
-L012dec6_loop:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-L011_aesni_decrypt6_inner:
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-L_aesni_decrypt6_enter:
-	movups	(%edx,%ecx,1),%xmm1
-	addl	$32,%ecx
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-	movups	-16(%edx,%ecx,1),%xmm0
-	jnz	L012dec6_loop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-.byte	102,15,56,223,232
-.byte	102,15,56,223,240
-.byte	102,15,56,223,248
-	ret
-.globl	_aes_hw_ecb_encrypt
-.private_extern	_aes_hw_ecb_encrypt
-.align	4
-_aes_hw_ecb_encrypt:
-L_aes_hw_ecb_encrypt_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%eax
-	movl	32(%esp),%edx
-	movl	36(%esp),%ebx
-	andl	$-16,%eax
-	jz	L013ecb_ret
-	movl	240(%edx),%ecx
-	testl	%ebx,%ebx
-	jz	L014ecb_decrypt
-	movl	%edx,%ebp
-	movl	%ecx,%ebx
-	cmpl	$96,%eax
-	jb	L015ecb_enc_tail
-	movdqu	(%esi),%xmm2
-	movdqu	16(%esi),%xmm3
-	movdqu	32(%esi),%xmm4
-	movdqu	48(%esi),%xmm5
-	movdqu	64(%esi),%xmm6
-	movdqu	80(%esi),%xmm7
-	leal	96(%esi),%esi
-	subl	$96,%eax
-	jmp	L016ecb_enc_loop6_enter
-.align	4,0x90
-L017ecb_enc_loop6:
-	movups	%xmm2,(%edi)
-	movdqu	(%esi),%xmm2
-	movups	%xmm3,16(%edi)
-	movdqu	16(%esi),%xmm3
-	movups	%xmm4,32(%edi)
-	movdqu	32(%esi),%xmm4
-	movups	%xmm5,48(%edi)
-	movdqu	48(%esi),%xmm5
-	movups	%xmm6,64(%edi)
-	movdqu	64(%esi),%xmm6
-	movups	%xmm7,80(%edi)
-	leal	96(%edi),%edi
-	movdqu	80(%esi),%xmm7
-	leal	96(%esi),%esi
-L016ecb_enc_loop6_enter:
-	call	__aesni_encrypt6
-	movl	%ebp,%edx
-	movl	%ebx,%ecx
-	subl	$96,%eax
-	jnc	L017ecb_enc_loop6
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	movups	%xmm5,48(%edi)
-	movups	%xmm6,64(%edi)
-	movups	%xmm7,80(%edi)
-	leal	96(%edi),%edi
-	addl	$96,%eax
-	jz	L013ecb_ret
-L015ecb_enc_tail:
-	movups	(%esi),%xmm2
-	cmpl	$32,%eax
-	jb	L018ecb_enc_one
-	movups	16(%esi),%xmm3
-	je	L019ecb_enc_two
-	movups	32(%esi),%xmm4
-	cmpl	$64,%eax
-	jb	L020ecb_enc_three
-	movups	48(%esi),%xmm5
-	je	L021ecb_enc_four
-	movups	64(%esi),%xmm6
-	xorps	%xmm7,%xmm7
-	call	__aesni_encrypt6
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	movups	%xmm5,48(%edi)
-	movups	%xmm6,64(%edi)
-	jmp	L013ecb_ret
-.align	4,0x90
-L018ecb_enc_one:
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L022enc1_loop_3:
-.byte	102,15,56,220,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L022enc1_loop_3
-.byte	102,15,56,221,209
-	movups	%xmm2,(%edi)
-	jmp	L013ecb_ret
-.align	4,0x90
-L019ecb_enc_two:
-	call	__aesni_encrypt2
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	jmp	L013ecb_ret
-.align	4,0x90
-L020ecb_enc_three:
-	call	__aesni_encrypt3
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	jmp	L013ecb_ret
-.align	4,0x90
-L021ecb_enc_four:
-	call	__aesni_encrypt4
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	movups	%xmm5,48(%edi)
-	jmp	L013ecb_ret
-.align	4,0x90
-L014ecb_decrypt:
-	movl	%edx,%ebp
-	movl	%ecx,%ebx
-	cmpl	$96,%eax
-	jb	L023ecb_dec_tail
-	movdqu	(%esi),%xmm2
-	movdqu	16(%esi),%xmm3
-	movdqu	32(%esi),%xmm4
-	movdqu	48(%esi),%xmm5
-	movdqu	64(%esi),%xmm6
-	movdqu	80(%esi),%xmm7
-	leal	96(%esi),%esi
-	subl	$96,%eax
-	jmp	L024ecb_dec_loop6_enter
-.align	4,0x90
-L025ecb_dec_loop6:
-	movups	%xmm2,(%edi)
-	movdqu	(%esi),%xmm2
-	movups	%xmm3,16(%edi)
-	movdqu	16(%esi),%xmm3
-	movups	%xmm4,32(%edi)
-	movdqu	32(%esi),%xmm4
-	movups	%xmm5,48(%edi)
-	movdqu	48(%esi),%xmm5
-	movups	%xmm6,64(%edi)
-	movdqu	64(%esi),%xmm6
-	movups	%xmm7,80(%edi)
-	leal	96(%edi),%edi
-	movdqu	80(%esi),%xmm7
-	leal	96(%esi),%esi
-L024ecb_dec_loop6_enter:
-	call	__aesni_decrypt6
-	movl	%ebp,%edx
-	movl	%ebx,%ecx
-	subl	$96,%eax
-	jnc	L025ecb_dec_loop6
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	movups	%xmm5,48(%edi)
-	movups	%xmm6,64(%edi)
-	movups	%xmm7,80(%edi)
-	leal	96(%edi),%edi
-	addl	$96,%eax
-	jz	L013ecb_ret
-L023ecb_dec_tail:
-	movups	(%esi),%xmm2
-	cmpl	$32,%eax
-	jb	L026ecb_dec_one
-	movups	16(%esi),%xmm3
-	je	L027ecb_dec_two
-	movups	32(%esi),%xmm4
-	cmpl	$64,%eax
-	jb	L028ecb_dec_three
-	movups	48(%esi),%xmm5
-	je	L029ecb_dec_four
-	movups	64(%esi),%xmm6
-	xorps	%xmm7,%xmm7
-	call	__aesni_decrypt6
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	movups	%xmm5,48(%edi)
-	movups	%xmm6,64(%edi)
-	jmp	L013ecb_ret
-.align	4,0x90
-L026ecb_dec_one:
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L030dec1_loop_4:
-.byte	102,15,56,222,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L030dec1_loop_4
-.byte	102,15,56,223,209
-	movups	%xmm2,(%edi)
-	jmp	L013ecb_ret
-.align	4,0x90
-L027ecb_dec_two:
-	call	__aesni_decrypt2
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	jmp	L013ecb_ret
-.align	4,0x90
-L028ecb_dec_three:
-	call	__aesni_decrypt3
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	jmp	L013ecb_ret
-.align	4,0x90
-L029ecb_dec_four:
-	call	__aesni_decrypt4
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	movups	%xmm5,48(%edi)
-L013ecb_ret:
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-	pxor	%xmm7,%xmm7
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_aes_hw_ccm64_encrypt_blocks
-.private_extern	_aes_hw_ccm64_encrypt_blocks
-.align	4
-_aes_hw_ccm64_encrypt_blocks:
-L_aes_hw_ccm64_encrypt_blocks_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%eax
-	movl	32(%esp),%edx
-	movl	36(%esp),%ebx
-	movl	40(%esp),%ecx
-	movl	%esp,%ebp
-	subl	$60,%esp
-	andl	$-16,%esp
-	movl	%ebp,48(%esp)
-	movdqu	(%ebx),%xmm7
-	movdqu	(%ecx),%xmm3
-	movl	240(%edx),%ecx
-	movl	$202182159,(%esp)
-	movl	$134810123,4(%esp)
-	movl	$67438087,8(%esp)
-	movl	$66051,12(%esp)
-	movl	$1,%ebx
-	xorl	%ebp,%ebp
-	movl	%ebx,16(%esp)
-	movl	%ebp,20(%esp)
-	movl	%ebp,24(%esp)
-	movl	%ebp,28(%esp)
-	shll	$4,%ecx
-	movl	$16,%ebx
-	leal	(%edx),%ebp
-	movdqa	(%esp),%xmm5
-	movdqa	%xmm7,%xmm2
-	leal	32(%edx,%ecx,1),%edx
-	subl	%ecx,%ebx
-.byte	102,15,56,0,253
-L031ccm64_enc_outer:
-	movups	(%ebp),%xmm0
-	movl	%ebx,%ecx
-	movups	(%esi),%xmm6
-	xorps	%xmm0,%xmm2
-	movups	16(%ebp),%xmm1
-	xorps	%xmm6,%xmm0
-	xorps	%xmm0,%xmm3
-	movups	32(%ebp),%xmm0
-L032ccm64_enc2_loop:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-	movups	(%edx,%ecx,1),%xmm1
-	addl	$32,%ecx
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-	movups	-16(%edx,%ecx,1),%xmm0
-	jnz	L032ccm64_enc2_loop
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-	paddq	16(%esp),%xmm7
-	decl	%eax
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-	leal	16(%esi),%esi
-	xorps	%xmm2,%xmm6
-	movdqa	%xmm7,%xmm2
-	movups	%xmm6,(%edi)
-.byte	102,15,56,0,213
-	leal	16(%edi),%edi
-	jnz	L031ccm64_enc_outer
-	movl	48(%esp),%esp
-	movl	40(%esp),%edi
-	movups	%xmm3,(%edi)
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-	pxor	%xmm7,%xmm7
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_aes_hw_ccm64_decrypt_blocks
-.private_extern	_aes_hw_ccm64_decrypt_blocks
-.align	4
-_aes_hw_ccm64_decrypt_blocks:
-L_aes_hw_ccm64_decrypt_blocks_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%eax
-	movl	32(%esp),%edx
-	movl	36(%esp),%ebx
-	movl	40(%esp),%ecx
-	movl	%esp,%ebp
-	subl	$60,%esp
-	andl	$-16,%esp
-	movl	%ebp,48(%esp)
-	movdqu	(%ebx),%xmm7
-	movdqu	(%ecx),%xmm3
-	movl	240(%edx),%ecx
-	movl	$202182159,(%esp)
-	movl	$134810123,4(%esp)
-	movl	$67438087,8(%esp)
-	movl	$66051,12(%esp)
-	movl	$1,%ebx
-	xorl	%ebp,%ebp
-	movl	%ebx,16(%esp)
-	movl	%ebp,20(%esp)
-	movl	%ebp,24(%esp)
-	movl	%ebp,28(%esp)
-	movdqa	(%esp),%xmm5
-	movdqa	%xmm7,%xmm2
-	movl	%edx,%ebp
-	movl	%ecx,%ebx
-.byte	102,15,56,0,253
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L033enc1_loop_5:
-.byte	102,15,56,220,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L033enc1_loop_5
-.byte	102,15,56,221,209
-	shll	$4,%ebx
-	movl	$16,%ecx
-	movups	(%esi),%xmm6
-	paddq	16(%esp),%xmm7
-	leal	16(%esi),%esi
-	subl	%ebx,%ecx
-	leal	32(%ebp,%ebx,1),%edx
-	movl	%ecx,%ebx
-	jmp	L034ccm64_dec_outer
-.align	4,0x90
-L034ccm64_dec_outer:
-	xorps	%xmm2,%xmm6
-	movdqa	%xmm7,%xmm2
-	movups	%xmm6,(%edi)
-	leal	16(%edi),%edi
-.byte	102,15,56,0,213
-	subl	$1,%eax
-	jz	L035ccm64_dec_break
-	movups	(%ebp),%xmm0
-	movl	%ebx,%ecx
-	movups	16(%ebp),%xmm1
-	xorps	%xmm0,%xmm6
-	xorps	%xmm0,%xmm2
-	xorps	%xmm6,%xmm3
-	movups	32(%ebp),%xmm0
-L036ccm64_dec2_loop:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-	movups	(%edx,%ecx,1),%xmm1
-	addl	$32,%ecx
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-	movups	-16(%edx,%ecx,1),%xmm0
-	jnz	L036ccm64_dec2_loop
-	movups	(%esi),%xmm6
-	paddq	16(%esp),%xmm7
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-	leal	16(%esi),%esi
-	jmp	L034ccm64_dec_outer
-.align	4,0x90
-L035ccm64_dec_break:
-	movl	240(%ebp),%ecx
-	movl	%ebp,%edx
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	xorps	%xmm0,%xmm6
-	leal	32(%edx),%edx
-	xorps	%xmm6,%xmm3
-L037enc1_loop_6:
-.byte	102,15,56,220,217
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L037enc1_loop_6
-.byte	102,15,56,221,217
-	movl	48(%esp),%esp
-	movl	40(%esp),%edi
-	movups	%xmm3,(%edi)
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-	pxor	%xmm7,%xmm7
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_aes_hw_ctr32_encrypt_blocks
-.private_extern	_aes_hw_ctr32_encrypt_blocks
-.align	4
-_aes_hw_ctr32_encrypt_blocks:
-L_aes_hw_ctr32_encrypt_blocks_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-#ifdef BORINGSSL_DISPATCH_TEST
-	pushl	%ebx
-	pushl	%edx
-	call	L038pic
-L038pic:
-	popl	%ebx
-	leal	_BORINGSSL_function_hit+0-L038pic(%ebx),%ebx
-	movl	$1,%edx
-	movb	%dl,(%ebx)
-	popl	%edx
-	popl	%ebx
-#endif
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%eax
-	movl	32(%esp),%edx
-	movl	36(%esp),%ebx
-	movl	%esp,%ebp
-	subl	$88,%esp
-	andl	$-16,%esp
-	movl	%ebp,80(%esp)
-	cmpl	$1,%eax
-	je	L039ctr32_one_shortcut
-	movdqu	(%ebx),%xmm7
-	movl	$202182159,(%esp)
-	movl	$134810123,4(%esp)
-	movl	$67438087,8(%esp)
-	movl	$66051,12(%esp)
-	movl	$6,%ecx
-	xorl	%ebp,%ebp
-	movl	%ecx,16(%esp)
-	movl	%ecx,20(%esp)
-	movl	%ecx,24(%esp)
-	movl	%ebp,28(%esp)
-.byte	102,15,58,22,251,3
-.byte	102,15,58,34,253,3
-	movl	240(%edx),%ecx
-	bswap	%ebx
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	movdqa	(%esp),%xmm2
-.byte	102,15,58,34,195,0
-	leal	3(%ebx),%ebp
-.byte	102,15,58,34,205,0
-	incl	%ebx
-.byte	102,15,58,34,195,1
-	incl	%ebp
-.byte	102,15,58,34,205,1
-	incl	%ebx
-.byte	102,15,58,34,195,2
-	incl	%ebp
-.byte	102,15,58,34,205,2
-	movdqa	%xmm0,48(%esp)
-.byte	102,15,56,0,194
-	movdqu	(%edx),%xmm6
-	movdqa	%xmm1,64(%esp)
-.byte	102,15,56,0,202
-	pshufd	$192,%xmm0,%xmm2
-	pshufd	$128,%xmm0,%xmm3
-	cmpl	$6,%eax
-	jb	L040ctr32_tail
-	pxor	%xmm6,%xmm7
-	shll	$4,%ecx
-	movl	$16,%ebx
-	movdqa	%xmm7,32(%esp)
-	movl	%edx,%ebp
-	subl	%ecx,%ebx
-	leal	32(%edx,%ecx,1),%edx
-	subl	$6,%eax
-	jmp	L041ctr32_loop6
-.align	4,0x90
-L041ctr32_loop6:
-	pshufd	$64,%xmm0,%xmm4
-	movdqa	32(%esp),%xmm0
-	pshufd	$192,%xmm1,%xmm5
-	pxor	%xmm0,%xmm2
-	pshufd	$128,%xmm1,%xmm6
-	pxor	%xmm0,%xmm3
-	pshufd	$64,%xmm1,%xmm7
-	movups	16(%ebp),%xmm1
-	pxor	%xmm0,%xmm4
-	pxor	%xmm0,%xmm5
-.byte	102,15,56,220,209
-	pxor	%xmm0,%xmm6
-	pxor	%xmm0,%xmm7
-.byte	102,15,56,220,217
-	movups	32(%ebp),%xmm0
-	movl	%ebx,%ecx
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-	call	L_aesni_encrypt6_enter
-	movups	(%esi),%xmm1
-	movups	16(%esi),%xmm0
-	xorps	%xmm1,%xmm2
-	movups	32(%esi),%xmm1
-	xorps	%xmm0,%xmm3
-	movups	%xmm2,(%edi)
-	movdqa	16(%esp),%xmm0
-	xorps	%xmm1,%xmm4
-	movdqa	64(%esp),%xmm1
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	paddd	%xmm0,%xmm1
-	paddd	48(%esp),%xmm0
-	movdqa	(%esp),%xmm2
-	movups	48(%esi),%xmm3
-	movups	64(%esi),%xmm4
-	xorps	%xmm3,%xmm5
-	movups	80(%esi),%xmm3
-	leal	96(%esi),%esi
-	movdqa	%xmm0,48(%esp)
-.byte	102,15,56,0,194
-	xorps	%xmm4,%xmm6
-	movups	%xmm5,48(%edi)
-	xorps	%xmm3,%xmm7
-	movdqa	%xmm1,64(%esp)
-.byte	102,15,56,0,202
-	movups	%xmm6,64(%edi)
-	pshufd	$192,%xmm0,%xmm2
-	movups	%xmm7,80(%edi)
-	leal	96(%edi),%edi
-	pshufd	$128,%xmm0,%xmm3
-	subl	$6,%eax
-	jnc	L041ctr32_loop6
-	addl	$6,%eax
-	jz	L042ctr32_ret
-	movdqu	(%ebp),%xmm7
-	movl	%ebp,%edx
-	pxor	32(%esp),%xmm7
-	movl	240(%ebp),%ecx
-L040ctr32_tail:
-	por	%xmm7,%xmm2
-	cmpl	$2,%eax
-	jb	L043ctr32_one
-	pshufd	$64,%xmm0,%xmm4
-	por	%xmm7,%xmm3
-	je	L044ctr32_two
-	pshufd	$192,%xmm1,%xmm5
-	por	%xmm7,%xmm4
-	cmpl	$4,%eax
-	jb	L045ctr32_three
-	pshufd	$128,%xmm1,%xmm6
-	por	%xmm7,%xmm5
-	je	L046ctr32_four
-	por	%xmm7,%xmm6
-	call	__aesni_encrypt6
-	movups	(%esi),%xmm1
-	movups	16(%esi),%xmm0
-	xorps	%xmm1,%xmm2
-	movups	32(%esi),%xmm1
-	xorps	%xmm0,%xmm3
-	movups	48(%esi),%xmm0
-	xorps	%xmm1,%xmm4
-	movups	64(%esi),%xmm1
-	xorps	%xmm0,%xmm5
-	movups	%xmm2,(%edi)
-	xorps	%xmm1,%xmm6
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	movups	%xmm5,48(%edi)
-	movups	%xmm6,64(%edi)
-	jmp	L042ctr32_ret
-.align	4,0x90
-L039ctr32_one_shortcut:
-	movups	(%ebx),%xmm2
-	movl	240(%edx),%ecx
-L043ctr32_one:
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L047enc1_loop_7:
-.byte	102,15,56,220,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L047enc1_loop_7
-.byte	102,15,56,221,209
-	movups	(%esi),%xmm6
-	xorps	%xmm2,%xmm6
-	movups	%xmm6,(%edi)
-	jmp	L042ctr32_ret
-.align	4,0x90
-L044ctr32_two:
-	call	__aesni_encrypt2
-	movups	(%esi),%xmm5
-	movups	16(%esi),%xmm6
-	xorps	%xmm5,%xmm2
-	xorps	%xmm6,%xmm3
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	jmp	L042ctr32_ret
-.align	4,0x90
-L045ctr32_three:
-	call	__aesni_encrypt3
-	movups	(%esi),%xmm5
-	movups	16(%esi),%xmm6
-	xorps	%xmm5,%xmm2
-	movups	32(%esi),%xmm7
-	xorps	%xmm6,%xmm3
-	movups	%xmm2,(%edi)
-	xorps	%xmm7,%xmm4
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	jmp	L042ctr32_ret
-.align	4,0x90
-L046ctr32_four:
-	call	__aesni_encrypt4
-	movups	(%esi),%xmm6
-	movups	16(%esi),%xmm7
-	movups	32(%esi),%xmm1
-	xorps	%xmm6,%xmm2
-	movups	48(%esi),%xmm0
-	xorps	%xmm7,%xmm3
-	movups	%xmm2,(%edi)
-	xorps	%xmm1,%xmm4
-	movups	%xmm3,16(%edi)
-	xorps	%xmm0,%xmm5
-	movups	%xmm4,32(%edi)
-	movups	%xmm5,48(%edi)
-L042ctr32_ret:
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	movdqa	%xmm0,32(%esp)
-	pxor	%xmm5,%xmm5
-	movdqa	%xmm0,48(%esp)
-	pxor	%xmm6,%xmm6
-	movdqa	%xmm0,64(%esp)
-	pxor	%xmm7,%xmm7
-	movl	80(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_aes_hw_xts_encrypt
-.private_extern	_aes_hw_xts_encrypt
-.align	4
-_aes_hw_xts_encrypt:
-L_aes_hw_xts_encrypt_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	36(%esp),%edx
-	movl	40(%esp),%esi
-	movl	240(%edx),%ecx
-	movups	(%esi),%xmm2
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L048enc1_loop_8:
-.byte	102,15,56,220,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L048enc1_loop_8
-.byte	102,15,56,221,209
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%eax
-	movl	32(%esp),%edx
-	movl	%esp,%ebp
-	subl	$120,%esp
-	movl	240(%edx),%ecx
-	andl	$-16,%esp
-	movl	$135,96(%esp)
-	movl	$0,100(%esp)
-	movl	$1,104(%esp)
-	movl	$0,108(%esp)
-	movl	%eax,112(%esp)
-	movl	%ebp,116(%esp)
-	movdqa	%xmm2,%xmm1
-	pxor	%xmm0,%xmm0
-	movdqa	96(%esp),%xmm3
-	pcmpgtd	%xmm1,%xmm0
-	andl	$-16,%eax
-	movl	%edx,%ebp
-	movl	%ecx,%ebx
-	subl	$96,%eax
-	jc	L049xts_enc_short
-	shll	$4,%ecx
-	movl	$16,%ebx
-	subl	%ecx,%ebx
-	leal	32(%edx,%ecx,1),%edx
-	jmp	L050xts_enc_loop6
-.align	4,0x90
-L050xts_enc_loop6:
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm1,(%esp)
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm1,16(%esp)
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm1,32(%esp)
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm1,48(%esp)
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	pshufd	$19,%xmm0,%xmm7
-	movdqa	%xmm1,64(%esp)
-	paddq	%xmm1,%xmm1
-	movups	(%ebp),%xmm0
-	pand	%xmm3,%xmm7
-	movups	(%esi),%xmm2
-	pxor	%xmm1,%xmm7
-	movl	%ebx,%ecx
-	movdqu	16(%esi),%xmm3
-	xorps	%xmm0,%xmm2
-	movdqu	32(%esi),%xmm4
-	pxor	%xmm0,%xmm3
-	movdqu	48(%esi),%xmm5
-	pxor	%xmm0,%xmm4
-	movdqu	64(%esi),%xmm6
-	pxor	%xmm0,%xmm5
-	movdqu	80(%esi),%xmm1
-	pxor	%xmm0,%xmm6
-	leal	96(%esi),%esi
-	pxor	(%esp),%xmm2
-	movdqa	%xmm7,80(%esp)
-	pxor	%xmm1,%xmm7
-	movups	16(%ebp),%xmm1
-	pxor	16(%esp),%xmm3
-	pxor	32(%esp),%xmm4
-.byte	102,15,56,220,209
-	pxor	48(%esp),%xmm5
-	pxor	64(%esp),%xmm6
-.byte	102,15,56,220,217
-	pxor	%xmm0,%xmm7
-	movups	32(%ebp),%xmm0
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-	call	L_aesni_encrypt6_enter
-	movdqa	80(%esp),%xmm1
-	pxor	%xmm0,%xmm0
-	xorps	(%esp),%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	xorps	16(%esp),%xmm3
-	movups	%xmm2,(%edi)
-	xorps	32(%esp),%xmm4
-	movups	%xmm3,16(%edi)
-	xorps	48(%esp),%xmm5
-	movups	%xmm4,32(%edi)
-	xorps	64(%esp),%xmm6
-	movups	%xmm5,48(%edi)
-	xorps	%xmm1,%xmm7
-	movups	%xmm6,64(%edi)
-	pshufd	$19,%xmm0,%xmm2
-	movups	%xmm7,80(%edi)
-	leal	96(%edi),%edi
-	movdqa	96(%esp),%xmm3
-	pxor	%xmm0,%xmm0
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	subl	$96,%eax
-	jnc	L050xts_enc_loop6
-	movl	240(%ebp),%ecx
-	movl	%ebp,%edx
-	movl	%ecx,%ebx
-L049xts_enc_short:
-	addl	$96,%eax
-	jz	L051xts_enc_done6x
-	movdqa	%xmm1,%xmm5
-	cmpl	$32,%eax
-	jb	L052xts_enc_one
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	je	L053xts_enc_two
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm1,%xmm6
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	cmpl	$64,%eax
-	jb	L054xts_enc_three
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm1,%xmm7
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm5,(%esp)
-	movdqa	%xmm6,16(%esp)
-	je	L055xts_enc_four
-	movdqa	%xmm7,32(%esp)
-	pshufd	$19,%xmm0,%xmm7
-	movdqa	%xmm1,48(%esp)
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm7
-	pxor	%xmm1,%xmm7
-	movdqu	(%esi),%xmm2
-	movdqu	16(%esi),%xmm3
-	movdqu	32(%esi),%xmm4
-	pxor	(%esp),%xmm2
-	movdqu	48(%esi),%xmm5
-	pxor	16(%esp),%xmm3
-	movdqu	64(%esi),%xmm6
-	pxor	32(%esp),%xmm4
-	leal	80(%esi),%esi
-	pxor	48(%esp),%xmm5
-	movdqa	%xmm7,64(%esp)
-	pxor	%xmm7,%xmm6
-	call	__aesni_encrypt6
-	movaps	64(%esp),%xmm1
-	xorps	(%esp),%xmm2
-	xorps	16(%esp),%xmm3
-	xorps	32(%esp),%xmm4
-	movups	%xmm2,(%edi)
-	xorps	48(%esp),%xmm5
-	movups	%xmm3,16(%edi)
-	xorps	%xmm1,%xmm6
-	movups	%xmm4,32(%edi)
-	movups	%xmm5,48(%edi)
-	movups	%xmm6,64(%edi)
-	leal	80(%edi),%edi
-	jmp	L056xts_enc_done
-.align	4,0x90
-L052xts_enc_one:
-	movups	(%esi),%xmm2
-	leal	16(%esi),%esi
-	xorps	%xmm5,%xmm2
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L057enc1_loop_9:
-.byte	102,15,56,220,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L057enc1_loop_9
-.byte	102,15,56,221,209
-	xorps	%xmm5,%xmm2
-	movups	%xmm2,(%edi)
-	leal	16(%edi),%edi
-	movdqa	%xmm5,%xmm1
-	jmp	L056xts_enc_done
-.align	4,0x90
-L053xts_enc_two:
-	movaps	%xmm1,%xmm6
-	movups	(%esi),%xmm2
-	movups	16(%esi),%xmm3
-	leal	32(%esi),%esi
-	xorps	%xmm5,%xmm2
-	xorps	%xmm6,%xmm3
-	call	__aesni_encrypt2
-	xorps	%xmm5,%xmm2
-	xorps	%xmm6,%xmm3
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	leal	32(%edi),%edi
-	movdqa	%xmm6,%xmm1
-	jmp	L056xts_enc_done
-.align	4,0x90
-L054xts_enc_three:
-	movaps	%xmm1,%xmm7
-	movups	(%esi),%xmm2
-	movups	16(%esi),%xmm3
-	movups	32(%esi),%xmm4
-	leal	48(%esi),%esi
-	xorps	%xmm5,%xmm2
-	xorps	%xmm6,%xmm3
-	xorps	%xmm7,%xmm4
-	call	__aesni_encrypt3
-	xorps	%xmm5,%xmm2
-	xorps	%xmm6,%xmm3
-	xorps	%xmm7,%xmm4
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	leal	48(%edi),%edi
-	movdqa	%xmm7,%xmm1
-	jmp	L056xts_enc_done
-.align	4,0x90
-L055xts_enc_four:
-	movaps	%xmm1,%xmm6
-	movups	(%esi),%xmm2
-	movups	16(%esi),%xmm3
-	movups	32(%esi),%xmm4
-	xorps	(%esp),%xmm2
-	movups	48(%esi),%xmm5
-	leal	64(%esi),%esi
-	xorps	16(%esp),%xmm3
-	xorps	%xmm7,%xmm4
-	xorps	%xmm6,%xmm5
-	call	__aesni_encrypt4
-	xorps	(%esp),%xmm2
-	xorps	16(%esp),%xmm3
-	xorps	%xmm7,%xmm4
-	movups	%xmm2,(%edi)
-	xorps	%xmm6,%xmm5
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	movups	%xmm5,48(%edi)
-	leal	64(%edi),%edi
-	movdqa	%xmm6,%xmm1
-	jmp	L056xts_enc_done
-.align	4,0x90
-L051xts_enc_done6x:
-	movl	112(%esp),%eax
-	andl	$15,%eax
-	jz	L058xts_enc_ret
-	movdqa	%xmm1,%xmm5
-	movl	%eax,112(%esp)
-	jmp	L059xts_enc_steal
-.align	4,0x90
-L056xts_enc_done:
-	movl	112(%esp),%eax
-	pxor	%xmm0,%xmm0
-	andl	$15,%eax
-	jz	L058xts_enc_ret
-	pcmpgtd	%xmm1,%xmm0
-	movl	%eax,112(%esp)
-	pshufd	$19,%xmm0,%xmm5
-	paddq	%xmm1,%xmm1
-	pand	96(%esp),%xmm5
-	pxor	%xmm1,%xmm5
-L059xts_enc_steal:
-	movzbl	(%esi),%ecx
-	movzbl	-16(%edi),%edx
-	leal	1(%esi),%esi
-	movb	%cl,-16(%edi)
-	movb	%dl,(%edi)
-	leal	1(%edi),%edi
-	subl	$1,%eax
-	jnz	L059xts_enc_steal
-	subl	112(%esp),%edi
-	movl	%ebp,%edx
-	movl	%ebx,%ecx
-	movups	-16(%edi),%xmm2
-	xorps	%xmm5,%xmm2
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L060enc1_loop_10:
-.byte	102,15,56,220,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L060enc1_loop_10
-.byte	102,15,56,221,209
-	xorps	%xmm5,%xmm2
-	movups	%xmm2,-16(%edi)
-L058xts_enc_ret:
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	movdqa	%xmm0,(%esp)
-	pxor	%xmm3,%xmm3
-	movdqa	%xmm0,16(%esp)
-	pxor	%xmm4,%xmm4
-	movdqa	%xmm0,32(%esp)
-	pxor	%xmm5,%xmm5
-	movdqa	%xmm0,48(%esp)
-	pxor	%xmm6,%xmm6
-	movdqa	%xmm0,64(%esp)
-	pxor	%xmm7,%xmm7
-	movdqa	%xmm0,80(%esp)
-	movl	116(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_aes_hw_xts_decrypt
-.private_extern	_aes_hw_xts_decrypt
-.align	4
-_aes_hw_xts_decrypt:
-L_aes_hw_xts_decrypt_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	36(%esp),%edx
-	movl	40(%esp),%esi
-	movl	240(%edx),%ecx
-	movups	(%esi),%xmm2
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L061enc1_loop_11:
-.byte	102,15,56,220,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L061enc1_loop_11
-.byte	102,15,56,221,209
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%eax
-	movl	32(%esp),%edx
-	movl	%esp,%ebp
-	subl	$120,%esp
-	andl	$-16,%esp
-	xorl	%ebx,%ebx
-	testl	$15,%eax
-	setnz	%bl
-	shll	$4,%ebx
-	subl	%ebx,%eax
-	movl	$135,96(%esp)
-	movl	$0,100(%esp)
-	movl	$1,104(%esp)
-	movl	$0,108(%esp)
-	movl	%eax,112(%esp)
-	movl	%ebp,116(%esp)
-	movl	240(%edx),%ecx
-	movl	%edx,%ebp
-	movl	%ecx,%ebx
-	movdqa	%xmm2,%xmm1
-	pxor	%xmm0,%xmm0
-	movdqa	96(%esp),%xmm3
-	pcmpgtd	%xmm1,%xmm0
-	andl	$-16,%eax
-	subl	$96,%eax
-	jc	L062xts_dec_short
-	shll	$4,%ecx
-	movl	$16,%ebx
-	subl	%ecx,%ebx
-	leal	32(%edx,%ecx,1),%edx
-	jmp	L063xts_dec_loop6
-.align	4,0x90
-L063xts_dec_loop6:
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm1,(%esp)
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm1,16(%esp)
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm1,32(%esp)
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm1,48(%esp)
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	pshufd	$19,%xmm0,%xmm7
-	movdqa	%xmm1,64(%esp)
-	paddq	%xmm1,%xmm1
-	movups	(%ebp),%xmm0
-	pand	%xmm3,%xmm7
-	movups	(%esi),%xmm2
-	pxor	%xmm1,%xmm7
-	movl	%ebx,%ecx
-	movdqu	16(%esi),%xmm3
-	xorps	%xmm0,%xmm2
-	movdqu	32(%esi),%xmm4
-	pxor	%xmm0,%xmm3
-	movdqu	48(%esi),%xmm5
-	pxor	%xmm0,%xmm4
-	movdqu	64(%esi),%xmm6
-	pxor	%xmm0,%xmm5
-	movdqu	80(%esi),%xmm1
-	pxor	%xmm0,%xmm6
-	leal	96(%esi),%esi
-	pxor	(%esp),%xmm2
-	movdqa	%xmm7,80(%esp)
-	pxor	%xmm1,%xmm7
-	movups	16(%ebp),%xmm1
-	pxor	16(%esp),%xmm3
-	pxor	32(%esp),%xmm4
-.byte	102,15,56,222,209
-	pxor	48(%esp),%xmm5
-	pxor	64(%esp),%xmm6
-.byte	102,15,56,222,217
-	pxor	%xmm0,%xmm7
-	movups	32(%ebp),%xmm0
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-	call	L_aesni_decrypt6_enter
-	movdqa	80(%esp),%xmm1
-	pxor	%xmm0,%xmm0
-	xorps	(%esp),%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	xorps	16(%esp),%xmm3
-	movups	%xmm2,(%edi)
-	xorps	32(%esp),%xmm4
-	movups	%xmm3,16(%edi)
-	xorps	48(%esp),%xmm5
-	movups	%xmm4,32(%edi)
-	xorps	64(%esp),%xmm6
-	movups	%xmm5,48(%edi)
-	xorps	%xmm1,%xmm7
-	movups	%xmm6,64(%edi)
-	pshufd	$19,%xmm0,%xmm2
-	movups	%xmm7,80(%edi)
-	leal	96(%edi),%edi
-	movdqa	96(%esp),%xmm3
-	pxor	%xmm0,%xmm0
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	subl	$96,%eax
-	jnc	L063xts_dec_loop6
-	movl	240(%ebp),%ecx
-	movl	%ebp,%edx
-	movl	%ecx,%ebx
-L062xts_dec_short:
-	addl	$96,%eax
-	jz	L064xts_dec_done6x
-	movdqa	%xmm1,%xmm5
-	cmpl	$32,%eax
-	jb	L065xts_dec_one
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	je	L066xts_dec_two
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm1,%xmm6
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	cmpl	$64,%eax
-	jb	L067xts_dec_three
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm1,%xmm7
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm5,(%esp)
-	movdqa	%xmm6,16(%esp)
-	je	L068xts_dec_four
-	movdqa	%xmm7,32(%esp)
-	pshufd	$19,%xmm0,%xmm7
-	movdqa	%xmm1,48(%esp)
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm7
-	pxor	%xmm1,%xmm7
-	movdqu	(%esi),%xmm2
-	movdqu	16(%esi),%xmm3
-	movdqu	32(%esi),%xmm4
-	pxor	(%esp),%xmm2
-	movdqu	48(%esi),%xmm5
-	pxor	16(%esp),%xmm3
-	movdqu	64(%esi),%xmm6
-	pxor	32(%esp),%xmm4
-	leal	80(%esi),%esi
-	pxor	48(%esp),%xmm5
-	movdqa	%xmm7,64(%esp)
-	pxor	%xmm7,%xmm6
-	call	__aesni_decrypt6
-	movaps	64(%esp),%xmm1
-	xorps	(%esp),%xmm2
-	xorps	16(%esp),%xmm3
-	xorps	32(%esp),%xmm4
-	movups	%xmm2,(%edi)
-	xorps	48(%esp),%xmm5
-	movups	%xmm3,16(%edi)
-	xorps	%xmm1,%xmm6
-	movups	%xmm4,32(%edi)
-	movups	%xmm5,48(%edi)
-	movups	%xmm6,64(%edi)
-	leal	80(%edi),%edi
-	jmp	L069xts_dec_done
-.align	4,0x90
-L065xts_dec_one:
-	movups	(%esi),%xmm2
-	leal	16(%esi),%esi
-	xorps	%xmm5,%xmm2
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L070dec1_loop_12:
-.byte	102,15,56,222,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L070dec1_loop_12
-.byte	102,15,56,223,209
-	xorps	%xmm5,%xmm2
-	movups	%xmm2,(%edi)
-	leal	16(%edi),%edi
-	movdqa	%xmm5,%xmm1
-	jmp	L069xts_dec_done
-.align	4,0x90
-L066xts_dec_two:
-	movaps	%xmm1,%xmm6
-	movups	(%esi),%xmm2
-	movups	16(%esi),%xmm3
-	leal	32(%esi),%esi
-	xorps	%xmm5,%xmm2
-	xorps	%xmm6,%xmm3
-	call	__aesni_decrypt2
-	xorps	%xmm5,%xmm2
-	xorps	%xmm6,%xmm3
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	leal	32(%edi),%edi
-	movdqa	%xmm6,%xmm1
-	jmp	L069xts_dec_done
-.align	4,0x90
-L067xts_dec_three:
-	movaps	%xmm1,%xmm7
-	movups	(%esi),%xmm2
-	movups	16(%esi),%xmm3
-	movups	32(%esi),%xmm4
-	leal	48(%esi),%esi
-	xorps	%xmm5,%xmm2
-	xorps	%xmm6,%xmm3
-	xorps	%xmm7,%xmm4
-	call	__aesni_decrypt3
-	xorps	%xmm5,%xmm2
-	xorps	%xmm6,%xmm3
-	xorps	%xmm7,%xmm4
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	leal	48(%edi),%edi
-	movdqa	%xmm7,%xmm1
-	jmp	L069xts_dec_done
-.align	4,0x90
-L068xts_dec_four:
-	movaps	%xmm1,%xmm6
-	movups	(%esi),%xmm2
-	movups	16(%esi),%xmm3
-	movups	32(%esi),%xmm4
-	xorps	(%esp),%xmm2
-	movups	48(%esi),%xmm5
-	leal	64(%esi),%esi
-	xorps	16(%esp),%xmm3
-	xorps	%xmm7,%xmm4
-	xorps	%xmm6,%xmm5
-	call	__aesni_decrypt4
-	xorps	(%esp),%xmm2
-	xorps	16(%esp),%xmm3
-	xorps	%xmm7,%xmm4
-	movups	%xmm2,(%edi)
-	xorps	%xmm6,%xmm5
-	movups	%xmm3,16(%edi)
-	movups	%xmm4,32(%edi)
-	movups	%xmm5,48(%edi)
-	leal	64(%edi),%edi
-	movdqa	%xmm6,%xmm1
-	jmp	L069xts_dec_done
-.align	4,0x90
-L064xts_dec_done6x:
-	movl	112(%esp),%eax
-	andl	$15,%eax
-	jz	L071xts_dec_ret
-	movl	%eax,112(%esp)
-	jmp	L072xts_dec_only_one_more
-.align	4,0x90
-L069xts_dec_done:
-	movl	112(%esp),%eax
-	pxor	%xmm0,%xmm0
-	andl	$15,%eax
-	jz	L071xts_dec_ret
-	pcmpgtd	%xmm1,%xmm0
-	movl	%eax,112(%esp)
-	pshufd	$19,%xmm0,%xmm2
-	pxor	%xmm0,%xmm0
-	movdqa	96(%esp),%xmm3
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm2
-	pcmpgtd	%xmm1,%xmm0
-	pxor	%xmm2,%xmm1
-L072xts_dec_only_one_more:
-	pshufd	$19,%xmm0,%xmm5
-	movdqa	%xmm1,%xmm6
-	paddq	%xmm1,%xmm1
-	pand	%xmm3,%xmm5
-	pxor	%xmm1,%xmm5
-	movl	%ebp,%edx
-	movl	%ebx,%ecx
-	movups	(%esi),%xmm2
-	xorps	%xmm5,%xmm2
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L073dec1_loop_13:
-.byte	102,15,56,222,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L073dec1_loop_13
-.byte	102,15,56,223,209
-	xorps	%xmm5,%xmm2
-	movups	%xmm2,(%edi)
-L074xts_dec_steal:
-	movzbl	16(%esi),%ecx
-	movzbl	(%edi),%edx
-	leal	1(%esi),%esi
-	movb	%cl,(%edi)
-	movb	%dl,16(%edi)
-	leal	1(%edi),%edi
-	subl	$1,%eax
-	jnz	L074xts_dec_steal
-	subl	112(%esp),%edi
-	movl	%ebp,%edx
-	movl	%ebx,%ecx
-	movups	(%edi),%xmm2
-	xorps	%xmm6,%xmm2
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L075dec1_loop_14:
-.byte	102,15,56,222,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L075dec1_loop_14
-.byte	102,15,56,223,209
-	xorps	%xmm6,%xmm2
-	movups	%xmm2,(%edi)
-L071xts_dec_ret:
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	movdqa	%xmm0,(%esp)
-	pxor	%xmm3,%xmm3
-	movdqa	%xmm0,16(%esp)
-	pxor	%xmm4,%xmm4
-	movdqa	%xmm0,32(%esp)
-	pxor	%xmm5,%xmm5
-	movdqa	%xmm0,48(%esp)
-	pxor	%xmm6,%xmm6
-	movdqa	%xmm0,64(%esp)
-	pxor	%xmm7,%xmm7
-	movdqa	%xmm0,80(%esp)
-	movl	116(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_aes_hw_cbc_encrypt
-.private_extern	_aes_hw_cbc_encrypt
-.align	4
-_aes_hw_cbc_encrypt:
-L_aes_hw_cbc_encrypt_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%esi
-	movl	%esp,%ebx
-	movl	24(%esp),%edi
-	subl	$24,%ebx
-	movl	28(%esp),%eax
-	andl	$-16,%ebx
-	movl	32(%esp),%edx
-	movl	36(%esp),%ebp
-	testl	%eax,%eax
-	jz	L076cbc_abort
-	cmpl	$0,40(%esp)
-	xchgl	%esp,%ebx
-	movups	(%ebp),%xmm7
-	movl	240(%edx),%ecx
-	movl	%edx,%ebp
-	movl	%ebx,16(%esp)
-	movl	%ecx,%ebx
-	je	L077cbc_decrypt
-	movaps	%xmm7,%xmm2
-	cmpl	$16,%eax
-	jb	L078cbc_enc_tail
-	subl	$16,%eax
-	jmp	L079cbc_enc_loop
-.align	4,0x90
-L079cbc_enc_loop:
-	movups	(%esi),%xmm7
-	leal	16(%esi),%esi
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	xorps	%xmm0,%xmm7
-	leal	32(%edx),%edx
-	xorps	%xmm7,%xmm2
-L080enc1_loop_15:
-.byte	102,15,56,220,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L080enc1_loop_15
-.byte	102,15,56,221,209
-	movl	%ebx,%ecx
-	movl	%ebp,%edx
-	movups	%xmm2,(%edi)
-	leal	16(%edi),%edi
-	subl	$16,%eax
-	jnc	L079cbc_enc_loop
-	addl	$16,%eax
-	jnz	L078cbc_enc_tail
-	movaps	%xmm2,%xmm7
-	pxor	%xmm2,%xmm2
-	jmp	L081cbc_ret
-L078cbc_enc_tail:
-	movl	%eax,%ecx
-.long	2767451785
-	movl	$16,%ecx
-	subl	%eax,%ecx
-	xorl	%eax,%eax
-.long	2868115081
-	leal	-16(%edi),%edi
-	movl	%ebx,%ecx
-	movl	%edi,%esi
-	movl	%ebp,%edx
-	jmp	L079cbc_enc_loop
-.align	4,0x90
-L077cbc_decrypt:
-	cmpl	$80,%eax
-	jbe	L082cbc_dec_tail
-	movaps	%xmm7,(%esp)
-	subl	$80,%eax
-	jmp	L083cbc_dec_loop6_enter
-.align	4,0x90
-L084cbc_dec_loop6:
-	movaps	%xmm0,(%esp)
-	movups	%xmm7,(%edi)
-	leal	16(%edi),%edi
-L083cbc_dec_loop6_enter:
-	movdqu	(%esi),%xmm2
-	movdqu	16(%esi),%xmm3
-	movdqu	32(%esi),%xmm4
-	movdqu	48(%esi),%xmm5
-	movdqu	64(%esi),%xmm6
-	movdqu	80(%esi),%xmm7
-	call	__aesni_decrypt6
-	movups	(%esi),%xmm1
-	movups	16(%esi),%xmm0
-	xorps	(%esp),%xmm2
-	xorps	%xmm1,%xmm3
-	movups	32(%esi),%xmm1
-	xorps	%xmm0,%xmm4
-	movups	48(%esi),%xmm0
-	xorps	%xmm1,%xmm5
-	movups	64(%esi),%xmm1
-	xorps	%xmm0,%xmm6
-	movups	80(%esi),%xmm0
-	xorps	%xmm1,%xmm7
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	leal	96(%esi),%esi
-	movups	%xmm4,32(%edi)
-	movl	%ebx,%ecx
-	movups	%xmm5,48(%edi)
-	movl	%ebp,%edx
-	movups	%xmm6,64(%edi)
-	leal	80(%edi),%edi
-	subl	$96,%eax
-	ja	L084cbc_dec_loop6
-	movaps	%xmm7,%xmm2
-	movaps	%xmm0,%xmm7
-	addl	$80,%eax
-	jle	L085cbc_dec_clear_tail_collected
-	movups	%xmm2,(%edi)
-	leal	16(%edi),%edi
-L082cbc_dec_tail:
-	movups	(%esi),%xmm2
-	movaps	%xmm2,%xmm6
-	cmpl	$16,%eax
-	jbe	L086cbc_dec_one
-	movups	16(%esi),%xmm3
-	movaps	%xmm3,%xmm5
-	cmpl	$32,%eax
-	jbe	L087cbc_dec_two
-	movups	32(%esi),%xmm4
-	cmpl	$48,%eax
-	jbe	L088cbc_dec_three
-	movups	48(%esi),%xmm5
-	cmpl	$64,%eax
-	jbe	L089cbc_dec_four
-	movups	64(%esi),%xmm6
-	movaps	%xmm7,(%esp)
-	movups	(%esi),%xmm2
-	xorps	%xmm7,%xmm7
-	call	__aesni_decrypt6
-	movups	(%esi),%xmm1
-	movups	16(%esi),%xmm0
-	xorps	(%esp),%xmm2
-	xorps	%xmm1,%xmm3
-	movups	32(%esi),%xmm1
-	xorps	%xmm0,%xmm4
-	movups	48(%esi),%xmm0
-	xorps	%xmm1,%xmm5
-	movups	64(%esi),%xmm7
-	xorps	%xmm0,%xmm6
-	movups	%xmm2,(%edi)
-	movups	%xmm3,16(%edi)
-	pxor	%xmm3,%xmm3
-	movups	%xmm4,32(%edi)
-	pxor	%xmm4,%xmm4
-	movups	%xmm5,48(%edi)
-	pxor	%xmm5,%xmm5
-	leal	64(%edi),%edi
-	movaps	%xmm6,%xmm2
-	pxor	%xmm6,%xmm6
-	subl	$80,%eax
-	jmp	L090cbc_dec_tail_collected
-.align	4,0x90
-L086cbc_dec_one:
-	movups	(%edx),%xmm0
-	movups	16(%edx),%xmm1
-	leal	32(%edx),%edx
-	xorps	%xmm0,%xmm2
-L091dec1_loop_16:
-.byte	102,15,56,222,209
-	decl	%ecx
-	movups	(%edx),%xmm1
-	leal	16(%edx),%edx
-	jnz	L091dec1_loop_16
-.byte	102,15,56,223,209
-	xorps	%xmm7,%xmm2
-	movaps	%xmm6,%xmm7
-	subl	$16,%eax
-	jmp	L090cbc_dec_tail_collected
-.align	4,0x90
-L087cbc_dec_two:
-	call	__aesni_decrypt2
-	xorps	%xmm7,%xmm2
-	xorps	%xmm6,%xmm3
-	movups	%xmm2,(%edi)
-	movaps	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	leal	16(%edi),%edi
-	movaps	%xmm5,%xmm7
-	subl	$32,%eax
-	jmp	L090cbc_dec_tail_collected
-.align	4,0x90
-L088cbc_dec_three:
-	call	__aesni_decrypt3
-	xorps	%xmm7,%xmm2
-	xorps	%xmm6,%xmm3
-	xorps	%xmm5,%xmm4
-	movups	%xmm2,(%edi)
-	movaps	%xmm4,%xmm2
-	pxor	%xmm4,%xmm4
-	movups	%xmm3,16(%edi)
-	pxor	%xmm3,%xmm3
-	leal	32(%edi),%edi
-	movups	32(%esi),%xmm7
-	subl	$48,%eax
-	jmp	L090cbc_dec_tail_collected
-.align	4,0x90
-L089cbc_dec_four:
-	call	__aesni_decrypt4
-	movups	16(%esi),%xmm1
-	movups	32(%esi),%xmm0
-	xorps	%xmm7,%xmm2
-	movups	48(%esi),%xmm7
-	xorps	%xmm6,%xmm3
-	movups	%xmm2,(%edi)
-	xorps	%xmm1,%xmm4
-	movups	%xmm3,16(%edi)
-	pxor	%xmm3,%xmm3
-	xorps	%xmm0,%xmm5
-	movups	%xmm4,32(%edi)
-	pxor	%xmm4,%xmm4
-	leal	48(%edi),%edi
-	movaps	%xmm5,%xmm2
-	pxor	%xmm5,%xmm5
-	subl	$64,%eax
-	jmp	L090cbc_dec_tail_collected
-.align	4,0x90
-L085cbc_dec_clear_tail_collected:
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-L090cbc_dec_tail_collected:
-	andl	$15,%eax
-	jnz	L092cbc_dec_tail_partial
-	movups	%xmm2,(%edi)
-	pxor	%xmm0,%xmm0
-	jmp	L081cbc_ret
-.align	4,0x90
-L092cbc_dec_tail_partial:
-	movaps	%xmm2,(%esp)
-	pxor	%xmm0,%xmm0
-	movl	$16,%ecx
-	movl	%esp,%esi
-	subl	%eax,%ecx
-.long	2767451785
-	movdqa	%xmm2,(%esp)
-L081cbc_ret:
-	movl	16(%esp),%esp
-	movl	36(%esp),%ebp
-	pxor	%xmm2,%xmm2
-	pxor	%xmm1,%xmm1
-	movups	%xmm7,(%ebp)
-	pxor	%xmm7,%xmm7
-L076cbc_abort:
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.private_extern	__aesni_set_encrypt_key
-.align	4
-__aesni_set_encrypt_key:
-	pushl	%ebp
-	pushl	%ebx
-	testl	%eax,%eax
-	jz	L093bad_pointer
-	testl	%edx,%edx
-	jz	L093bad_pointer
-	call	L094pic
-L094pic:
-	popl	%ebx
-	leal	Lkey_const-L094pic(%ebx),%ebx
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp
-	movups	(%eax),%xmm0
-	xorps	%xmm4,%xmm4
-	movl	4(%ebp),%ebp
-	leal	16(%edx),%edx
-	andl	$268437504,%ebp
-	cmpl	$256,%ecx
-	je	L09514rounds
-	cmpl	$192,%ecx
-	je	L09612rounds
-	cmpl	$128,%ecx
-	jne	L097bad_keybits
-.align	4,0x90
-L09810rounds:
-	cmpl	$268435456,%ebp
-	je	L09910rounds_alt
-	movl	$9,%ecx
-	movups	%xmm0,-16(%edx)
-.byte	102,15,58,223,200,1
-	call	L100key_128_cold
-.byte	102,15,58,223,200,2
-	call	L101key_128
-.byte	102,15,58,223,200,4
-	call	L101key_128
-.byte	102,15,58,223,200,8
-	call	L101key_128
-.byte	102,15,58,223,200,16
-	call	L101key_128
-.byte	102,15,58,223,200,32
-	call	L101key_128
-.byte	102,15,58,223,200,64
-	call	L101key_128
-.byte	102,15,58,223,200,128
-	call	L101key_128
-.byte	102,15,58,223,200,27
-	call	L101key_128
-.byte	102,15,58,223,200,54
-	call	L101key_128
-	movups	%xmm0,(%edx)
-	movl	%ecx,80(%edx)
-	jmp	L102good_key
-.align	4,0x90
-L101key_128:
-	movups	%xmm0,(%edx)
-	leal	16(%edx),%edx
-L100key_128_cold:
-	shufps	$16,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$255,%xmm1,%xmm1
-	xorps	%xmm1,%xmm0
-	ret
-.align	4,0x90
-L09910rounds_alt:
-	movdqa	(%ebx),%xmm5
-	movl	$8,%ecx
-	movdqa	32(%ebx),%xmm4
-	movdqa	%xmm0,%xmm2
-	movdqu	%xmm0,-16(%edx)
-L103loop_key128:
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-	pslld	$1,%xmm4
-	leal	16(%edx),%edx
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,-16(%edx)
-	movdqa	%xmm0,%xmm2
-	decl	%ecx
-	jnz	L103loop_key128
-	movdqa	48(%ebx),%xmm4
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-	pslld	$1,%xmm4
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,(%edx)
-	movdqa	%xmm0,%xmm2
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,16(%edx)
-	movl	$9,%ecx
-	movl	%ecx,96(%edx)
-	jmp	L102good_key
-.align	4,0x90
-L09612rounds:
-	movq	16(%eax),%xmm2
-	cmpl	$268435456,%ebp
-	je	L10412rounds_alt
-	movl	$11,%ecx
-	movups	%xmm0,-16(%edx)
-.byte	102,15,58,223,202,1
-	call	L105key_192a_cold
-.byte	102,15,58,223,202,2
-	call	L106key_192b
-.byte	102,15,58,223,202,4
-	call	L107key_192a
-.byte	102,15,58,223,202,8
-	call	L106key_192b
-.byte	102,15,58,223,202,16
-	call	L107key_192a
-.byte	102,15,58,223,202,32
-	call	L106key_192b
-.byte	102,15,58,223,202,64
-	call	L107key_192a
-.byte	102,15,58,223,202,128
-	call	L106key_192b
-	movups	%xmm0,(%edx)
-	movl	%ecx,48(%edx)
-	jmp	L102good_key
-.align	4,0x90
-L107key_192a:
-	movups	%xmm0,(%edx)
-	leal	16(%edx),%edx
-.align	4,0x90
-L105key_192a_cold:
-	movaps	%xmm2,%xmm5
-L108key_192b_warm:
-	shufps	$16,%xmm0,%xmm4
-	movdqa	%xmm2,%xmm3
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	pslldq	$4,%xmm3
-	xorps	%xmm4,%xmm0
-	pshufd	$85,%xmm1,%xmm1
-	pxor	%xmm3,%xmm2
-	pxor	%xmm1,%xmm0
-	pshufd	$255,%xmm0,%xmm3
-	pxor	%xmm3,%xmm2
-	ret
-.align	4,0x90
-L106key_192b:
-	movaps	%xmm0,%xmm3
-	shufps	$68,%xmm0,%xmm5
-	movups	%xmm5,(%edx)
-	shufps	$78,%xmm2,%xmm3
-	movups	%xmm3,16(%edx)
-	leal	32(%edx),%edx
-	jmp	L108key_192b_warm
-.align	4,0x90
-L10412rounds_alt:
-	movdqa	16(%ebx),%xmm5
-	movdqa	32(%ebx),%xmm4
-	movl	$8,%ecx
-	movdqu	%xmm0,-16(%edx)
-L109loop_key192:
-	movq	%xmm2,(%edx)
-	movdqa	%xmm2,%xmm1
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
-	pslld	$1,%xmm4
-	leal	24(%edx),%edx
-	movdqa	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm3,%xmm0
-	pshufd	$255,%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-	pxor	%xmm2,%xmm0
-	pxor	%xmm3,%xmm2
-	movdqu	%xmm0,-16(%edx)
-	decl	%ecx
-	jnz	L109loop_key192
-	movl	$11,%ecx
-	movl	%ecx,32(%edx)
-	jmp	L102good_key
-.align	4,0x90
-L09514rounds:
-	movups	16(%eax),%xmm2
-	leal	16(%edx),%edx
-	cmpl	$268435456,%ebp
-	je	L11014rounds_alt
-	movl	$13,%ecx
-	movups	%xmm0,-32(%edx)
-	movups	%xmm2,-16(%edx)
-.byte	102,15,58,223,202,1
-	call	L111key_256a_cold
-.byte	102,15,58,223,200,1
-	call	L112key_256b
-.byte	102,15,58,223,202,2
-	call	L113key_256a
-.byte	102,15,58,223,200,2
-	call	L112key_256b
-.byte	102,15,58,223,202,4
-	call	L113key_256a
-.byte	102,15,58,223,200,4
-	call	L112key_256b
-.byte	102,15,58,223,202,8
-	call	L113key_256a
-.byte	102,15,58,223,200,8
-	call	L112key_256b
-.byte	102,15,58,223,202,16
-	call	L113key_256a
-.byte	102,15,58,223,200,16
-	call	L112key_256b
-.byte	102,15,58,223,202,32
-	call	L113key_256a
-.byte	102,15,58,223,200,32
-	call	L112key_256b
-.byte	102,15,58,223,202,64
-	call	L113key_256a
-	movups	%xmm0,(%edx)
-	movl	%ecx,16(%edx)
-	xorl	%eax,%eax
-	jmp	L102good_key
-.align	4,0x90
-L113key_256a:
-	movups	%xmm2,(%edx)
-	leal	16(%edx),%edx
-L111key_256a_cold:
-	shufps	$16,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$255,%xmm1,%xmm1
-	xorps	%xmm1,%xmm0
-	ret
-.align	4,0x90
-L112key_256b:
-	movups	%xmm0,(%edx)
-	leal	16(%edx),%edx
-	shufps	$16,%xmm2,%xmm4
-	xorps	%xmm4,%xmm2
-	shufps	$140,%xmm2,%xmm4
-	xorps	%xmm4,%xmm2
-	shufps	$170,%xmm1,%xmm1
-	xorps	%xmm1,%xmm2
-	ret
-.align	4,0x90
-L11014rounds_alt:
-	movdqa	(%ebx),%xmm5
-	movdqa	32(%ebx),%xmm4
-	movl	$7,%ecx
-	movdqu	%xmm0,-32(%edx)
-	movdqa	%xmm2,%xmm1
-	movdqu	%xmm2,-16(%edx)
-L114loop_key256:
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
-	movdqa	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm3,%xmm0
-	pslld	$1,%xmm4
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,(%edx)
-	decl	%ecx
-	jz	L115done_key256
-	pshufd	$255,%xmm0,%xmm2
-	pxor	%xmm3,%xmm3
-.byte	102,15,56,221,211
-	movdqa	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm3,%xmm1
-	pxor	%xmm1,%xmm2
-	movdqu	%xmm2,16(%edx)
-	leal	32(%edx),%edx
-	movdqa	%xmm2,%xmm1
-	jmp	L114loop_key256
-L115done_key256:
-	movl	$13,%ecx
-	movl	%ecx,16(%edx)
-L102good_key:
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	xorl	%eax,%eax
-	popl	%ebx
-	popl	%ebp
-	ret
-.align	2,0x90
-L093bad_pointer:
-	movl	$-1,%eax
-	popl	%ebx
-	popl	%ebp
-	ret
-.align	2,0x90
-L097bad_keybits:
-	pxor	%xmm0,%xmm0
-	movl	$-2,%eax
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_aes_hw_set_encrypt_key
-.private_extern	_aes_hw_set_encrypt_key
-.align	4
-_aes_hw_set_encrypt_key:
-L_aes_hw_set_encrypt_key_begin:
-#ifdef BORINGSSL_DISPATCH_TEST
-	pushl	%ebx
-	pushl	%edx
-	call	L116pic
-L116pic:
-	popl	%ebx
-	leal	_BORINGSSL_function_hit+3-L116pic(%ebx),%ebx
-	movl	$1,%edx
-	movb	%dl,(%ebx)
-	popl	%edx
-	popl	%ebx
-#endif
-	movl	4(%esp),%eax
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	call	__aesni_set_encrypt_key
-	ret
-.globl	_aes_hw_set_decrypt_key
-.private_extern	_aes_hw_set_decrypt_key
-.align	4
-_aes_hw_set_decrypt_key:
-L_aes_hw_set_decrypt_key_begin:
-	movl	4(%esp),%eax
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	call	__aesni_set_encrypt_key
-	movl	12(%esp),%edx
-	shll	$4,%ecx
-	testl	%eax,%eax
-	jnz	L117dec_key_ret
-	leal	16(%edx,%ecx,1),%eax
-	movups	(%edx),%xmm0
-	movups	(%eax),%xmm1
-	movups	%xmm0,(%eax)
-	movups	%xmm1,(%edx)
-	leal	16(%edx),%edx
-	leal	-16(%eax),%eax
-L118dec_key_inverse:
-	movups	(%edx),%xmm0
-	movups	(%eax),%xmm1
-.byte	102,15,56,219,192
-.byte	102,15,56,219,201
-	leal	16(%edx),%edx
-	leal	-16(%eax),%eax
-	movups	%xmm0,16(%eax)
-	movups	%xmm1,-16(%edx)
-	cmpl	%edx,%eax
-	ja	L118dec_key_inverse
-	movups	(%edx),%xmm0
-.byte	102,15,56,219,192
-	movups	%xmm0,(%edx)
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	xorl	%eax,%eax
-L117dec_key_ret:
-	ret
-.align	6,0x90
-Lkey_const:
-.long	202313229,202313229,202313229,202313229
-.long	67569157,67569157,67569157,67569157
-.long	1,1,1,1
-.long	27,27,27,27
-.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
-.byte	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
-.byte	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
-.byte	115,108,46,111,114,103,62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol	_OPENSSL_ia32cap_P
-.long	0
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/bn-586-apple.S b/apple-x86/crypto/fipsmodule/bn-586-apple.S
deleted file mode 100644
index 93513d0..0000000
--- a/apple-x86/crypto/fipsmodule/bn-586-apple.S
+++ /dev/null
@@ -1,987 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl	_bn_mul_add_words
-.private_extern	_bn_mul_add_words
-.align	4
-_bn_mul_add_words:
-L_bn_mul_add_words_begin:
-	call	L000PIC_me_up
-L000PIC_me_up:
-	popl	%eax
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L000PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	L001maw_non_sse2
-	movl	4(%esp),%eax
-	movl	8(%esp),%edx
-	movl	12(%esp),%ecx
-	movd	16(%esp),%mm0
-	pxor	%mm1,%mm1
-	jmp	L002maw_sse2_entry
-.align	4,0x90
-L003maw_sse2_unrolled:
-	movd	(%eax),%mm3
-	paddq	%mm3,%mm1
-	movd	(%edx),%mm2
-	pmuludq	%mm0,%mm2
-	movd	4(%edx),%mm4
-	pmuludq	%mm0,%mm4
-	movd	8(%edx),%mm6
-	pmuludq	%mm0,%mm6
-	movd	12(%edx),%mm7
-	pmuludq	%mm0,%mm7
-	paddq	%mm2,%mm1
-	movd	4(%eax),%mm3
-	paddq	%mm4,%mm3
-	movd	8(%eax),%mm5
-	paddq	%mm6,%mm5
-	movd	12(%eax),%mm4
-	paddq	%mm4,%mm7
-	movd	%mm1,(%eax)
-	movd	16(%edx),%mm2
-	pmuludq	%mm0,%mm2
-	psrlq	$32,%mm1
-	movd	20(%edx),%mm4
-	pmuludq	%mm0,%mm4
-	paddq	%mm3,%mm1
-	movd	24(%edx),%mm6
-	pmuludq	%mm0,%mm6
-	movd	%mm1,4(%eax)
-	psrlq	$32,%mm1
-	movd	28(%edx),%mm3
-	addl	$32,%edx
-	pmuludq	%mm0,%mm3
-	paddq	%mm5,%mm1
-	movd	16(%eax),%mm5
-	paddq	%mm5,%mm2
-	movd	%mm1,8(%eax)
-	psrlq	$32,%mm1
-	paddq	%mm7,%mm1
-	movd	20(%eax),%mm5
-	paddq	%mm5,%mm4
-	movd	%mm1,12(%eax)
-	psrlq	$32,%mm1
-	paddq	%mm2,%mm1
-	movd	24(%eax),%mm5
-	paddq	%mm5,%mm6
-	movd	%mm1,16(%eax)
-	psrlq	$32,%mm1
-	paddq	%mm4,%mm1
-	movd	28(%eax),%mm5
-	paddq	%mm5,%mm3
-	movd	%mm1,20(%eax)
-	psrlq	$32,%mm1
-	paddq	%mm6,%mm1
-	movd	%mm1,24(%eax)
-	psrlq	$32,%mm1
-	paddq	%mm3,%mm1
-	movd	%mm1,28(%eax)
-	leal	32(%eax),%eax
-	psrlq	$32,%mm1
-	subl	$8,%ecx
-	jz	L004maw_sse2_exit
-L002maw_sse2_entry:
-	testl	$4294967288,%ecx
-	jnz	L003maw_sse2_unrolled
-.align	2,0x90
-L005maw_sse2_loop:
-	movd	(%edx),%mm2
-	movd	(%eax),%mm3
-	pmuludq	%mm0,%mm2
-	leal	4(%edx),%edx
-	paddq	%mm3,%mm1
-	paddq	%mm2,%mm1
-	movd	%mm1,(%eax)
-	subl	$1,%ecx
-	psrlq	$32,%mm1
-	leal	4(%eax),%eax
-	jnz	L005maw_sse2_loop
-L004maw_sse2_exit:
-	movd	%mm1,%eax
-	emms
-	ret
-.align	4,0x90
-L001maw_non_sse2:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	xorl	%esi,%esi
-	movl	20(%esp),%edi
-	movl	28(%esp),%ecx
-	movl	24(%esp),%ebx
-	andl	$4294967288,%ecx
-	movl	32(%esp),%ebp
-	pushl	%ecx
-	jz	L006maw_finish
-.align	4,0x90
-L007maw_loop:
-	# Round 0 
-	movl	(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-	# Round 4 
-	movl	4(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	4(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-	# Round 8 
-	movl	8(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	8(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-	# Round 12 
-	movl	12(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	12(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-	# Round 16 
-	movl	16(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	16(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-	# Round 20 
-	movl	20(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	20(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-	# Round 24 
-	movl	24(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	24(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-	# Round 28 
-	movl	28(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	28(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,28(%edi)
-	movl	%edx,%esi
-
-	subl	$8,%ecx
-	leal	32(%ebx),%ebx
-	leal	32(%edi),%edi
-	jnz	L007maw_loop
-L006maw_finish:
-	movl	32(%esp),%ecx
-	andl	$7,%ecx
-	jnz	L008maw_finish2
-	jmp	L009maw_end
-L008maw_finish2:
-	# Tail Round 0 
-	movl	(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-	jz	L009maw_end
-	# Tail Round 1 
-	movl	4(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	4(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-	jz	L009maw_end
-	# Tail Round 2 
-	movl	8(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	8(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-	jz	L009maw_end
-	# Tail Round 3 
-	movl	12(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	12(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-	jz	L009maw_end
-	# Tail Round 4 
-	movl	16(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	16(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-	jz	L009maw_end
-	# Tail Round 5 
-	movl	20(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	20(%edi),%eax
-	adcl	$0,%edx
-	decl	%ecx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-	jz	L009maw_end
-	# Tail Round 6 
-	movl	24(%ebx),%eax
-	mull	%ebp
-	addl	%esi,%eax
-	adcl	$0,%edx
-	addl	24(%edi),%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-L009maw_end:
-	movl	%esi,%eax
-	popl	%ecx
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_bn_mul_words
-.private_extern	_bn_mul_words
-.align	4
-_bn_mul_words:
-L_bn_mul_words_begin:
-	call	L010PIC_me_up
-L010PIC_me_up:
-	popl	%eax
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L010PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	L011mw_non_sse2
-	movl	4(%esp),%eax
-	movl	8(%esp),%edx
-	movl	12(%esp),%ecx
-	movd	16(%esp),%mm0
-	pxor	%mm1,%mm1
-.align	4,0x90
-L012mw_sse2_loop:
-	movd	(%edx),%mm2
-	pmuludq	%mm0,%mm2
-	leal	4(%edx),%edx
-	paddq	%mm2,%mm1
-	movd	%mm1,(%eax)
-	subl	$1,%ecx
-	psrlq	$32,%mm1
-	leal	4(%eax),%eax
-	jnz	L012mw_sse2_loop
-	movd	%mm1,%eax
-	emms
-	ret
-.align	4,0x90
-L011mw_non_sse2:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	xorl	%esi,%esi
-	movl	20(%esp),%edi
-	movl	24(%esp),%ebx
-	movl	28(%esp),%ebp
-	movl	32(%esp),%ecx
-	andl	$4294967288,%ebp
-	jz	L013mw_finish
-L014mw_loop:
-	# Round 0 
-	movl	(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-	# Round 4 
-	movl	4(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-	# Round 8 
-	movl	8(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-	# Round 12 
-	movl	12(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-	# Round 16 
-	movl	16(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-	# Round 20 
-	movl	20(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-	# Round 24 
-	movl	24(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-	# Round 28 
-	movl	28(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,28(%edi)
-	movl	%edx,%esi
-
-	addl	$32,%ebx
-	addl	$32,%edi
-	subl	$8,%ebp
-	jz	L013mw_finish
-	jmp	L014mw_loop
-L013mw_finish:
-	movl	28(%esp),%ebp
-	andl	$7,%ebp
-	jnz	L015mw_finish2
-	jmp	L016mw_end
-L015mw_finish2:
-	# Tail Round 0 
-	movl	(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	L016mw_end
-	# Tail Round 1 
-	movl	4(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,4(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	L016mw_end
-	# Tail Round 2 
-	movl	8(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,8(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	L016mw_end
-	# Tail Round 3 
-	movl	12(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,12(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	L016mw_end
-	# Tail Round 4 
-	movl	16(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	L016mw_end
-	# Tail Round 5 
-	movl	20(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,20(%edi)
-	movl	%edx,%esi
-	decl	%ebp
-	jz	L016mw_end
-	# Tail Round 6 
-	movl	24(%ebx),%eax
-	mull	%ecx
-	addl	%esi,%eax
-	adcl	$0,%edx
-	movl	%eax,24(%edi)
-	movl	%edx,%esi
-L016mw_end:
-	movl	%esi,%eax
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_bn_sqr_words
-.private_extern	_bn_sqr_words
-.align	4
-_bn_sqr_words:
-L_bn_sqr_words_begin:
-	call	L017PIC_me_up
-L017PIC_me_up:
-	popl	%eax
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L017PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	L018sqr_non_sse2
-	movl	4(%esp),%eax
-	movl	8(%esp),%edx
-	movl	12(%esp),%ecx
-.align	4,0x90
-L019sqr_sse2_loop:
-	movd	(%edx),%mm0
-	pmuludq	%mm0,%mm0
-	leal	4(%edx),%edx
-	movq	%mm0,(%eax)
-	subl	$1,%ecx
-	leal	8(%eax),%eax
-	jnz	L019sqr_sse2_loop
-	emms
-	ret
-.align	4,0x90
-L018sqr_non_sse2:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%ebx
-	andl	$4294967288,%ebx
-	jz	L020sw_finish
-L021sw_loop:
-	# Round 0 
-	movl	(%edi),%eax
-	mull	%eax
-	movl	%eax,(%esi)
-	movl	%edx,4(%esi)
-	# Round 4 
-	movl	4(%edi),%eax
-	mull	%eax
-	movl	%eax,8(%esi)
-	movl	%edx,12(%esi)
-	# Round 8 
-	movl	8(%edi),%eax
-	mull	%eax
-	movl	%eax,16(%esi)
-	movl	%edx,20(%esi)
-	# Round 12 
-	movl	12(%edi),%eax
-	mull	%eax
-	movl	%eax,24(%esi)
-	movl	%edx,28(%esi)
-	# Round 16 
-	movl	16(%edi),%eax
-	mull	%eax
-	movl	%eax,32(%esi)
-	movl	%edx,36(%esi)
-	# Round 20 
-	movl	20(%edi),%eax
-	mull	%eax
-	movl	%eax,40(%esi)
-	movl	%edx,44(%esi)
-	# Round 24 
-	movl	24(%edi),%eax
-	mull	%eax
-	movl	%eax,48(%esi)
-	movl	%edx,52(%esi)
-	# Round 28 
-	movl	28(%edi),%eax
-	mull	%eax
-	movl	%eax,56(%esi)
-	movl	%edx,60(%esi)
-
-	addl	$32,%edi
-	addl	$64,%esi
-	subl	$8,%ebx
-	jnz	L021sw_loop
-L020sw_finish:
-	movl	28(%esp),%ebx
-	andl	$7,%ebx
-	jz	L022sw_end
-	# Tail Round 0 
-	movl	(%edi),%eax
-	mull	%eax
-	movl	%eax,(%esi)
-	decl	%ebx
-	movl	%edx,4(%esi)
-	jz	L022sw_end
-	# Tail Round 1 
-	movl	4(%edi),%eax
-	mull	%eax
-	movl	%eax,8(%esi)
-	decl	%ebx
-	movl	%edx,12(%esi)
-	jz	L022sw_end
-	# Tail Round 2 
-	movl	8(%edi),%eax
-	mull	%eax
-	movl	%eax,16(%esi)
-	decl	%ebx
-	movl	%edx,20(%esi)
-	jz	L022sw_end
-	# Tail Round 3 
-	movl	12(%edi),%eax
-	mull	%eax
-	movl	%eax,24(%esi)
-	decl	%ebx
-	movl	%edx,28(%esi)
-	jz	L022sw_end
-	# Tail Round 4 
-	movl	16(%edi),%eax
-	mull	%eax
-	movl	%eax,32(%esi)
-	decl	%ebx
-	movl	%edx,36(%esi)
-	jz	L022sw_end
-	# Tail Round 5 
-	movl	20(%edi),%eax
-	mull	%eax
-	movl	%eax,40(%esi)
-	decl	%ebx
-	movl	%edx,44(%esi)
-	jz	L022sw_end
-	# Tail Round 6 
-	movl	24(%edi),%eax
-	mull	%eax
-	movl	%eax,48(%esi)
-	movl	%edx,52(%esi)
-L022sw_end:
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_bn_div_words
-.private_extern	_bn_div_words
-.align	4
-_bn_div_words:
-L_bn_div_words_begin:
-	movl	4(%esp),%edx
-	movl	8(%esp),%eax
-	movl	12(%esp),%ecx
-	divl	%ecx
-	ret
-.globl	_bn_add_words
-.private_extern	_bn_add_words
-.align	4
-_bn_add_words:
-L_bn_add_words_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	movl	20(%esp),%ebx
-	movl	24(%esp),%esi
-	movl	28(%esp),%edi
-	movl	32(%esp),%ebp
-	xorl	%eax,%eax
-	andl	$4294967288,%ebp
-	jz	L023aw_finish
-L024aw_loop:
-	# Round 0 
-	movl	(%esi),%ecx
-	movl	(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,(%ebx)
-	# Round 1 
-	movl	4(%esi),%ecx
-	movl	4(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,4(%ebx)
-	# Round 2 
-	movl	8(%esi),%ecx
-	movl	8(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,8(%ebx)
-	# Round 3 
-	movl	12(%esi),%ecx
-	movl	12(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,12(%ebx)
-	# Round 4 
-	movl	16(%esi),%ecx
-	movl	16(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,16(%ebx)
-	# Round 5 
-	movl	20(%esi),%ecx
-	movl	20(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,20(%ebx)
-	# Round 6 
-	movl	24(%esi),%ecx
-	movl	24(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,24(%ebx)
-	# Round 7 
-	movl	28(%esi),%ecx
-	movl	28(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,28(%ebx)
-
-	addl	$32,%esi
-	addl	$32,%edi
-	addl	$32,%ebx
-	subl	$8,%ebp
-	jnz	L024aw_loop
-L023aw_finish:
-	movl	32(%esp),%ebp
-	andl	$7,%ebp
-	jz	L025aw_end
-	# Tail Round 0 
-	movl	(%esi),%ecx
-	movl	(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	decl	%ebp
-	movl	%ecx,(%ebx)
-	jz	L025aw_end
-	# Tail Round 1 
-	movl	4(%esi),%ecx
-	movl	4(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	decl	%ebp
-	movl	%ecx,4(%ebx)
-	jz	L025aw_end
-	# Tail Round 2 
-	movl	8(%esi),%ecx
-	movl	8(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	decl	%ebp
-	movl	%ecx,8(%ebx)
-	jz	L025aw_end
-	# Tail Round 3 
-	movl	12(%esi),%ecx
-	movl	12(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	decl	%ebp
-	movl	%ecx,12(%ebx)
-	jz	L025aw_end
-	# Tail Round 4 
-	movl	16(%esi),%ecx
-	movl	16(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	decl	%ebp
-	movl	%ecx,16(%ebx)
-	jz	L025aw_end
-	# Tail Round 5 
-	movl	20(%esi),%ecx
-	movl	20(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	decl	%ebp
-	movl	%ecx,20(%ebx)
-	jz	L025aw_end
-	# Tail Round 6 
-	movl	24(%esi),%ecx
-	movl	24(%edi),%edx
-	addl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	addl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,24(%ebx)
-L025aw_end:
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_bn_sub_words
-.private_extern	_bn_sub_words
-.align	4
-_bn_sub_words:
-L_bn_sub_words_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-
-	movl	20(%esp),%ebx
-	movl	24(%esp),%esi
-	movl	28(%esp),%edi
-	movl	32(%esp),%ebp
-	xorl	%eax,%eax
-	andl	$4294967288,%ebp
-	jz	L026aw_finish
-L027aw_loop:
-	# Round 0 
-	movl	(%esi),%ecx
-	movl	(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,(%ebx)
-	# Round 1 
-	movl	4(%esi),%ecx
-	movl	4(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,4(%ebx)
-	# Round 2 
-	movl	8(%esi),%ecx
-	movl	8(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,8(%ebx)
-	# Round 3 
-	movl	12(%esi),%ecx
-	movl	12(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,12(%ebx)
-	# Round 4 
-	movl	16(%esi),%ecx
-	movl	16(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,16(%ebx)
-	# Round 5 
-	movl	20(%esi),%ecx
-	movl	20(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,20(%ebx)
-	# Round 6 
-	movl	24(%esi),%ecx
-	movl	24(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,24(%ebx)
-	# Round 7 
-	movl	28(%esi),%ecx
-	movl	28(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,28(%ebx)
-
-	addl	$32,%esi
-	addl	$32,%edi
-	addl	$32,%ebx
-	subl	$8,%ebp
-	jnz	L027aw_loop
-L026aw_finish:
-	movl	32(%esp),%ebp
-	andl	$7,%ebp
-	jz	L028aw_end
-	# Tail Round 0 
-	movl	(%esi),%ecx
-	movl	(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	decl	%ebp
-	movl	%ecx,(%ebx)
-	jz	L028aw_end
-	# Tail Round 1 
-	movl	4(%esi),%ecx
-	movl	4(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	decl	%ebp
-	movl	%ecx,4(%ebx)
-	jz	L028aw_end
-	# Tail Round 2 
-	movl	8(%esi),%ecx
-	movl	8(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	decl	%ebp
-	movl	%ecx,8(%ebx)
-	jz	L028aw_end
-	# Tail Round 3 
-	movl	12(%esi),%ecx
-	movl	12(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	decl	%ebp
-	movl	%ecx,12(%ebx)
-	jz	L028aw_end
-	# Tail Round 4 
-	movl	16(%esi),%ecx
-	movl	16(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	decl	%ebp
-	movl	%ecx,16(%ebx)
-	jz	L028aw_end
-	# Tail Round 5 
-	movl	20(%esi),%ecx
-	movl	20(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	decl	%ebp
-	movl	%ecx,20(%ebx)
-	jz	L028aw_end
-	# Tail Round 6 
-	movl	24(%esi),%ecx
-	movl	24(%edi),%edx
-	subl	%eax,%ecx
-	movl	$0,%eax
-	adcl	%eax,%eax
-	subl	%edx,%ecx
-	adcl	$0,%eax
-	movl	%ecx,24(%ebx)
-L028aw_end:
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol	_OPENSSL_ia32cap_P
-.long	0
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/co-586-apple.S b/apple-x86/crypto/fipsmodule/co-586-apple.S
deleted file mode 100644
index ab985ee..0000000
--- a/apple-x86/crypto/fipsmodule/co-586-apple.S
+++ /dev/null
@@ -1,1256 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl	_bn_mul_comba8
-.private_extern	_bn_mul_comba8
-.align	4
-_bn_mul_comba8:
-L_bn_mul_comba8_begin:
-	pushl	%esi
-	movl	12(%esp),%esi
-	pushl	%edi
-	movl	20(%esp),%edi
-	pushl	%ebp
-	pushl	%ebx
-	xorl	%ebx,%ebx
-	movl	(%esi),%eax
-	xorl	%ecx,%ecx
-	movl	(%edi),%edx
-	# ################## Calculate word 0 
-	xorl	%ebp,%ebp
-	# mul a[0]*b[0] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	20(%esp),%eax
-	adcl	%edx,%ecx
-	movl	(%edi),%edx
-	adcl	$0,%ebp
-	movl	%ebx,(%eax)
-	movl	4(%esi),%eax
-	# saved r[0] 
-	# ################## Calculate word 1 
-	xorl	%ebx,%ebx
-	# mul a[1]*b[0] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	(%esi),%eax
-	adcl	%edx,%ebp
-	movl	4(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[0]*b[1] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	20(%esp),%eax
-	adcl	%edx,%ebp
-	movl	(%edi),%edx
-	adcl	$0,%ebx
-	movl	%ecx,4(%eax)
-	movl	8(%esi),%eax
-	# saved r[1] 
-	# ################## Calculate word 2 
-	xorl	%ecx,%ecx
-	# mul a[2]*b[0] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	4(%esi),%eax
-	adcl	%edx,%ebx
-	movl	4(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[1]*b[1] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	(%esi),%eax
-	adcl	%edx,%ebx
-	movl	8(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[0]*b[2] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	20(%esp),%eax
-	adcl	%edx,%ebx
-	movl	(%edi),%edx
-	adcl	$0,%ecx
-	movl	%ebp,8(%eax)
-	movl	12(%esi),%eax
-	# saved r[2] 
-	# ################## Calculate word 3 
-	xorl	%ebp,%ebp
-	# mul a[3]*b[0] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	8(%esi),%eax
-	adcl	%edx,%ecx
-	movl	4(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[2]*b[1] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	4(%esi),%eax
-	adcl	%edx,%ecx
-	movl	8(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[1]*b[2] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	(%esi),%eax
-	adcl	%edx,%ecx
-	movl	12(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[0]*b[3] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	20(%esp),%eax
-	adcl	%edx,%ecx
-	movl	(%edi),%edx
-	adcl	$0,%ebp
-	movl	%ebx,12(%eax)
-	movl	16(%esi),%eax
-	# saved r[3] 
-	# ################## Calculate word 4 
-	xorl	%ebx,%ebx
-	# mul a[4]*b[0] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	12(%esi),%eax
-	adcl	%edx,%ebp
-	movl	4(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[3]*b[1] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	8(%esi),%eax
-	adcl	%edx,%ebp
-	movl	8(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[2]*b[2] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	4(%esi),%eax
-	adcl	%edx,%ebp
-	movl	12(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[1]*b[3] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	(%esi),%eax
-	adcl	%edx,%ebp
-	movl	16(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[0]*b[4] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	20(%esp),%eax
-	adcl	%edx,%ebp
-	movl	(%edi),%edx
-	adcl	$0,%ebx
-	movl	%ecx,16(%eax)
-	movl	20(%esi),%eax
-	# saved r[4] 
-	# ################## Calculate word 5 
-	xorl	%ecx,%ecx
-	# mul a[5]*b[0] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	16(%esi),%eax
-	adcl	%edx,%ebx
-	movl	4(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[4]*b[1] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	12(%esi),%eax
-	adcl	%edx,%ebx
-	movl	8(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[3]*b[2] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	8(%esi),%eax
-	adcl	%edx,%ebx
-	movl	12(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[2]*b[3] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	4(%esi),%eax
-	adcl	%edx,%ebx
-	movl	16(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[1]*b[4] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	(%esi),%eax
-	adcl	%edx,%ebx
-	movl	20(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[0]*b[5] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	20(%esp),%eax
-	adcl	%edx,%ebx
-	movl	(%edi),%edx
-	adcl	$0,%ecx
-	movl	%ebp,20(%eax)
-	movl	24(%esi),%eax
-	# saved r[5] 
-	# ################## Calculate word 6 
-	xorl	%ebp,%ebp
-	# mul a[6]*b[0] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	20(%esi),%eax
-	adcl	%edx,%ecx
-	movl	4(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[5]*b[1] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	16(%esi),%eax
-	adcl	%edx,%ecx
-	movl	8(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[4]*b[2] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	12(%esi),%eax
-	adcl	%edx,%ecx
-	movl	12(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[3]*b[3] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	8(%esi),%eax
-	adcl	%edx,%ecx
-	movl	16(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[2]*b[4] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	4(%esi),%eax
-	adcl	%edx,%ecx
-	movl	20(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[1]*b[5] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	(%esi),%eax
-	adcl	%edx,%ecx
-	movl	24(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[0]*b[6] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	20(%esp),%eax
-	adcl	%edx,%ecx
-	movl	(%edi),%edx
-	adcl	$0,%ebp
-	movl	%ebx,24(%eax)
-	movl	28(%esi),%eax
-	# saved r[6] 
-	# ################## Calculate word 7 
-	xorl	%ebx,%ebx
-	# mul a[7]*b[0] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	24(%esi),%eax
-	adcl	%edx,%ebp
-	movl	4(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[6]*b[1] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	20(%esi),%eax
-	adcl	%edx,%ebp
-	movl	8(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[5]*b[2] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	16(%esi),%eax
-	adcl	%edx,%ebp
-	movl	12(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[4]*b[3] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	12(%esi),%eax
-	adcl	%edx,%ebp
-	movl	16(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[3]*b[4] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	8(%esi),%eax
-	adcl	%edx,%ebp
-	movl	20(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[2]*b[5] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	4(%esi),%eax
-	adcl	%edx,%ebp
-	movl	24(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[1]*b[6] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	(%esi),%eax
-	adcl	%edx,%ebp
-	movl	28(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[0]*b[7] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	20(%esp),%eax
-	adcl	%edx,%ebp
-	movl	4(%edi),%edx
-	adcl	$0,%ebx
-	movl	%ecx,28(%eax)
-	movl	28(%esi),%eax
-	# saved r[7] 
-	# ################## Calculate word 8 
-	xorl	%ecx,%ecx
-	# mul a[7]*b[1] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	24(%esi),%eax
-	adcl	%edx,%ebx
-	movl	8(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[6]*b[2] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	20(%esi),%eax
-	adcl	%edx,%ebx
-	movl	12(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[5]*b[3] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	16(%esi),%eax
-	adcl	%edx,%ebx
-	movl	16(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[4]*b[4] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	12(%esi),%eax
-	adcl	%edx,%ebx
-	movl	20(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[3]*b[5] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	8(%esi),%eax
-	adcl	%edx,%ebx
-	movl	24(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[2]*b[6] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	4(%esi),%eax
-	adcl	%edx,%ebx
-	movl	28(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[1]*b[7] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	20(%esp),%eax
-	adcl	%edx,%ebx
-	movl	8(%edi),%edx
-	adcl	$0,%ecx
-	movl	%ebp,32(%eax)
-	movl	28(%esi),%eax
-	# saved r[8] 
-	# ################## Calculate word 9 
-	xorl	%ebp,%ebp
-	# mul a[7]*b[2] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	24(%esi),%eax
-	adcl	%edx,%ecx
-	movl	12(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[6]*b[3] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	20(%esi),%eax
-	adcl	%edx,%ecx
-	movl	16(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[5]*b[4] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	16(%esi),%eax
-	adcl	%edx,%ecx
-	movl	20(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[4]*b[5] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	12(%esi),%eax
-	adcl	%edx,%ecx
-	movl	24(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[3]*b[6] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	8(%esi),%eax
-	adcl	%edx,%ecx
-	movl	28(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[2]*b[7] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	20(%esp),%eax
-	adcl	%edx,%ecx
-	movl	12(%edi),%edx
-	adcl	$0,%ebp
-	movl	%ebx,36(%eax)
-	movl	28(%esi),%eax
-	# saved r[9] 
-	# ################## Calculate word 10 
-	xorl	%ebx,%ebx
-	# mul a[7]*b[3] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	24(%esi),%eax
-	adcl	%edx,%ebp
-	movl	16(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[6]*b[4] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	20(%esi),%eax
-	adcl	%edx,%ebp
-	movl	20(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[5]*b[5] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	16(%esi),%eax
-	adcl	%edx,%ebp
-	movl	24(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[4]*b[6] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	12(%esi),%eax
-	adcl	%edx,%ebp
-	movl	28(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[3]*b[7] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	20(%esp),%eax
-	adcl	%edx,%ebp
-	movl	16(%edi),%edx
-	adcl	$0,%ebx
-	movl	%ecx,40(%eax)
-	movl	28(%esi),%eax
-	# saved r[10] 
-	# ################## Calculate word 11 
-	xorl	%ecx,%ecx
-	# mul a[7]*b[4] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	24(%esi),%eax
-	adcl	%edx,%ebx
-	movl	20(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[6]*b[5] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	20(%esi),%eax
-	adcl	%edx,%ebx
-	movl	24(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[5]*b[6] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	16(%esi),%eax
-	adcl	%edx,%ebx
-	movl	28(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[4]*b[7] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	20(%esp),%eax
-	adcl	%edx,%ebx
-	movl	20(%edi),%edx
-	adcl	$0,%ecx
-	movl	%ebp,44(%eax)
-	movl	28(%esi),%eax
-	# saved r[11] 
-	# ################## Calculate word 12 
-	xorl	%ebp,%ebp
-	# mul a[7]*b[5] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	24(%esi),%eax
-	adcl	%edx,%ecx
-	movl	24(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[6]*b[6] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	20(%esi),%eax
-	adcl	%edx,%ecx
-	movl	28(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[5]*b[7] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	20(%esp),%eax
-	adcl	%edx,%ecx
-	movl	24(%edi),%edx
-	adcl	$0,%ebp
-	movl	%ebx,48(%eax)
-	movl	28(%esi),%eax
-	# saved r[12] 
-	# ################## Calculate word 13 
-	xorl	%ebx,%ebx
-	# mul a[7]*b[6] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	24(%esi),%eax
-	adcl	%edx,%ebp
-	movl	28(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[6]*b[7] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	20(%esp),%eax
-	adcl	%edx,%ebp
-	movl	28(%edi),%edx
-	adcl	$0,%ebx
-	movl	%ecx,52(%eax)
-	movl	28(%esi),%eax
-	# saved r[13] 
-	# ################## Calculate word 14 
-	xorl	%ecx,%ecx
-	# mul a[7]*b[7] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	20(%esp),%eax
-	adcl	%edx,%ebx
-	adcl	$0,%ecx
-	movl	%ebp,56(%eax)
-	# saved r[14] 
-	# save r[15] 
-	movl	%ebx,60(%eax)
-	popl	%ebx
-	popl	%ebp
-	popl	%edi
-	popl	%esi
-	ret
-.globl	_bn_mul_comba4
-.private_extern	_bn_mul_comba4
-.align	4
-_bn_mul_comba4:
-L_bn_mul_comba4_begin:
-	pushl	%esi
-	movl	12(%esp),%esi
-	pushl	%edi
-	movl	20(%esp),%edi
-	pushl	%ebp
-	pushl	%ebx
-	xorl	%ebx,%ebx
-	movl	(%esi),%eax
-	xorl	%ecx,%ecx
-	movl	(%edi),%edx
-	# ################## Calculate word 0 
-	xorl	%ebp,%ebp
-	# mul a[0]*b[0] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	20(%esp),%eax
-	adcl	%edx,%ecx
-	movl	(%edi),%edx
-	adcl	$0,%ebp
-	movl	%ebx,(%eax)
-	movl	4(%esi),%eax
-	# saved r[0] 
-	# ################## Calculate word 1 
-	xorl	%ebx,%ebx
-	# mul a[1]*b[0] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	(%esi),%eax
-	adcl	%edx,%ebp
-	movl	4(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[0]*b[1] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	20(%esp),%eax
-	adcl	%edx,%ebp
-	movl	(%edi),%edx
-	adcl	$0,%ebx
-	movl	%ecx,4(%eax)
-	movl	8(%esi),%eax
-	# saved r[1] 
-	# ################## Calculate word 2 
-	xorl	%ecx,%ecx
-	# mul a[2]*b[0] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	4(%esi),%eax
-	adcl	%edx,%ebx
-	movl	4(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[1]*b[1] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	(%esi),%eax
-	adcl	%edx,%ebx
-	movl	8(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[0]*b[2] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	20(%esp),%eax
-	adcl	%edx,%ebx
-	movl	(%edi),%edx
-	adcl	$0,%ecx
-	movl	%ebp,8(%eax)
-	movl	12(%esi),%eax
-	# saved r[2] 
-	# ################## Calculate word 3 
-	xorl	%ebp,%ebp
-	# mul a[3]*b[0] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	8(%esi),%eax
-	adcl	%edx,%ecx
-	movl	4(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[2]*b[1] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	4(%esi),%eax
-	adcl	%edx,%ecx
-	movl	8(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[1]*b[2] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	(%esi),%eax
-	adcl	%edx,%ecx
-	movl	12(%edi),%edx
-	adcl	$0,%ebp
-	# mul a[0]*b[3] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	20(%esp),%eax
-	adcl	%edx,%ecx
-	movl	4(%edi),%edx
-	adcl	$0,%ebp
-	movl	%ebx,12(%eax)
-	movl	12(%esi),%eax
-	# saved r[3] 
-	# ################## Calculate word 4 
-	xorl	%ebx,%ebx
-	# mul a[3]*b[1] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	8(%esi),%eax
-	adcl	%edx,%ebp
-	movl	8(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[2]*b[2] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	4(%esi),%eax
-	adcl	%edx,%ebp
-	movl	12(%edi),%edx
-	adcl	$0,%ebx
-	# mul a[1]*b[3] 
-	mull	%edx
-	addl	%eax,%ecx
-	movl	20(%esp),%eax
-	adcl	%edx,%ebp
-	movl	8(%edi),%edx
-	adcl	$0,%ebx
-	movl	%ecx,16(%eax)
-	movl	12(%esi),%eax
-	# saved r[4] 
-	# ################## Calculate word 5 
-	xorl	%ecx,%ecx
-	# mul a[3]*b[2] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	8(%esi),%eax
-	adcl	%edx,%ebx
-	movl	12(%edi),%edx
-	adcl	$0,%ecx
-	# mul a[2]*b[3] 
-	mull	%edx
-	addl	%eax,%ebp
-	movl	20(%esp),%eax
-	adcl	%edx,%ebx
-	movl	12(%edi),%edx
-	adcl	$0,%ecx
-	movl	%ebp,20(%eax)
-	movl	12(%esi),%eax
-	# saved r[5] 
-	# ################## Calculate word 6 
-	xorl	%ebp,%ebp
-	# mul a[3]*b[3] 
-	mull	%edx
-	addl	%eax,%ebx
-	movl	20(%esp),%eax
-	adcl	%edx,%ecx
-	adcl	$0,%ebp
-	movl	%ebx,24(%eax)
-	# saved r[6] 
-	# save r[7] 
-	movl	%ecx,28(%eax)
-	popl	%ebx
-	popl	%ebp
-	popl	%edi
-	popl	%esi
-	ret
-.globl	_bn_sqr_comba8
-.private_extern	_bn_sqr_comba8
-.align	4
-_bn_sqr_comba8:
-L_bn_sqr_comba8_begin:
-	pushl	%esi
-	pushl	%edi
-	pushl	%ebp
-	pushl	%ebx
-	movl	20(%esp),%edi
-	movl	24(%esp),%esi
-	xorl	%ebx,%ebx
-	xorl	%ecx,%ecx
-	movl	(%esi),%eax
-	# ############### Calculate word 0 
-	xorl	%ebp,%ebp
-	# sqr a[0]*a[0] 
-	mull	%eax
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	(%esi),%edx
-	adcl	$0,%ebp
-	movl	%ebx,(%edi)
-	movl	4(%esi),%eax
-	# saved r[0] 
-	# ############### Calculate word 1 
-	xorl	%ebx,%ebx
-	# sqr a[1]*a[0] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebx
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	8(%esi),%eax
-	adcl	$0,%ebx
-	movl	%ecx,4(%edi)
-	movl	(%esi),%edx
-	# saved r[1] 
-	# ############### Calculate word 2 
-	xorl	%ecx,%ecx
-	# sqr a[2]*a[0] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ecx
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	4(%esi),%eax
-	adcl	$0,%ecx
-	# sqr a[1]*a[1] 
-	mull	%eax
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	(%esi),%edx
-	adcl	$0,%ecx
-	movl	%ebp,8(%edi)
-	movl	12(%esi),%eax
-	# saved r[2] 
-	# ############### Calculate word 3 
-	xorl	%ebp,%ebp
-	# sqr a[3]*a[0] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebp
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	8(%esi),%eax
-	adcl	$0,%ebp
-	movl	4(%esi),%edx
-	# sqr a[2]*a[1] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebp
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	16(%esi),%eax
-	adcl	$0,%ebp
-	movl	%ebx,12(%edi)
-	movl	(%esi),%edx
-	# saved r[3] 
-	# ############### Calculate word 4 
-	xorl	%ebx,%ebx
-	# sqr a[4]*a[0] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebx
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	12(%esi),%eax
-	adcl	$0,%ebx
-	movl	4(%esi),%edx
-	# sqr a[3]*a[1] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebx
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	8(%esi),%eax
-	adcl	$0,%ebx
-	# sqr a[2]*a[2] 
-	mull	%eax
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	(%esi),%edx
-	adcl	$0,%ebx
-	movl	%ecx,16(%edi)
-	movl	20(%esi),%eax
-	# saved r[4] 
-	# ############### Calculate word 5 
-	xorl	%ecx,%ecx
-	# sqr a[5]*a[0] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ecx
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	16(%esi),%eax
-	adcl	$0,%ecx
-	movl	4(%esi),%edx
-	# sqr a[4]*a[1] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ecx
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	12(%esi),%eax
-	adcl	$0,%ecx
-	movl	8(%esi),%edx
-	# sqr a[3]*a[2] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ecx
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	24(%esi),%eax
-	adcl	$0,%ecx
-	movl	%ebp,20(%edi)
-	movl	(%esi),%edx
-	# saved r[5] 
-	# ############### Calculate word 6 
-	xorl	%ebp,%ebp
-	# sqr a[6]*a[0] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebp
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	20(%esi),%eax
-	adcl	$0,%ebp
-	movl	4(%esi),%edx
-	# sqr a[5]*a[1] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebp
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	16(%esi),%eax
-	adcl	$0,%ebp
-	movl	8(%esi),%edx
-	# sqr a[4]*a[2] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebp
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	12(%esi),%eax
-	adcl	$0,%ebp
-	# sqr a[3]*a[3] 
-	mull	%eax
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	(%esi),%edx
-	adcl	$0,%ebp
-	movl	%ebx,24(%edi)
-	movl	28(%esi),%eax
-	# saved r[6] 
-	# ############### Calculate word 7 
-	xorl	%ebx,%ebx
-	# sqr a[7]*a[0] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebx
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	24(%esi),%eax
-	adcl	$0,%ebx
-	movl	4(%esi),%edx
-	# sqr a[6]*a[1] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebx
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	20(%esi),%eax
-	adcl	$0,%ebx
-	movl	8(%esi),%edx
-	# sqr a[5]*a[2] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebx
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	16(%esi),%eax
-	adcl	$0,%ebx
-	movl	12(%esi),%edx
-	# sqr a[4]*a[3] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebx
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	28(%esi),%eax
-	adcl	$0,%ebx
-	movl	%ecx,28(%edi)
-	movl	4(%esi),%edx
-	# saved r[7] 
-	# ############### Calculate word 8 
-	xorl	%ecx,%ecx
-	# sqr a[7]*a[1] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ecx
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	24(%esi),%eax
-	adcl	$0,%ecx
-	movl	8(%esi),%edx
-	# sqr a[6]*a[2] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ecx
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	20(%esi),%eax
-	adcl	$0,%ecx
-	movl	12(%esi),%edx
-	# sqr a[5]*a[3] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ecx
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	16(%esi),%eax
-	adcl	$0,%ecx
-	# sqr a[4]*a[4] 
-	mull	%eax
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	8(%esi),%edx
-	adcl	$0,%ecx
-	movl	%ebp,32(%edi)
-	movl	28(%esi),%eax
-	# saved r[8] 
-	# ############### Calculate word 9 
-	xorl	%ebp,%ebp
-	# sqr a[7]*a[2] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebp
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	24(%esi),%eax
-	adcl	$0,%ebp
-	movl	12(%esi),%edx
-	# sqr a[6]*a[3] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebp
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	20(%esi),%eax
-	adcl	$0,%ebp
-	movl	16(%esi),%edx
-	# sqr a[5]*a[4] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebp
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	28(%esi),%eax
-	adcl	$0,%ebp
-	movl	%ebx,36(%edi)
-	movl	12(%esi),%edx
-	# saved r[9] 
-	# ############### Calculate word 10 
-	xorl	%ebx,%ebx
-	# sqr a[7]*a[3] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebx
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	24(%esi),%eax
-	adcl	$0,%ebx
-	movl	16(%esi),%edx
-	# sqr a[6]*a[4] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebx
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	20(%esi),%eax
-	adcl	$0,%ebx
-	# sqr a[5]*a[5] 
-	mull	%eax
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	16(%esi),%edx
-	adcl	$0,%ebx
-	movl	%ecx,40(%edi)
-	movl	28(%esi),%eax
-	# saved r[10] 
-	# ############### Calculate word 11 
-	xorl	%ecx,%ecx
-	# sqr a[7]*a[4] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ecx
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	24(%esi),%eax
-	adcl	$0,%ecx
-	movl	20(%esi),%edx
-	# sqr a[6]*a[5] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ecx
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	28(%esi),%eax
-	adcl	$0,%ecx
-	movl	%ebp,44(%edi)
-	movl	20(%esi),%edx
-	# saved r[11] 
-	# ############### Calculate word 12 
-	xorl	%ebp,%ebp
-	# sqr a[7]*a[5] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebp
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	24(%esi),%eax
-	adcl	$0,%ebp
-	# sqr a[6]*a[6] 
-	mull	%eax
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	24(%esi),%edx
-	adcl	$0,%ebp
-	movl	%ebx,48(%edi)
-	movl	28(%esi),%eax
-	# saved r[12] 
-	# ############### Calculate word 13 
-	xorl	%ebx,%ebx
-	# sqr a[7]*a[6] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebx
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	28(%esi),%eax
-	adcl	$0,%ebx
-	movl	%ecx,52(%edi)
-	# saved r[13] 
-	# ############### Calculate word 14 
-	xorl	%ecx,%ecx
-	# sqr a[7]*a[7] 
-	mull	%eax
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	adcl	$0,%ecx
-	movl	%ebp,56(%edi)
-	# saved r[14] 
-	movl	%ebx,60(%edi)
-	popl	%ebx
-	popl	%ebp
-	popl	%edi
-	popl	%esi
-	ret
-.globl	_bn_sqr_comba4
-.private_extern	_bn_sqr_comba4
-.align	4
-_bn_sqr_comba4:
-L_bn_sqr_comba4_begin:
-	pushl	%esi
-	pushl	%edi
-	pushl	%ebp
-	pushl	%ebx
-	movl	20(%esp),%edi
-	movl	24(%esp),%esi
-	xorl	%ebx,%ebx
-	xorl	%ecx,%ecx
-	movl	(%esi),%eax
-	# ############### Calculate word 0 
-	xorl	%ebp,%ebp
-	# sqr a[0]*a[0] 
-	mull	%eax
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	(%esi),%edx
-	adcl	$0,%ebp
-	movl	%ebx,(%edi)
-	movl	4(%esi),%eax
-	# saved r[0] 
-	# ############### Calculate word 1 
-	xorl	%ebx,%ebx
-	# sqr a[1]*a[0] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebx
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	8(%esi),%eax
-	adcl	$0,%ebx
-	movl	%ecx,4(%edi)
-	movl	(%esi),%edx
-	# saved r[1] 
-	# ############### Calculate word 2 
-	xorl	%ecx,%ecx
-	# sqr a[2]*a[0] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ecx
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	4(%esi),%eax
-	adcl	$0,%ecx
-	# sqr a[1]*a[1] 
-	mull	%eax
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	(%esi),%edx
-	adcl	$0,%ecx
-	movl	%ebp,8(%edi)
-	movl	12(%esi),%eax
-	# saved r[2] 
-	# ############### Calculate word 3 
-	xorl	%ebp,%ebp
-	# sqr a[3]*a[0] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebp
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	8(%esi),%eax
-	adcl	$0,%ebp
-	movl	4(%esi),%edx
-	# sqr a[2]*a[1] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebp
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	movl	12(%esi),%eax
-	adcl	$0,%ebp
-	movl	%ebx,12(%edi)
-	movl	4(%esi),%edx
-	# saved r[3] 
-	# ############### Calculate word 4 
-	xorl	%ebx,%ebx
-	# sqr a[3]*a[1] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ebx
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	8(%esi),%eax
-	adcl	$0,%ebx
-	# sqr a[2]*a[2] 
-	mull	%eax
-	addl	%eax,%ecx
-	adcl	%edx,%ebp
-	movl	8(%esi),%edx
-	adcl	$0,%ebx
-	movl	%ecx,16(%edi)
-	movl	12(%esi),%eax
-	# saved r[4] 
-	# ############### Calculate word 5 
-	xorl	%ecx,%ecx
-	# sqr a[3]*a[2] 
-	mull	%edx
-	addl	%eax,%eax
-	adcl	%edx,%edx
-	adcl	$0,%ecx
-	addl	%eax,%ebp
-	adcl	%edx,%ebx
-	movl	12(%esi),%eax
-	adcl	$0,%ecx
-	movl	%ebp,20(%edi)
-	# saved r[5] 
-	# ############### Calculate word 6 
-	xorl	%ebp,%ebp
-	# sqr a[3]*a[3] 
-	mull	%eax
-	addl	%eax,%ebx
-	adcl	%edx,%ecx
-	adcl	$0,%ebp
-	movl	%ebx,24(%edi)
-	# saved r[6] 
-	movl	%ecx,28(%edi)
-	popl	%ebx
-	popl	%ebp
-	popl	%edi
-	popl	%esi
-	ret
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/ghash-ssse3-x86-apple.S b/apple-x86/crypto/fipsmodule/ghash-ssse3-x86-apple.S
deleted file mode 100644
index 24b1f2f..0000000
--- a/apple-x86/crypto/fipsmodule/ghash-ssse3-x86-apple.S
+++ /dev/null
@@ -1,288 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl	_gcm_gmult_ssse3
-.private_extern	_gcm_gmult_ssse3
-.align	4
-_gcm_gmult_ssse3:
-L_gcm_gmult_ssse3_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%edi
-	movl	24(%esp),%esi
-	movdqu	(%edi),%xmm0
-	call	L000pic_point
-L000pic_point:
-	popl	%eax
-	movdqa	Lreverse_bytes-L000pic_point(%eax),%xmm7
-	movdqa	Llow4_mask-L000pic_point(%eax),%xmm2
-.byte	102,15,56,0,199
-	movdqa	%xmm2,%xmm1
-	pandn	%xmm0,%xmm1
-	psrld	$4,%xmm1
-	pand	%xmm2,%xmm0
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	movl	$5,%eax
-L001loop_row_1:
-	movdqa	(%esi),%xmm4
-	leal	16(%esi),%esi
-	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
-	movdqa	%xmm6,%xmm3
-	psrldq	$1,%xmm2
-	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
-	pxor	%xmm5,%xmm2
-	movdqa	%xmm4,%xmm5
-	psllq	$60,%xmm5
-	movdqa	%xmm5,%xmm6
-	pslldq	$8,%xmm6
-	pxor	%xmm6,%xmm3
-	psrldq	$8,%xmm5
-	pxor	%xmm5,%xmm2
-	psrlq	$4,%xmm4
-	pxor	%xmm4,%xmm2
-	subl	$1,%eax
-	jnz	L001loop_row_1
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$5,%xmm3
-	pxor	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	movl	$5,%eax
-L002loop_row_2:
-	movdqa	(%esi),%xmm4
-	leal	16(%esi),%esi
-	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
-	movdqa	%xmm6,%xmm3
-	psrldq	$1,%xmm2
-	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
-	pxor	%xmm5,%xmm2
-	movdqa	%xmm4,%xmm5
-	psllq	$60,%xmm5
-	movdqa	%xmm5,%xmm6
-	pslldq	$8,%xmm6
-	pxor	%xmm6,%xmm3
-	psrldq	$8,%xmm5
-	pxor	%xmm5,%xmm2
-	psrlq	$4,%xmm4
-	pxor	%xmm4,%xmm2
-	subl	$1,%eax
-	jnz	L002loop_row_2
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$5,%xmm3
-	pxor	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	movl	$6,%eax
-L003loop_row_3:
-	movdqa	(%esi),%xmm4
-	leal	16(%esi),%esi
-	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
-	movdqa	%xmm6,%xmm3
-	psrldq	$1,%xmm2
-	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
-	pxor	%xmm5,%xmm2
-	movdqa	%xmm4,%xmm5
-	psllq	$60,%xmm5
-	movdqa	%xmm5,%xmm6
-	pslldq	$8,%xmm6
-	pxor	%xmm6,%xmm3
-	psrldq	$8,%xmm5
-	pxor	%xmm5,%xmm2
-	psrlq	$4,%xmm4
-	pxor	%xmm4,%xmm2
-	subl	$1,%eax
-	jnz	L003loop_row_3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$5,%xmm3
-	pxor	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-.byte	102,15,56,0,215
-	movdqu	%xmm2,(%edi)
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_gcm_ghash_ssse3
-.private_extern	_gcm_ghash_ssse3
-.align	4
-_gcm_ghash_ssse3:
-L_gcm_ghash_ssse3_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%edi
-	movl	24(%esp),%esi
-	movl	28(%esp),%edx
-	movl	32(%esp),%ecx
-	movdqu	(%edi),%xmm0
-	call	L004pic_point
-L004pic_point:
-	popl	%ebx
-	movdqa	Lreverse_bytes-L004pic_point(%ebx),%xmm7
-	andl	$-16,%ecx
-.byte	102,15,56,0,199
-	pxor	%xmm3,%xmm3
-L005loop_ghash:
-	movdqa	Llow4_mask-L004pic_point(%ebx),%xmm2
-	movdqu	(%edx),%xmm1
-.byte	102,15,56,0,207
-	pxor	%xmm1,%xmm0
-	movdqa	%xmm2,%xmm1
-	pandn	%xmm0,%xmm1
-	psrld	$4,%xmm1
-	pand	%xmm2,%xmm0
-	pxor	%xmm2,%xmm2
-	movl	$5,%eax
-L006loop_row_4:
-	movdqa	(%esi),%xmm4
-	leal	16(%esi),%esi
-	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
-	movdqa	%xmm6,%xmm3
-	psrldq	$1,%xmm2
-	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
-	pxor	%xmm5,%xmm2
-	movdqa	%xmm4,%xmm5
-	psllq	$60,%xmm5
-	movdqa	%xmm5,%xmm6
-	pslldq	$8,%xmm6
-	pxor	%xmm6,%xmm3
-	psrldq	$8,%xmm5
-	pxor	%xmm5,%xmm2
-	psrlq	$4,%xmm4
-	pxor	%xmm4,%xmm2
-	subl	$1,%eax
-	jnz	L006loop_row_4
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$5,%xmm3
-	pxor	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	movl	$5,%eax
-L007loop_row_5:
-	movdqa	(%esi),%xmm4
-	leal	16(%esi),%esi
-	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
-	movdqa	%xmm6,%xmm3
-	psrldq	$1,%xmm2
-	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
-	pxor	%xmm5,%xmm2
-	movdqa	%xmm4,%xmm5
-	psllq	$60,%xmm5
-	movdqa	%xmm5,%xmm6
-	pslldq	$8,%xmm6
-	pxor	%xmm6,%xmm3
-	psrldq	$8,%xmm5
-	pxor	%xmm5,%xmm2
-	psrlq	$4,%xmm4
-	pxor	%xmm4,%xmm2
-	subl	$1,%eax
-	jnz	L007loop_row_5
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$5,%xmm3
-	pxor	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	movl	$6,%eax
-L008loop_row_6:
-	movdqa	(%esi),%xmm4
-	leal	16(%esi),%esi
-	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
-	movdqa	%xmm6,%xmm3
-	psrldq	$1,%xmm2
-	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
-	pxor	%xmm5,%xmm2
-	movdqa	%xmm4,%xmm5
-	psllq	$60,%xmm5
-	movdqa	%xmm5,%xmm6
-	pslldq	$8,%xmm6
-	pxor	%xmm6,%xmm3
-	psrldq	$8,%xmm5
-	pxor	%xmm5,%xmm2
-	psrlq	$4,%xmm4
-	pxor	%xmm4,%xmm2
-	subl	$1,%eax
-	jnz	L008loop_row_6
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$5,%xmm3
-	pxor	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	movdqa	%xmm2,%xmm0
-	leal	-256(%esi),%esi
-	leal	16(%edx),%edx
-	subl	$16,%ecx
-	jnz	L005loop_ghash
-.byte	102,15,56,0,199
-	movdqu	%xmm0,(%edi)
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.align	4,0x90
-Lreverse_bytes:
-.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.align	4,0x90
-Llow4_mask:
-.long	252645135,252645135,252645135,252645135
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/ghash-x86-apple.S b/apple-x86/crypto/fipsmodule/ghash-x86-apple.S
deleted file mode 100644
index a178b74..0000000
--- a/apple-x86/crypto/fipsmodule/ghash-x86-apple.S
+++ /dev/null
@@ -1,322 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl	_gcm_init_clmul
-.private_extern	_gcm_init_clmul
-.align	4
-_gcm_init_clmul:
-L_gcm_init_clmul_begin:
-	movl	4(%esp),%edx
-	movl	8(%esp),%eax
-	call	L000pic
-L000pic:
-	popl	%ecx
-	leal	Lbswap-L000pic(%ecx),%ecx
-	movdqu	(%eax),%xmm2
-	pshufd	$78,%xmm2,%xmm2
-	pshufd	$255,%xmm2,%xmm4
-	movdqa	%xmm2,%xmm3
-	psllq	$1,%xmm2
-	pxor	%xmm5,%xmm5
-	psrlq	$63,%xmm3
-	pcmpgtd	%xmm4,%xmm5
-	pslldq	$8,%xmm3
-	por	%xmm3,%xmm2
-	pand	16(%ecx),%xmm5
-	pxor	%xmm5,%xmm2
-	movdqa	%xmm2,%xmm0
-	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm2,%xmm4
-	pxor	%xmm0,%xmm3
-	pxor	%xmm2,%xmm4
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
-	xorps	%xmm0,%xmm3
-	xorps	%xmm1,%xmm3
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-	movdqa	%xmm0,%xmm4
-	movdqa	%xmm0,%xmm3
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm3
-	psllq	$1,%xmm0
-	pxor	%xmm3,%xmm0
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm3
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm3
-	pxor	%xmm4,%xmm0
-	pxor	%xmm3,%xmm1
-	movdqa	%xmm0,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm1
-	pxor	%xmm0,%xmm4
-	psrlq	$5,%xmm0
-	pxor	%xmm4,%xmm0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-	pshufd	$78,%xmm2,%xmm3
-	pshufd	$78,%xmm0,%xmm4
-	pxor	%xmm2,%xmm3
-	movdqu	%xmm2,(%edx)
-	pxor	%xmm0,%xmm4
-	movdqu	%xmm0,16(%edx)
-.byte	102,15,58,15,227,8
-	movdqu	%xmm4,32(%edx)
-	ret
-.globl	_gcm_gmult_clmul
-.private_extern	_gcm_gmult_clmul
-.align	4
-_gcm_gmult_clmul:
-L_gcm_gmult_clmul_begin:
-	movl	4(%esp),%eax
-	movl	8(%esp),%edx
-	call	L001pic
-L001pic:
-	popl	%ecx
-	leal	Lbswap-L001pic(%ecx),%ecx
-	movdqu	(%eax),%xmm0
-	movdqa	(%ecx),%xmm5
-	movups	(%edx),%xmm2
-.byte	102,15,56,0,197
-	movups	32(%edx),%xmm4
-	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
-	xorps	%xmm0,%xmm3
-	xorps	%xmm1,%xmm3
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-	movdqa	%xmm0,%xmm4
-	movdqa	%xmm0,%xmm3
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm3
-	psllq	$1,%xmm0
-	pxor	%xmm3,%xmm0
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm3
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm3
-	pxor	%xmm4,%xmm0
-	pxor	%xmm3,%xmm1
-	movdqa	%xmm0,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm1
-	pxor	%xmm0,%xmm4
-	psrlq	$5,%xmm0
-	pxor	%xmm4,%xmm0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,197
-	movdqu	%xmm0,(%eax)
-	ret
-.globl	_gcm_ghash_clmul
-.private_extern	_gcm_ghash_clmul
-.align	4
-_gcm_ghash_clmul:
-L_gcm_ghash_clmul_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%eax
-	movl	24(%esp),%edx
-	movl	28(%esp),%esi
-	movl	32(%esp),%ebx
-	call	L002pic
-L002pic:
-	popl	%ecx
-	leal	Lbswap-L002pic(%ecx),%ecx
-	movdqu	(%eax),%xmm0
-	movdqa	(%ecx),%xmm5
-	movdqu	(%edx),%xmm2
-.byte	102,15,56,0,197
-	subl	$16,%ebx
-	jz	L003odd_tail
-	movdqu	(%esi),%xmm3
-	movdqu	16(%esi),%xmm6
-.byte	102,15,56,0,221
-.byte	102,15,56,0,245
-	movdqu	32(%edx),%xmm5
-	pxor	%xmm3,%xmm0
-	pshufd	$78,%xmm6,%xmm3
-	movdqa	%xmm6,%xmm7
-	pxor	%xmm6,%xmm3
-	leal	32(%esi),%esi
-.byte	102,15,58,68,242,0
-.byte	102,15,58,68,250,17
-.byte	102,15,58,68,221,0
-	movups	16(%edx),%xmm2
-	nop
-	subl	$32,%ebx
-	jbe	L004even_tail
-	jmp	L005mod_loop
-.align	5,0x90
-L005mod_loop:
-	pshufd	$78,%xmm0,%xmm4
-	movdqa	%xmm0,%xmm1
-	pxor	%xmm0,%xmm4
-	nop
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,229,16
-	movups	(%edx),%xmm2
-	xorps	%xmm6,%xmm0
-	movdqa	(%ecx),%xmm5
-	xorps	%xmm7,%xmm1
-	movdqu	(%esi),%xmm7
-	pxor	%xmm0,%xmm3
-	movdqu	16(%esi),%xmm6
-	pxor	%xmm1,%xmm3
-.byte	102,15,56,0,253
-	pxor	%xmm3,%xmm4
-	movdqa	%xmm4,%xmm3
-	psrldq	$8,%xmm4
-	pslldq	$8,%xmm3
-	pxor	%xmm4,%xmm1
-	pxor	%xmm3,%xmm0
-.byte	102,15,56,0,245
-	pxor	%xmm7,%xmm1
-	movdqa	%xmm6,%xmm7
-	movdqa	%xmm0,%xmm4
-	movdqa	%xmm0,%xmm3
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm3
-	psllq	$1,%xmm0
-	pxor	%xmm3,%xmm0
-.byte	102,15,58,68,242,0
-	movups	32(%edx),%xmm5
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm3
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm3
-	pxor	%xmm4,%xmm0
-	pxor	%xmm3,%xmm1
-	pshufd	$78,%xmm7,%xmm3
-	movdqa	%xmm0,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm7,%xmm3
-	pxor	%xmm4,%xmm1
-.byte	102,15,58,68,250,17
-	movups	16(%edx),%xmm2
-	pxor	%xmm0,%xmm4
-	psrlq	$5,%xmm0
-	pxor	%xmm4,%xmm0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-.byte	102,15,58,68,221,0
-	leal	32(%esi),%esi
-	subl	$32,%ebx
-	ja	L005mod_loop
-L004even_tail:
-	pshufd	$78,%xmm0,%xmm4
-	movdqa	%xmm0,%xmm1
-	pxor	%xmm0,%xmm4
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,229,16
-	movdqa	(%ecx),%xmm5
-	xorps	%xmm6,%xmm0
-	xorps	%xmm7,%xmm1
-	pxor	%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
-	pxor	%xmm3,%xmm4
-	movdqa	%xmm4,%xmm3
-	psrldq	$8,%xmm4
-	pslldq	$8,%xmm3
-	pxor	%xmm4,%xmm1
-	pxor	%xmm3,%xmm0
-	movdqa	%xmm0,%xmm4
-	movdqa	%xmm0,%xmm3
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm3
-	psllq	$1,%xmm0
-	pxor	%xmm3,%xmm0
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm3
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm3
-	pxor	%xmm4,%xmm0
-	pxor	%xmm3,%xmm1
-	movdqa	%xmm0,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm1
-	pxor	%xmm0,%xmm4
-	psrlq	$5,%xmm0
-	pxor	%xmm4,%xmm0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-	testl	%ebx,%ebx
-	jnz	L006done
-	movups	(%edx),%xmm2
-L003odd_tail:
-	movdqu	(%esi),%xmm3
-.byte	102,15,56,0,221
-	pxor	%xmm3,%xmm0
-	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pshufd	$78,%xmm2,%xmm4
-	pxor	%xmm0,%xmm3
-	pxor	%xmm2,%xmm4
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
-	xorps	%xmm0,%xmm3
-	xorps	%xmm1,%xmm3
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-	movdqa	%xmm0,%xmm4
-	movdqa	%xmm0,%xmm3
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm3
-	psllq	$1,%xmm0
-	pxor	%xmm3,%xmm0
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm3
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm3
-	pxor	%xmm4,%xmm0
-	pxor	%xmm3,%xmm1
-	movdqa	%xmm0,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm1
-	pxor	%xmm0,%xmm4
-	psrlq	$5,%xmm0
-	pxor	%xmm4,%xmm0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-L006done:
-.byte	102,15,56,0,197
-	movdqu	%xmm0,(%eax)
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.align	6,0x90
-Lbswap:
-.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
-.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
-.byte	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
-.byte	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
-.byte	0
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/md5-586-apple.S b/apple-x86/crypto/fipsmodule/md5-586-apple.S
deleted file mode 100644
index 986d590..0000000
--- a/apple-x86/crypto/fipsmodule/md5-586-apple.S
+++ /dev/null
@@ -1,684 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl	_md5_block_asm_data_order
-.private_extern	_md5_block_asm_data_order
-.align	4
-_md5_block_asm_data_order:
-L_md5_block_asm_data_order_begin:
-	pushl	%esi
-	pushl	%edi
-	movl	12(%esp),%edi
-	movl	16(%esp),%esi
-	movl	20(%esp),%ecx
-	pushl	%ebp
-	shll	$6,%ecx
-	pushl	%ebx
-	addl	%esi,%ecx
-	subl	$64,%ecx
-	movl	(%edi),%eax
-	pushl	%ecx
-	movl	4(%edi),%ebx
-	movl	8(%edi),%ecx
-	movl	12(%edi),%edx
-L000start:
-
-	# R0 section 
-	movl	%ecx,%edi
-	movl	(%esi),%ebp
-	# R0 0 
-	xorl	%edx,%edi
-	andl	%ebx,%edi
-	leal	3614090360(%eax,%ebp,1),%eax
-	xorl	%edx,%edi
-	addl	%edi,%eax
-	movl	%ebx,%edi
-	roll	$7,%eax
-	movl	4(%esi),%ebp
-	addl	%ebx,%eax
-	# R0 1 
-	xorl	%ecx,%edi
-	andl	%eax,%edi
-	leal	3905402710(%edx,%ebp,1),%edx
-	xorl	%ecx,%edi
-	addl	%edi,%edx
-	movl	%eax,%edi
-	roll	$12,%edx
-	movl	8(%esi),%ebp
-	addl	%eax,%edx
-	# R0 2 
-	xorl	%ebx,%edi
-	andl	%edx,%edi
-	leal	606105819(%ecx,%ebp,1),%ecx
-	xorl	%ebx,%edi
-	addl	%edi,%ecx
-	movl	%edx,%edi
-	roll	$17,%ecx
-	movl	12(%esi),%ebp
-	addl	%edx,%ecx
-	# R0 3 
-	xorl	%eax,%edi
-	andl	%ecx,%edi
-	leal	3250441966(%ebx,%ebp,1),%ebx
-	xorl	%eax,%edi
-	addl	%edi,%ebx
-	movl	%ecx,%edi
-	roll	$22,%ebx
-	movl	16(%esi),%ebp
-	addl	%ecx,%ebx
-	# R0 4 
-	xorl	%edx,%edi
-	andl	%ebx,%edi
-	leal	4118548399(%eax,%ebp,1),%eax
-	xorl	%edx,%edi
-	addl	%edi,%eax
-	movl	%ebx,%edi
-	roll	$7,%eax
-	movl	20(%esi),%ebp
-	addl	%ebx,%eax
-	# R0 5 
-	xorl	%ecx,%edi
-	andl	%eax,%edi
-	leal	1200080426(%edx,%ebp,1),%edx
-	xorl	%ecx,%edi
-	addl	%edi,%edx
-	movl	%eax,%edi
-	roll	$12,%edx
-	movl	24(%esi),%ebp
-	addl	%eax,%edx
-	# R0 6 
-	xorl	%ebx,%edi
-	andl	%edx,%edi
-	leal	2821735955(%ecx,%ebp,1),%ecx
-	xorl	%ebx,%edi
-	addl	%edi,%ecx
-	movl	%edx,%edi
-	roll	$17,%ecx
-	movl	28(%esi),%ebp
-	addl	%edx,%ecx
-	# R0 7 
-	xorl	%eax,%edi
-	andl	%ecx,%edi
-	leal	4249261313(%ebx,%ebp,1),%ebx
-	xorl	%eax,%edi
-	addl	%edi,%ebx
-	movl	%ecx,%edi
-	roll	$22,%ebx
-	movl	32(%esi),%ebp
-	addl	%ecx,%ebx
-	# R0 8 
-	xorl	%edx,%edi
-	andl	%ebx,%edi
-	leal	1770035416(%eax,%ebp,1),%eax
-	xorl	%edx,%edi
-	addl	%edi,%eax
-	movl	%ebx,%edi
-	roll	$7,%eax
-	movl	36(%esi),%ebp
-	addl	%ebx,%eax
-	# R0 9 
-	xorl	%ecx,%edi
-	andl	%eax,%edi
-	leal	2336552879(%edx,%ebp,1),%edx
-	xorl	%ecx,%edi
-	addl	%edi,%edx
-	movl	%eax,%edi
-	roll	$12,%edx
-	movl	40(%esi),%ebp
-	addl	%eax,%edx
-	# R0 10 
-	xorl	%ebx,%edi
-	andl	%edx,%edi
-	leal	4294925233(%ecx,%ebp,1),%ecx
-	xorl	%ebx,%edi
-	addl	%edi,%ecx
-	movl	%edx,%edi
-	roll	$17,%ecx
-	movl	44(%esi),%ebp
-	addl	%edx,%ecx
-	# R0 11 
-	xorl	%eax,%edi
-	andl	%ecx,%edi
-	leal	2304563134(%ebx,%ebp,1),%ebx
-	xorl	%eax,%edi
-	addl	%edi,%ebx
-	movl	%ecx,%edi
-	roll	$22,%ebx
-	movl	48(%esi),%ebp
-	addl	%ecx,%ebx
-	# R0 12 
-	xorl	%edx,%edi
-	andl	%ebx,%edi
-	leal	1804603682(%eax,%ebp,1),%eax
-	xorl	%edx,%edi
-	addl	%edi,%eax
-	movl	%ebx,%edi
-	roll	$7,%eax
-	movl	52(%esi),%ebp
-	addl	%ebx,%eax
-	# R0 13 
-	xorl	%ecx,%edi
-	andl	%eax,%edi
-	leal	4254626195(%edx,%ebp,1),%edx
-	xorl	%ecx,%edi
-	addl	%edi,%edx
-	movl	%eax,%edi
-	roll	$12,%edx
-	movl	56(%esi),%ebp
-	addl	%eax,%edx
-	# R0 14 
-	xorl	%ebx,%edi
-	andl	%edx,%edi
-	leal	2792965006(%ecx,%ebp,1),%ecx
-	xorl	%ebx,%edi
-	addl	%edi,%ecx
-	movl	%edx,%edi
-	roll	$17,%ecx
-	movl	60(%esi),%ebp
-	addl	%edx,%ecx
-	# R0 15 
-	xorl	%eax,%edi
-	andl	%ecx,%edi
-	leal	1236535329(%ebx,%ebp,1),%ebx
-	xorl	%eax,%edi
-	addl	%edi,%ebx
-	movl	%ecx,%edi
-	roll	$22,%ebx
-	movl	4(%esi),%ebp
-	addl	%ecx,%ebx
-
-	# R1 section 
-	# R1 16 
-	leal	4129170786(%eax,%ebp,1),%eax
-	xorl	%ebx,%edi
-	andl	%edx,%edi
-	movl	24(%esi),%ebp
-	xorl	%ecx,%edi
-	addl	%edi,%eax
-	movl	%ebx,%edi
-	roll	$5,%eax
-	addl	%ebx,%eax
-	# R1 17 
-	leal	3225465664(%edx,%ebp,1),%edx
-	xorl	%eax,%edi
-	andl	%ecx,%edi
-	movl	44(%esi),%ebp
-	xorl	%ebx,%edi
-	addl	%edi,%edx
-	movl	%eax,%edi
-	roll	$9,%edx
-	addl	%eax,%edx
-	# R1 18 
-	leal	643717713(%ecx,%ebp,1),%ecx
-	xorl	%edx,%edi
-	andl	%ebx,%edi
-	movl	(%esi),%ebp
-	xorl	%eax,%edi
-	addl	%edi,%ecx
-	movl	%edx,%edi
-	roll	$14,%ecx
-	addl	%edx,%ecx
-	# R1 19 
-	leal	3921069994(%ebx,%ebp,1),%ebx
-	xorl	%ecx,%edi
-	andl	%eax,%edi
-	movl	20(%esi),%ebp
-	xorl	%edx,%edi
-	addl	%edi,%ebx
-	movl	%ecx,%edi
-	roll	$20,%ebx
-	addl	%ecx,%ebx
-	# R1 20 
-	leal	3593408605(%eax,%ebp,1),%eax
-	xorl	%ebx,%edi
-	andl	%edx,%edi
-	movl	40(%esi),%ebp
-	xorl	%ecx,%edi
-	addl	%edi,%eax
-	movl	%ebx,%edi
-	roll	$5,%eax
-	addl	%ebx,%eax
-	# R1 21 
-	leal	38016083(%edx,%ebp,1),%edx
-	xorl	%eax,%edi
-	andl	%ecx,%edi
-	movl	60(%esi),%ebp
-	xorl	%ebx,%edi
-	addl	%edi,%edx
-	movl	%eax,%edi
-	roll	$9,%edx
-	addl	%eax,%edx
-	# R1 22 
-	leal	3634488961(%ecx,%ebp,1),%ecx
-	xorl	%edx,%edi
-	andl	%ebx,%edi
-	movl	16(%esi),%ebp
-	xorl	%eax,%edi
-	addl	%edi,%ecx
-	movl	%edx,%edi
-	roll	$14,%ecx
-	addl	%edx,%ecx
-	# R1 23 
-	leal	3889429448(%ebx,%ebp,1),%ebx
-	xorl	%ecx,%edi
-	andl	%eax,%edi
-	movl	36(%esi),%ebp
-	xorl	%edx,%edi
-	addl	%edi,%ebx
-	movl	%ecx,%edi
-	roll	$20,%ebx
-	addl	%ecx,%ebx
-	# R1 24 
-	leal	568446438(%eax,%ebp,1),%eax
-	xorl	%ebx,%edi
-	andl	%edx,%edi
-	movl	56(%esi),%ebp
-	xorl	%ecx,%edi
-	addl	%edi,%eax
-	movl	%ebx,%edi
-	roll	$5,%eax
-	addl	%ebx,%eax
-	# R1 25 
-	leal	3275163606(%edx,%ebp,1),%edx
-	xorl	%eax,%edi
-	andl	%ecx,%edi
-	movl	12(%esi),%ebp
-	xorl	%ebx,%edi
-	addl	%edi,%edx
-	movl	%eax,%edi
-	roll	$9,%edx
-	addl	%eax,%edx
-	# R1 26 
-	leal	4107603335(%ecx,%ebp,1),%ecx
-	xorl	%edx,%edi
-	andl	%ebx,%edi
-	movl	32(%esi),%ebp
-	xorl	%eax,%edi
-	addl	%edi,%ecx
-	movl	%edx,%edi
-	roll	$14,%ecx
-	addl	%edx,%ecx
-	# R1 27 
-	leal	1163531501(%ebx,%ebp,1),%ebx
-	xorl	%ecx,%edi
-	andl	%eax,%edi
-	movl	52(%esi),%ebp
-	xorl	%edx,%edi
-	addl	%edi,%ebx
-	movl	%ecx,%edi
-	roll	$20,%ebx
-	addl	%ecx,%ebx
-	# R1 28 
-	leal	2850285829(%eax,%ebp,1),%eax
-	xorl	%ebx,%edi
-	andl	%edx,%edi
-	movl	8(%esi),%ebp
-	xorl	%ecx,%edi
-	addl	%edi,%eax
-	movl	%ebx,%edi
-	roll	$5,%eax
-	addl	%ebx,%eax
-	# R1 29 
-	leal	4243563512(%edx,%ebp,1),%edx
-	xorl	%eax,%edi
-	andl	%ecx,%edi
-	movl	28(%esi),%ebp
-	xorl	%ebx,%edi
-	addl	%edi,%edx
-	movl	%eax,%edi
-	roll	$9,%edx
-	addl	%eax,%edx
-	# R1 30 
-	leal	1735328473(%ecx,%ebp,1),%ecx
-	xorl	%edx,%edi
-	andl	%ebx,%edi
-	movl	48(%esi),%ebp
-	xorl	%eax,%edi
-	addl	%edi,%ecx
-	movl	%edx,%edi
-	roll	$14,%ecx
-	addl	%edx,%ecx
-	# R1 31 
-	leal	2368359562(%ebx,%ebp,1),%ebx
-	xorl	%ecx,%edi
-	andl	%eax,%edi
-	movl	20(%esi),%ebp
-	xorl	%edx,%edi
-	addl	%edi,%ebx
-	movl	%ecx,%edi
-	roll	$20,%ebx
-	addl	%ecx,%ebx
-
-	# R2 section 
-	# R2 32 
-	xorl	%edx,%edi
-	xorl	%ebx,%edi
-	leal	4294588738(%eax,%ebp,1),%eax
-	addl	%edi,%eax
-	roll	$4,%eax
-	movl	32(%esi),%ebp
-	movl	%ebx,%edi
-	# R2 33 
-	leal	2272392833(%edx,%ebp,1),%edx
-	addl	%ebx,%eax
-	xorl	%ecx,%edi
-	xorl	%eax,%edi
-	movl	44(%esi),%ebp
-	addl	%edi,%edx
-	movl	%eax,%edi
-	roll	$11,%edx
-	addl	%eax,%edx
-	# R2 34 
-	xorl	%ebx,%edi
-	xorl	%edx,%edi
-	leal	1839030562(%ecx,%ebp,1),%ecx
-	addl	%edi,%ecx
-	roll	$16,%ecx
-	movl	56(%esi),%ebp
-	movl	%edx,%edi
-	# R2 35 
-	leal	4259657740(%ebx,%ebp,1),%ebx
-	addl	%edx,%ecx
-	xorl	%eax,%edi
-	xorl	%ecx,%edi
-	movl	4(%esi),%ebp
-	addl	%edi,%ebx
-	movl	%ecx,%edi
-	roll	$23,%ebx
-	addl	%ecx,%ebx
-	# R2 36 
-	xorl	%edx,%edi
-	xorl	%ebx,%edi
-	leal	2763975236(%eax,%ebp,1),%eax
-	addl	%edi,%eax
-	roll	$4,%eax
-	movl	16(%esi),%ebp
-	movl	%ebx,%edi
-	# R2 37 
-	leal	1272893353(%edx,%ebp,1),%edx
-	addl	%ebx,%eax
-	xorl	%ecx,%edi
-	xorl	%eax,%edi
-	movl	28(%esi),%ebp
-	addl	%edi,%edx
-	movl	%eax,%edi
-	roll	$11,%edx
-	addl	%eax,%edx
-	# R2 38 
-	xorl	%ebx,%edi
-	xorl	%edx,%edi
-	leal	4139469664(%ecx,%ebp,1),%ecx
-	addl	%edi,%ecx
-	roll	$16,%ecx
-	movl	40(%esi),%ebp
-	movl	%edx,%edi
-	# R2 39 
-	leal	3200236656(%ebx,%ebp,1),%ebx
-	addl	%edx,%ecx
-	xorl	%eax,%edi
-	xorl	%ecx,%edi
-	movl	52(%esi),%ebp
-	addl	%edi,%ebx
-	movl	%ecx,%edi
-	roll	$23,%ebx
-	addl	%ecx,%ebx
-	# R2 40 
-	xorl	%edx,%edi
-	xorl	%ebx,%edi
-	leal	681279174(%eax,%ebp,1),%eax
-	addl	%edi,%eax
-	roll	$4,%eax
-	movl	(%esi),%ebp
-	movl	%ebx,%edi
-	# R2 41 
-	leal	3936430074(%edx,%ebp,1),%edx
-	addl	%ebx,%eax
-	xorl	%ecx,%edi
-	xorl	%eax,%edi
-	movl	12(%esi),%ebp
-	addl	%edi,%edx
-	movl	%eax,%edi
-	roll	$11,%edx
-	addl	%eax,%edx
-	# R2 42 
-	xorl	%ebx,%edi
-	xorl	%edx,%edi
-	leal	3572445317(%ecx,%ebp,1),%ecx
-	addl	%edi,%ecx
-	roll	$16,%ecx
-	movl	24(%esi),%ebp
-	movl	%edx,%edi
-	# R2 43 
-	leal	76029189(%ebx,%ebp,1),%ebx
-	addl	%edx,%ecx
-	xorl	%eax,%edi
-	xorl	%ecx,%edi
-	movl	36(%esi),%ebp
-	addl	%edi,%ebx
-	movl	%ecx,%edi
-	roll	$23,%ebx
-	addl	%ecx,%ebx
-	# R2 44 
-	xorl	%edx,%edi
-	xorl	%ebx,%edi
-	leal	3654602809(%eax,%ebp,1),%eax
-	addl	%edi,%eax
-	roll	$4,%eax
-	movl	48(%esi),%ebp
-	movl	%ebx,%edi
-	# R2 45 
-	leal	3873151461(%edx,%ebp,1),%edx
-	addl	%ebx,%eax
-	xorl	%ecx,%edi
-	xorl	%eax,%edi
-	movl	60(%esi),%ebp
-	addl	%edi,%edx
-	movl	%eax,%edi
-	roll	$11,%edx
-	addl	%eax,%edx
-	# R2 46 
-	xorl	%ebx,%edi
-	xorl	%edx,%edi
-	leal	530742520(%ecx,%ebp,1),%ecx
-	addl	%edi,%ecx
-	roll	$16,%ecx
-	movl	8(%esi),%ebp
-	movl	%edx,%edi
-	# R2 47 
-	leal	3299628645(%ebx,%ebp,1),%ebx
-	addl	%edx,%ecx
-	xorl	%eax,%edi
-	xorl	%ecx,%edi
-	movl	(%esi),%ebp
-	addl	%edi,%ebx
-	movl	$-1,%edi
-	roll	$23,%ebx
-	addl	%ecx,%ebx
-
-	# R3 section 
-	# R3 48 
-	xorl	%edx,%edi
-	orl	%ebx,%edi
-	leal	4096336452(%eax,%ebp,1),%eax
-	xorl	%ecx,%edi
-	movl	28(%esi),%ebp
-	addl	%edi,%eax
-	movl	$-1,%edi
-	roll	$6,%eax
-	xorl	%ecx,%edi
-	addl	%ebx,%eax
-	# R3 49 
-	orl	%eax,%edi
-	leal	1126891415(%edx,%ebp,1),%edx
-	xorl	%ebx,%edi
-	movl	56(%esi),%ebp
-	addl	%edi,%edx
-	movl	$-1,%edi
-	roll	$10,%edx
-	xorl	%ebx,%edi
-	addl	%eax,%edx
-	# R3 50 
-	orl	%edx,%edi
-	leal	2878612391(%ecx,%ebp,1),%ecx
-	xorl	%eax,%edi
-	movl	20(%esi),%ebp
-	addl	%edi,%ecx
-	movl	$-1,%edi
-	roll	$15,%ecx
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	# R3 51 
-	orl	%ecx,%edi
-	leal	4237533241(%ebx,%ebp,1),%ebx
-	xorl	%edx,%edi
-	movl	48(%esi),%ebp
-	addl	%edi,%ebx
-	movl	$-1,%edi
-	roll	$21,%ebx
-	xorl	%edx,%edi
-	addl	%ecx,%ebx
-	# R3 52 
-	orl	%ebx,%edi
-	leal	1700485571(%eax,%ebp,1),%eax
-	xorl	%ecx,%edi
-	movl	12(%esi),%ebp
-	addl	%edi,%eax
-	movl	$-1,%edi
-	roll	$6,%eax
-	xorl	%ecx,%edi
-	addl	%ebx,%eax
-	# R3 53 
-	orl	%eax,%edi
-	leal	2399980690(%edx,%ebp,1),%edx
-	xorl	%ebx,%edi
-	movl	40(%esi),%ebp
-	addl	%edi,%edx
-	movl	$-1,%edi
-	roll	$10,%edx
-	xorl	%ebx,%edi
-	addl	%eax,%edx
-	# R3 54 
-	orl	%edx,%edi
-	leal	4293915773(%ecx,%ebp,1),%ecx
-	xorl	%eax,%edi
-	movl	4(%esi),%ebp
-	addl	%edi,%ecx
-	movl	$-1,%edi
-	roll	$15,%ecx
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	# R3 55 
-	orl	%ecx,%edi
-	leal	2240044497(%ebx,%ebp,1),%ebx
-	xorl	%edx,%edi
-	movl	32(%esi),%ebp
-	addl	%edi,%ebx
-	movl	$-1,%edi
-	roll	$21,%ebx
-	xorl	%edx,%edi
-	addl	%ecx,%ebx
-	# R3 56 
-	orl	%ebx,%edi
-	leal	1873313359(%eax,%ebp,1),%eax
-	xorl	%ecx,%edi
-	movl	60(%esi),%ebp
-	addl	%edi,%eax
-	movl	$-1,%edi
-	roll	$6,%eax
-	xorl	%ecx,%edi
-	addl	%ebx,%eax
-	# R3 57 
-	orl	%eax,%edi
-	leal	4264355552(%edx,%ebp,1),%edx
-	xorl	%ebx,%edi
-	movl	24(%esi),%ebp
-	addl	%edi,%edx
-	movl	$-1,%edi
-	roll	$10,%edx
-	xorl	%ebx,%edi
-	addl	%eax,%edx
-	# R3 58 
-	orl	%edx,%edi
-	leal	2734768916(%ecx,%ebp,1),%ecx
-	xorl	%eax,%edi
-	movl	52(%esi),%ebp
-	addl	%edi,%ecx
-	movl	$-1,%edi
-	roll	$15,%ecx
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	# R3 59 
-	orl	%ecx,%edi
-	leal	1309151649(%ebx,%ebp,1),%ebx
-	xorl	%edx,%edi
-	movl	16(%esi),%ebp
-	addl	%edi,%ebx
-	movl	$-1,%edi
-	roll	$21,%ebx
-	xorl	%edx,%edi
-	addl	%ecx,%ebx
-	# R3 60 
-	orl	%ebx,%edi
-	leal	4149444226(%eax,%ebp,1),%eax
-	xorl	%ecx,%edi
-	movl	44(%esi),%ebp
-	addl	%edi,%eax
-	movl	$-1,%edi
-	roll	$6,%eax
-	xorl	%ecx,%edi
-	addl	%ebx,%eax
-	# R3 61 
-	orl	%eax,%edi
-	leal	3174756917(%edx,%ebp,1),%edx
-	xorl	%ebx,%edi
-	movl	8(%esi),%ebp
-	addl	%edi,%edx
-	movl	$-1,%edi
-	roll	$10,%edx
-	xorl	%ebx,%edi
-	addl	%eax,%edx
-	# R3 62 
-	orl	%edx,%edi
-	leal	718787259(%ecx,%ebp,1),%ecx
-	xorl	%eax,%edi
-	movl	36(%esi),%ebp
-	addl	%edi,%ecx
-	movl	$-1,%edi
-	roll	$15,%ecx
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	# R3 63 
-	orl	%ecx,%edi
-	leal	3951481745(%ebx,%ebp,1),%ebx
-	xorl	%edx,%edi
-	movl	24(%esp),%ebp
-	addl	%edi,%ebx
-	addl	$64,%esi
-	roll	$21,%ebx
-	movl	(%ebp),%edi
-	addl	%ecx,%ebx
-	addl	%edi,%eax
-	movl	4(%ebp),%edi
-	addl	%edi,%ebx
-	movl	8(%ebp),%edi
-	addl	%edi,%ecx
-	movl	12(%ebp),%edi
-	addl	%edi,%edx
-	movl	%eax,(%ebp)
-	movl	%ebx,4(%ebp)
-	movl	(%esp),%edi
-	movl	%ecx,8(%ebp)
-	movl	%edx,12(%ebp)
-	cmpl	%esi,%edi
-	jae	L000start
-	popl	%eax
-	popl	%ebx
-	popl	%ebp
-	popl	%edi
-	popl	%esi
-	ret
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/sha1-586-apple.S b/apple-x86/crypto/fipsmodule/sha1-586-apple.S
deleted file mode 100644
index 76ee6bc..0000000
--- a/apple-x86/crypto/fipsmodule/sha1-586-apple.S
+++ /dev/null
@@ -1,3804 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl	_sha1_block_data_order
-.private_extern	_sha1_block_data_order
-.align	4
-_sha1_block_data_order:
-L_sha1_block_data_order_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	call	L000pic_point
-L000pic_point:
-	popl	%ebp
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L000pic_point(%ebp),%esi
-	leal	LK_XX_XX-L000pic_point(%ebp),%ebp
-	movl	(%esi),%eax
-	movl	4(%esi),%edx
-	testl	$512,%edx
-	jz	L001x86
-	movl	8(%esi),%ecx
-	testl	$16777216,%eax
-	jz	L001x86
-	andl	$268435456,%edx
-	andl	$1073741824,%eax
-	orl	%edx,%eax
-	cmpl	$1342177280,%eax
-	je	Lavx_shortcut
-	jmp	Lssse3_shortcut
-.align	4,0x90
-L001x86:
-	movl	20(%esp),%ebp
-	movl	24(%esp),%esi
-	movl	28(%esp),%eax
-	subl	$76,%esp
-	shll	$6,%eax
-	addl	%esi,%eax
-	movl	%eax,104(%esp)
-	movl	16(%ebp),%edi
-	jmp	L002loop
-.align	4,0x90
-L002loop:
-	movl	(%esi),%eax
-	movl	4(%esi),%ebx
-	movl	8(%esi),%ecx
-	movl	12(%esi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movl	%ecx,8(%esp)
-	movl	%edx,12(%esp)
-	movl	16(%esi),%eax
-	movl	20(%esi),%ebx
-	movl	24(%esi),%ecx
-	movl	28(%esi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	movl	%eax,16(%esp)
-	movl	%ebx,20(%esp)
-	movl	%ecx,24(%esp)
-	movl	%edx,28(%esp)
-	movl	32(%esi),%eax
-	movl	36(%esi),%ebx
-	movl	40(%esi),%ecx
-	movl	44(%esi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	movl	%eax,32(%esp)
-	movl	%ebx,36(%esp)
-	movl	%ecx,40(%esp)
-	movl	%edx,44(%esp)
-	movl	48(%esi),%eax
-	movl	52(%esi),%ebx
-	movl	56(%esi),%ecx
-	movl	60(%esi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	movl	%eax,48(%esp)
-	movl	%ebx,52(%esp)
-	movl	%ecx,56(%esp)
-	movl	%edx,60(%esp)
-	movl	%esi,100(%esp)
-	movl	(%ebp),%eax
-	movl	4(%ebp),%ebx
-	movl	8(%ebp),%ecx
-	movl	12(%ebp),%edx
-	# 00_15 0 
-	movl	%ecx,%esi
-	movl	%eax,%ebp
-	roll	$5,%ebp
-	xorl	%edx,%esi
-	addl	%edi,%ebp
-	movl	(%esp),%edi
-	andl	%ebx,%esi
-	rorl	$2,%ebx
-	xorl	%edx,%esi
-	leal	1518500249(%ebp,%edi,1),%ebp
-	addl	%esi,%ebp
-	# 00_15 1 
-	movl	%ebx,%edi
-	movl	%ebp,%esi
-	roll	$5,%ebp
-	xorl	%ecx,%edi
-	addl	%edx,%ebp
-	movl	4(%esp),%edx
-	andl	%eax,%edi
-	rorl	$2,%eax
-	xorl	%ecx,%edi
-	leal	1518500249(%ebp,%edx,1),%ebp
-	addl	%edi,%ebp
-	# 00_15 2 
-	movl	%eax,%edx
-	movl	%ebp,%edi
-	roll	$5,%ebp
-	xorl	%ebx,%edx
-	addl	%ecx,%ebp
-	movl	8(%esp),%ecx
-	andl	%esi,%edx
-	rorl	$2,%esi
-	xorl	%ebx,%edx
-	leal	1518500249(%ebp,%ecx,1),%ebp
-	addl	%edx,%ebp
-	# 00_15 3 
-	movl	%esi,%ecx
-	movl	%ebp,%edx
-	roll	$5,%ebp
-	xorl	%eax,%ecx
-	addl	%ebx,%ebp
-	movl	12(%esp),%ebx
-	andl	%edi,%ecx
-	rorl	$2,%edi
-	xorl	%eax,%ecx
-	leal	1518500249(%ebp,%ebx,1),%ebp
-	addl	%ecx,%ebp
-	# 00_15 4 
-	movl	%edi,%ebx
-	movl	%ebp,%ecx
-	roll	$5,%ebp
-	xorl	%esi,%ebx
-	addl	%eax,%ebp
-	movl	16(%esp),%eax
-	andl	%edx,%ebx
-	rorl	$2,%edx
-	xorl	%esi,%ebx
-	leal	1518500249(%ebp,%eax,1),%ebp
-	addl	%ebx,%ebp
-	# 00_15 5 
-	movl	%edx,%eax
-	movl	%ebp,%ebx
-	roll	$5,%ebp
-	xorl	%edi,%eax
-	addl	%esi,%ebp
-	movl	20(%esp),%esi
-	andl	%ecx,%eax
-	rorl	$2,%ecx
-	xorl	%edi,%eax
-	leal	1518500249(%ebp,%esi,1),%ebp
-	addl	%eax,%ebp
-	# 00_15 6 
-	movl	%ecx,%esi
-	movl	%ebp,%eax
-	roll	$5,%ebp
-	xorl	%edx,%esi
-	addl	%edi,%ebp
-	movl	24(%esp),%edi
-	andl	%ebx,%esi
-	rorl	$2,%ebx
-	xorl	%edx,%esi
-	leal	1518500249(%ebp,%edi,1),%ebp
-	addl	%esi,%ebp
-	# 00_15 7 
-	movl	%ebx,%edi
-	movl	%ebp,%esi
-	roll	$5,%ebp
-	xorl	%ecx,%edi
-	addl	%edx,%ebp
-	movl	28(%esp),%edx
-	andl	%eax,%edi
-	rorl	$2,%eax
-	xorl	%ecx,%edi
-	leal	1518500249(%ebp,%edx,1),%ebp
-	addl	%edi,%ebp
-	# 00_15 8 
-	movl	%eax,%edx
-	movl	%ebp,%edi
-	roll	$5,%ebp
-	xorl	%ebx,%edx
-	addl	%ecx,%ebp
-	movl	32(%esp),%ecx
-	andl	%esi,%edx
-	rorl	$2,%esi
-	xorl	%ebx,%edx
-	leal	1518500249(%ebp,%ecx,1),%ebp
-	addl	%edx,%ebp
-	# 00_15 9 
-	movl	%esi,%ecx
-	movl	%ebp,%edx
-	roll	$5,%ebp
-	xorl	%eax,%ecx
-	addl	%ebx,%ebp
-	movl	36(%esp),%ebx
-	andl	%edi,%ecx
-	rorl	$2,%edi
-	xorl	%eax,%ecx
-	leal	1518500249(%ebp,%ebx,1),%ebp
-	addl	%ecx,%ebp
-	# 00_15 10 
-	movl	%edi,%ebx
-	movl	%ebp,%ecx
-	roll	$5,%ebp
-	xorl	%esi,%ebx
-	addl	%eax,%ebp
-	movl	40(%esp),%eax
-	andl	%edx,%ebx
-	rorl	$2,%edx
-	xorl	%esi,%ebx
-	leal	1518500249(%ebp,%eax,1),%ebp
-	addl	%ebx,%ebp
-	# 00_15 11 
-	movl	%edx,%eax
-	movl	%ebp,%ebx
-	roll	$5,%ebp
-	xorl	%edi,%eax
-	addl	%esi,%ebp
-	movl	44(%esp),%esi
-	andl	%ecx,%eax
-	rorl	$2,%ecx
-	xorl	%edi,%eax
-	leal	1518500249(%ebp,%esi,1),%ebp
-	addl	%eax,%ebp
-	# 00_15 12 
-	movl	%ecx,%esi
-	movl	%ebp,%eax
-	roll	$5,%ebp
-	xorl	%edx,%esi
-	addl	%edi,%ebp
-	movl	48(%esp),%edi
-	andl	%ebx,%esi
-	rorl	$2,%ebx
-	xorl	%edx,%esi
-	leal	1518500249(%ebp,%edi,1),%ebp
-	addl	%esi,%ebp
-	# 00_15 13 
-	movl	%ebx,%edi
-	movl	%ebp,%esi
-	roll	$5,%ebp
-	xorl	%ecx,%edi
-	addl	%edx,%ebp
-	movl	52(%esp),%edx
-	andl	%eax,%edi
-	rorl	$2,%eax
-	xorl	%ecx,%edi
-	leal	1518500249(%ebp,%edx,1),%ebp
-	addl	%edi,%ebp
-	# 00_15 14 
-	movl	%eax,%edx
-	movl	%ebp,%edi
-	roll	$5,%ebp
-	xorl	%ebx,%edx
-	addl	%ecx,%ebp
-	movl	56(%esp),%ecx
-	andl	%esi,%edx
-	rorl	$2,%esi
-	xorl	%ebx,%edx
-	leal	1518500249(%ebp,%ecx,1),%ebp
-	addl	%edx,%ebp
-	# 00_15 15 
-	movl	%esi,%ecx
-	movl	%ebp,%edx
-	roll	$5,%ebp
-	xorl	%eax,%ecx
-	addl	%ebx,%ebp
-	movl	60(%esp),%ebx
-	andl	%edi,%ecx
-	rorl	$2,%edi
-	xorl	%eax,%ecx
-	leal	1518500249(%ebp,%ebx,1),%ebp
-	movl	(%esp),%ebx
-	addl	%ebp,%ecx
-	# 16_19 16 
-	movl	%edi,%ebp
-	xorl	8(%esp),%ebx
-	xorl	%esi,%ebp
-	xorl	32(%esp),%ebx
-	andl	%edx,%ebp
-	xorl	52(%esp),%ebx
-	roll	$1,%ebx
-	xorl	%esi,%ebp
-	addl	%ebp,%eax
-	movl	%ecx,%ebp
-	rorl	$2,%edx
-	movl	%ebx,(%esp)
-	roll	$5,%ebp
-	leal	1518500249(%ebx,%eax,1),%ebx
-	movl	4(%esp),%eax
-	addl	%ebp,%ebx
-	# 16_19 17 
-	movl	%edx,%ebp
-	xorl	12(%esp),%eax
-	xorl	%edi,%ebp
-	xorl	36(%esp),%eax
-	andl	%ecx,%ebp
-	xorl	56(%esp),%eax
-	roll	$1,%eax
-	xorl	%edi,%ebp
-	addl	%ebp,%esi
-	movl	%ebx,%ebp
-	rorl	$2,%ecx
-	movl	%eax,4(%esp)
-	roll	$5,%ebp
-	leal	1518500249(%eax,%esi,1),%eax
-	movl	8(%esp),%esi
-	addl	%ebp,%eax
-	# 16_19 18 
-	movl	%ecx,%ebp
-	xorl	16(%esp),%esi
-	xorl	%edx,%ebp
-	xorl	40(%esp),%esi
-	andl	%ebx,%ebp
-	xorl	60(%esp),%esi
-	roll	$1,%esi
-	xorl	%edx,%ebp
-	addl	%ebp,%edi
-	movl	%eax,%ebp
-	rorl	$2,%ebx
-	movl	%esi,8(%esp)
-	roll	$5,%ebp
-	leal	1518500249(%esi,%edi,1),%esi
-	movl	12(%esp),%edi
-	addl	%ebp,%esi
-	# 16_19 19 
-	movl	%ebx,%ebp
-	xorl	20(%esp),%edi
-	xorl	%ecx,%ebp
-	xorl	44(%esp),%edi
-	andl	%eax,%ebp
-	xorl	(%esp),%edi
-	roll	$1,%edi
-	xorl	%ecx,%ebp
-	addl	%ebp,%edx
-	movl	%esi,%ebp
-	rorl	$2,%eax
-	movl	%edi,12(%esp)
-	roll	$5,%ebp
-	leal	1518500249(%edi,%edx,1),%edi
-	movl	16(%esp),%edx
-	addl	%ebp,%edi
-	# 20_39 20 
-	movl	%esi,%ebp
-	xorl	24(%esp),%edx
-	xorl	%eax,%ebp
-	xorl	48(%esp),%edx
-	xorl	%ebx,%ebp
-	xorl	4(%esp),%edx
-	roll	$1,%edx
-	addl	%ebp,%ecx
-	rorl	$2,%esi
-	movl	%edi,%ebp
-	roll	$5,%ebp
-	movl	%edx,16(%esp)
-	leal	1859775393(%edx,%ecx,1),%edx
-	movl	20(%esp),%ecx
-	addl	%ebp,%edx
-	# 20_39 21 
-	movl	%edi,%ebp
-	xorl	28(%esp),%ecx
-	xorl	%esi,%ebp
-	xorl	52(%esp),%ecx
-	xorl	%eax,%ebp
-	xorl	8(%esp),%ecx
-	roll	$1,%ecx
-	addl	%ebp,%ebx
-	rorl	$2,%edi
-	movl	%edx,%ebp
-	roll	$5,%ebp
-	movl	%ecx,20(%esp)
-	leal	1859775393(%ecx,%ebx,1),%ecx
-	movl	24(%esp),%ebx
-	addl	%ebp,%ecx
-	# 20_39 22 
-	movl	%edx,%ebp
-	xorl	32(%esp),%ebx
-	xorl	%edi,%ebp
-	xorl	56(%esp),%ebx
-	xorl	%esi,%ebp
-	xorl	12(%esp),%ebx
-	roll	$1,%ebx
-	addl	%ebp,%eax
-	rorl	$2,%edx
-	movl	%ecx,%ebp
-	roll	$5,%ebp
-	movl	%ebx,24(%esp)
-	leal	1859775393(%ebx,%eax,1),%ebx
-	movl	28(%esp),%eax
-	addl	%ebp,%ebx
-	# 20_39 23 
-	movl	%ecx,%ebp
-	xorl	36(%esp),%eax
-	xorl	%edx,%ebp
-	xorl	60(%esp),%eax
-	xorl	%edi,%ebp
-	xorl	16(%esp),%eax
-	roll	$1,%eax
-	addl	%ebp,%esi
-	rorl	$2,%ecx
-	movl	%ebx,%ebp
-	roll	$5,%ebp
-	movl	%eax,28(%esp)
-	leal	1859775393(%eax,%esi,1),%eax
-	movl	32(%esp),%esi
-	addl	%ebp,%eax
-	# 20_39 24 
-	movl	%ebx,%ebp
-	xorl	40(%esp),%esi
-	xorl	%ecx,%ebp
-	xorl	(%esp),%esi
-	xorl	%edx,%ebp
-	xorl	20(%esp),%esi
-	roll	$1,%esi
-	addl	%ebp,%edi
-	rorl	$2,%ebx
-	movl	%eax,%ebp
-	roll	$5,%ebp
-	movl	%esi,32(%esp)
-	leal	1859775393(%esi,%edi,1),%esi
-	movl	36(%esp),%edi
-	addl	%ebp,%esi
-	# 20_39 25 
-	movl	%eax,%ebp
-	xorl	44(%esp),%edi
-	xorl	%ebx,%ebp
-	xorl	4(%esp),%edi
-	xorl	%ecx,%ebp
-	xorl	24(%esp),%edi
-	roll	$1,%edi
-	addl	%ebp,%edx
-	rorl	$2,%eax
-	movl	%esi,%ebp
-	roll	$5,%ebp
-	movl	%edi,36(%esp)
-	leal	1859775393(%edi,%edx,1),%edi
-	movl	40(%esp),%edx
-	addl	%ebp,%edi
-	# 20_39 26 
-	movl	%esi,%ebp
-	xorl	48(%esp),%edx
-	xorl	%eax,%ebp
-	xorl	8(%esp),%edx
-	xorl	%ebx,%ebp
-	xorl	28(%esp),%edx
-	roll	$1,%edx
-	addl	%ebp,%ecx
-	rorl	$2,%esi
-	movl	%edi,%ebp
-	roll	$5,%ebp
-	movl	%edx,40(%esp)
-	leal	1859775393(%edx,%ecx,1),%edx
-	movl	44(%esp),%ecx
-	addl	%ebp,%edx
-	# 20_39 27 
-	movl	%edi,%ebp
-	xorl	52(%esp),%ecx
-	xorl	%esi,%ebp
-	xorl	12(%esp),%ecx
-	xorl	%eax,%ebp
-	xorl	32(%esp),%ecx
-	roll	$1,%ecx
-	addl	%ebp,%ebx
-	rorl	$2,%edi
-	movl	%edx,%ebp
-	roll	$5,%ebp
-	movl	%ecx,44(%esp)
-	leal	1859775393(%ecx,%ebx,1),%ecx
-	movl	48(%esp),%ebx
-	addl	%ebp,%ecx
-	# 20_39 28 
-	movl	%edx,%ebp
-	xorl	56(%esp),%ebx
-	xorl	%edi,%ebp
-	xorl	16(%esp),%ebx
-	xorl	%esi,%ebp
-	xorl	36(%esp),%ebx
-	roll	$1,%ebx
-	addl	%ebp,%eax
-	rorl	$2,%edx
-	movl	%ecx,%ebp
-	roll	$5,%ebp
-	movl	%ebx,48(%esp)
-	leal	1859775393(%ebx,%eax,1),%ebx
-	movl	52(%esp),%eax
-	addl	%ebp,%ebx
-	# 20_39 29 
-	movl	%ecx,%ebp
-	xorl	60(%esp),%eax
-	xorl	%edx,%ebp
-	xorl	20(%esp),%eax
-	xorl	%edi,%ebp
-	xorl	40(%esp),%eax
-	roll	$1,%eax
-	addl	%ebp,%esi
-	rorl	$2,%ecx
-	movl	%ebx,%ebp
-	roll	$5,%ebp
-	movl	%eax,52(%esp)
-	leal	1859775393(%eax,%esi,1),%eax
-	movl	56(%esp),%esi
-	addl	%ebp,%eax
-	# 20_39 30 
-	movl	%ebx,%ebp
-	xorl	(%esp),%esi
-	xorl	%ecx,%ebp
-	xorl	24(%esp),%esi
-	xorl	%edx,%ebp
-	xorl	44(%esp),%esi
-	roll	$1,%esi
-	addl	%ebp,%edi
-	rorl	$2,%ebx
-	movl	%eax,%ebp
-	roll	$5,%ebp
-	movl	%esi,56(%esp)
-	leal	1859775393(%esi,%edi,1),%esi
-	movl	60(%esp),%edi
-	addl	%ebp,%esi
-	# 20_39 31 
-	movl	%eax,%ebp
-	xorl	4(%esp),%edi
-	xorl	%ebx,%ebp
-	xorl	28(%esp),%edi
-	xorl	%ecx,%ebp
-	xorl	48(%esp),%edi
-	roll	$1,%edi
-	addl	%ebp,%edx
-	rorl	$2,%eax
-	movl	%esi,%ebp
-	roll	$5,%ebp
-	movl	%edi,60(%esp)
-	leal	1859775393(%edi,%edx,1),%edi
-	movl	(%esp),%edx
-	addl	%ebp,%edi
-	# 20_39 32 
-	movl	%esi,%ebp
-	xorl	8(%esp),%edx
-	xorl	%eax,%ebp
-	xorl	32(%esp),%edx
-	xorl	%ebx,%ebp
-	xorl	52(%esp),%edx
-	roll	$1,%edx
-	addl	%ebp,%ecx
-	rorl	$2,%esi
-	movl	%edi,%ebp
-	roll	$5,%ebp
-	movl	%edx,(%esp)
-	leal	1859775393(%edx,%ecx,1),%edx
-	movl	4(%esp),%ecx
-	addl	%ebp,%edx
-	# 20_39 33 
-	movl	%edi,%ebp
-	xorl	12(%esp),%ecx
-	xorl	%esi,%ebp
-	xorl	36(%esp),%ecx
-	xorl	%eax,%ebp
-	xorl	56(%esp),%ecx
-	roll	$1,%ecx
-	addl	%ebp,%ebx
-	rorl	$2,%edi
-	movl	%edx,%ebp
-	roll	$5,%ebp
-	movl	%ecx,4(%esp)
-	leal	1859775393(%ecx,%ebx,1),%ecx
-	movl	8(%esp),%ebx
-	addl	%ebp,%ecx
-	# 20_39 34 
-	movl	%edx,%ebp
-	xorl	16(%esp),%ebx
-	xorl	%edi,%ebp
-	xorl	40(%esp),%ebx
-	xorl	%esi,%ebp
-	xorl	60(%esp),%ebx
-	roll	$1,%ebx
-	addl	%ebp,%eax
-	rorl	$2,%edx
-	movl	%ecx,%ebp
-	roll	$5,%ebp
-	movl	%ebx,8(%esp)
-	leal	1859775393(%ebx,%eax,1),%ebx
-	movl	12(%esp),%eax
-	addl	%ebp,%ebx
-	# 20_39 35 
-	movl	%ecx,%ebp
-	xorl	20(%esp),%eax
-	xorl	%edx,%ebp
-	xorl	44(%esp),%eax
-	xorl	%edi,%ebp
-	xorl	(%esp),%eax
-	roll	$1,%eax
-	addl	%ebp,%esi
-	rorl	$2,%ecx
-	movl	%ebx,%ebp
-	roll	$5,%ebp
-	movl	%eax,12(%esp)
-	leal	1859775393(%eax,%esi,1),%eax
-	movl	16(%esp),%esi
-	addl	%ebp,%eax
-	# 20_39 36 
-	movl	%ebx,%ebp
-	xorl	24(%esp),%esi
-	xorl	%ecx,%ebp
-	xorl	48(%esp),%esi
-	xorl	%edx,%ebp
-	xorl	4(%esp),%esi
-	roll	$1,%esi
-	addl	%ebp,%edi
-	rorl	$2,%ebx
-	movl	%eax,%ebp
-	roll	$5,%ebp
-	movl	%esi,16(%esp)
-	leal	1859775393(%esi,%edi,1),%esi
-	movl	20(%esp),%edi
-	addl	%ebp,%esi
-	# 20_39 37 
-	movl	%eax,%ebp
-	xorl	28(%esp),%edi
-	xorl	%ebx,%ebp
-	xorl	52(%esp),%edi
-	xorl	%ecx,%ebp
-	xorl	8(%esp),%edi
-	roll	$1,%edi
-	addl	%ebp,%edx
-	rorl	$2,%eax
-	movl	%esi,%ebp
-	roll	$5,%ebp
-	movl	%edi,20(%esp)
-	leal	1859775393(%edi,%edx,1),%edi
-	movl	24(%esp),%edx
-	addl	%ebp,%edi
-	# 20_39 38 
-	movl	%esi,%ebp
-	xorl	32(%esp),%edx
-	xorl	%eax,%ebp
-	xorl	56(%esp),%edx
-	xorl	%ebx,%ebp
-	xorl	12(%esp),%edx
-	roll	$1,%edx
-	addl	%ebp,%ecx
-	rorl	$2,%esi
-	movl	%edi,%ebp
-	roll	$5,%ebp
-	movl	%edx,24(%esp)
-	leal	1859775393(%edx,%ecx,1),%edx
-	movl	28(%esp),%ecx
-	addl	%ebp,%edx
-	# 20_39 39 
-	movl	%edi,%ebp
-	xorl	36(%esp),%ecx
-	xorl	%esi,%ebp
-	xorl	60(%esp),%ecx
-	xorl	%eax,%ebp
-	xorl	16(%esp),%ecx
-	roll	$1,%ecx
-	addl	%ebp,%ebx
-	rorl	$2,%edi
-	movl	%edx,%ebp
-	roll	$5,%ebp
-	movl	%ecx,28(%esp)
-	leal	1859775393(%ecx,%ebx,1),%ecx
-	movl	32(%esp),%ebx
-	addl	%ebp,%ecx
-	# 40_59 40 
-	movl	%edi,%ebp
-	xorl	40(%esp),%ebx
-	xorl	%esi,%ebp
-	xorl	(%esp),%ebx
-	andl	%edx,%ebp
-	xorl	20(%esp),%ebx
-	roll	$1,%ebx
-	addl	%eax,%ebp
-	rorl	$2,%edx
-	movl	%ecx,%eax
-	roll	$5,%eax
-	movl	%ebx,32(%esp)
-	leal	2400959708(%ebx,%ebp,1),%ebx
-	movl	%edi,%ebp
-	addl	%eax,%ebx
-	andl	%esi,%ebp
-	movl	36(%esp),%eax
-	addl	%ebp,%ebx
-	# 40_59 41 
-	movl	%edx,%ebp
-	xorl	44(%esp),%eax
-	xorl	%edi,%ebp
-	xorl	4(%esp),%eax
-	andl	%ecx,%ebp
-	xorl	24(%esp),%eax
-	roll	$1,%eax
-	addl	%esi,%ebp
-	rorl	$2,%ecx
-	movl	%ebx,%esi
-	roll	$5,%esi
-	movl	%eax,36(%esp)
-	leal	2400959708(%eax,%ebp,1),%eax
-	movl	%edx,%ebp
-	addl	%esi,%eax
-	andl	%edi,%ebp
-	movl	40(%esp),%esi
-	addl	%ebp,%eax
-	# 40_59 42 
-	movl	%ecx,%ebp
-	xorl	48(%esp),%esi
-	xorl	%edx,%ebp
-	xorl	8(%esp),%esi
-	andl	%ebx,%ebp
-	xorl	28(%esp),%esi
-	roll	$1,%esi
-	addl	%edi,%ebp
-	rorl	$2,%ebx
-	movl	%eax,%edi
-	roll	$5,%edi
-	movl	%esi,40(%esp)
-	leal	2400959708(%esi,%ebp,1),%esi
-	movl	%ecx,%ebp
-	addl	%edi,%esi
-	andl	%edx,%ebp
-	movl	44(%esp),%edi
-	addl	%ebp,%esi
-	# 40_59 43 
-	movl	%ebx,%ebp
-	xorl	52(%esp),%edi
-	xorl	%ecx,%ebp
-	xorl	12(%esp),%edi
-	andl	%eax,%ebp
-	xorl	32(%esp),%edi
-	roll	$1,%edi
-	addl	%edx,%ebp
-	rorl	$2,%eax
-	movl	%esi,%edx
-	roll	$5,%edx
-	movl	%edi,44(%esp)
-	leal	2400959708(%edi,%ebp,1),%edi
-	movl	%ebx,%ebp
-	addl	%edx,%edi
-	andl	%ecx,%ebp
-	movl	48(%esp),%edx
-	addl	%ebp,%edi
-	# 40_59 44 
-	movl	%eax,%ebp
-	xorl	56(%esp),%edx
-	xorl	%ebx,%ebp
-	xorl	16(%esp),%edx
-	andl	%esi,%ebp
-	xorl	36(%esp),%edx
-	roll	$1,%edx
-	addl	%ecx,%ebp
-	rorl	$2,%esi
-	movl	%edi,%ecx
-	roll	$5,%ecx
-	movl	%edx,48(%esp)
-	leal	2400959708(%edx,%ebp,1),%edx
-	movl	%eax,%ebp
-	addl	%ecx,%edx
-	andl	%ebx,%ebp
-	movl	52(%esp),%ecx
-	addl	%ebp,%edx
-	# 40_59 45 
-	movl	%esi,%ebp
-	xorl	60(%esp),%ecx
-	xorl	%eax,%ebp
-	xorl	20(%esp),%ecx
-	andl	%edi,%ebp
-	xorl	40(%esp),%ecx
-	roll	$1,%ecx
-	addl	%ebx,%ebp
-	rorl	$2,%edi
-	movl	%edx,%ebx
-	roll	$5,%ebx
-	movl	%ecx,52(%esp)
-	leal	2400959708(%ecx,%ebp,1),%ecx
-	movl	%esi,%ebp
-	addl	%ebx,%ecx
-	andl	%eax,%ebp
-	movl	56(%esp),%ebx
-	addl	%ebp,%ecx
-	# 40_59 46 
-	movl	%edi,%ebp
-	xorl	(%esp),%ebx
-	xorl	%esi,%ebp
-	xorl	24(%esp),%ebx
-	andl	%edx,%ebp
-	xorl	44(%esp),%ebx
-	roll	$1,%ebx
-	addl	%eax,%ebp
-	rorl	$2,%edx
-	movl	%ecx,%eax
-	roll	$5,%eax
-	movl	%ebx,56(%esp)
-	leal	2400959708(%ebx,%ebp,1),%ebx
-	movl	%edi,%ebp
-	addl	%eax,%ebx
-	andl	%esi,%ebp
-	movl	60(%esp),%eax
-	addl	%ebp,%ebx
-	# 40_59 47 
-	movl	%edx,%ebp
-	xorl	4(%esp),%eax
-	xorl	%edi,%ebp
-	xorl	28(%esp),%eax
-	andl	%ecx,%ebp
-	xorl	48(%esp),%eax
-	roll	$1,%eax
-	addl	%esi,%ebp
-	rorl	$2,%ecx
-	movl	%ebx,%esi
-	roll	$5,%esi
-	movl	%eax,60(%esp)
-	leal	2400959708(%eax,%ebp,1),%eax
-	movl	%edx,%ebp
-	addl	%esi,%eax
-	andl	%edi,%ebp
-	movl	(%esp),%esi
-	addl	%ebp,%eax
-	# 40_59 48 
-	movl	%ecx,%ebp
-	xorl	8(%esp),%esi
-	xorl	%edx,%ebp
-	xorl	32(%esp),%esi
-	andl	%ebx,%ebp
-	xorl	52(%esp),%esi
-	roll	$1,%esi
-	addl	%edi,%ebp
-	rorl	$2,%ebx
-	movl	%eax,%edi
-	roll	$5,%edi
-	movl	%esi,(%esp)
-	leal	2400959708(%esi,%ebp,1),%esi
-	movl	%ecx,%ebp
-	addl	%edi,%esi
-	andl	%edx,%ebp
-	movl	4(%esp),%edi
-	addl	%ebp,%esi
-	# 40_59 49 
-	movl	%ebx,%ebp
-	xorl	12(%esp),%edi
-	xorl	%ecx,%ebp
-	xorl	36(%esp),%edi
-	andl	%eax,%ebp
-	xorl	56(%esp),%edi
-	roll	$1,%edi
-	addl	%edx,%ebp
-	rorl	$2,%eax
-	movl	%esi,%edx
-	roll	$5,%edx
-	movl	%edi,4(%esp)
-	leal	2400959708(%edi,%ebp,1),%edi
-	movl	%ebx,%ebp
-	addl	%edx,%edi
-	andl	%ecx,%ebp
-	movl	8(%esp),%edx
-	addl	%ebp,%edi
-	# 40_59 50 
-	movl	%eax,%ebp
-	xorl	16(%esp),%edx
-	xorl	%ebx,%ebp
-	xorl	40(%esp),%edx
-	andl	%esi,%ebp
-	xorl	60(%esp),%edx
-	roll	$1,%edx
-	addl	%ecx,%ebp
-	rorl	$2,%esi
-	movl	%edi,%ecx
-	roll	$5,%ecx
-	movl	%edx,8(%esp)
-	leal	2400959708(%edx,%ebp,1),%edx
-	movl	%eax,%ebp
-	addl	%ecx,%edx
-	andl	%ebx,%ebp
-	movl	12(%esp),%ecx
-	addl	%ebp,%edx
-	# 40_59 51 
-	movl	%esi,%ebp
-	xorl	20(%esp),%ecx
-	xorl	%eax,%ebp
-	xorl	44(%esp),%ecx
-	andl	%edi,%ebp
-	xorl	(%esp),%ecx
-	roll	$1,%ecx
-	addl	%ebx,%ebp
-	rorl	$2,%edi
-	movl	%edx,%ebx
-	roll	$5,%ebx
-	movl	%ecx,12(%esp)
-	leal	2400959708(%ecx,%ebp,1),%ecx
-	movl	%esi,%ebp
-	addl	%ebx,%ecx
-	andl	%eax,%ebp
-	movl	16(%esp),%ebx
-	addl	%ebp,%ecx
-	# 40_59 52 
-	movl	%edi,%ebp
-	xorl	24(%esp),%ebx
-	xorl	%esi,%ebp
-	xorl	48(%esp),%ebx
-	andl	%edx,%ebp
-	xorl	4(%esp),%ebx
-	roll	$1,%ebx
-	addl	%eax,%ebp
-	rorl	$2,%edx
-	movl	%ecx,%eax
-	roll	$5,%eax
-	movl	%ebx,16(%esp)
-	leal	2400959708(%ebx,%ebp,1),%ebx
-	movl	%edi,%ebp
-	addl	%eax,%ebx
-	andl	%esi,%ebp
-	movl	20(%esp),%eax
-	addl	%ebp,%ebx
-	# 40_59 53 
-	movl	%edx,%ebp
-	xorl	28(%esp),%eax
-	xorl	%edi,%ebp
-	xorl	52(%esp),%eax
-	andl	%ecx,%ebp
-	xorl	8(%esp),%eax
-	roll	$1,%eax
-	addl	%esi,%ebp
-	rorl	$2,%ecx
-	movl	%ebx,%esi
-	roll	$5,%esi
-	movl	%eax,20(%esp)
-	leal	2400959708(%eax,%ebp,1),%eax
-	movl	%edx,%ebp
-	addl	%esi,%eax
-	andl	%edi,%ebp
-	movl	24(%esp),%esi
-	addl	%ebp,%eax
-	# 40_59 54 
-	movl	%ecx,%ebp
-	xorl	32(%esp),%esi
-	xorl	%edx,%ebp
-	xorl	56(%esp),%esi
-	andl	%ebx,%ebp
-	xorl	12(%esp),%esi
-	roll	$1,%esi
-	addl	%edi,%ebp
-	rorl	$2,%ebx
-	movl	%eax,%edi
-	roll	$5,%edi
-	movl	%esi,24(%esp)
-	leal	2400959708(%esi,%ebp,1),%esi
-	movl	%ecx,%ebp
-	addl	%edi,%esi
-	andl	%edx,%ebp
-	movl	28(%esp),%edi
-	addl	%ebp,%esi
-	# 40_59 55 
-	movl	%ebx,%ebp
-	xorl	36(%esp),%edi
-	xorl	%ecx,%ebp
-	xorl	60(%esp),%edi
-	andl	%eax,%ebp
-	xorl	16(%esp),%edi
-	roll	$1,%edi
-	addl	%edx,%ebp
-	rorl	$2,%eax
-	movl	%esi,%edx
-	roll	$5,%edx
-	movl	%edi,28(%esp)
-	leal	2400959708(%edi,%ebp,1),%edi
-	movl	%ebx,%ebp
-	addl	%edx,%edi
-	andl	%ecx,%ebp
-	movl	32(%esp),%edx
-	addl	%ebp,%edi
-	# 40_59 56 
-	movl	%eax,%ebp
-	xorl	40(%esp),%edx
-	xorl	%ebx,%ebp
-	xorl	(%esp),%edx
-	andl	%esi,%ebp
-	xorl	20(%esp),%edx
-	roll	$1,%edx
-	addl	%ecx,%ebp
-	rorl	$2,%esi
-	movl	%edi,%ecx
-	roll	$5,%ecx
-	movl	%edx,32(%esp)
-	leal	2400959708(%edx,%ebp,1),%edx
-	movl	%eax,%ebp
-	addl	%ecx,%edx
-	andl	%ebx,%ebp
-	movl	36(%esp),%ecx
-	addl	%ebp,%edx
-	# 40_59 57 
-	movl	%esi,%ebp
-	xorl	44(%esp),%ecx
-	xorl	%eax,%ebp
-	xorl	4(%esp),%ecx
-	andl	%edi,%ebp
-	xorl	24(%esp),%ecx
-	roll	$1,%ecx
-	addl	%ebx,%ebp
-	rorl	$2,%edi
-	movl	%edx,%ebx
-	roll	$5,%ebx
-	movl	%ecx,36(%esp)
-	leal	2400959708(%ecx,%ebp,1),%ecx
-	movl	%esi,%ebp
-	addl	%ebx,%ecx
-	andl	%eax,%ebp
-	movl	40(%esp),%ebx
-	addl	%ebp,%ecx
-	# 40_59 58 
-	movl	%edi,%ebp
-	xorl	48(%esp),%ebx
-	xorl	%esi,%ebp
-	xorl	8(%esp),%ebx
-	andl	%edx,%ebp
-	xorl	28(%esp),%ebx
-	roll	$1,%ebx
-	addl	%eax,%ebp
-	rorl	$2,%edx
-	movl	%ecx,%eax
-	roll	$5,%eax
-	movl	%ebx,40(%esp)
-	leal	2400959708(%ebx,%ebp,1),%ebx
-	movl	%edi,%ebp
-	addl	%eax,%ebx
-	andl	%esi,%ebp
-	movl	44(%esp),%eax
-	addl	%ebp,%ebx
-	# 40_59 59 
-	movl	%edx,%ebp
-	xorl	52(%esp),%eax
-	xorl	%edi,%ebp
-	xorl	12(%esp),%eax
-	andl	%ecx,%ebp
-	xorl	32(%esp),%eax
-	roll	$1,%eax
-	addl	%esi,%ebp
-	rorl	$2,%ecx
-	movl	%ebx,%esi
-	roll	$5,%esi
-	movl	%eax,44(%esp)
-	leal	2400959708(%eax,%ebp,1),%eax
-	movl	%edx,%ebp
-	addl	%esi,%eax
-	andl	%edi,%ebp
-	movl	48(%esp),%esi
-	addl	%ebp,%eax
-	# 20_39 60 
-	movl	%ebx,%ebp
-	xorl	56(%esp),%esi
-	xorl	%ecx,%ebp
-	xorl	16(%esp),%esi
-	xorl	%edx,%ebp
-	xorl	36(%esp),%esi
-	roll	$1,%esi
-	addl	%ebp,%edi
-	rorl	$2,%ebx
-	movl	%eax,%ebp
-	roll	$5,%ebp
-	movl	%esi,48(%esp)
-	leal	3395469782(%esi,%edi,1),%esi
-	movl	52(%esp),%edi
-	addl	%ebp,%esi
-	# 20_39 61 
-	movl	%eax,%ebp
-	xorl	60(%esp),%edi
-	xorl	%ebx,%ebp
-	xorl	20(%esp),%edi
-	xorl	%ecx,%ebp
-	xorl	40(%esp),%edi
-	roll	$1,%edi
-	addl	%ebp,%edx
-	rorl	$2,%eax
-	movl	%esi,%ebp
-	roll	$5,%ebp
-	movl	%edi,52(%esp)
-	leal	3395469782(%edi,%edx,1),%edi
-	movl	56(%esp),%edx
-	addl	%ebp,%edi
-	# 20_39 62 
-	movl	%esi,%ebp
-	xorl	(%esp),%edx
-	xorl	%eax,%ebp
-	xorl	24(%esp),%edx
-	xorl	%ebx,%ebp
-	xorl	44(%esp),%edx
-	roll	$1,%edx
-	addl	%ebp,%ecx
-	rorl	$2,%esi
-	movl	%edi,%ebp
-	roll	$5,%ebp
-	movl	%edx,56(%esp)
-	leal	3395469782(%edx,%ecx,1),%edx
-	movl	60(%esp),%ecx
-	addl	%ebp,%edx
-	# 20_39 63 
-	movl	%edi,%ebp
-	xorl	4(%esp),%ecx
-	xorl	%esi,%ebp
-	xorl	28(%esp),%ecx
-	xorl	%eax,%ebp
-	xorl	48(%esp),%ecx
-	roll	$1,%ecx
-	addl	%ebp,%ebx
-	rorl	$2,%edi
-	movl	%edx,%ebp
-	roll	$5,%ebp
-	movl	%ecx,60(%esp)
-	leal	3395469782(%ecx,%ebx,1),%ecx
-	movl	(%esp),%ebx
-	addl	%ebp,%ecx
-	# 20_39 64 
-	movl	%edx,%ebp
-	xorl	8(%esp),%ebx
-	xorl	%edi,%ebp
-	xorl	32(%esp),%ebx
-	xorl	%esi,%ebp
-	xorl	52(%esp),%ebx
-	roll	$1,%ebx
-	addl	%ebp,%eax
-	rorl	$2,%edx
-	movl	%ecx,%ebp
-	roll	$5,%ebp
-	movl	%ebx,(%esp)
-	leal	3395469782(%ebx,%eax,1),%ebx
-	movl	4(%esp),%eax
-	addl	%ebp,%ebx
-	# 20_39 65 
-	movl	%ecx,%ebp
-	xorl	12(%esp),%eax
-	xorl	%edx,%ebp
-	xorl	36(%esp),%eax
-	xorl	%edi,%ebp
-	xorl	56(%esp),%eax
-	roll	$1,%eax
-	addl	%ebp,%esi
-	rorl	$2,%ecx
-	movl	%ebx,%ebp
-	roll	$5,%ebp
-	movl	%eax,4(%esp)
-	leal	3395469782(%eax,%esi,1),%eax
-	movl	8(%esp),%esi
-	addl	%ebp,%eax
-	# 20_39 66 
-	movl	%ebx,%ebp
-	xorl	16(%esp),%esi
-	xorl	%ecx,%ebp
-	xorl	40(%esp),%esi
-	xorl	%edx,%ebp
-	xorl	60(%esp),%esi
-	roll	$1,%esi
-	addl	%ebp,%edi
-	rorl	$2,%ebx
-	movl	%eax,%ebp
-	roll	$5,%ebp
-	movl	%esi,8(%esp)
-	leal	3395469782(%esi,%edi,1),%esi
-	movl	12(%esp),%edi
-	addl	%ebp,%esi
-	# 20_39 67 
-	movl	%eax,%ebp
-	xorl	20(%esp),%edi
-	xorl	%ebx,%ebp
-	xorl	44(%esp),%edi
-	xorl	%ecx,%ebp
-	xorl	(%esp),%edi
-	roll	$1,%edi
-	addl	%ebp,%edx
-	rorl	$2,%eax
-	movl	%esi,%ebp
-	roll	$5,%ebp
-	movl	%edi,12(%esp)
-	leal	3395469782(%edi,%edx,1),%edi
-	movl	16(%esp),%edx
-	addl	%ebp,%edi
-	# 20_39 68 
-	movl	%esi,%ebp
-	xorl	24(%esp),%edx
-	xorl	%eax,%ebp
-	xorl	48(%esp),%edx
-	xorl	%ebx,%ebp
-	xorl	4(%esp),%edx
-	roll	$1,%edx
-	addl	%ebp,%ecx
-	rorl	$2,%esi
-	movl	%edi,%ebp
-	roll	$5,%ebp
-	movl	%edx,16(%esp)
-	leal	3395469782(%edx,%ecx,1),%edx
-	movl	20(%esp),%ecx
-	addl	%ebp,%edx
-	# 20_39 69 
-	movl	%edi,%ebp
-	xorl	28(%esp),%ecx
-	xorl	%esi,%ebp
-	xorl	52(%esp),%ecx
-	xorl	%eax,%ebp
-	xorl	8(%esp),%ecx
-	roll	$1,%ecx
-	addl	%ebp,%ebx
-	rorl	$2,%edi
-	movl	%edx,%ebp
-	roll	$5,%ebp
-	movl	%ecx,20(%esp)
-	leal	3395469782(%ecx,%ebx,1),%ecx
-	movl	24(%esp),%ebx
-	addl	%ebp,%ecx
-	# 20_39 70 
-	movl	%edx,%ebp
-	xorl	32(%esp),%ebx
-	xorl	%edi,%ebp
-	xorl	56(%esp),%ebx
-	xorl	%esi,%ebp
-	xorl	12(%esp),%ebx
-	roll	$1,%ebx
-	addl	%ebp,%eax
-	rorl	$2,%edx
-	movl	%ecx,%ebp
-	roll	$5,%ebp
-	movl	%ebx,24(%esp)
-	leal	3395469782(%ebx,%eax,1),%ebx
-	movl	28(%esp),%eax
-	addl	%ebp,%ebx
-	# 20_39 71 
-	movl	%ecx,%ebp
-	xorl	36(%esp),%eax
-	xorl	%edx,%ebp
-	xorl	60(%esp),%eax
-	xorl	%edi,%ebp
-	xorl	16(%esp),%eax
-	roll	$1,%eax
-	addl	%ebp,%esi
-	rorl	$2,%ecx
-	movl	%ebx,%ebp
-	roll	$5,%ebp
-	movl	%eax,28(%esp)
-	leal	3395469782(%eax,%esi,1),%eax
-	movl	32(%esp),%esi
-	addl	%ebp,%eax
-	# 20_39 72 
-	movl	%ebx,%ebp
-	xorl	40(%esp),%esi
-	xorl	%ecx,%ebp
-	xorl	(%esp),%esi
-	xorl	%edx,%ebp
-	xorl	20(%esp),%esi
-	roll	$1,%esi
-	addl	%ebp,%edi
-	rorl	$2,%ebx
-	movl	%eax,%ebp
-	roll	$5,%ebp
-	movl	%esi,32(%esp)
-	leal	3395469782(%esi,%edi,1),%esi
-	movl	36(%esp),%edi
-	addl	%ebp,%esi
-	# 20_39 73 
-	movl	%eax,%ebp
-	xorl	44(%esp),%edi
-	xorl	%ebx,%ebp
-	xorl	4(%esp),%edi
-	xorl	%ecx,%ebp
-	xorl	24(%esp),%edi
-	roll	$1,%edi
-	addl	%ebp,%edx
-	rorl	$2,%eax
-	movl	%esi,%ebp
-	roll	$5,%ebp
-	movl	%edi,36(%esp)
-	leal	3395469782(%edi,%edx,1),%edi
-	movl	40(%esp),%edx
-	addl	%ebp,%edi
-	# 20_39 74 
-	movl	%esi,%ebp
-	xorl	48(%esp),%edx
-	xorl	%eax,%ebp
-	xorl	8(%esp),%edx
-	xorl	%ebx,%ebp
-	xorl	28(%esp),%edx
-	roll	$1,%edx
-	addl	%ebp,%ecx
-	rorl	$2,%esi
-	movl	%edi,%ebp
-	roll	$5,%ebp
-	movl	%edx,40(%esp)
-	leal	3395469782(%edx,%ecx,1),%edx
-	movl	44(%esp),%ecx
-	addl	%ebp,%edx
-	# 20_39 75 
-	movl	%edi,%ebp
-	xorl	52(%esp),%ecx
-	xorl	%esi,%ebp
-	xorl	12(%esp),%ecx
-	xorl	%eax,%ebp
-	xorl	32(%esp),%ecx
-	roll	$1,%ecx
-	addl	%ebp,%ebx
-	rorl	$2,%edi
-	movl	%edx,%ebp
-	roll	$5,%ebp
-	movl	%ecx,44(%esp)
-	leal	3395469782(%ecx,%ebx,1),%ecx
-	movl	48(%esp),%ebx
-	addl	%ebp,%ecx
-	# 20_39 76 
-	movl	%edx,%ebp
-	xorl	56(%esp),%ebx
-	xorl	%edi,%ebp
-	xorl	16(%esp),%ebx
-	xorl	%esi,%ebp
-	xorl	36(%esp),%ebx
-	roll	$1,%ebx
-	addl	%ebp,%eax
-	rorl	$2,%edx
-	movl	%ecx,%ebp
-	roll	$5,%ebp
-	movl	%ebx,48(%esp)
-	leal	3395469782(%ebx,%eax,1),%ebx
-	movl	52(%esp),%eax
-	addl	%ebp,%ebx
-	# 20_39 77 
-	movl	%ecx,%ebp
-	xorl	60(%esp),%eax
-	xorl	%edx,%ebp
-	xorl	20(%esp),%eax
-	xorl	%edi,%ebp
-	xorl	40(%esp),%eax
-	roll	$1,%eax
-	addl	%ebp,%esi
-	rorl	$2,%ecx
-	movl	%ebx,%ebp
-	roll	$5,%ebp
-	leal	3395469782(%eax,%esi,1),%eax
-	movl	56(%esp),%esi
-	addl	%ebp,%eax
-	# 20_39 78 
-	movl	%ebx,%ebp
-	xorl	(%esp),%esi
-	xorl	%ecx,%ebp
-	xorl	24(%esp),%esi
-	xorl	%edx,%ebp
-	xorl	44(%esp),%esi
-	roll	$1,%esi
-	addl	%ebp,%edi
-	rorl	$2,%ebx
-	movl	%eax,%ebp
-	roll	$5,%ebp
-	leal	3395469782(%esi,%edi,1),%esi
-	movl	60(%esp),%edi
-	addl	%ebp,%esi
-	# 20_39 79 
-	movl	%eax,%ebp
-	xorl	4(%esp),%edi
-	xorl	%ebx,%ebp
-	xorl	28(%esp),%edi
-	xorl	%ecx,%ebp
-	xorl	48(%esp),%edi
-	roll	$1,%edi
-	addl	%ebp,%edx
-	rorl	$2,%eax
-	movl	%esi,%ebp
-	roll	$5,%ebp
-	leal	3395469782(%edi,%edx,1),%edi
-	addl	%ebp,%edi
-	movl	96(%esp),%ebp
-	movl	100(%esp),%edx
-	addl	(%ebp),%edi
-	addl	4(%ebp),%esi
-	addl	8(%ebp),%eax
-	addl	12(%ebp),%ebx
-	addl	16(%ebp),%ecx
-	movl	%edi,(%ebp)
-	addl	$64,%edx
-	movl	%esi,4(%ebp)
-	cmpl	104(%esp),%edx
-	movl	%eax,8(%ebp)
-	movl	%ecx,%edi
-	movl	%ebx,12(%ebp)
-	movl	%edx,%esi
-	movl	%ecx,16(%ebp)
-	jb	L002loop
-	addl	$76,%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.private_extern	__sha1_block_data_order_ssse3
-.align	4
-__sha1_block_data_order_ssse3:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	call	L003pic_point
-L003pic_point:
-	popl	%ebp
-	leal	LK_XX_XX-L003pic_point(%ebp),%ebp
-Lssse3_shortcut:
-	movdqa	(%ebp),%xmm7
-	movdqa	16(%ebp),%xmm0
-	movdqa	32(%ebp),%xmm1
-	movdqa	48(%ebp),%xmm2
-	movdqa	64(%ebp),%xmm6
-	movl	20(%esp),%edi
-	movl	24(%esp),%ebp
-	movl	28(%esp),%edx
-	movl	%esp,%esi
-	subl	$208,%esp
-	andl	$-64,%esp
-	movdqa	%xmm0,112(%esp)
-	movdqa	%xmm1,128(%esp)
-	movdqa	%xmm2,144(%esp)
-	shll	$6,%edx
-	movdqa	%xmm7,160(%esp)
-	addl	%ebp,%edx
-	movdqa	%xmm6,176(%esp)
-	addl	$64,%ebp
-	movl	%edi,192(%esp)
-	movl	%ebp,196(%esp)
-	movl	%edx,200(%esp)
-	movl	%esi,204(%esp)
-	movl	(%edi),%eax
-	movl	4(%edi),%ebx
-	movl	8(%edi),%ecx
-	movl	12(%edi),%edx
-	movl	16(%edi),%edi
-	movl	%ebx,%esi
-	movdqu	-64(%ebp),%xmm0
-	movdqu	-48(%ebp),%xmm1
-	movdqu	-32(%ebp),%xmm2
-	movdqu	-16(%ebp),%xmm3
-.byte	102,15,56,0,198
-.byte	102,15,56,0,206
-.byte	102,15,56,0,214
-	movdqa	%xmm7,96(%esp)
-.byte	102,15,56,0,222
-	paddd	%xmm7,%xmm0
-	paddd	%xmm7,%xmm1
-	paddd	%xmm7,%xmm2
-	movdqa	%xmm0,(%esp)
-	psubd	%xmm7,%xmm0
-	movdqa	%xmm1,16(%esp)
-	psubd	%xmm7,%xmm1
-	movdqa	%xmm2,32(%esp)
-	movl	%ecx,%ebp
-	psubd	%xmm7,%xmm2
-	xorl	%edx,%ebp
-	pshufd	$238,%xmm0,%xmm4
-	andl	%ebp,%esi
-	jmp	L004loop
-.align	4,0x90
-L004loop:
-	rorl	$2,%ebx
-	xorl	%edx,%esi
-	movl	%eax,%ebp
-	punpcklqdq	%xmm1,%xmm4
-	movdqa	%xmm3,%xmm6
-	addl	(%esp),%edi
-	xorl	%ecx,%ebx
-	paddd	%xmm3,%xmm7
-	movdqa	%xmm0,64(%esp)
-	roll	$5,%eax
-	addl	%esi,%edi
-	psrldq	$4,%xmm6
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	pxor	%xmm0,%xmm4
-	addl	%eax,%edi
-	rorl	$7,%eax
-	pxor	%xmm2,%xmm6
-	xorl	%ecx,%ebp
-	movl	%edi,%esi
-	addl	4(%esp),%edx
-	pxor	%xmm6,%xmm4
-	xorl	%ebx,%eax
-	roll	$5,%edi
-	movdqa	%xmm7,48(%esp)
-	addl	%ebp,%edx
-	andl	%eax,%esi
-	movdqa	%xmm4,%xmm0
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	rorl	$7,%edi
-	movdqa	%xmm4,%xmm6
-	xorl	%ebx,%esi
-	pslldq	$12,%xmm0
-	paddd	%xmm4,%xmm4
-	movl	%edx,%ebp
-	addl	8(%esp),%ecx
-	psrld	$31,%xmm6
-	xorl	%eax,%edi
-	roll	$5,%edx
-	movdqa	%xmm0,%xmm7
-	addl	%esi,%ecx
-	andl	%edi,%ebp
-	xorl	%eax,%edi
-	psrld	$30,%xmm0
-	addl	%edx,%ecx
-	rorl	$7,%edx
-	por	%xmm6,%xmm4
-	xorl	%eax,%ebp
-	movl	%ecx,%esi
-	addl	12(%esp),%ebx
-	pslld	$2,%xmm7
-	xorl	%edi,%edx
-	roll	$5,%ecx
-	pxor	%xmm0,%xmm4
-	movdqa	96(%esp),%xmm0
-	addl	%ebp,%ebx
-	andl	%edx,%esi
-	pxor	%xmm7,%xmm4
-	pshufd	$238,%xmm1,%xmm5
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	movl	%ebx,%ebp
-	punpcklqdq	%xmm2,%xmm5
-	movdqa	%xmm4,%xmm7
-	addl	16(%esp),%eax
-	xorl	%edx,%ecx
-	paddd	%xmm4,%xmm0
-	movdqa	%xmm1,80(%esp)
-	roll	$5,%ebx
-	addl	%esi,%eax
-	psrldq	$4,%xmm7
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
-	pxor	%xmm1,%xmm5
-	addl	%ebx,%eax
-	rorl	$7,%ebx
-	pxor	%xmm3,%xmm7
-	xorl	%edx,%ebp
-	movl	%eax,%esi
-	addl	20(%esp),%edi
-	pxor	%xmm7,%xmm5
-	xorl	%ecx,%ebx
-	roll	$5,%eax
-	movdqa	%xmm0,(%esp)
-	addl	%ebp,%edi
-	andl	%ebx,%esi
-	movdqa	%xmm5,%xmm1
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	rorl	$7,%eax
-	movdqa	%xmm5,%xmm7
-	xorl	%ecx,%esi
-	pslldq	$12,%xmm1
-	paddd	%xmm5,%xmm5
-	movl	%edi,%ebp
-	addl	24(%esp),%edx
-	psrld	$31,%xmm7
-	xorl	%ebx,%eax
-	roll	$5,%edi
-	movdqa	%xmm1,%xmm0
-	addl	%esi,%edx
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
-	psrld	$30,%xmm1
-	addl	%edi,%edx
-	rorl	$7,%edi
-	por	%xmm7,%xmm5
-	xorl	%ebx,%ebp
-	movl	%edx,%esi
-	addl	28(%esp),%ecx
-	pslld	$2,%xmm0
-	xorl	%eax,%edi
-	roll	$5,%edx
-	pxor	%xmm1,%xmm5
-	movdqa	112(%esp),%xmm1
-	addl	%ebp,%ecx
-	andl	%edi,%esi
-	pxor	%xmm0,%xmm5
-	pshufd	$238,%xmm2,%xmm6
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	rorl	$7,%edx
-	xorl	%eax,%esi
-	movl	%ecx,%ebp
-	punpcklqdq	%xmm3,%xmm6
-	movdqa	%xmm5,%xmm0
-	addl	32(%esp),%ebx
-	xorl	%edi,%edx
-	paddd	%xmm5,%xmm1
-	movdqa	%xmm2,96(%esp)
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	psrldq	$4,%xmm0
-	andl	%edx,%ebp
-	xorl	%edi,%edx
-	pxor	%xmm2,%xmm6
-	addl	%ecx,%ebx
-	rorl	$7,%ecx
-	pxor	%xmm4,%xmm0
-	xorl	%edi,%ebp
-	movl	%ebx,%esi
-	addl	36(%esp),%eax
-	pxor	%xmm0,%xmm6
-	xorl	%edx,%ecx
-	roll	$5,%ebx
-	movdqa	%xmm1,16(%esp)
-	addl	%ebp,%eax
-	andl	%ecx,%esi
-	movdqa	%xmm6,%xmm2
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	rorl	$7,%ebx
-	movdqa	%xmm6,%xmm0
-	xorl	%edx,%esi
-	pslldq	$12,%xmm2
-	paddd	%xmm6,%xmm6
-	movl	%eax,%ebp
-	addl	40(%esp),%edi
-	psrld	$31,%xmm0
-	xorl	%ecx,%ebx
-	roll	$5,%eax
-	movdqa	%xmm2,%xmm1
-	addl	%esi,%edi
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	psrld	$30,%xmm2
-	addl	%eax,%edi
-	rorl	$7,%eax
-	por	%xmm0,%xmm6
-	xorl	%ecx,%ebp
-	movdqa	64(%esp),%xmm0
-	movl	%edi,%esi
-	addl	44(%esp),%edx
-	pslld	$2,%xmm1
-	xorl	%ebx,%eax
-	roll	$5,%edi
-	pxor	%xmm2,%xmm6
-	movdqa	112(%esp),%xmm2
-	addl	%ebp,%edx
-	andl	%eax,%esi
-	pxor	%xmm1,%xmm6
-	pshufd	$238,%xmm3,%xmm7
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	rorl	$7,%edi
-	xorl	%ebx,%esi
-	movl	%edx,%ebp
-	punpcklqdq	%xmm4,%xmm7
-	movdqa	%xmm6,%xmm1
-	addl	48(%esp),%ecx
-	xorl	%eax,%edi
-	paddd	%xmm6,%xmm2
-	movdqa	%xmm3,64(%esp)
-	roll	$5,%edx
-	addl	%esi,%ecx
-	psrldq	$4,%xmm1
-	andl	%edi,%ebp
-	xorl	%eax,%edi
-	pxor	%xmm3,%xmm7
-	addl	%edx,%ecx
-	rorl	$7,%edx
-	pxor	%xmm5,%xmm1
-	xorl	%eax,%ebp
-	movl	%ecx,%esi
-	addl	52(%esp),%ebx
-	pxor	%xmm1,%xmm7
-	xorl	%edi,%edx
-	roll	$5,%ecx
-	movdqa	%xmm2,32(%esp)
-	addl	%ebp,%ebx
-	andl	%edx,%esi
-	movdqa	%xmm7,%xmm3
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	rorl	$7,%ecx
-	movdqa	%xmm7,%xmm1
-	xorl	%edi,%esi
-	pslldq	$12,%xmm3
-	paddd	%xmm7,%xmm7
-	movl	%ebx,%ebp
-	addl	56(%esp),%eax
-	psrld	$31,%xmm1
-	xorl	%edx,%ecx
-	roll	$5,%ebx
-	movdqa	%xmm3,%xmm2
-	addl	%esi,%eax
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
-	psrld	$30,%xmm3
-	addl	%ebx,%eax
-	rorl	$7,%ebx
-	por	%xmm1,%xmm7
-	xorl	%edx,%ebp
-	movdqa	80(%esp),%xmm1
-	movl	%eax,%esi
-	addl	60(%esp),%edi
-	pslld	$2,%xmm2
-	xorl	%ecx,%ebx
-	roll	$5,%eax
-	pxor	%xmm3,%xmm7
-	movdqa	112(%esp),%xmm3
-	addl	%ebp,%edi
-	andl	%ebx,%esi
-	pxor	%xmm2,%xmm7
-	pshufd	$238,%xmm6,%xmm2
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	rorl	$7,%eax
-	pxor	%xmm4,%xmm0
-	punpcklqdq	%xmm7,%xmm2
-	xorl	%ecx,%esi
-	movl	%edi,%ebp
-	addl	(%esp),%edx
-	pxor	%xmm1,%xmm0
-	movdqa	%xmm4,80(%esp)
-	xorl	%ebx,%eax
-	roll	$5,%edi
-	movdqa	%xmm3,%xmm4
-	addl	%esi,%edx
-	paddd	%xmm7,%xmm3
-	andl	%eax,%ebp
-	pxor	%xmm2,%xmm0
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	rorl	$7,%edi
-	xorl	%ebx,%ebp
-	movdqa	%xmm0,%xmm2
-	movdqa	%xmm3,48(%esp)
-	movl	%edx,%esi
-	addl	4(%esp),%ecx
-	xorl	%eax,%edi
-	roll	$5,%edx
-	pslld	$2,%xmm0
-	addl	%ebp,%ecx
-	andl	%edi,%esi
-	psrld	$30,%xmm2
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	rorl	$7,%edx
-	xorl	%eax,%esi
-	movl	%ecx,%ebp
-	addl	8(%esp),%ebx
-	xorl	%edi,%edx
-	roll	$5,%ecx
-	por	%xmm2,%xmm0
-	addl	%esi,%ebx
-	andl	%edx,%ebp
-	movdqa	96(%esp),%xmm2
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	addl	12(%esp),%eax
-	xorl	%edi,%ebp
-	movl	%ebx,%esi
-	pshufd	$238,%xmm7,%xmm3
-	roll	$5,%ebx
-	addl	%ebp,%eax
-	xorl	%edx,%esi
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	addl	16(%esp),%edi
-	pxor	%xmm5,%xmm1
-	punpcklqdq	%xmm0,%xmm3
-	xorl	%ecx,%esi
-	movl	%eax,%ebp
-	roll	$5,%eax
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm5,96(%esp)
-	addl	%esi,%edi
-	xorl	%ecx,%ebp
-	movdqa	%xmm4,%xmm5
-	rorl	$7,%ebx
-	paddd	%xmm0,%xmm4
-	addl	%eax,%edi
-	pxor	%xmm3,%xmm1
-	addl	20(%esp),%edx
-	xorl	%ebx,%ebp
-	movl	%edi,%esi
-	roll	$5,%edi
-	movdqa	%xmm1,%xmm3
-	movdqa	%xmm4,(%esp)
-	addl	%ebp,%edx
-	xorl	%ebx,%esi
-	rorl	$7,%eax
-	addl	%edi,%edx
-	pslld	$2,%xmm1
-	addl	24(%esp),%ecx
-	xorl	%eax,%esi
-	psrld	$30,%xmm3
-	movl	%edx,%ebp
-	roll	$5,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%ebp
-	rorl	$7,%edi
-	addl	%edx,%ecx
-	por	%xmm3,%xmm1
-	addl	28(%esp),%ebx
-	xorl	%edi,%ebp
-	movdqa	64(%esp),%xmm3
-	movl	%ecx,%esi
-	roll	$5,%ecx
-	addl	%ebp,%ebx
-	xorl	%edi,%esi
-	rorl	$7,%edx
-	pshufd	$238,%xmm0,%xmm4
-	addl	%ecx,%ebx
-	addl	32(%esp),%eax
-	pxor	%xmm6,%xmm2
-	punpcklqdq	%xmm1,%xmm4
-	xorl	%edx,%esi
-	movl	%ebx,%ebp
-	roll	$5,%ebx
-	pxor	%xmm3,%xmm2
-	movdqa	%xmm6,64(%esp)
-	addl	%esi,%eax
-	xorl	%edx,%ebp
-	movdqa	128(%esp),%xmm6
-	rorl	$7,%ecx
-	paddd	%xmm1,%xmm5
-	addl	%ebx,%eax
-	pxor	%xmm4,%xmm2
-	addl	36(%esp),%edi
-	xorl	%ecx,%ebp
-	movl	%eax,%esi
-	roll	$5,%eax
-	movdqa	%xmm2,%xmm4
-	movdqa	%xmm5,16(%esp)
-	addl	%ebp,%edi
-	xorl	%ecx,%esi
-	rorl	$7,%ebx
-	addl	%eax,%edi
-	pslld	$2,%xmm2
-	addl	40(%esp),%edx
-	xorl	%ebx,%esi
-	psrld	$30,%xmm4
-	movl	%edi,%ebp
-	roll	$5,%edi
-	addl	%esi,%edx
-	xorl	%ebx,%ebp
-	rorl	$7,%eax
-	addl	%edi,%edx
-	por	%xmm4,%xmm2
-	addl	44(%esp),%ecx
-	xorl	%eax,%ebp
-	movdqa	80(%esp),%xmm4
-	movl	%edx,%esi
-	roll	$5,%edx
-	addl	%ebp,%ecx
-	xorl	%eax,%esi
-	rorl	$7,%edi
-	pshufd	$238,%xmm1,%xmm5
-	addl	%edx,%ecx
-	addl	48(%esp),%ebx
-	pxor	%xmm7,%xmm3
-	punpcklqdq	%xmm2,%xmm5
-	xorl	%edi,%esi
-	movl	%ecx,%ebp
-	roll	$5,%ecx
-	pxor	%xmm4,%xmm3
-	movdqa	%xmm7,80(%esp)
-	addl	%esi,%ebx
-	xorl	%edi,%ebp
-	movdqa	%xmm6,%xmm7
-	rorl	$7,%edx
-	paddd	%xmm2,%xmm6
-	addl	%ecx,%ebx
-	pxor	%xmm5,%xmm3
-	addl	52(%esp),%eax
-	xorl	%edx,%ebp
-	movl	%ebx,%esi
-	roll	$5,%ebx
-	movdqa	%xmm3,%xmm5
-	movdqa	%xmm6,32(%esp)
-	addl	%ebp,%eax
-	xorl	%edx,%esi
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	pslld	$2,%xmm3
-	addl	56(%esp),%edi
-	xorl	%ecx,%esi
-	psrld	$30,%xmm5
-	movl	%eax,%ebp
-	roll	$5,%eax
-	addl	%esi,%edi
-	xorl	%ecx,%ebp
-	rorl	$7,%ebx
-	addl	%eax,%edi
-	por	%xmm5,%xmm3
-	addl	60(%esp),%edx
-	xorl	%ebx,%ebp
-	movdqa	96(%esp),%xmm5
-	movl	%edi,%esi
-	roll	$5,%edi
-	addl	%ebp,%edx
-	xorl	%ebx,%esi
-	rorl	$7,%eax
-	pshufd	$238,%xmm2,%xmm6
-	addl	%edi,%edx
-	addl	(%esp),%ecx
-	pxor	%xmm0,%xmm4
-	punpcklqdq	%xmm3,%xmm6
-	xorl	%eax,%esi
-	movl	%edx,%ebp
-	roll	$5,%edx
-	pxor	%xmm5,%xmm4
-	movdqa	%xmm0,96(%esp)
-	addl	%esi,%ecx
-	xorl	%eax,%ebp
-	movdqa	%xmm7,%xmm0
-	rorl	$7,%edi
-	paddd	%xmm3,%xmm7
-	addl	%edx,%ecx
-	pxor	%xmm6,%xmm4
-	addl	4(%esp),%ebx
-	xorl	%edi,%ebp
-	movl	%ecx,%esi
-	roll	$5,%ecx
-	movdqa	%xmm4,%xmm6
-	movdqa	%xmm7,48(%esp)
-	addl	%ebp,%ebx
-	xorl	%edi,%esi
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	pslld	$2,%xmm4
-	addl	8(%esp),%eax
-	xorl	%edx,%esi
-	psrld	$30,%xmm6
-	movl	%ebx,%ebp
-	roll	$5,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%ebp
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	por	%xmm6,%xmm4
-	addl	12(%esp),%edi
-	xorl	%ecx,%ebp
-	movdqa	64(%esp),%xmm6
-	movl	%eax,%esi
-	roll	$5,%eax
-	addl	%ebp,%edi
-	xorl	%ecx,%esi
-	rorl	$7,%ebx
-	pshufd	$238,%xmm3,%xmm7
-	addl	%eax,%edi
-	addl	16(%esp),%edx
-	pxor	%xmm1,%xmm5
-	punpcklqdq	%xmm4,%xmm7
-	xorl	%ebx,%esi
-	movl	%edi,%ebp
-	roll	$5,%edi
-	pxor	%xmm6,%xmm5
-	movdqa	%xmm1,64(%esp)
-	addl	%esi,%edx
-	xorl	%ebx,%ebp
-	movdqa	%xmm0,%xmm1
-	rorl	$7,%eax
-	paddd	%xmm4,%xmm0
-	addl	%edi,%edx
-	pxor	%xmm7,%xmm5
-	addl	20(%esp),%ecx
-	xorl	%eax,%ebp
-	movl	%edx,%esi
-	roll	$5,%edx
-	movdqa	%xmm5,%xmm7
-	movdqa	%xmm0,(%esp)
-	addl	%ebp,%ecx
-	xorl	%eax,%esi
-	rorl	$7,%edi
-	addl	%edx,%ecx
-	pslld	$2,%xmm5
-	addl	24(%esp),%ebx
-	xorl	%edi,%esi
-	psrld	$30,%xmm7
-	movl	%ecx,%ebp
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%edi,%ebp
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	por	%xmm7,%xmm5
-	addl	28(%esp),%eax
-	movdqa	80(%esp),%xmm7
-	rorl	$7,%ecx
-	movl	%ebx,%esi
-	xorl	%edx,%ebp
-	roll	$5,%ebx
-	pshufd	$238,%xmm4,%xmm0
-	addl	%ebp,%eax
-	xorl	%ecx,%esi
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	addl	32(%esp),%edi
-	pxor	%xmm2,%xmm6
-	punpcklqdq	%xmm5,%xmm0
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
-	rorl	$7,%ebx
-	pxor	%xmm7,%xmm6
-	movdqa	%xmm2,80(%esp)
-	movl	%eax,%ebp
-	xorl	%ecx,%esi
-	roll	$5,%eax
-	movdqa	%xmm1,%xmm2
-	addl	%esi,%edi
-	paddd	%xmm5,%xmm1
-	xorl	%ebx,%ebp
-	pxor	%xmm0,%xmm6
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	addl	36(%esp),%edx
-	andl	%ebx,%ebp
-	movdqa	%xmm6,%xmm0
-	movdqa	%xmm1,16(%esp)
-	xorl	%ecx,%ebx
-	rorl	$7,%eax
-	movl	%edi,%esi
-	xorl	%ebx,%ebp
-	roll	$5,%edi
-	pslld	$2,%xmm6
-	addl	%ebp,%edx
-	xorl	%eax,%esi
-	psrld	$30,%xmm0
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	addl	40(%esp),%ecx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	rorl	$7,%edi
-	por	%xmm0,%xmm6
-	movl	%edx,%ebp
-	xorl	%eax,%esi
-	movdqa	96(%esp),%xmm0
-	roll	$5,%edx
-	addl	%esi,%ecx
-	xorl	%edi,%ebp
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	pshufd	$238,%xmm5,%xmm1
-	addl	44(%esp),%ebx
-	andl	%edi,%ebp
-	xorl	%eax,%edi
-	rorl	$7,%edx
-	movl	%ecx,%esi
-	xorl	%edi,%ebp
-	roll	$5,%ecx
-	addl	%ebp,%ebx
-	xorl	%edx,%esi
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	addl	48(%esp),%eax
-	pxor	%xmm3,%xmm7
-	punpcklqdq	%xmm6,%xmm1
-	andl	%edx,%esi
-	xorl	%edi,%edx
-	rorl	$7,%ecx
-	pxor	%xmm0,%xmm7
-	movdqa	%xmm3,96(%esp)
-	movl	%ebx,%ebp
-	xorl	%edx,%esi
-	roll	$5,%ebx
-	movdqa	144(%esp),%xmm3
-	addl	%esi,%eax
-	paddd	%xmm6,%xmm2
-	xorl	%ecx,%ebp
-	pxor	%xmm1,%xmm7
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	addl	52(%esp),%edi
-	andl	%ecx,%ebp
-	movdqa	%xmm7,%xmm1
-	movdqa	%xmm2,32(%esp)
-	xorl	%edx,%ecx
-	rorl	$7,%ebx
-	movl	%eax,%esi
-	xorl	%ecx,%ebp
-	roll	$5,%eax
-	pslld	$2,%xmm7
-	addl	%ebp,%edi
-	xorl	%ebx,%esi
-	psrld	$30,%xmm1
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	addl	56(%esp),%edx
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	rorl	$7,%eax
-	por	%xmm1,%xmm7
-	movl	%edi,%ebp
-	xorl	%ebx,%esi
-	movdqa	64(%esp),%xmm1
-	roll	$5,%edi
-	addl	%esi,%edx
-	xorl	%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	pshufd	$238,%xmm6,%xmm2
-	addl	60(%esp),%ecx
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
-	rorl	$7,%edi
-	movl	%edx,%esi
-	xorl	%eax,%ebp
-	roll	$5,%edx
-	addl	%ebp,%ecx
-	xorl	%edi,%esi
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	addl	(%esp),%ebx
-	pxor	%xmm4,%xmm0
-	punpcklqdq	%xmm7,%xmm2
-	andl	%edi,%esi
-	xorl	%eax,%edi
-	rorl	$7,%edx
-	pxor	%xmm1,%xmm0
-	movdqa	%xmm4,64(%esp)
-	movl	%ecx,%ebp
-	xorl	%edi,%esi
-	roll	$5,%ecx
-	movdqa	%xmm3,%xmm4
-	addl	%esi,%ebx
-	paddd	%xmm7,%xmm3
-	xorl	%edx,%ebp
-	pxor	%xmm2,%xmm0
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	addl	4(%esp),%eax
-	andl	%edx,%ebp
-	movdqa	%xmm0,%xmm2
-	movdqa	%xmm3,48(%esp)
-	xorl	%edi,%edx
-	rorl	$7,%ecx
-	movl	%ebx,%esi
-	xorl	%edx,%ebp
-	roll	$5,%ebx
-	pslld	$2,%xmm0
-	addl	%ebp,%eax
-	xorl	%ecx,%esi
-	psrld	$30,%xmm2
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	addl	8(%esp),%edi
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
-	rorl	$7,%ebx
-	por	%xmm2,%xmm0
-	movl	%eax,%ebp
-	xorl	%ecx,%esi
-	movdqa	80(%esp),%xmm2
-	roll	$5,%eax
-	addl	%esi,%edi
-	xorl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	pshufd	$238,%xmm7,%xmm3
-	addl	12(%esp),%edx
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	rorl	$7,%eax
-	movl	%edi,%esi
-	xorl	%ebx,%ebp
-	roll	$5,%edi
-	addl	%ebp,%edx
-	xorl	%eax,%esi
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	addl	16(%esp),%ecx
-	pxor	%xmm5,%xmm1
-	punpcklqdq	%xmm0,%xmm3
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	rorl	$7,%edi
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm5,80(%esp)
-	movl	%edx,%ebp
-	xorl	%eax,%esi
-	roll	$5,%edx
-	movdqa	%xmm4,%xmm5
-	addl	%esi,%ecx
-	paddd	%xmm0,%xmm4
-	xorl	%edi,%ebp
-	pxor	%xmm3,%xmm1
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	addl	20(%esp),%ebx
-	andl	%edi,%ebp
-	movdqa	%xmm1,%xmm3
-	movdqa	%xmm4,(%esp)
-	xorl	%eax,%edi
-	rorl	$7,%edx
-	movl	%ecx,%esi
-	xorl	%edi,%ebp
-	roll	$5,%ecx
-	pslld	$2,%xmm1
-	addl	%ebp,%ebx
-	xorl	%edx,%esi
-	psrld	$30,%xmm3
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	addl	24(%esp),%eax
-	andl	%edx,%esi
-	xorl	%edi,%edx
-	rorl	$7,%ecx
-	por	%xmm3,%xmm1
-	movl	%ebx,%ebp
-	xorl	%edx,%esi
-	movdqa	96(%esp),%xmm3
-	roll	$5,%ebx
-	addl	%esi,%eax
-	xorl	%ecx,%ebp
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	pshufd	$238,%xmm0,%xmm4
-	addl	28(%esp),%edi
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
-	rorl	$7,%ebx
-	movl	%eax,%esi
-	xorl	%ecx,%ebp
-	roll	$5,%eax
-	addl	%ebp,%edi
-	xorl	%ebx,%esi
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	addl	32(%esp),%edx
-	pxor	%xmm6,%xmm2
-	punpcklqdq	%xmm1,%xmm4
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	rorl	$7,%eax
-	pxor	%xmm3,%xmm2
-	movdqa	%xmm6,96(%esp)
-	movl	%edi,%ebp
-	xorl	%ebx,%esi
-	roll	$5,%edi
-	movdqa	%xmm5,%xmm6
-	addl	%esi,%edx
-	paddd	%xmm1,%xmm5
-	xorl	%eax,%ebp
-	pxor	%xmm4,%xmm2
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	addl	36(%esp),%ecx
-	andl	%eax,%ebp
-	movdqa	%xmm2,%xmm4
-	movdqa	%xmm5,16(%esp)
-	xorl	%ebx,%eax
-	rorl	$7,%edi
-	movl	%edx,%esi
-	xorl	%eax,%ebp
-	roll	$5,%edx
-	pslld	$2,%xmm2
-	addl	%ebp,%ecx
-	xorl	%edi,%esi
-	psrld	$30,%xmm4
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	addl	40(%esp),%ebx
-	andl	%edi,%esi
-	xorl	%eax,%edi
-	rorl	$7,%edx
-	por	%xmm4,%xmm2
-	movl	%ecx,%ebp
-	xorl	%edi,%esi
-	movdqa	64(%esp),%xmm4
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%edx,%ebp
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	pshufd	$238,%xmm1,%xmm5
-	addl	44(%esp),%eax
-	andl	%edx,%ebp
-	xorl	%edi,%edx
-	rorl	$7,%ecx
-	movl	%ebx,%esi
-	xorl	%edx,%ebp
-	roll	$5,%ebx
-	addl	%ebp,%eax
-	xorl	%edx,%esi
-	addl	%ebx,%eax
-	addl	48(%esp),%edi
-	pxor	%xmm7,%xmm3
-	punpcklqdq	%xmm2,%xmm5
-	xorl	%ecx,%esi
-	movl	%eax,%ebp
-	roll	$5,%eax
-	pxor	%xmm4,%xmm3
-	movdqa	%xmm7,64(%esp)
-	addl	%esi,%edi
-	xorl	%ecx,%ebp
-	movdqa	%xmm6,%xmm7
-	rorl	$7,%ebx
-	paddd	%xmm2,%xmm6
-	addl	%eax,%edi
-	pxor	%xmm5,%xmm3
-	addl	52(%esp),%edx
-	xorl	%ebx,%ebp
-	movl	%edi,%esi
-	roll	$5,%edi
-	movdqa	%xmm3,%xmm5
-	movdqa	%xmm6,32(%esp)
-	addl	%ebp,%edx
-	xorl	%ebx,%esi
-	rorl	$7,%eax
-	addl	%edi,%edx
-	pslld	$2,%xmm3
-	addl	56(%esp),%ecx
-	xorl	%eax,%esi
-	psrld	$30,%xmm5
-	movl	%edx,%ebp
-	roll	$5,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%ebp
-	rorl	$7,%edi
-	addl	%edx,%ecx
-	por	%xmm5,%xmm3
-	addl	60(%esp),%ebx
-	xorl	%edi,%ebp
-	movl	%ecx,%esi
-	roll	$5,%ecx
-	addl	%ebp,%ebx
-	xorl	%edi,%esi
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	addl	(%esp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%ebp
-	roll	$5,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%ebp
-	rorl	$7,%ecx
-	paddd	%xmm3,%xmm7
-	addl	%ebx,%eax
-	addl	4(%esp),%edi
-	xorl	%ecx,%ebp
-	movl	%eax,%esi
-	movdqa	%xmm7,48(%esp)
-	roll	$5,%eax
-	addl	%ebp,%edi
-	xorl	%ecx,%esi
-	rorl	$7,%ebx
-	addl	%eax,%edi
-	addl	8(%esp),%edx
-	xorl	%ebx,%esi
-	movl	%edi,%ebp
-	roll	$5,%edi
-	addl	%esi,%edx
-	xorl	%ebx,%ebp
-	rorl	$7,%eax
-	addl	%edi,%edx
-	addl	12(%esp),%ecx
-	xorl	%eax,%ebp
-	movl	%edx,%esi
-	roll	$5,%edx
-	addl	%ebp,%ecx
-	xorl	%eax,%esi
-	rorl	$7,%edi
-	addl	%edx,%ecx
-	movl	196(%esp),%ebp
-	cmpl	200(%esp),%ebp
-	je	L005done
-	movdqa	160(%esp),%xmm7
-	movdqa	176(%esp),%xmm6
-	movdqu	(%ebp),%xmm0
-	movdqu	16(%ebp),%xmm1
-	movdqu	32(%ebp),%xmm2
-	movdqu	48(%ebp),%xmm3
-	addl	$64,%ebp
-.byte	102,15,56,0,198
-	movl	%ebp,196(%esp)
-	movdqa	%xmm7,96(%esp)
-	addl	16(%esp),%ebx
-	xorl	%edi,%esi
-	movl	%ecx,%ebp
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%edi,%ebp
-	rorl	$7,%edx
-.byte	102,15,56,0,206
-	addl	%ecx,%ebx
-	addl	20(%esp),%eax
-	xorl	%edx,%ebp
-	movl	%ebx,%esi
-	paddd	%xmm7,%xmm0
-	roll	$5,%ebx
-	addl	%ebp,%eax
-	xorl	%edx,%esi
-	rorl	$7,%ecx
-	movdqa	%xmm0,(%esp)
-	addl	%ebx,%eax
-	addl	24(%esp),%edi
-	xorl	%ecx,%esi
-	movl	%eax,%ebp
-	psubd	%xmm7,%xmm0
-	roll	$5,%eax
-	addl	%esi,%edi
-	xorl	%ecx,%ebp
-	rorl	$7,%ebx
-	addl	%eax,%edi
-	addl	28(%esp),%edx
-	xorl	%ebx,%ebp
-	movl	%edi,%esi
-	roll	$5,%edi
-	addl	%ebp,%edx
-	xorl	%ebx,%esi
-	rorl	$7,%eax
-	addl	%edi,%edx
-	addl	32(%esp),%ecx
-	xorl	%eax,%esi
-	movl	%edx,%ebp
-	roll	$5,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%ebp
-	rorl	$7,%edi
-.byte	102,15,56,0,214
-	addl	%edx,%ecx
-	addl	36(%esp),%ebx
-	xorl	%edi,%ebp
-	movl	%ecx,%esi
-	paddd	%xmm7,%xmm1
-	roll	$5,%ecx
-	addl	%ebp,%ebx
-	xorl	%edi,%esi
-	rorl	$7,%edx
-	movdqa	%xmm1,16(%esp)
-	addl	%ecx,%ebx
-	addl	40(%esp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%ebp
-	psubd	%xmm7,%xmm1
-	roll	$5,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%ebp
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	addl	44(%esp),%edi
-	xorl	%ecx,%ebp
-	movl	%eax,%esi
-	roll	$5,%eax
-	addl	%ebp,%edi
-	xorl	%ecx,%esi
-	rorl	$7,%ebx
-	addl	%eax,%edi
-	addl	48(%esp),%edx
-	xorl	%ebx,%esi
-	movl	%edi,%ebp
-	roll	$5,%edi
-	addl	%esi,%edx
-	xorl	%ebx,%ebp
-	rorl	$7,%eax
-.byte	102,15,56,0,222
-	addl	%edi,%edx
-	addl	52(%esp),%ecx
-	xorl	%eax,%ebp
-	movl	%edx,%esi
-	paddd	%xmm7,%xmm2
-	roll	$5,%edx
-	addl	%ebp,%ecx
-	xorl	%eax,%esi
-	rorl	$7,%edi
-	movdqa	%xmm2,32(%esp)
-	addl	%edx,%ecx
-	addl	56(%esp),%ebx
-	xorl	%edi,%esi
-	movl	%ecx,%ebp
-	psubd	%xmm7,%xmm2
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%edi,%ebp
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	addl	60(%esp),%eax
-	xorl	%edx,%ebp
-	movl	%ebx,%esi
-	roll	$5,%ebx
-	addl	%ebp,%eax
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	movl	192(%esp),%ebp
-	addl	(%ebp),%eax
-	addl	4(%ebp),%esi
-	addl	8(%ebp),%ecx
-	movl	%eax,(%ebp)
-	addl	12(%ebp),%edx
-	movl	%esi,4(%ebp)
-	addl	16(%ebp),%edi
-	movl	%ecx,8(%ebp)
-	movl	%ecx,%ebx
-	movl	%edx,12(%ebp)
-	xorl	%edx,%ebx
-	movl	%edi,16(%ebp)
-	movl	%esi,%ebp
-	pshufd	$238,%xmm0,%xmm4
-	andl	%ebx,%esi
-	movl	%ebp,%ebx
-	jmp	L004loop
-.align	4,0x90
-L005done:
-	addl	16(%esp),%ebx
-	xorl	%edi,%esi
-	movl	%ecx,%ebp
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%edi,%ebp
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	addl	20(%esp),%eax
-	xorl	%edx,%ebp
-	movl	%ebx,%esi
-	roll	$5,%ebx
-	addl	%ebp,%eax
-	xorl	%edx,%esi
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	addl	24(%esp),%edi
-	xorl	%ecx,%esi
-	movl	%eax,%ebp
-	roll	$5,%eax
-	addl	%esi,%edi
-	xorl	%ecx,%ebp
-	rorl	$7,%ebx
-	addl	%eax,%edi
-	addl	28(%esp),%edx
-	xorl	%ebx,%ebp
-	movl	%edi,%esi
-	roll	$5,%edi
-	addl	%ebp,%edx
-	xorl	%ebx,%esi
-	rorl	$7,%eax
-	addl	%edi,%edx
-	addl	32(%esp),%ecx
-	xorl	%eax,%esi
-	movl	%edx,%ebp
-	roll	$5,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%ebp
-	rorl	$7,%edi
-	addl	%edx,%ecx
-	addl	36(%esp),%ebx
-	xorl	%edi,%ebp
-	movl	%ecx,%esi
-	roll	$5,%ecx
-	addl	%ebp,%ebx
-	xorl	%edi,%esi
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	addl	40(%esp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%ebp
-	roll	$5,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%ebp
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	addl	44(%esp),%edi
-	xorl	%ecx,%ebp
-	movl	%eax,%esi
-	roll	$5,%eax
-	addl	%ebp,%edi
-	xorl	%ecx,%esi
-	rorl	$7,%ebx
-	addl	%eax,%edi
-	addl	48(%esp),%edx
-	xorl	%ebx,%esi
-	movl	%edi,%ebp
-	roll	$5,%edi
-	addl	%esi,%edx
-	xorl	%ebx,%ebp
-	rorl	$7,%eax
-	addl	%edi,%edx
-	addl	52(%esp),%ecx
-	xorl	%eax,%ebp
-	movl	%edx,%esi
-	roll	$5,%edx
-	addl	%ebp,%ecx
-	xorl	%eax,%esi
-	rorl	$7,%edi
-	addl	%edx,%ecx
-	addl	56(%esp),%ebx
-	xorl	%edi,%esi
-	movl	%ecx,%ebp
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%edi,%ebp
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	addl	60(%esp),%eax
-	xorl	%edx,%ebp
-	movl	%ebx,%esi
-	roll	$5,%ebx
-	addl	%ebp,%eax
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	movl	192(%esp),%ebp
-	addl	(%ebp),%eax
-	movl	204(%esp),%esp
-	addl	4(%ebp),%esi
-	addl	8(%ebp),%ecx
-	movl	%eax,(%ebp)
-	addl	12(%ebp),%edx
-	movl	%esi,4(%ebp)
-	addl	16(%ebp),%edi
-	movl	%ecx,8(%ebp)
-	movl	%edx,12(%ebp)
-	movl	%edi,16(%ebp)
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.private_extern	__sha1_block_data_order_avx
-.align	4
-__sha1_block_data_order_avx:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	call	L006pic_point
-L006pic_point:
-	popl	%ebp
-	leal	LK_XX_XX-L006pic_point(%ebp),%ebp
-Lavx_shortcut:
-	vzeroall
-	vmovdqa	(%ebp),%xmm7
-	vmovdqa	16(%ebp),%xmm0
-	vmovdqa	32(%ebp),%xmm1
-	vmovdqa	48(%ebp),%xmm2
-	vmovdqa	64(%ebp),%xmm6
-	movl	20(%esp),%edi
-	movl	24(%esp),%ebp
-	movl	28(%esp),%edx
-	movl	%esp,%esi
-	subl	$208,%esp
-	andl	$-64,%esp
-	vmovdqa	%xmm0,112(%esp)
-	vmovdqa	%xmm1,128(%esp)
-	vmovdqa	%xmm2,144(%esp)
-	shll	$6,%edx
-	vmovdqa	%xmm7,160(%esp)
-	addl	%ebp,%edx
-	vmovdqa	%xmm6,176(%esp)
-	addl	$64,%ebp
-	movl	%edi,192(%esp)
-	movl	%ebp,196(%esp)
-	movl	%edx,200(%esp)
-	movl	%esi,204(%esp)
-	movl	(%edi),%eax
-	movl	4(%edi),%ebx
-	movl	8(%edi),%ecx
-	movl	12(%edi),%edx
-	movl	16(%edi),%edi
-	movl	%ebx,%esi
-	vmovdqu	-64(%ebp),%xmm0
-	vmovdqu	-48(%ebp),%xmm1
-	vmovdqu	-32(%ebp),%xmm2
-	vmovdqu	-16(%ebp),%xmm3
-	vpshufb	%xmm6,%xmm0,%xmm0
-	vpshufb	%xmm6,%xmm1,%xmm1
-	vpshufb	%xmm6,%xmm2,%xmm2
-	vmovdqa	%xmm7,96(%esp)
-	vpshufb	%xmm6,%xmm3,%xmm3
-	vpaddd	%xmm7,%xmm0,%xmm4
-	vpaddd	%xmm7,%xmm1,%xmm5
-	vpaddd	%xmm7,%xmm2,%xmm6
-	vmovdqa	%xmm4,(%esp)
-	movl	%ecx,%ebp
-	vmovdqa	%xmm5,16(%esp)
-	xorl	%edx,%ebp
-	vmovdqa	%xmm6,32(%esp)
-	andl	%ebp,%esi
-	jmp	L007loop
-.align	4,0x90
-L007loop:
-	shrdl	$2,%ebx,%ebx
-	xorl	%edx,%esi
-	vpalignr	$8,%xmm0,%xmm1,%xmm4
-	movl	%eax,%ebp
-	addl	(%esp),%edi
-	vpaddd	%xmm3,%xmm7,%xmm7
-	vmovdqa	%xmm0,64(%esp)
-	xorl	%ecx,%ebx
-	shldl	$5,%eax,%eax
-	vpsrldq	$4,%xmm3,%xmm6
-	addl	%esi,%edi
-	andl	%ebx,%ebp
-	vpxor	%xmm0,%xmm4,%xmm4
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	vpxor	%xmm2,%xmm6,%xmm6
-	shrdl	$7,%eax,%eax
-	xorl	%ecx,%ebp
-	vmovdqa	%xmm7,48(%esp)
-	movl	%edi,%esi
-	addl	4(%esp),%edx
-	vpxor	%xmm6,%xmm4,%xmm4
-	xorl	%ebx,%eax
-	shldl	$5,%edi,%edi
-	addl	%ebp,%edx
-	andl	%eax,%esi
-	vpsrld	$31,%xmm4,%xmm6
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	shrdl	$7,%edi,%edi
-	xorl	%ebx,%esi
-	vpslldq	$12,%xmm4,%xmm0
-	vpaddd	%xmm4,%xmm4,%xmm4
-	movl	%edx,%ebp
-	addl	8(%esp),%ecx
-	xorl	%eax,%edi
-	shldl	$5,%edx,%edx
-	vpsrld	$30,%xmm0,%xmm7
-	vpor	%xmm6,%xmm4,%xmm4
-	addl	%esi,%ecx
-	andl	%edi,%ebp
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	vpslld	$2,%xmm0,%xmm0
-	shrdl	$7,%edx,%edx
-	xorl	%eax,%ebp
-	vpxor	%xmm7,%xmm4,%xmm4
-	movl	%ecx,%esi
-	addl	12(%esp),%ebx
-	xorl	%edi,%edx
-	shldl	$5,%ecx,%ecx
-	vpxor	%xmm0,%xmm4,%xmm4
-	addl	%ebp,%ebx
-	andl	%edx,%esi
-	vmovdqa	96(%esp),%xmm0
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	shrdl	$7,%ecx,%ecx
-	xorl	%edi,%esi
-	vpalignr	$8,%xmm1,%xmm2,%xmm5
-	movl	%ebx,%ebp
-	addl	16(%esp),%eax
-	vpaddd	%xmm4,%xmm0,%xmm0
-	vmovdqa	%xmm1,80(%esp)
-	xorl	%edx,%ecx
-	shldl	$5,%ebx,%ebx
-	vpsrldq	$4,%xmm4,%xmm7
-	addl	%esi,%eax
-	andl	%ecx,%ebp
-	vpxor	%xmm1,%xmm5,%xmm5
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	vpxor	%xmm3,%xmm7,%xmm7
-	shrdl	$7,%ebx,%ebx
-	xorl	%edx,%ebp
-	vmovdqa	%xmm0,(%esp)
-	movl	%eax,%esi
-	addl	20(%esp),%edi
-	vpxor	%xmm7,%xmm5,%xmm5
-	xorl	%ecx,%ebx
-	shldl	$5,%eax,%eax
-	addl	%ebp,%edi
-	andl	%ebx,%esi
-	vpsrld	$31,%xmm5,%xmm7
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	shrdl	$7,%eax,%eax
-	xorl	%ecx,%esi
-	vpslldq	$12,%xmm5,%xmm1
-	vpaddd	%xmm5,%xmm5,%xmm5
-	movl	%edi,%ebp
-	addl	24(%esp),%edx
-	xorl	%ebx,%eax
-	shldl	$5,%edi,%edi
-	vpsrld	$30,%xmm1,%xmm0
-	vpor	%xmm7,%xmm5,%xmm5
-	addl	%esi,%edx
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	vpslld	$2,%xmm1,%xmm1
-	shrdl	$7,%edi,%edi
-	xorl	%ebx,%ebp
-	vpxor	%xmm0,%xmm5,%xmm5
-	movl	%edx,%esi
-	addl	28(%esp),%ecx
-	xorl	%eax,%edi
-	shldl	$5,%edx,%edx
-	vpxor	%xmm1,%xmm5,%xmm5
-	addl	%ebp,%ecx
-	andl	%edi,%esi
-	vmovdqa	112(%esp),%xmm1
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	shrdl	$7,%edx,%edx
-	xorl	%eax,%esi
-	vpalignr	$8,%xmm2,%xmm3,%xmm6
-	movl	%ecx,%ebp
-	addl	32(%esp),%ebx
-	vpaddd	%xmm5,%xmm1,%xmm1
-	vmovdqa	%xmm2,96(%esp)
-	xorl	%edi,%edx
-	shldl	$5,%ecx,%ecx
-	vpsrldq	$4,%xmm5,%xmm0
-	addl	%esi,%ebx
-	andl	%edx,%ebp
-	vpxor	%xmm2,%xmm6,%xmm6
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	vpxor	%xmm4,%xmm0,%xmm0
-	shrdl	$7,%ecx,%ecx
-	xorl	%edi,%ebp
-	vmovdqa	%xmm1,16(%esp)
-	movl	%ebx,%esi
-	addl	36(%esp),%eax
-	vpxor	%xmm0,%xmm6,%xmm6
-	xorl	%edx,%ecx
-	shldl	$5,%ebx,%ebx
-	addl	%ebp,%eax
-	andl	%ecx,%esi
-	vpsrld	$31,%xmm6,%xmm0
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	shrdl	$7,%ebx,%ebx
-	xorl	%edx,%esi
-	vpslldq	$12,%xmm6,%xmm2
-	vpaddd	%xmm6,%xmm6,%xmm6
-	movl	%eax,%ebp
-	addl	40(%esp),%edi
-	xorl	%ecx,%ebx
-	shldl	$5,%eax,%eax
-	vpsrld	$30,%xmm2,%xmm1
-	vpor	%xmm0,%xmm6,%xmm6
-	addl	%esi,%edi
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	vpslld	$2,%xmm2,%xmm2
-	vmovdqa	64(%esp),%xmm0
-	shrdl	$7,%eax,%eax
-	xorl	%ecx,%ebp
-	vpxor	%xmm1,%xmm6,%xmm6
-	movl	%edi,%esi
-	addl	44(%esp),%edx
-	xorl	%ebx,%eax
-	shldl	$5,%edi,%edi
-	vpxor	%xmm2,%xmm6,%xmm6
-	addl	%ebp,%edx
-	andl	%eax,%esi
-	vmovdqa	112(%esp),%xmm2
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	shrdl	$7,%edi,%edi
-	xorl	%ebx,%esi
-	vpalignr	$8,%xmm3,%xmm4,%xmm7
-	movl	%edx,%ebp
-	addl	48(%esp),%ecx
-	vpaddd	%xmm6,%xmm2,%xmm2
-	vmovdqa	%xmm3,64(%esp)
-	xorl	%eax,%edi
-	shldl	$5,%edx,%edx
-	vpsrldq	$4,%xmm6,%xmm1
-	addl	%esi,%ecx
-	andl	%edi,%ebp
-	vpxor	%xmm3,%xmm7,%xmm7
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	vpxor	%xmm5,%xmm1,%xmm1
-	shrdl	$7,%edx,%edx
-	xorl	%eax,%ebp
-	vmovdqa	%xmm2,32(%esp)
-	movl	%ecx,%esi
-	addl	52(%esp),%ebx
-	vpxor	%xmm1,%xmm7,%xmm7
-	xorl	%edi,%edx
-	shldl	$5,%ecx,%ecx
-	addl	%ebp,%ebx
-	andl	%edx,%esi
-	vpsrld	$31,%xmm7,%xmm1
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	shrdl	$7,%ecx,%ecx
-	xorl	%edi,%esi
-	vpslldq	$12,%xmm7,%xmm3
-	vpaddd	%xmm7,%xmm7,%xmm7
-	movl	%ebx,%ebp
-	addl	56(%esp),%eax
-	xorl	%edx,%ecx
-	shldl	$5,%ebx,%ebx
-	vpsrld	$30,%xmm3,%xmm2
-	vpor	%xmm1,%xmm7,%xmm7
-	addl	%esi,%eax
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	vpslld	$2,%xmm3,%xmm3
-	vmovdqa	80(%esp),%xmm1
-	shrdl	$7,%ebx,%ebx
-	xorl	%edx,%ebp
-	vpxor	%xmm2,%xmm7,%xmm7
-	movl	%eax,%esi
-	addl	60(%esp),%edi
-	xorl	%ecx,%ebx
-	shldl	$5,%eax,%eax
-	vpxor	%xmm3,%xmm7,%xmm7
-	addl	%ebp,%edi
-	andl	%ebx,%esi
-	vmovdqa	112(%esp),%xmm3
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	vpalignr	$8,%xmm6,%xmm7,%xmm2
-	vpxor	%xmm4,%xmm0,%xmm0
-	shrdl	$7,%eax,%eax
-	xorl	%ecx,%esi
-	movl	%edi,%ebp
-	addl	(%esp),%edx
-	vpxor	%xmm1,%xmm0,%xmm0
-	vmovdqa	%xmm4,80(%esp)
-	xorl	%ebx,%eax
-	shldl	$5,%edi,%edi
-	vmovdqa	%xmm3,%xmm4
-	vpaddd	%xmm7,%xmm3,%xmm3
-	addl	%esi,%edx
-	andl	%eax,%ebp
-	vpxor	%xmm2,%xmm0,%xmm0
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	shrdl	$7,%edi,%edi
-	xorl	%ebx,%ebp
-	vpsrld	$30,%xmm0,%xmm2
-	vmovdqa	%xmm3,48(%esp)
-	movl	%edx,%esi
-	addl	4(%esp),%ecx
-	xorl	%eax,%edi
-	shldl	$5,%edx,%edx
-	vpslld	$2,%xmm0,%xmm0
-	addl	%ebp,%ecx
-	andl	%edi,%esi
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	shrdl	$7,%edx,%edx
-	xorl	%eax,%esi
-	movl	%ecx,%ebp
-	addl	8(%esp),%ebx
-	vpor	%xmm2,%xmm0,%xmm0
-	xorl	%edi,%edx
-	shldl	$5,%ecx,%ecx
-	vmovdqa	96(%esp),%xmm2
-	addl	%esi,%ebx
-	andl	%edx,%ebp
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	addl	12(%esp),%eax
-	xorl	%edi,%ebp
-	movl	%ebx,%esi
-	shldl	$5,%ebx,%ebx
-	addl	%ebp,%eax
-	xorl	%edx,%esi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	vpalignr	$8,%xmm7,%xmm0,%xmm3
-	vpxor	%xmm5,%xmm1,%xmm1
-	addl	16(%esp),%edi
-	xorl	%ecx,%esi
-	movl	%eax,%ebp
-	shldl	$5,%eax,%eax
-	vpxor	%xmm2,%xmm1,%xmm1
-	vmovdqa	%xmm5,96(%esp)
-	addl	%esi,%edi
-	xorl	%ecx,%ebp
-	vmovdqa	%xmm4,%xmm5
-	vpaddd	%xmm0,%xmm4,%xmm4
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%edi
-	vpxor	%xmm3,%xmm1,%xmm1
-	addl	20(%esp),%edx
-	xorl	%ebx,%ebp
-	movl	%edi,%esi
-	shldl	$5,%edi,%edi
-	vpsrld	$30,%xmm1,%xmm3
-	vmovdqa	%xmm4,(%esp)
-	addl	%ebp,%edx
-	xorl	%ebx,%esi
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	vpslld	$2,%xmm1,%xmm1
-	addl	24(%esp),%ecx
-	xorl	%eax,%esi
-	movl	%edx,%ebp
-	shldl	$5,%edx,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%ebp
-	shrdl	$7,%edi,%edi
-	addl	%edx,%ecx
-	vpor	%xmm3,%xmm1,%xmm1
-	addl	28(%esp),%ebx
-	xorl	%edi,%ebp
-	vmovdqa	64(%esp),%xmm3
-	movl	%ecx,%esi
-	shldl	$5,%ecx,%ecx
-	addl	%ebp,%ebx
-	xorl	%edi,%esi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	vpalignr	$8,%xmm0,%xmm1,%xmm4
-	vpxor	%xmm6,%xmm2,%xmm2
-	addl	32(%esp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%ebp
-	shldl	$5,%ebx,%ebx
-	vpxor	%xmm3,%xmm2,%xmm2
-	vmovdqa	%xmm6,64(%esp)
-	addl	%esi,%eax
-	xorl	%edx,%ebp
-	vmovdqa	128(%esp),%xmm6
-	vpaddd	%xmm1,%xmm5,%xmm5
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	vpxor	%xmm4,%xmm2,%xmm2
-	addl	36(%esp),%edi
-	xorl	%ecx,%ebp
-	movl	%eax,%esi
-	shldl	$5,%eax,%eax
-	vpsrld	$30,%xmm2,%xmm4
-	vmovdqa	%xmm5,16(%esp)
-	addl	%ebp,%edi
-	xorl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%edi
-	vpslld	$2,%xmm2,%xmm2
-	addl	40(%esp),%edx
-	xorl	%ebx,%esi
-	movl	%edi,%ebp
-	shldl	$5,%edi,%edi
-	addl	%esi,%edx
-	xorl	%ebx,%ebp
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	vpor	%xmm4,%xmm2,%xmm2
-	addl	44(%esp),%ecx
-	xorl	%eax,%ebp
-	vmovdqa	80(%esp),%xmm4
-	movl	%edx,%esi
-	shldl	$5,%edx,%edx
-	addl	%ebp,%ecx
-	xorl	%eax,%esi
-	shrdl	$7,%edi,%edi
-	addl	%edx,%ecx
-	vpalignr	$8,%xmm1,%xmm2,%xmm5
-	vpxor	%xmm7,%xmm3,%xmm3
-	addl	48(%esp),%ebx
-	xorl	%edi,%esi
-	movl	%ecx,%ebp
-	shldl	$5,%ecx,%ecx
-	vpxor	%xmm4,%xmm3,%xmm3
-	vmovdqa	%xmm7,80(%esp)
-	addl	%esi,%ebx
-	xorl	%edi,%ebp
-	vmovdqa	%xmm6,%xmm7
-	vpaddd	%xmm2,%xmm6,%xmm6
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	vpxor	%xmm5,%xmm3,%xmm3
-	addl	52(%esp),%eax
-	xorl	%edx,%ebp
-	movl	%ebx,%esi
-	shldl	$5,%ebx,%ebx
-	vpsrld	$30,%xmm3,%xmm5
-	vmovdqa	%xmm6,32(%esp)
-	addl	%ebp,%eax
-	xorl	%edx,%esi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	vpslld	$2,%xmm3,%xmm3
-	addl	56(%esp),%edi
-	xorl	%ecx,%esi
-	movl	%eax,%ebp
-	shldl	$5,%eax,%eax
-	addl	%esi,%edi
-	xorl	%ecx,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%edi
-	vpor	%xmm5,%xmm3,%xmm3
-	addl	60(%esp),%edx
-	xorl	%ebx,%ebp
-	vmovdqa	96(%esp),%xmm5
-	movl	%edi,%esi
-	shldl	$5,%edi,%edi
-	addl	%ebp,%edx
-	xorl	%ebx,%esi
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	vpalignr	$8,%xmm2,%xmm3,%xmm6
-	vpxor	%xmm0,%xmm4,%xmm4
-	addl	(%esp),%ecx
-	xorl	%eax,%esi
-	movl	%edx,%ebp
-	shldl	$5,%edx,%edx
-	vpxor	%xmm5,%xmm4,%xmm4
-	vmovdqa	%xmm0,96(%esp)
-	addl	%esi,%ecx
-	xorl	%eax,%ebp
-	vmovdqa	%xmm7,%xmm0
-	vpaddd	%xmm3,%xmm7,%xmm7
-	shrdl	$7,%edi,%edi
-	addl	%edx,%ecx
-	vpxor	%xmm6,%xmm4,%xmm4
-	addl	4(%esp),%ebx
-	xorl	%edi,%ebp
-	movl	%ecx,%esi
-	shldl	$5,%ecx,%ecx
-	vpsrld	$30,%xmm4,%xmm6
-	vmovdqa	%xmm7,48(%esp)
-	addl	%ebp,%ebx
-	xorl	%edi,%esi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	vpslld	$2,%xmm4,%xmm4
-	addl	8(%esp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%ebp
-	shldl	$5,%ebx,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%ebp
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	vpor	%xmm6,%xmm4,%xmm4
-	addl	12(%esp),%edi
-	xorl	%ecx,%ebp
-	vmovdqa	64(%esp),%xmm6
-	movl	%eax,%esi
-	shldl	$5,%eax,%eax
-	addl	%ebp,%edi
-	xorl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%edi
-	vpalignr	$8,%xmm3,%xmm4,%xmm7
-	vpxor	%xmm1,%xmm5,%xmm5
-	addl	16(%esp),%edx
-	xorl	%ebx,%esi
-	movl	%edi,%ebp
-	shldl	$5,%edi,%edi
-	vpxor	%xmm6,%xmm5,%xmm5
-	vmovdqa	%xmm1,64(%esp)
-	addl	%esi,%edx
-	xorl	%ebx,%ebp
-	vmovdqa	%xmm0,%xmm1
-	vpaddd	%xmm4,%xmm0,%xmm0
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	vpxor	%xmm7,%xmm5,%xmm5
-	addl	20(%esp),%ecx
-	xorl	%eax,%ebp
-	movl	%edx,%esi
-	shldl	$5,%edx,%edx
-	vpsrld	$30,%xmm5,%xmm7
-	vmovdqa	%xmm0,(%esp)
-	addl	%ebp,%ecx
-	xorl	%eax,%esi
-	shrdl	$7,%edi,%edi
-	addl	%edx,%ecx
-	vpslld	$2,%xmm5,%xmm5
-	addl	24(%esp),%ebx
-	xorl	%edi,%esi
-	movl	%ecx,%ebp
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	xorl	%edi,%ebp
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	vpor	%xmm7,%xmm5,%xmm5
-	addl	28(%esp),%eax
-	vmovdqa	80(%esp),%xmm7
-	shrdl	$7,%ecx,%ecx
-	movl	%ebx,%esi
-	xorl	%edx,%ebp
-	shldl	$5,%ebx,%ebx
-	addl	%ebp,%eax
-	xorl	%ecx,%esi
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	vpalignr	$8,%xmm4,%xmm5,%xmm0
-	vpxor	%xmm2,%xmm6,%xmm6
-	addl	32(%esp),%edi
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
-	shrdl	$7,%ebx,%ebx
-	vpxor	%xmm7,%xmm6,%xmm6
-	vmovdqa	%xmm2,80(%esp)
-	movl	%eax,%ebp
-	xorl	%ecx,%esi
-	vmovdqa	%xmm1,%xmm2
-	vpaddd	%xmm5,%xmm1,%xmm1
-	shldl	$5,%eax,%eax
-	addl	%esi,%edi
-	vpxor	%xmm0,%xmm6,%xmm6
-	xorl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	addl	36(%esp),%edx
-	vpsrld	$30,%xmm6,%xmm0
-	vmovdqa	%xmm1,16(%esp)
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	shrdl	$7,%eax,%eax
-	movl	%edi,%esi
-	vpslld	$2,%xmm6,%xmm6
-	xorl	%ebx,%ebp
-	shldl	$5,%edi,%edi
-	addl	%ebp,%edx
-	xorl	%eax,%esi
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	addl	40(%esp),%ecx
-	andl	%eax,%esi
-	vpor	%xmm0,%xmm6,%xmm6
-	xorl	%ebx,%eax
-	shrdl	$7,%edi,%edi
-	vmovdqa	96(%esp),%xmm0
-	movl	%edx,%ebp
-	xorl	%eax,%esi
-	shldl	$5,%edx,%edx
-	addl	%esi,%ecx
-	xorl	%edi,%ebp
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	addl	44(%esp),%ebx
-	andl	%edi,%ebp
-	xorl	%eax,%edi
-	shrdl	$7,%edx,%edx
-	movl	%ecx,%esi
-	xorl	%edi,%ebp
-	shldl	$5,%ecx,%ecx
-	addl	%ebp,%ebx
-	xorl	%edx,%esi
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	vpalignr	$8,%xmm5,%xmm6,%xmm1
-	vpxor	%xmm3,%xmm7,%xmm7
-	addl	48(%esp),%eax
-	andl	%edx,%esi
-	xorl	%edi,%edx
-	shrdl	$7,%ecx,%ecx
-	vpxor	%xmm0,%xmm7,%xmm7
-	vmovdqa	%xmm3,96(%esp)
-	movl	%ebx,%ebp
-	xorl	%edx,%esi
-	vmovdqa	144(%esp),%xmm3
-	vpaddd	%xmm6,%xmm2,%xmm2
-	shldl	$5,%ebx,%ebx
-	addl	%esi,%eax
-	vpxor	%xmm1,%xmm7,%xmm7
-	xorl	%ecx,%ebp
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	addl	52(%esp),%edi
-	vpsrld	$30,%xmm7,%xmm1
-	vmovdqa	%xmm2,32(%esp)
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
-	shrdl	$7,%ebx,%ebx
-	movl	%eax,%esi
-	vpslld	$2,%xmm7,%xmm7
-	xorl	%ecx,%ebp
-	shldl	$5,%eax,%eax
-	addl	%ebp,%edi
-	xorl	%ebx,%esi
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	addl	56(%esp),%edx
-	andl	%ebx,%esi
-	vpor	%xmm1,%xmm7,%xmm7
-	xorl	%ecx,%ebx
-	shrdl	$7,%eax,%eax
-	vmovdqa	64(%esp),%xmm1
-	movl	%edi,%ebp
-	xorl	%ebx,%esi
-	shldl	$5,%edi,%edi
-	addl	%esi,%edx
-	xorl	%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	addl	60(%esp),%ecx
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
-	shrdl	$7,%edi,%edi
-	movl	%edx,%esi
-	xorl	%eax,%ebp
-	shldl	$5,%edx,%edx
-	addl	%ebp,%ecx
-	xorl	%edi,%esi
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	vpalignr	$8,%xmm6,%xmm7,%xmm2
-	vpxor	%xmm4,%xmm0,%xmm0
-	addl	(%esp),%ebx
-	andl	%edi,%esi
-	xorl	%eax,%edi
-	shrdl	$7,%edx,%edx
-	vpxor	%xmm1,%xmm0,%xmm0
-	vmovdqa	%xmm4,64(%esp)
-	movl	%ecx,%ebp
-	xorl	%edi,%esi
-	vmovdqa	%xmm3,%xmm4
-	vpaddd	%xmm7,%xmm3,%xmm3
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	vpxor	%xmm2,%xmm0,%xmm0
-	xorl	%edx,%ebp
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	addl	4(%esp),%eax
-	vpsrld	$30,%xmm0,%xmm2
-	vmovdqa	%xmm3,48(%esp)
-	andl	%edx,%ebp
-	xorl	%edi,%edx
-	shrdl	$7,%ecx,%ecx
-	movl	%ebx,%esi
-	vpslld	$2,%xmm0,%xmm0
-	xorl	%edx,%ebp
-	shldl	$5,%ebx,%ebx
-	addl	%ebp,%eax
-	xorl	%ecx,%esi
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	addl	8(%esp),%edi
-	andl	%ecx,%esi
-	vpor	%xmm2,%xmm0,%xmm0
-	xorl	%edx,%ecx
-	shrdl	$7,%ebx,%ebx
-	vmovdqa	80(%esp),%xmm2
-	movl	%eax,%ebp
-	xorl	%ecx,%esi
-	shldl	$5,%eax,%eax
-	addl	%esi,%edi
-	xorl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	addl	12(%esp),%edx
-	andl	%ebx,%ebp
-	xorl	%ecx,%ebx
-	shrdl	$7,%eax,%eax
-	movl	%edi,%esi
-	xorl	%ebx,%ebp
-	shldl	$5,%edi,%edi
-	addl	%ebp,%edx
-	xorl	%eax,%esi
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	vpalignr	$8,%xmm7,%xmm0,%xmm3
-	vpxor	%xmm5,%xmm1,%xmm1
-	addl	16(%esp),%ecx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	shrdl	$7,%edi,%edi
-	vpxor	%xmm2,%xmm1,%xmm1
-	vmovdqa	%xmm5,80(%esp)
-	movl	%edx,%ebp
-	xorl	%eax,%esi
-	vmovdqa	%xmm4,%xmm5
-	vpaddd	%xmm0,%xmm4,%xmm4
-	shldl	$5,%edx,%edx
-	addl	%esi,%ecx
-	vpxor	%xmm3,%xmm1,%xmm1
-	xorl	%edi,%ebp
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	addl	20(%esp),%ebx
-	vpsrld	$30,%xmm1,%xmm3
-	vmovdqa	%xmm4,(%esp)
-	andl	%edi,%ebp
-	xorl	%eax,%edi
-	shrdl	$7,%edx,%edx
-	movl	%ecx,%esi
-	vpslld	$2,%xmm1,%xmm1
-	xorl	%edi,%ebp
-	shldl	$5,%ecx,%ecx
-	addl	%ebp,%ebx
-	xorl	%edx,%esi
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	addl	24(%esp),%eax
-	andl	%edx,%esi
-	vpor	%xmm3,%xmm1,%xmm1
-	xorl	%edi,%edx
-	shrdl	$7,%ecx,%ecx
-	vmovdqa	96(%esp),%xmm3
-	movl	%ebx,%ebp
-	xorl	%edx,%esi
-	shldl	$5,%ebx,%ebx
-	addl	%esi,%eax
-	xorl	%ecx,%ebp
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	addl	28(%esp),%edi
-	andl	%ecx,%ebp
-	xorl	%edx,%ecx
-	shrdl	$7,%ebx,%ebx
-	movl	%eax,%esi
-	xorl	%ecx,%ebp
-	shldl	$5,%eax,%eax
-	addl	%ebp,%edi
-	xorl	%ebx,%esi
-	xorl	%ecx,%ebx
-	addl	%eax,%edi
-	vpalignr	$8,%xmm0,%xmm1,%xmm4
-	vpxor	%xmm6,%xmm2,%xmm2
-	addl	32(%esp),%edx
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	shrdl	$7,%eax,%eax
-	vpxor	%xmm3,%xmm2,%xmm2
-	vmovdqa	%xmm6,96(%esp)
-	movl	%edi,%ebp
-	xorl	%ebx,%esi
-	vmovdqa	%xmm5,%xmm6
-	vpaddd	%xmm1,%xmm5,%xmm5
-	shldl	$5,%edi,%edi
-	addl	%esi,%edx
-	vpxor	%xmm4,%xmm2,%xmm2
-	xorl	%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%edi,%edx
-	addl	36(%esp),%ecx
-	vpsrld	$30,%xmm2,%xmm4
-	vmovdqa	%xmm5,16(%esp)
-	andl	%eax,%ebp
-	xorl	%ebx,%eax
-	shrdl	$7,%edi,%edi
-	movl	%edx,%esi
-	vpslld	$2,%xmm2,%xmm2
-	xorl	%eax,%ebp
-	shldl	$5,%edx,%edx
-	addl	%ebp,%ecx
-	xorl	%edi,%esi
-	xorl	%eax,%edi
-	addl	%edx,%ecx
-	addl	40(%esp),%ebx
-	andl	%edi,%esi
-	vpor	%xmm4,%xmm2,%xmm2
-	xorl	%eax,%edi
-	shrdl	$7,%edx,%edx
-	vmovdqa	64(%esp),%xmm4
-	movl	%ecx,%ebp
-	xorl	%edi,%esi
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	xorl	%edx,%ebp
-	xorl	%edi,%edx
-	addl	%ecx,%ebx
-	addl	44(%esp),%eax
-	andl	%edx,%ebp
-	xorl	%edi,%edx
-	shrdl	$7,%ecx,%ecx
-	movl	%ebx,%esi
-	xorl	%edx,%ebp
-	shldl	$5,%ebx,%ebx
-	addl	%ebp,%eax
-	xorl	%edx,%esi
-	addl	%ebx,%eax
-	vpalignr	$8,%xmm1,%xmm2,%xmm5
-	vpxor	%xmm7,%xmm3,%xmm3
-	addl	48(%esp),%edi
-	xorl	%ecx,%esi
-	movl	%eax,%ebp
-	shldl	$5,%eax,%eax
-	vpxor	%xmm4,%xmm3,%xmm3
-	vmovdqa	%xmm7,64(%esp)
-	addl	%esi,%edi
-	xorl	%ecx,%ebp
-	vmovdqa	%xmm6,%xmm7
-	vpaddd	%xmm2,%xmm6,%xmm6
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%edi
-	vpxor	%xmm5,%xmm3,%xmm3
-	addl	52(%esp),%edx
-	xorl	%ebx,%ebp
-	movl	%edi,%esi
-	shldl	$5,%edi,%edi
-	vpsrld	$30,%xmm3,%xmm5
-	vmovdqa	%xmm6,32(%esp)
-	addl	%ebp,%edx
-	xorl	%ebx,%esi
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	vpslld	$2,%xmm3,%xmm3
-	addl	56(%esp),%ecx
-	xorl	%eax,%esi
-	movl	%edx,%ebp
-	shldl	$5,%edx,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%ebp
-	shrdl	$7,%edi,%edi
-	addl	%edx,%ecx
-	vpor	%xmm5,%xmm3,%xmm3
-	addl	60(%esp),%ebx
-	xorl	%edi,%ebp
-	movl	%ecx,%esi
-	shldl	$5,%ecx,%ecx
-	addl	%ebp,%ebx
-	xorl	%edi,%esi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	addl	(%esp),%eax
-	vpaddd	%xmm3,%xmm7,%xmm7
-	xorl	%edx,%esi
-	movl	%ebx,%ebp
-	shldl	$5,%ebx,%ebx
-	addl	%esi,%eax
-	vmovdqa	%xmm7,48(%esp)
-	xorl	%edx,%ebp
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	addl	4(%esp),%edi
-	xorl	%ecx,%ebp
-	movl	%eax,%esi
-	shldl	$5,%eax,%eax
-	addl	%ebp,%edi
-	xorl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%edi
-	addl	8(%esp),%edx
-	xorl	%ebx,%esi
-	movl	%edi,%ebp
-	shldl	$5,%edi,%edi
-	addl	%esi,%edx
-	xorl	%ebx,%ebp
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	addl	12(%esp),%ecx
-	xorl	%eax,%ebp
-	movl	%edx,%esi
-	shldl	$5,%edx,%edx
-	addl	%ebp,%ecx
-	xorl	%eax,%esi
-	shrdl	$7,%edi,%edi
-	addl	%edx,%ecx
-	movl	196(%esp),%ebp
-	cmpl	200(%esp),%ebp
-	je	L008done
-	vmovdqa	160(%esp),%xmm7
-	vmovdqa	176(%esp),%xmm6
-	vmovdqu	(%ebp),%xmm0
-	vmovdqu	16(%ebp),%xmm1
-	vmovdqu	32(%ebp),%xmm2
-	vmovdqu	48(%ebp),%xmm3
-	addl	$64,%ebp
-	vpshufb	%xmm6,%xmm0,%xmm0
-	movl	%ebp,196(%esp)
-	vmovdqa	%xmm7,96(%esp)
-	addl	16(%esp),%ebx
-	xorl	%edi,%esi
-	vpshufb	%xmm6,%xmm1,%xmm1
-	movl	%ecx,%ebp
-	shldl	$5,%ecx,%ecx
-	vpaddd	%xmm7,%xmm0,%xmm4
-	addl	%esi,%ebx
-	xorl	%edi,%ebp
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	vmovdqa	%xmm4,(%esp)
-	addl	20(%esp),%eax
-	xorl	%edx,%ebp
-	movl	%ebx,%esi
-	shldl	$5,%ebx,%ebx
-	addl	%ebp,%eax
-	xorl	%edx,%esi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	addl	24(%esp),%edi
-	xorl	%ecx,%esi
-	movl	%eax,%ebp
-	shldl	$5,%eax,%eax
-	addl	%esi,%edi
-	xorl	%ecx,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%edi
-	addl	28(%esp),%edx
-	xorl	%ebx,%ebp
-	movl	%edi,%esi
-	shldl	$5,%edi,%edi
-	addl	%ebp,%edx
-	xorl	%ebx,%esi
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	addl	32(%esp),%ecx
-	xorl	%eax,%esi
-	vpshufb	%xmm6,%xmm2,%xmm2
-	movl	%edx,%ebp
-	shldl	$5,%edx,%edx
-	vpaddd	%xmm7,%xmm1,%xmm5
-	addl	%esi,%ecx
-	xorl	%eax,%ebp
-	shrdl	$7,%edi,%edi
-	addl	%edx,%ecx
-	vmovdqa	%xmm5,16(%esp)
-	addl	36(%esp),%ebx
-	xorl	%edi,%ebp
-	movl	%ecx,%esi
-	shldl	$5,%ecx,%ecx
-	addl	%ebp,%ebx
-	xorl	%edi,%esi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	addl	40(%esp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%ebp
-	shldl	$5,%ebx,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%ebp
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	addl	44(%esp),%edi
-	xorl	%ecx,%ebp
-	movl	%eax,%esi
-	shldl	$5,%eax,%eax
-	addl	%ebp,%edi
-	xorl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%edi
-	addl	48(%esp),%edx
-	xorl	%ebx,%esi
-	vpshufb	%xmm6,%xmm3,%xmm3
-	movl	%edi,%ebp
-	shldl	$5,%edi,%edi
-	vpaddd	%xmm7,%xmm2,%xmm6
-	addl	%esi,%edx
-	xorl	%ebx,%ebp
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	vmovdqa	%xmm6,32(%esp)
-	addl	52(%esp),%ecx
-	xorl	%eax,%ebp
-	movl	%edx,%esi
-	shldl	$5,%edx,%edx
-	addl	%ebp,%ecx
-	xorl	%eax,%esi
-	shrdl	$7,%edi,%edi
-	addl	%edx,%ecx
-	addl	56(%esp),%ebx
-	xorl	%edi,%esi
-	movl	%ecx,%ebp
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	xorl	%edi,%ebp
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	addl	60(%esp),%eax
-	xorl	%edx,%ebp
-	movl	%ebx,%esi
-	shldl	$5,%ebx,%ebx
-	addl	%ebp,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	movl	192(%esp),%ebp
-	addl	(%ebp),%eax
-	addl	4(%ebp),%esi
-	addl	8(%ebp),%ecx
-	movl	%eax,(%ebp)
-	addl	12(%ebp),%edx
-	movl	%esi,4(%ebp)
-	addl	16(%ebp),%edi
-	movl	%ecx,%ebx
-	movl	%ecx,8(%ebp)
-	xorl	%edx,%ebx
-	movl	%edx,12(%ebp)
-	movl	%edi,16(%ebp)
-	movl	%esi,%ebp
-	andl	%ebx,%esi
-	movl	%ebp,%ebx
-	jmp	L007loop
-.align	4,0x90
-L008done:
-	addl	16(%esp),%ebx
-	xorl	%edi,%esi
-	movl	%ecx,%ebp
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	xorl	%edi,%ebp
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	addl	20(%esp),%eax
-	xorl	%edx,%ebp
-	movl	%ebx,%esi
-	shldl	$5,%ebx,%ebx
-	addl	%ebp,%eax
-	xorl	%edx,%esi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	addl	24(%esp),%edi
-	xorl	%ecx,%esi
-	movl	%eax,%ebp
-	shldl	$5,%eax,%eax
-	addl	%esi,%edi
-	xorl	%ecx,%ebp
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%edi
-	addl	28(%esp),%edx
-	xorl	%ebx,%ebp
-	movl	%edi,%esi
-	shldl	$5,%edi,%edi
-	addl	%ebp,%edx
-	xorl	%ebx,%esi
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	addl	32(%esp),%ecx
-	xorl	%eax,%esi
-	movl	%edx,%ebp
-	shldl	$5,%edx,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%ebp
-	shrdl	$7,%edi,%edi
-	addl	%edx,%ecx
-	addl	36(%esp),%ebx
-	xorl	%edi,%ebp
-	movl	%ecx,%esi
-	shldl	$5,%ecx,%ecx
-	addl	%ebp,%ebx
-	xorl	%edi,%esi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	addl	40(%esp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%ebp
-	shldl	$5,%ebx,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%ebp
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	addl	44(%esp),%edi
-	xorl	%ecx,%ebp
-	movl	%eax,%esi
-	shldl	$5,%eax,%eax
-	addl	%ebp,%edi
-	xorl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%edi
-	addl	48(%esp),%edx
-	xorl	%ebx,%esi
-	movl	%edi,%ebp
-	shldl	$5,%edi,%edi
-	addl	%esi,%edx
-	xorl	%ebx,%ebp
-	shrdl	$7,%eax,%eax
-	addl	%edi,%edx
-	addl	52(%esp),%ecx
-	xorl	%eax,%ebp
-	movl	%edx,%esi
-	shldl	$5,%edx,%edx
-	addl	%ebp,%ecx
-	xorl	%eax,%esi
-	shrdl	$7,%edi,%edi
-	addl	%edx,%ecx
-	addl	56(%esp),%ebx
-	xorl	%edi,%esi
-	movl	%ecx,%ebp
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	xorl	%edi,%ebp
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	addl	60(%esp),%eax
-	xorl	%edx,%ebp
-	movl	%ebx,%esi
-	shldl	$5,%ebx,%ebx
-	addl	%ebp,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	vzeroall
-	movl	192(%esp),%ebp
-	addl	(%ebp),%eax
-	movl	204(%esp),%esp
-	addl	4(%ebp),%esi
-	addl	8(%ebp),%ecx
-	movl	%eax,(%ebp)
-	addl	12(%ebp),%edx
-	movl	%esi,4(%ebp)
-	addl	16(%ebp),%edi
-	movl	%ecx,8(%ebp)
-	movl	%edx,12(%ebp)
-	movl	%edi,16(%ebp)
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.align	6,0x90
-LK_XX_XX:
-.long	1518500249,1518500249,1518500249,1518500249
-.long	1859775393,1859775393,1859775393,1859775393
-.long	2400959708,2400959708,2400959708,2400959708
-.long	3395469782,3395469782,3395469782,3395469782
-.long	66051,67438087,134810123,202182159
-.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
-.byte	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
-.byte	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
-.byte	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol	_OPENSSL_ia32cap_P
-.long	0
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/sha256-586-apple.S b/apple-x86/crypto/fipsmodule/sha256-586-apple.S
deleted file mode 100644
index d43510a..0000000
--- a/apple-x86/crypto/fipsmodule/sha256-586-apple.S
+++ /dev/null
@@ -1,5567 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl	_sha256_block_data_order
-.private_extern	_sha256_block_data_order
-.align	4
-_sha256_block_data_order:
-L_sha256_block_data_order_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%eax
-	movl	%esp,%ebx
-	call	L000pic_point
-L000pic_point:
-	popl	%ebp
-	leal	L001K256-L000pic_point(%ebp),%ebp
-	subl	$16,%esp
-	andl	$-64,%esp
-	shll	$6,%eax
-	addl	%edi,%eax
-	movl	%esi,(%esp)
-	movl	%edi,4(%esp)
-	movl	%eax,8(%esp)
-	movl	%ebx,12(%esp)
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L001K256(%ebp),%edx
-	movl	(%edx),%ecx
-	movl	4(%edx),%ebx
-	testl	$1048576,%ecx
-	jnz	L002loop
-	movl	8(%edx),%edx
-	testl	$16777216,%ecx
-	jz	L003no_xmm
-	andl	$1073741824,%ecx
-	andl	$268435968,%ebx
-	orl	%ebx,%ecx
-	andl	$1342177280,%ecx
-	cmpl	$1342177280,%ecx
-	je	L004AVX
-	testl	$512,%ebx
-	jnz	L005SSSE3
-L003no_xmm:
-	subl	%edi,%eax
-	cmpl	$256,%eax
-	jae	L006unrolled
-	jmp	L002loop
-.align	4,0x90
-L002loop:
-	movl	(%edi),%eax
-	movl	4(%edi),%ebx
-	movl	8(%edi),%ecx
-	bswap	%eax
-	movl	12(%edi),%edx
-	bswap	%ebx
-	pushl	%eax
-	bswap	%ecx
-	pushl	%ebx
-	bswap	%edx
-	pushl	%ecx
-	pushl	%edx
-	movl	16(%edi),%eax
-	movl	20(%edi),%ebx
-	movl	24(%edi),%ecx
-	bswap	%eax
-	movl	28(%edi),%edx
-	bswap	%ebx
-	pushl	%eax
-	bswap	%ecx
-	pushl	%ebx
-	bswap	%edx
-	pushl	%ecx
-	pushl	%edx
-	movl	32(%edi),%eax
-	movl	36(%edi),%ebx
-	movl	40(%edi),%ecx
-	bswap	%eax
-	movl	44(%edi),%edx
-	bswap	%ebx
-	pushl	%eax
-	bswap	%ecx
-	pushl	%ebx
-	bswap	%edx
-	pushl	%ecx
-	pushl	%edx
-	movl	48(%edi),%eax
-	movl	52(%edi),%ebx
-	movl	56(%edi),%ecx
-	bswap	%eax
-	movl	60(%edi),%edx
-	bswap	%ebx
-	pushl	%eax
-	bswap	%ecx
-	pushl	%ebx
-	bswap	%edx
-	pushl	%ecx
-	pushl	%edx
-	addl	$64,%edi
-	leal	-36(%esp),%esp
-	movl	%edi,104(%esp)
-	movl	(%esi),%eax
-	movl	4(%esi),%ebx
-	movl	8(%esi),%ecx
-	movl	12(%esi),%edi
-	movl	%ebx,8(%esp)
-	xorl	%ecx,%ebx
-	movl	%ecx,12(%esp)
-	movl	%edi,16(%esp)
-	movl	%ebx,(%esp)
-	movl	16(%esi),%edx
-	movl	20(%esi),%ebx
-	movl	24(%esi),%ecx
-	movl	28(%esi),%edi
-	movl	%ebx,24(%esp)
-	movl	%ecx,28(%esp)
-	movl	%edi,32(%esp)
-.align	4,0x90
-L00700_15:
-	movl	%edx,%ecx
-	movl	24(%esp),%esi
-	rorl	$14,%ecx
-	movl	28(%esp),%edi
-	xorl	%edx,%ecx
-	xorl	%edi,%esi
-	movl	96(%esp),%ebx
-	rorl	$5,%ecx
-	andl	%edx,%esi
-	movl	%edx,20(%esp)
-	xorl	%ecx,%edx
-	addl	32(%esp),%ebx
-	xorl	%edi,%esi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%esi,%ebx
-	rorl	$9,%ecx
-	addl	%edx,%ebx
-	movl	8(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,4(%esp)
-	leal	-4(%esp),%esp
-	rorl	$11,%ecx
-	movl	(%ebp),%esi
-	xorl	%eax,%ecx
-	movl	20(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%esi,%ebx
-	movl	%eax,(%esp)
-	addl	%ebx,%edx
-	andl	4(%esp),%eax
-	addl	%ecx,%ebx
-	xorl	%edi,%eax
-	addl	$4,%ebp
-	addl	%ebx,%eax
-	cmpl	$3248222580,%esi
-	jne	L00700_15
-	movl	156(%esp),%ecx
-	jmp	L00816_63
-.align	4,0x90
-L00816_63:
-	movl	%ecx,%ebx
-	movl	104(%esp),%esi
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	160(%esp),%ebx
-	shrl	$10,%edi
-	addl	124(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	24(%esp),%esi
-	rorl	$14,%ecx
-	addl	%edi,%ebx
-	movl	28(%esp),%edi
-	xorl	%edx,%ecx
-	xorl	%edi,%esi
-	movl	%ebx,96(%esp)
-	rorl	$5,%ecx
-	andl	%edx,%esi
-	movl	%edx,20(%esp)
-	xorl	%ecx,%edx
-	addl	32(%esp),%ebx
-	xorl	%edi,%esi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%esi,%ebx
-	rorl	$9,%ecx
-	addl	%edx,%ebx
-	movl	8(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,4(%esp)
-	leal	-4(%esp),%esp
-	rorl	$11,%ecx
-	movl	(%ebp),%esi
-	xorl	%eax,%ecx
-	movl	20(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%esi,%ebx
-	movl	%eax,(%esp)
-	addl	%ebx,%edx
-	andl	4(%esp),%eax
-	addl	%ecx,%ebx
-	xorl	%edi,%eax
-	movl	156(%esp),%ecx
-	addl	$4,%ebp
-	addl	%ebx,%eax
-	cmpl	$3329325298,%esi
-	jne	L00816_63
-	movl	356(%esp),%esi
-	movl	8(%esp),%ebx
-	movl	16(%esp),%ecx
-	addl	(%esi),%eax
-	addl	4(%esi),%ebx
-	addl	8(%esi),%edi
-	addl	12(%esi),%ecx
-	movl	%eax,(%esi)
-	movl	%ebx,4(%esi)
-	movl	%edi,8(%esi)
-	movl	%ecx,12(%esi)
-	movl	24(%esp),%eax
-	movl	28(%esp),%ebx
-	movl	32(%esp),%ecx
-	movl	360(%esp),%edi
-	addl	16(%esi),%edx
-	addl	20(%esi),%eax
-	addl	24(%esi),%ebx
-	addl	28(%esi),%ecx
-	movl	%edx,16(%esi)
-	movl	%eax,20(%esi)
-	movl	%ebx,24(%esi)
-	movl	%ecx,28(%esi)
-	leal	356(%esp),%esp
-	subl	$256,%ebp
-	cmpl	8(%esp),%edi
-	jb	L002loop
-	movl	12(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.align	6,0x90
-L001K256:
-.long	1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
-.long	66051,67438087,134810123,202182159
-.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
-.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
-.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
-.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
-.byte	62,0
-.align	4,0x90
-L006unrolled:
-	leal	-96(%esp),%esp
-	movl	(%esi),%eax
-	movl	4(%esi),%ebp
-	movl	8(%esi),%ecx
-	movl	12(%esi),%ebx
-	movl	%ebp,4(%esp)
-	xorl	%ecx,%ebp
-	movl	%ecx,8(%esp)
-	movl	%ebx,12(%esp)
-	movl	16(%esi),%edx
-	movl	20(%esi),%ebx
-	movl	24(%esi),%ecx
-	movl	28(%esi),%esi
-	movl	%ebx,20(%esp)
-	movl	%ecx,24(%esp)
-	movl	%esi,28(%esp)
-	jmp	L009grand_loop
-.align	4,0x90
-L009grand_loop:
-	movl	(%edi),%ebx
-	movl	4(%edi),%ecx
-	bswap	%ebx
-	movl	8(%edi),%esi
-	bswap	%ecx
-	movl	%ebx,32(%esp)
-	bswap	%esi
-	movl	%ecx,36(%esp)
-	movl	%esi,40(%esp)
-	movl	12(%edi),%ebx
-	movl	16(%edi),%ecx
-	bswap	%ebx
-	movl	20(%edi),%esi
-	bswap	%ecx
-	movl	%ebx,44(%esp)
-	bswap	%esi
-	movl	%ecx,48(%esp)
-	movl	%esi,52(%esp)
-	movl	24(%edi),%ebx
-	movl	28(%edi),%ecx
-	bswap	%ebx
-	movl	32(%edi),%esi
-	bswap	%ecx
-	movl	%ebx,56(%esp)
-	bswap	%esi
-	movl	%ecx,60(%esp)
-	movl	%esi,64(%esp)
-	movl	36(%edi),%ebx
-	movl	40(%edi),%ecx
-	bswap	%ebx
-	movl	44(%edi),%esi
-	bswap	%ecx
-	movl	%ebx,68(%esp)
-	bswap	%esi
-	movl	%ecx,72(%esp)
-	movl	%esi,76(%esp)
-	movl	48(%edi),%ebx
-	movl	52(%edi),%ecx
-	bswap	%ebx
-	movl	56(%edi),%esi
-	bswap	%ecx
-	movl	%ebx,80(%esp)
-	bswap	%esi
-	movl	%ecx,84(%esp)
-	movl	%esi,88(%esp)
-	movl	60(%edi),%ebx
-	addl	$64,%edi
-	bswap	%ebx
-	movl	%edi,100(%esp)
-	movl	%ebx,92(%esp)
-	movl	%edx,%ecx
-	movl	20(%esp),%esi
-	rorl	$14,%edx
-	movl	24(%esp),%edi
-	xorl	%ecx,%edx
-	movl	32(%esp),%ebx
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	addl	28(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	4(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	1116352408(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	12(%esp),%edx
-	addl	%ecx,%ebp
-	movl	%edx,%esi
-	movl	16(%esp),%ecx
-	rorl	$14,%edx
-	movl	20(%esp),%edi
-	xorl	%esi,%edx
-	movl	36(%esp),%ebx
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,12(%esp)
-	xorl	%esi,%edx
-	addl	24(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,28(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	1899447441(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%esi,%eax
-	movl	%edx,%ecx
-	movl	12(%esp),%esi
-	rorl	$14,%edx
-	movl	16(%esp),%edi
-	xorl	%ecx,%edx
-	movl	40(%esp),%ebx
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	addl	20(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	28(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,24(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	3049323471(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	4(%esp),%edx
-	addl	%ecx,%ebp
-	movl	%edx,%esi
-	movl	8(%esp),%ecx
-	rorl	$14,%edx
-	movl	12(%esp),%edi
-	xorl	%esi,%edx
-	movl	44(%esp),%ebx
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,4(%esp)
-	xorl	%esi,%edx
-	addl	16(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	24(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,20(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	3921009573(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%esi,%eax
-	movl	%edx,%ecx
-	movl	4(%esp),%esi
-	rorl	$14,%edx
-	movl	8(%esp),%edi
-	xorl	%ecx,%edx
-	movl	48(%esp),%ebx
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	addl	12(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	20(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,16(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	961987163(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	28(%esp),%edx
-	addl	%ecx,%ebp
-	movl	%edx,%esi
-	movl	(%esp),%ecx
-	rorl	$14,%edx
-	movl	4(%esp),%edi
-	xorl	%esi,%edx
-	movl	52(%esp),%ebx
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,28(%esp)
-	xorl	%esi,%edx
-	addl	8(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	16(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,12(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	1508970993(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%esi,%eax
-	movl	%edx,%ecx
-	movl	28(%esp),%esi
-	rorl	$14,%edx
-	movl	(%esp),%edi
-	xorl	%ecx,%edx
-	movl	56(%esp),%ebx
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	addl	4(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	12(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,8(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	2453635748(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	20(%esp),%edx
-	addl	%ecx,%ebp
-	movl	%edx,%esi
-	movl	24(%esp),%ecx
-	rorl	$14,%edx
-	movl	28(%esp),%edi
-	xorl	%esi,%edx
-	movl	60(%esp),%ebx
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,20(%esp)
-	xorl	%esi,%edx
-	addl	(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	8(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,4(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	2870763221(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%esi,%eax
-	movl	%edx,%ecx
-	movl	20(%esp),%esi
-	rorl	$14,%edx
-	movl	24(%esp),%edi
-	xorl	%ecx,%edx
-	movl	64(%esp),%ebx
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	addl	28(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	4(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	3624381080(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	12(%esp),%edx
-	addl	%ecx,%ebp
-	movl	%edx,%esi
-	movl	16(%esp),%ecx
-	rorl	$14,%edx
-	movl	20(%esp),%edi
-	xorl	%esi,%edx
-	movl	68(%esp),%ebx
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,12(%esp)
-	xorl	%esi,%edx
-	addl	24(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,28(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	310598401(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%esi,%eax
-	movl	%edx,%ecx
-	movl	12(%esp),%esi
-	rorl	$14,%edx
-	movl	16(%esp),%edi
-	xorl	%ecx,%edx
-	movl	72(%esp),%ebx
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	addl	20(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	28(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,24(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	607225278(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	4(%esp),%edx
-	addl	%ecx,%ebp
-	movl	%edx,%esi
-	movl	8(%esp),%ecx
-	rorl	$14,%edx
-	movl	12(%esp),%edi
-	xorl	%esi,%edx
-	movl	76(%esp),%ebx
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,4(%esp)
-	xorl	%esi,%edx
-	addl	16(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	24(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,20(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	1426881987(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%esi,%eax
-	movl	%edx,%ecx
-	movl	4(%esp),%esi
-	rorl	$14,%edx
-	movl	8(%esp),%edi
-	xorl	%ecx,%edx
-	movl	80(%esp),%ebx
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	addl	12(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	20(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,16(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	1925078388(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	28(%esp),%edx
-	addl	%ecx,%ebp
-	movl	%edx,%esi
-	movl	(%esp),%ecx
-	rorl	$14,%edx
-	movl	4(%esp),%edi
-	xorl	%esi,%edx
-	movl	84(%esp),%ebx
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,28(%esp)
-	xorl	%esi,%edx
-	addl	8(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	16(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,12(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	2162078206(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%esi,%eax
-	movl	%edx,%ecx
-	movl	28(%esp),%esi
-	rorl	$14,%edx
-	movl	(%esp),%edi
-	xorl	%ecx,%edx
-	movl	88(%esp),%ebx
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	addl	4(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	12(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,8(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	2614888103(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	20(%esp),%edx
-	addl	%ecx,%ebp
-	movl	%edx,%esi
-	movl	24(%esp),%ecx
-	rorl	$14,%edx
-	movl	28(%esp),%edi
-	xorl	%esi,%edx
-	movl	92(%esp),%ebx
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,20(%esp)
-	xorl	%esi,%edx
-	addl	(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	8(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,4(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	3248222580(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	36(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%esi,%eax
-	movl	88(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	32(%esp),%ebx
-	shrl	$10,%edi
-	addl	68(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	20(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	24(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,32(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	addl	28(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	4(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	3835390401(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	40(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	12(%esp),%edx
-	addl	%ecx,%ebp
-	movl	92(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	36(%esp),%ebx
-	shrl	$10,%edi
-	addl	72(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	16(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	20(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,36(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,12(%esp)
-	xorl	%esi,%edx
-	addl	24(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,28(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	4022224774(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	44(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%esi,%eax
-	movl	32(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	40(%esp),%ebx
-	shrl	$10,%edi
-	addl	76(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	12(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	16(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,40(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	addl	20(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	28(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,24(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	264347078(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	48(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	4(%esp),%edx
-	addl	%ecx,%ebp
-	movl	36(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	44(%esp),%ebx
-	shrl	$10,%edi
-	addl	80(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	8(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	12(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,44(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,4(%esp)
-	xorl	%esi,%edx
-	addl	16(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	24(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,20(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	604807628(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	52(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%esi,%eax
-	movl	40(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	48(%esp),%ebx
-	shrl	$10,%edi
-	addl	84(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	4(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	8(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,48(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	addl	12(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	20(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,16(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	770255983(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	56(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	28(%esp),%edx
-	addl	%ecx,%ebp
-	movl	44(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	52(%esp),%ebx
-	shrl	$10,%edi
-	addl	88(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	4(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,52(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,28(%esp)
-	xorl	%esi,%edx
-	addl	8(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	16(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,12(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	1249150122(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	60(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%esi,%eax
-	movl	48(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	56(%esp),%ebx
-	shrl	$10,%edi
-	addl	92(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	28(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,56(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	addl	4(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	12(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,8(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	1555081692(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	64(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	20(%esp),%edx
-	addl	%ecx,%ebp
-	movl	52(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	60(%esp),%ebx
-	shrl	$10,%edi
-	addl	32(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	24(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	28(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,60(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,20(%esp)
-	xorl	%esi,%edx
-	addl	(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	8(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,4(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	1996064986(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	68(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%esi,%eax
-	movl	56(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	64(%esp),%ebx
-	shrl	$10,%edi
-	addl	36(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	20(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	24(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,64(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	addl	28(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	4(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	2554220882(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	72(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	12(%esp),%edx
-	addl	%ecx,%ebp
-	movl	60(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	68(%esp),%ebx
-	shrl	$10,%edi
-	addl	40(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	16(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	20(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,68(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,12(%esp)
-	xorl	%esi,%edx
-	addl	24(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,28(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	2821834349(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	76(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%esi,%eax
-	movl	64(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	72(%esp),%ebx
-	shrl	$10,%edi
-	addl	44(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	12(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	16(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,72(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	addl	20(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	28(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,24(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	2952996808(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	80(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	4(%esp),%edx
-	addl	%ecx,%ebp
-	movl	68(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	76(%esp),%ebx
-	shrl	$10,%edi
-	addl	48(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	8(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	12(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,76(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,4(%esp)
-	xorl	%esi,%edx
-	addl	16(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	24(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,20(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	3210313671(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	84(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%esi,%eax
-	movl	72(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	80(%esp),%ebx
-	shrl	$10,%edi
-	addl	52(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	4(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	8(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,80(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	addl	12(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	20(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,16(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	3336571891(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	88(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	28(%esp),%edx
-	addl	%ecx,%ebp
-	movl	76(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	84(%esp),%ebx
-	shrl	$10,%edi
-	addl	56(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	4(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,84(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,28(%esp)
-	xorl	%esi,%edx
-	addl	8(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	16(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,12(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	3584528711(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	92(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%esi,%eax
-	movl	80(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	88(%esp),%ebx
-	shrl	$10,%edi
-	addl	60(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	28(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,88(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	addl	4(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	12(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,8(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	113926993(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	32(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	20(%esp),%edx
-	addl	%ecx,%ebp
-	movl	84(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	92(%esp),%ebx
-	shrl	$10,%edi
-	addl	64(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	24(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	28(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,92(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,20(%esp)
-	xorl	%esi,%edx
-	addl	(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	8(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,4(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	338241895(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	36(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%esi,%eax
-	movl	88(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	32(%esp),%ebx
-	shrl	$10,%edi
-	addl	68(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	20(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	24(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,32(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	addl	28(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	4(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	666307205(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	40(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	12(%esp),%edx
-	addl	%ecx,%ebp
-	movl	92(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	36(%esp),%ebx
-	shrl	$10,%edi
-	addl	72(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	16(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	20(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,36(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,12(%esp)
-	xorl	%esi,%edx
-	addl	24(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,28(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	773529912(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	44(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%esi,%eax
-	movl	32(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	40(%esp),%ebx
-	shrl	$10,%edi
-	addl	76(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	12(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	16(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,40(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	addl	20(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	28(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,24(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	1294757372(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	48(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	4(%esp),%edx
-	addl	%ecx,%ebp
-	movl	36(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	44(%esp),%ebx
-	shrl	$10,%edi
-	addl	80(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	8(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	12(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,44(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,4(%esp)
-	xorl	%esi,%edx
-	addl	16(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	24(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,20(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	1396182291(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	52(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%esi,%eax
-	movl	40(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	48(%esp),%ebx
-	shrl	$10,%edi
-	addl	84(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	4(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	8(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,48(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	addl	12(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	20(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,16(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	1695183700(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	56(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	28(%esp),%edx
-	addl	%ecx,%ebp
-	movl	44(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	52(%esp),%ebx
-	shrl	$10,%edi
-	addl	88(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	4(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,52(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,28(%esp)
-	xorl	%esi,%edx
-	addl	8(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	16(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,12(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	1986661051(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	60(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%esi,%eax
-	movl	48(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	56(%esp),%ebx
-	shrl	$10,%edi
-	addl	92(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	28(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,56(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	addl	4(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	12(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,8(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	2177026350(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	64(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	20(%esp),%edx
-	addl	%ecx,%ebp
-	movl	52(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	60(%esp),%ebx
-	shrl	$10,%edi
-	addl	32(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	24(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	28(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,60(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,20(%esp)
-	xorl	%esi,%edx
-	addl	(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	8(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,4(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	2456956037(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	68(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%esi,%eax
-	movl	56(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	64(%esp),%ebx
-	shrl	$10,%edi
-	addl	36(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	20(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	24(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,64(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	addl	28(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	4(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	2730485921(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	72(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	12(%esp),%edx
-	addl	%ecx,%ebp
-	movl	60(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	68(%esp),%ebx
-	shrl	$10,%edi
-	addl	40(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	16(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	20(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,68(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,12(%esp)
-	xorl	%esi,%edx
-	addl	24(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,28(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	2820302411(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	76(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%esi,%eax
-	movl	64(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	72(%esp),%ebx
-	shrl	$10,%edi
-	addl	44(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	12(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	16(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,72(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	addl	20(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	28(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,24(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	3259730800(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	80(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	4(%esp),%edx
-	addl	%ecx,%ebp
-	movl	68(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	76(%esp),%ebx
-	shrl	$10,%edi
-	addl	48(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	8(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	12(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,76(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,4(%esp)
-	xorl	%esi,%edx
-	addl	16(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	24(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,20(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	3345764771(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	84(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%esi,%eax
-	movl	72(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	80(%esp),%ebx
-	shrl	$10,%edi
-	addl	52(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	4(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	8(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,80(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	addl	12(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	20(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,16(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	3516065817(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	88(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	28(%esp),%edx
-	addl	%ecx,%ebp
-	movl	76(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	84(%esp),%ebx
-	shrl	$10,%edi
-	addl	56(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	4(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,84(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,28(%esp)
-	xorl	%esi,%edx
-	addl	8(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	16(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,12(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	3600352804(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	92(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%esi,%eax
-	movl	80(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	88(%esp),%ebx
-	shrl	$10,%edi
-	addl	60(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	28(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,88(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	addl	4(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	12(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,8(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	4094571909(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	32(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	20(%esp),%edx
-	addl	%ecx,%ebp
-	movl	84(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	92(%esp),%ebx
-	shrl	$10,%edi
-	addl	64(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	24(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	28(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,92(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,20(%esp)
-	xorl	%esi,%edx
-	addl	(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	8(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,4(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	275423344(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	36(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%esi,%eax
-	movl	88(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	32(%esp),%ebx
-	shrl	$10,%edi
-	addl	68(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	20(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	24(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,32(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	addl	28(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	4(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	430227734(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	40(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	12(%esp),%edx
-	addl	%ecx,%ebp
-	movl	92(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	36(%esp),%ebx
-	shrl	$10,%edi
-	addl	72(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	16(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	20(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,36(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,12(%esp)
-	xorl	%esi,%edx
-	addl	24(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,28(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	506948616(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	44(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%esi,%eax
-	movl	32(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	40(%esp),%ebx
-	shrl	$10,%edi
-	addl	76(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	12(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	16(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,40(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	addl	20(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	28(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,24(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	659060556(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	48(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	4(%esp),%edx
-	addl	%ecx,%ebp
-	movl	36(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	44(%esp),%ebx
-	shrl	$10,%edi
-	addl	80(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	8(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	12(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,44(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,4(%esp)
-	xorl	%esi,%edx
-	addl	16(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	24(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,20(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	883997877(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	52(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%esi,%eax
-	movl	40(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	48(%esp),%ebx
-	shrl	$10,%edi
-	addl	84(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	4(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	8(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,48(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	addl	12(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	20(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,16(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	958139571(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	56(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	28(%esp),%edx
-	addl	%ecx,%ebp
-	movl	44(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	52(%esp),%ebx
-	shrl	$10,%edi
-	addl	88(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	4(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,52(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,28(%esp)
-	xorl	%esi,%edx
-	addl	8(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	16(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,12(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	1322822218(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	60(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%esi,%eax
-	movl	48(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	56(%esp),%ebx
-	shrl	$10,%edi
-	addl	92(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	28(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,56(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	addl	4(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	12(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,8(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	1537002063(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	64(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	20(%esp),%edx
-	addl	%ecx,%ebp
-	movl	52(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	60(%esp),%ebx
-	shrl	$10,%edi
-	addl	32(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	24(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	28(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,60(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,20(%esp)
-	xorl	%esi,%edx
-	addl	(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	8(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,4(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	1747873779(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	68(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%esi,%eax
-	movl	56(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	64(%esp),%ebx
-	shrl	$10,%edi
-	addl	36(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	20(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	24(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,64(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	addl	28(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	4(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	1955562222(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	72(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	12(%esp),%edx
-	addl	%ecx,%ebp
-	movl	60(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	68(%esp),%ebx
-	shrl	$10,%edi
-	addl	40(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	16(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	20(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,68(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,12(%esp)
-	xorl	%esi,%edx
-	addl	24(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,28(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	2024104815(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	76(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%esi,%eax
-	movl	64(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	72(%esp),%ebx
-	shrl	$10,%edi
-	addl	44(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	12(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	16(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,72(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	addl	20(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	28(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,24(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	2227730452(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	80(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	4(%esp),%edx
-	addl	%ecx,%ebp
-	movl	68(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	76(%esp),%ebx
-	shrl	$10,%edi
-	addl	48(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	8(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	12(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,76(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,4(%esp)
-	xorl	%esi,%edx
-	addl	16(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	24(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,20(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	2361852424(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	84(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%esi,%eax
-	movl	72(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	80(%esp),%ebx
-	shrl	$10,%edi
-	addl	52(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	4(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	8(%esp),%edi
-	xorl	%ecx,%edx
-	movl	%ebx,80(%esp)
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	addl	12(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	20(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,16(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	2428436474(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	88(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	28(%esp),%edx
-	addl	%ecx,%ebp
-	movl	76(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	84(%esp),%ebx
-	shrl	$10,%edi
-	addl	56(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	4(%esp),%edi
-	xorl	%esi,%edx
-	movl	%ebx,84(%esp)
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,28(%esp)
-	xorl	%esi,%edx
-	addl	8(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	16(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,12(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	2756734187(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	movl	92(%esp),%ecx
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%esi,%eax
-	movl	80(%esp),%esi
-	movl	%ecx,%ebx
-	rorl	$11,%ecx
-	movl	%esi,%edi
-	rorl	$2,%esi
-	xorl	%ebx,%ecx
-	shrl	$3,%ebx
-	rorl	$7,%ecx
-	xorl	%edi,%esi
-	xorl	%ecx,%ebx
-	rorl	$17,%esi
-	addl	88(%esp),%ebx
-	shrl	$10,%edi
-	addl	60(%esp),%ebx
-	movl	%edx,%ecx
-	xorl	%esi,%edi
-	movl	28(%esp),%esi
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	(%esp),%edi
-	xorl	%ecx,%edx
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	addl	4(%esp),%ebx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%ebx
-	rorl	$9,%ecx
-	movl	%eax,%esi
-	movl	12(%esp),%edi
-	xorl	%eax,%ecx
-	movl	%eax,8(%esp)
-	xorl	%edi,%eax
-	rorl	$11,%ecx
-	andl	%eax,%ebp
-	leal	3204031479(%ebx,%edx,1),%edx
-	xorl	%esi,%ecx
-	xorl	%edi,%ebp
-	movl	32(%esp),%esi
-	rorl	$2,%ecx
-	addl	%edx,%ebp
-	addl	20(%esp),%edx
-	addl	%ecx,%ebp
-	movl	84(%esp),%ecx
-	movl	%esi,%ebx
-	rorl	$11,%esi
-	movl	%ecx,%edi
-	rorl	$2,%ecx
-	xorl	%ebx,%esi
-	shrl	$3,%ebx
-	rorl	$7,%esi
-	xorl	%edi,%ecx
-	xorl	%esi,%ebx
-	rorl	$17,%ecx
-	addl	92(%esp),%ebx
-	shrl	$10,%edi
-	addl	64(%esp),%ebx
-	movl	%edx,%esi
-	xorl	%ecx,%edi
-	movl	24(%esp),%ecx
-	rorl	$14,%edx
-	addl	%edi,%ebx
-	movl	28(%esp),%edi
-	xorl	%esi,%edx
-	xorl	%edi,%ecx
-	rorl	$5,%edx
-	andl	%esi,%ecx
-	movl	%esi,20(%esp)
-	xorl	%esi,%edx
-	addl	(%esp),%ebx
-	xorl	%ecx,%edi
-	rorl	$6,%edx
-	movl	%ebp,%esi
-	addl	%edi,%ebx
-	rorl	$9,%esi
-	movl	%ebp,%ecx
-	movl	8(%esp),%edi
-	xorl	%ebp,%esi
-	movl	%ebp,4(%esp)
-	xorl	%edi,%ebp
-	rorl	$11,%esi
-	andl	%ebp,%eax
-	leal	3329325298(%ebx,%edx,1),%edx
-	xorl	%ecx,%esi
-	xorl	%edi,%eax
-	rorl	$2,%esi
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%esi,%eax
-	movl	96(%esp),%esi
-	xorl	%edi,%ebp
-	movl	12(%esp),%ecx
-	addl	(%esi),%eax
-	addl	4(%esi),%ebp
-	addl	8(%esi),%edi
-	addl	12(%esi),%ecx
-	movl	%eax,(%esi)
-	movl	%ebp,4(%esi)
-	movl	%edi,8(%esi)
-	movl	%ecx,12(%esi)
-	movl	%ebp,4(%esp)
-	xorl	%edi,%ebp
-	movl	%edi,8(%esp)
-	movl	%ecx,12(%esp)
-	movl	20(%esp),%edi
-	movl	24(%esp),%ebx
-	movl	28(%esp),%ecx
-	addl	16(%esi),%edx
-	addl	20(%esi),%edi
-	addl	24(%esi),%ebx
-	addl	28(%esi),%ecx
-	movl	%edx,16(%esi)
-	movl	%edi,20(%esi)
-	movl	%ebx,24(%esi)
-	movl	%ecx,28(%esi)
-	movl	%edi,20(%esp)
-	movl	100(%esp),%edi
-	movl	%ebx,24(%esp)
-	movl	%ecx,28(%esp)
-	cmpl	104(%esp),%edi
-	jb	L009grand_loop
-	movl	108(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.align	5,0x90
-L005SSSE3:
-	leal	-96(%esp),%esp
-	movl	(%esi),%eax
-	movl	4(%esi),%ebx
-	movl	8(%esi),%ecx
-	movl	12(%esi),%edi
-	movl	%ebx,4(%esp)
-	xorl	%ecx,%ebx
-	movl	%ecx,8(%esp)
-	movl	%edi,12(%esp)
-	movl	16(%esi),%edx
-	movl	20(%esi),%edi
-	movl	24(%esi),%ecx
-	movl	28(%esi),%esi
-	movl	%edi,20(%esp)
-	movl	100(%esp),%edi
-	movl	%ecx,24(%esp)
-	movl	%esi,28(%esp)
-	movdqa	256(%ebp),%xmm7
-	jmp	L010grand_ssse3
-.align	4,0x90
-L010grand_ssse3:
-	movdqu	(%edi),%xmm0
-	movdqu	16(%edi),%xmm1
-	movdqu	32(%edi),%xmm2
-	movdqu	48(%edi),%xmm3
-	addl	$64,%edi
-.byte	102,15,56,0,199
-	movl	%edi,100(%esp)
-.byte	102,15,56,0,207
-	movdqa	(%ebp),%xmm4
-.byte	102,15,56,0,215
-	movdqa	16(%ebp),%xmm5
-	paddd	%xmm0,%xmm4
-.byte	102,15,56,0,223
-	movdqa	32(%ebp),%xmm6
-	paddd	%xmm1,%xmm5
-	movdqa	48(%ebp),%xmm7
-	movdqa	%xmm4,32(%esp)
-	paddd	%xmm2,%xmm6
-	movdqa	%xmm5,48(%esp)
-	paddd	%xmm3,%xmm7
-	movdqa	%xmm6,64(%esp)
-	movdqa	%xmm7,80(%esp)
-	jmp	L011ssse3_00_47
-.align	4,0x90
-L011ssse3_00_47:
-	addl	$64,%ebp
-	movl	%edx,%ecx
-	movdqa	%xmm1,%xmm4
-	rorl	$14,%edx
-	movl	20(%esp),%esi
-	movdqa	%xmm3,%xmm7
-	xorl	%ecx,%edx
-	movl	24(%esp),%edi
-.byte	102,15,58,15,224,4
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-.byte	102,15,58,15,250,4
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	movdqa	%xmm4,%xmm5
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	movdqa	%xmm4,%xmm6
-	addl	%edi,%edx
-	movl	4(%esp),%edi
-	psrld	$3,%xmm4
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	paddd	%xmm7,%xmm0
-	movl	%eax,(%esp)
-	xorl	%eax,%ecx
-	psrld	$7,%xmm6
-	xorl	%edi,%eax
-	addl	28(%esp),%edx
-	rorl	$11,%ecx
-	andl	%eax,%ebx
-	pshufd	$250,%xmm3,%xmm7
-	xorl	%esi,%ecx
-	addl	32(%esp),%edx
-	pslld	$14,%xmm5
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	pxor	%xmm6,%xmm4
-	addl	%edx,%ebx
-	addl	12(%esp),%edx
-	psrld	$11,%xmm6
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	pxor	%xmm5,%xmm4
-	movl	16(%esp),%esi
-	xorl	%ecx,%edx
-	pslld	$11,%xmm5
-	movl	20(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	pxor	%xmm6,%xmm4
-	andl	%ecx,%esi
-	movl	%ecx,12(%esp)
-	movdqa	%xmm7,%xmm6
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	pxor	%xmm5,%xmm4
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	psrld	$10,%xmm7
-	movl	(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	paddd	%xmm4,%xmm0
-	movl	%ebx,28(%esp)
-	xorl	%ebx,%ecx
-	psrlq	$17,%xmm6
-	xorl	%edi,%ebx
-	addl	24(%esp),%edx
-	rorl	$11,%ecx
-	pxor	%xmm6,%xmm7
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	psrlq	$2,%xmm6
-	addl	36(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	pxor	%xmm6,%xmm7
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	pshufd	$128,%xmm7,%xmm7
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	12(%esp),%esi
-	xorl	%ecx,%edx
-	movl	16(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	psrldq	$8,%xmm7
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	paddd	%xmm7,%xmm0
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	28(%esp),%edi
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	movl	%eax,24(%esp)
-	pshufd	$80,%xmm0,%xmm7
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	20(%esp),%edx
-	movdqa	%xmm7,%xmm6
-	rorl	$11,%ecx
-	psrld	$10,%xmm7
-	andl	%eax,%ebx
-	psrlq	$17,%xmm6
-	xorl	%esi,%ecx
-	addl	40(%esp),%edx
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	pxor	%xmm6,%xmm7
-	addl	%edx,%ebx
-	addl	4(%esp),%edx
-	psrlq	$2,%xmm6
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	pxor	%xmm6,%xmm7
-	movl	8(%esp),%esi
-	xorl	%ecx,%edx
-	movl	12(%esp),%edi
-	pshufd	$8,%xmm7,%xmm7
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	movdqa	(%ebp),%xmm6
-	andl	%ecx,%esi
-	movl	%ecx,4(%esp)
-	pslldq	$8,%xmm7
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	24(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	paddd	%xmm7,%xmm0
-	movl	%ebx,20(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	16(%esp),%edx
-	paddd	%xmm0,%xmm6
-	rorl	$11,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	44(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%ecx,%eax
-	movdqa	%xmm6,32(%esp)
-	movl	%edx,%ecx
-	movdqa	%xmm2,%xmm4
-	rorl	$14,%edx
-	movl	4(%esp),%esi
-	movdqa	%xmm0,%xmm7
-	xorl	%ecx,%edx
-	movl	8(%esp),%edi
-.byte	102,15,58,15,225,4
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-.byte	102,15,58,15,251,4
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	movdqa	%xmm4,%xmm5
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	movdqa	%xmm4,%xmm6
-	addl	%edi,%edx
-	movl	20(%esp),%edi
-	psrld	$3,%xmm4
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	paddd	%xmm7,%xmm1
-	movl	%eax,16(%esp)
-	xorl	%eax,%ecx
-	psrld	$7,%xmm6
-	xorl	%edi,%eax
-	addl	12(%esp),%edx
-	rorl	$11,%ecx
-	andl	%eax,%ebx
-	pshufd	$250,%xmm0,%xmm7
-	xorl	%esi,%ecx
-	addl	48(%esp),%edx
-	pslld	$14,%xmm5
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	pxor	%xmm6,%xmm4
-	addl	%edx,%ebx
-	addl	28(%esp),%edx
-	psrld	$11,%xmm6
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	pxor	%xmm5,%xmm4
-	movl	(%esp),%esi
-	xorl	%ecx,%edx
-	pslld	$11,%xmm5
-	movl	4(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	pxor	%xmm6,%xmm4
-	andl	%ecx,%esi
-	movl	%ecx,28(%esp)
-	movdqa	%xmm7,%xmm6
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	pxor	%xmm5,%xmm4
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	psrld	$10,%xmm7
-	movl	16(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	paddd	%xmm4,%xmm1
-	movl	%ebx,12(%esp)
-	xorl	%ebx,%ecx
-	psrlq	$17,%xmm6
-	xorl	%edi,%ebx
-	addl	8(%esp),%edx
-	rorl	$11,%ecx
-	pxor	%xmm6,%xmm7
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	psrlq	$2,%xmm6
-	addl	52(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	pxor	%xmm6,%xmm7
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	pshufd	$128,%xmm7,%xmm7
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	28(%esp),%esi
-	xorl	%ecx,%edx
-	movl	(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	psrldq	$8,%xmm7
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	paddd	%xmm7,%xmm1
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	12(%esp),%edi
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	movl	%eax,8(%esp)
-	pshufd	$80,%xmm1,%xmm7
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	4(%esp),%edx
-	movdqa	%xmm7,%xmm6
-	rorl	$11,%ecx
-	psrld	$10,%xmm7
-	andl	%eax,%ebx
-	psrlq	$17,%xmm6
-	xorl	%esi,%ecx
-	addl	56(%esp),%edx
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	pxor	%xmm6,%xmm7
-	addl	%edx,%ebx
-	addl	20(%esp),%edx
-	psrlq	$2,%xmm6
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	pxor	%xmm6,%xmm7
-	movl	24(%esp),%esi
-	xorl	%ecx,%edx
-	movl	28(%esp),%edi
-	pshufd	$8,%xmm7,%xmm7
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	movdqa	16(%ebp),%xmm6
-	andl	%ecx,%esi
-	movl	%ecx,20(%esp)
-	pslldq	$8,%xmm7
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	8(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	paddd	%xmm7,%xmm1
-	movl	%ebx,4(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	(%esp),%edx
-	paddd	%xmm1,%xmm6
-	rorl	$11,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	60(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%ecx,%eax
-	movdqa	%xmm6,48(%esp)
-	movl	%edx,%ecx
-	movdqa	%xmm3,%xmm4
-	rorl	$14,%edx
-	movl	20(%esp),%esi
-	movdqa	%xmm1,%xmm7
-	xorl	%ecx,%edx
-	movl	24(%esp),%edi
-.byte	102,15,58,15,226,4
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-.byte	102,15,58,15,248,4
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	movdqa	%xmm4,%xmm5
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	movdqa	%xmm4,%xmm6
-	addl	%edi,%edx
-	movl	4(%esp),%edi
-	psrld	$3,%xmm4
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	paddd	%xmm7,%xmm2
-	movl	%eax,(%esp)
-	xorl	%eax,%ecx
-	psrld	$7,%xmm6
-	xorl	%edi,%eax
-	addl	28(%esp),%edx
-	rorl	$11,%ecx
-	andl	%eax,%ebx
-	pshufd	$250,%xmm1,%xmm7
-	xorl	%esi,%ecx
-	addl	64(%esp),%edx
-	pslld	$14,%xmm5
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	pxor	%xmm6,%xmm4
-	addl	%edx,%ebx
-	addl	12(%esp),%edx
-	psrld	$11,%xmm6
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	pxor	%xmm5,%xmm4
-	movl	16(%esp),%esi
-	xorl	%ecx,%edx
-	pslld	$11,%xmm5
-	movl	20(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	pxor	%xmm6,%xmm4
-	andl	%ecx,%esi
-	movl	%ecx,12(%esp)
-	movdqa	%xmm7,%xmm6
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	pxor	%xmm5,%xmm4
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	psrld	$10,%xmm7
-	movl	(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	paddd	%xmm4,%xmm2
-	movl	%ebx,28(%esp)
-	xorl	%ebx,%ecx
-	psrlq	$17,%xmm6
-	xorl	%edi,%ebx
-	addl	24(%esp),%edx
-	rorl	$11,%ecx
-	pxor	%xmm6,%xmm7
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	psrlq	$2,%xmm6
-	addl	68(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	pxor	%xmm6,%xmm7
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	pshufd	$128,%xmm7,%xmm7
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	12(%esp),%esi
-	xorl	%ecx,%edx
-	movl	16(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	psrldq	$8,%xmm7
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	paddd	%xmm7,%xmm2
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	28(%esp),%edi
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	movl	%eax,24(%esp)
-	pshufd	$80,%xmm2,%xmm7
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	20(%esp),%edx
-	movdqa	%xmm7,%xmm6
-	rorl	$11,%ecx
-	psrld	$10,%xmm7
-	andl	%eax,%ebx
-	psrlq	$17,%xmm6
-	xorl	%esi,%ecx
-	addl	72(%esp),%edx
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	pxor	%xmm6,%xmm7
-	addl	%edx,%ebx
-	addl	4(%esp),%edx
-	psrlq	$2,%xmm6
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	pxor	%xmm6,%xmm7
-	movl	8(%esp),%esi
-	xorl	%ecx,%edx
-	movl	12(%esp),%edi
-	pshufd	$8,%xmm7,%xmm7
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	movdqa	32(%ebp),%xmm6
-	andl	%ecx,%esi
-	movl	%ecx,4(%esp)
-	pslldq	$8,%xmm7
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	24(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	paddd	%xmm7,%xmm2
-	movl	%ebx,20(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	16(%esp),%edx
-	paddd	%xmm2,%xmm6
-	rorl	$11,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	76(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%ecx,%eax
-	movdqa	%xmm6,64(%esp)
-	movl	%edx,%ecx
-	movdqa	%xmm0,%xmm4
-	rorl	$14,%edx
-	movl	4(%esp),%esi
-	movdqa	%xmm2,%xmm7
-	xorl	%ecx,%edx
-	movl	8(%esp),%edi
-.byte	102,15,58,15,227,4
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-.byte	102,15,58,15,249,4
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	movdqa	%xmm4,%xmm5
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	movdqa	%xmm4,%xmm6
-	addl	%edi,%edx
-	movl	20(%esp),%edi
-	psrld	$3,%xmm4
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	paddd	%xmm7,%xmm3
-	movl	%eax,16(%esp)
-	xorl	%eax,%ecx
-	psrld	$7,%xmm6
-	xorl	%edi,%eax
-	addl	12(%esp),%edx
-	rorl	$11,%ecx
-	andl	%eax,%ebx
-	pshufd	$250,%xmm2,%xmm7
-	xorl	%esi,%ecx
-	addl	80(%esp),%edx
-	pslld	$14,%xmm5
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	pxor	%xmm6,%xmm4
-	addl	%edx,%ebx
-	addl	28(%esp),%edx
-	psrld	$11,%xmm6
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	pxor	%xmm5,%xmm4
-	movl	(%esp),%esi
-	xorl	%ecx,%edx
-	pslld	$11,%xmm5
-	movl	4(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	pxor	%xmm6,%xmm4
-	andl	%ecx,%esi
-	movl	%ecx,28(%esp)
-	movdqa	%xmm7,%xmm6
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	pxor	%xmm5,%xmm4
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	psrld	$10,%xmm7
-	movl	16(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	paddd	%xmm4,%xmm3
-	movl	%ebx,12(%esp)
-	xorl	%ebx,%ecx
-	psrlq	$17,%xmm6
-	xorl	%edi,%ebx
-	addl	8(%esp),%edx
-	rorl	$11,%ecx
-	pxor	%xmm6,%xmm7
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	psrlq	$2,%xmm6
-	addl	84(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	pxor	%xmm6,%xmm7
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	pshufd	$128,%xmm7,%xmm7
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	28(%esp),%esi
-	xorl	%ecx,%edx
-	movl	(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	psrldq	$8,%xmm7
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	paddd	%xmm7,%xmm3
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	12(%esp),%edi
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	movl	%eax,8(%esp)
-	pshufd	$80,%xmm3,%xmm7
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	4(%esp),%edx
-	movdqa	%xmm7,%xmm6
-	rorl	$11,%ecx
-	psrld	$10,%xmm7
-	andl	%eax,%ebx
-	psrlq	$17,%xmm6
-	xorl	%esi,%ecx
-	addl	88(%esp),%edx
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	pxor	%xmm6,%xmm7
-	addl	%edx,%ebx
-	addl	20(%esp),%edx
-	psrlq	$2,%xmm6
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	pxor	%xmm6,%xmm7
-	movl	24(%esp),%esi
-	xorl	%ecx,%edx
-	movl	28(%esp),%edi
-	pshufd	$8,%xmm7,%xmm7
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	movdqa	48(%ebp),%xmm6
-	andl	%ecx,%esi
-	movl	%ecx,20(%esp)
-	pslldq	$8,%xmm7
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	8(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	paddd	%xmm7,%xmm3
-	movl	%ebx,4(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	(%esp),%edx
-	paddd	%xmm3,%xmm6
-	rorl	$11,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	92(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%ecx,%eax
-	movdqa	%xmm6,80(%esp)
-	cmpl	$66051,64(%ebp)
-	jne	L011ssse3_00_47
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	20(%esp),%esi
-	xorl	%ecx,%edx
-	movl	24(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	4(%esp),%edi
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	movl	%eax,(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	28(%esp),%edx
-	rorl	$11,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	32(%esp),%edx
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	addl	%edx,%ebx
-	addl	12(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	16(%esp),%esi
-	xorl	%ecx,%edx
-	movl	20(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,12(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	movl	%ebx,28(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	24(%esp),%edx
-	rorl	$11,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	36(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	12(%esp),%esi
-	xorl	%ecx,%edx
-	movl	16(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	28(%esp),%edi
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	movl	%eax,24(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	20(%esp),%edx
-	rorl	$11,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	40(%esp),%edx
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	addl	%edx,%ebx
-	addl	4(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	8(%esp),%esi
-	xorl	%ecx,%edx
-	movl	12(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,4(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	24(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	movl	%ebx,20(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	16(%esp),%edx
-	rorl	$11,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	44(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	4(%esp),%esi
-	xorl	%ecx,%edx
-	movl	8(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	20(%esp),%edi
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	movl	%eax,16(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	12(%esp),%edx
-	rorl	$11,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	48(%esp),%edx
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	addl	%edx,%ebx
-	addl	28(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	(%esp),%esi
-	xorl	%ecx,%edx
-	movl	4(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,28(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	16(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	movl	%ebx,12(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	8(%esp),%edx
-	rorl	$11,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	52(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	28(%esp),%esi
-	xorl	%ecx,%edx
-	movl	(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	12(%esp),%edi
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	movl	%eax,8(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	4(%esp),%edx
-	rorl	$11,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	56(%esp),%edx
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	addl	%edx,%ebx
-	addl	20(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	24(%esp),%esi
-	xorl	%ecx,%edx
-	movl	28(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,20(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	8(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	movl	%ebx,4(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	(%esp),%edx
-	rorl	$11,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	60(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	20(%esp),%esi
-	xorl	%ecx,%edx
-	movl	24(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	4(%esp),%edi
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	movl	%eax,(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	28(%esp),%edx
-	rorl	$11,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	64(%esp),%edx
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	addl	%edx,%ebx
-	addl	12(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	16(%esp),%esi
-	xorl	%ecx,%edx
-	movl	20(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,12(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	movl	%ebx,28(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	24(%esp),%edx
-	rorl	$11,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	68(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	12(%esp),%esi
-	xorl	%ecx,%edx
-	movl	16(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	28(%esp),%edi
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	movl	%eax,24(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	20(%esp),%edx
-	rorl	$11,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	72(%esp),%edx
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	addl	%edx,%ebx
-	addl	4(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	8(%esp),%esi
-	xorl	%ecx,%edx
-	movl	12(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,4(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	24(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	movl	%ebx,20(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	16(%esp),%edx
-	rorl	$11,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	76(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	4(%esp),%esi
-	xorl	%ecx,%edx
-	movl	8(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	20(%esp),%edi
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	movl	%eax,16(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	12(%esp),%edx
-	rorl	$11,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	80(%esp),%edx
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	addl	%edx,%ebx
-	addl	28(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	(%esp),%esi
-	xorl	%ecx,%edx
-	movl	4(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,28(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	16(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	movl	%ebx,12(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	8(%esp),%edx
-	rorl	$11,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	84(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	28(%esp),%esi
-	xorl	%ecx,%edx
-	movl	(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	12(%esp),%edi
-	movl	%eax,%esi
-	rorl	$9,%ecx
-	movl	%eax,8(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	4(%esp),%edx
-	rorl	$11,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	88(%esp),%edx
-	xorl	%edi,%ebx
-	rorl	$2,%ecx
-	addl	%edx,%ebx
-	addl	20(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	rorl	$14,%edx
-	movl	24(%esp),%esi
-	xorl	%ecx,%edx
-	movl	28(%esp),%edi
-	xorl	%edi,%esi
-	rorl	$5,%edx
-	andl	%ecx,%esi
-	movl	%ecx,20(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	rorl	$6,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	8(%esp),%edi
-	movl	%ebx,%esi
-	rorl	$9,%ecx
-	movl	%ebx,4(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	(%esp),%edx
-	rorl	$11,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	92(%esp),%edx
-	xorl	%edi,%eax
-	rorl	$2,%ecx
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%ecx,%eax
-	movl	96(%esp),%esi
-	xorl	%edi,%ebx
-	movl	12(%esp),%ecx
-	addl	(%esi),%eax
-	addl	4(%esi),%ebx
-	addl	8(%esi),%edi
-	addl	12(%esi),%ecx
-	movl	%eax,(%esi)
-	movl	%ebx,4(%esi)
-	movl	%edi,8(%esi)
-	movl	%ecx,12(%esi)
-	movl	%ebx,4(%esp)
-	xorl	%edi,%ebx
-	movl	%edi,8(%esp)
-	movl	%ecx,12(%esp)
-	movl	20(%esp),%edi
-	movl	24(%esp),%ecx
-	addl	16(%esi),%edx
-	addl	20(%esi),%edi
-	addl	24(%esi),%ecx
-	movl	%edx,16(%esi)
-	movl	%edi,20(%esi)
-	movl	%edi,20(%esp)
-	movl	28(%esp),%edi
-	movl	%ecx,24(%esi)
-	addl	28(%esi),%edi
-	movl	%ecx,24(%esp)
-	movl	%edi,28(%esi)
-	movl	%edi,28(%esp)
-	movl	100(%esp),%edi
-	movdqa	64(%ebp),%xmm7
-	subl	$192,%ebp
-	cmpl	104(%esp),%edi
-	jb	L010grand_ssse3
-	movl	108(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.align	5,0x90
-L004AVX:
-	leal	-96(%esp),%esp
-	vzeroall
-	movl	(%esi),%eax
-	movl	4(%esi),%ebx
-	movl	8(%esi),%ecx
-	movl	12(%esi),%edi
-	movl	%ebx,4(%esp)
-	xorl	%ecx,%ebx
-	movl	%ecx,8(%esp)
-	movl	%edi,12(%esp)
-	movl	16(%esi),%edx
-	movl	20(%esi),%edi
-	movl	24(%esi),%ecx
-	movl	28(%esi),%esi
-	movl	%edi,20(%esp)
-	movl	100(%esp),%edi
-	movl	%ecx,24(%esp)
-	movl	%esi,28(%esp)
-	vmovdqa	256(%ebp),%xmm7
-	jmp	L012grand_avx
-.align	5,0x90
-L012grand_avx:
-	vmovdqu	(%edi),%xmm0
-	vmovdqu	16(%edi),%xmm1
-	vmovdqu	32(%edi),%xmm2
-	vmovdqu	48(%edi),%xmm3
-	addl	$64,%edi
-	vpshufb	%xmm7,%xmm0,%xmm0
-	movl	%edi,100(%esp)
-	vpshufb	%xmm7,%xmm1,%xmm1
-	vpshufb	%xmm7,%xmm2,%xmm2
-	vpaddd	(%ebp),%xmm0,%xmm4
-	vpshufb	%xmm7,%xmm3,%xmm3
-	vpaddd	16(%ebp),%xmm1,%xmm5
-	vpaddd	32(%ebp),%xmm2,%xmm6
-	vpaddd	48(%ebp),%xmm3,%xmm7
-	vmovdqa	%xmm4,32(%esp)
-	vmovdqa	%xmm5,48(%esp)
-	vmovdqa	%xmm6,64(%esp)
-	vmovdqa	%xmm7,80(%esp)
-	jmp	L013avx_00_47
-.align	4,0x90
-L013avx_00_47:
-	addl	$64,%ebp
-	vpalignr	$4,%xmm0,%xmm1,%xmm4
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	20(%esp),%esi
-	vpalignr	$4,%xmm2,%xmm3,%xmm7
-	xorl	%ecx,%edx
-	movl	24(%esp),%edi
-	xorl	%edi,%esi
-	vpsrld	$7,%xmm4,%xmm6
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	vpaddd	%xmm7,%xmm0,%xmm0
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	vpsrld	$3,%xmm4,%xmm7
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	4(%esp),%edi
-	vpslld	$14,%xmm4,%xmm5
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,(%esp)
-	vpxor	%xmm6,%xmm7,%xmm4
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	28(%esp),%edx
-	vpshufd	$250,%xmm3,%xmm7
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	vpsrld	$11,%xmm6,%xmm6
-	addl	32(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	vpxor	%xmm5,%xmm4,%xmm4
-	addl	%edx,%ebx
-	addl	12(%esp),%edx
-	addl	%ecx,%ebx
-	vpslld	$11,%xmm5,%xmm5
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	16(%esp),%esi
-	vpxor	%xmm6,%xmm4,%xmm4
-	xorl	%ecx,%edx
-	movl	20(%esp),%edi
-	xorl	%edi,%esi
-	vpsrld	$10,%xmm7,%xmm6
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,12(%esp)
-	vpxor	%xmm5,%xmm4,%xmm4
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	vpsrlq	$17,%xmm7,%xmm5
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	(%esp),%edi
-	vpaddd	%xmm4,%xmm0,%xmm0
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,28(%esp)
-	vpxor	%xmm5,%xmm6,%xmm6
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	24(%esp),%edx
-	vpsrlq	$19,%xmm7,%xmm7
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	vpxor	%xmm7,%xmm6,%xmm6
-	addl	36(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	vpshufd	$132,%xmm6,%xmm7
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%ecx,%eax
-	vpsrldq	$8,%xmm7,%xmm7
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	12(%esp),%esi
-	vpaddd	%xmm7,%xmm0,%xmm0
-	xorl	%ecx,%edx
-	movl	16(%esp),%edi
-	xorl	%edi,%esi
-	vpshufd	$80,%xmm0,%xmm7
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	vpsrld	$10,%xmm7,%xmm6
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	vpsrlq	$17,%xmm7,%xmm5
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	28(%esp),%edi
-	vpxor	%xmm5,%xmm6,%xmm6
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,24(%esp)
-	vpsrlq	$19,%xmm7,%xmm7
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	20(%esp),%edx
-	vpxor	%xmm7,%xmm6,%xmm6
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	vpshufd	$232,%xmm6,%xmm7
-	addl	40(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	vpslldq	$8,%xmm7,%xmm7
-	addl	%edx,%ebx
-	addl	4(%esp),%edx
-	addl	%ecx,%ebx
-	vpaddd	%xmm7,%xmm0,%xmm0
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	8(%esp),%esi
-	vpaddd	(%ebp),%xmm0,%xmm6
-	xorl	%ecx,%edx
-	movl	12(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,4(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	24(%esp),%edi
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,20(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	16(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	44(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%ecx,%eax
-	vmovdqa	%xmm6,32(%esp)
-	vpalignr	$4,%xmm1,%xmm2,%xmm4
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	4(%esp),%esi
-	vpalignr	$4,%xmm3,%xmm0,%xmm7
-	xorl	%ecx,%edx
-	movl	8(%esp),%edi
-	xorl	%edi,%esi
-	vpsrld	$7,%xmm4,%xmm6
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	vpaddd	%xmm7,%xmm1,%xmm1
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	vpsrld	$3,%xmm4,%xmm7
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	20(%esp),%edi
-	vpslld	$14,%xmm4,%xmm5
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,16(%esp)
-	vpxor	%xmm6,%xmm7,%xmm4
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	12(%esp),%edx
-	vpshufd	$250,%xmm0,%xmm7
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	vpsrld	$11,%xmm6,%xmm6
-	addl	48(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	vpxor	%xmm5,%xmm4,%xmm4
-	addl	%edx,%ebx
-	addl	28(%esp),%edx
-	addl	%ecx,%ebx
-	vpslld	$11,%xmm5,%xmm5
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	(%esp),%esi
-	vpxor	%xmm6,%xmm4,%xmm4
-	xorl	%ecx,%edx
-	movl	4(%esp),%edi
-	xorl	%edi,%esi
-	vpsrld	$10,%xmm7,%xmm6
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,28(%esp)
-	vpxor	%xmm5,%xmm4,%xmm4
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	vpsrlq	$17,%xmm7,%xmm5
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	16(%esp),%edi
-	vpaddd	%xmm4,%xmm1,%xmm1
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,12(%esp)
-	vpxor	%xmm5,%xmm6,%xmm6
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	8(%esp),%edx
-	vpsrlq	$19,%xmm7,%xmm7
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	vpxor	%xmm7,%xmm6,%xmm6
-	addl	52(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	vpshufd	$132,%xmm6,%xmm7
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%ecx,%eax
-	vpsrldq	$8,%xmm7,%xmm7
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	28(%esp),%esi
-	vpaddd	%xmm7,%xmm1,%xmm1
-	xorl	%ecx,%edx
-	movl	(%esp),%edi
-	xorl	%edi,%esi
-	vpshufd	$80,%xmm1,%xmm7
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	vpsrld	$10,%xmm7,%xmm6
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	vpsrlq	$17,%xmm7,%xmm5
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	12(%esp),%edi
-	vpxor	%xmm5,%xmm6,%xmm6
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,8(%esp)
-	vpsrlq	$19,%xmm7,%xmm7
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	4(%esp),%edx
-	vpxor	%xmm7,%xmm6,%xmm6
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	vpshufd	$232,%xmm6,%xmm7
-	addl	56(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	vpslldq	$8,%xmm7,%xmm7
-	addl	%edx,%ebx
-	addl	20(%esp),%edx
-	addl	%ecx,%ebx
-	vpaddd	%xmm7,%xmm1,%xmm1
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	24(%esp),%esi
-	vpaddd	16(%ebp),%xmm1,%xmm6
-	xorl	%ecx,%edx
-	movl	28(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,20(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	8(%esp),%edi
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,4(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	60(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%ecx,%eax
-	vmovdqa	%xmm6,48(%esp)
-	vpalignr	$4,%xmm2,%xmm3,%xmm4
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	20(%esp),%esi
-	vpalignr	$4,%xmm0,%xmm1,%xmm7
-	xorl	%ecx,%edx
-	movl	24(%esp),%edi
-	xorl	%edi,%esi
-	vpsrld	$7,%xmm4,%xmm6
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	vpaddd	%xmm7,%xmm2,%xmm2
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	vpsrld	$3,%xmm4,%xmm7
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	4(%esp),%edi
-	vpslld	$14,%xmm4,%xmm5
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,(%esp)
-	vpxor	%xmm6,%xmm7,%xmm4
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	28(%esp),%edx
-	vpshufd	$250,%xmm1,%xmm7
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	vpsrld	$11,%xmm6,%xmm6
-	addl	64(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	vpxor	%xmm5,%xmm4,%xmm4
-	addl	%edx,%ebx
-	addl	12(%esp),%edx
-	addl	%ecx,%ebx
-	vpslld	$11,%xmm5,%xmm5
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	16(%esp),%esi
-	vpxor	%xmm6,%xmm4,%xmm4
-	xorl	%ecx,%edx
-	movl	20(%esp),%edi
-	xorl	%edi,%esi
-	vpsrld	$10,%xmm7,%xmm6
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,12(%esp)
-	vpxor	%xmm5,%xmm4,%xmm4
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	vpsrlq	$17,%xmm7,%xmm5
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	(%esp),%edi
-	vpaddd	%xmm4,%xmm2,%xmm2
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,28(%esp)
-	vpxor	%xmm5,%xmm6,%xmm6
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	24(%esp),%edx
-	vpsrlq	$19,%xmm7,%xmm7
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	vpxor	%xmm7,%xmm6,%xmm6
-	addl	68(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	vpshufd	$132,%xmm6,%xmm7
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%ecx,%eax
-	vpsrldq	$8,%xmm7,%xmm7
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	12(%esp),%esi
-	vpaddd	%xmm7,%xmm2,%xmm2
-	xorl	%ecx,%edx
-	movl	16(%esp),%edi
-	xorl	%edi,%esi
-	vpshufd	$80,%xmm2,%xmm7
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	vpsrld	$10,%xmm7,%xmm6
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	vpsrlq	$17,%xmm7,%xmm5
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	28(%esp),%edi
-	vpxor	%xmm5,%xmm6,%xmm6
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,24(%esp)
-	vpsrlq	$19,%xmm7,%xmm7
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	20(%esp),%edx
-	vpxor	%xmm7,%xmm6,%xmm6
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	vpshufd	$232,%xmm6,%xmm7
-	addl	72(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	vpslldq	$8,%xmm7,%xmm7
-	addl	%edx,%ebx
-	addl	4(%esp),%edx
-	addl	%ecx,%ebx
-	vpaddd	%xmm7,%xmm2,%xmm2
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	8(%esp),%esi
-	vpaddd	32(%ebp),%xmm2,%xmm6
-	xorl	%ecx,%edx
-	movl	12(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,4(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	24(%esp),%edi
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,20(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	16(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	76(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%ecx,%eax
-	vmovdqa	%xmm6,64(%esp)
-	vpalignr	$4,%xmm3,%xmm0,%xmm4
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	4(%esp),%esi
-	vpalignr	$4,%xmm1,%xmm2,%xmm7
-	xorl	%ecx,%edx
-	movl	8(%esp),%edi
-	xorl	%edi,%esi
-	vpsrld	$7,%xmm4,%xmm6
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	vpaddd	%xmm7,%xmm3,%xmm3
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	vpsrld	$3,%xmm4,%xmm7
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	20(%esp),%edi
-	vpslld	$14,%xmm4,%xmm5
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,16(%esp)
-	vpxor	%xmm6,%xmm7,%xmm4
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	12(%esp),%edx
-	vpshufd	$250,%xmm2,%xmm7
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	vpsrld	$11,%xmm6,%xmm6
-	addl	80(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	vpxor	%xmm5,%xmm4,%xmm4
-	addl	%edx,%ebx
-	addl	28(%esp),%edx
-	addl	%ecx,%ebx
-	vpslld	$11,%xmm5,%xmm5
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	(%esp),%esi
-	vpxor	%xmm6,%xmm4,%xmm4
-	xorl	%ecx,%edx
-	movl	4(%esp),%edi
-	xorl	%edi,%esi
-	vpsrld	$10,%xmm7,%xmm6
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,28(%esp)
-	vpxor	%xmm5,%xmm4,%xmm4
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	vpsrlq	$17,%xmm7,%xmm5
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	16(%esp),%edi
-	vpaddd	%xmm4,%xmm3,%xmm3
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,12(%esp)
-	vpxor	%xmm5,%xmm6,%xmm6
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	8(%esp),%edx
-	vpsrlq	$19,%xmm7,%xmm7
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	vpxor	%xmm7,%xmm6,%xmm6
-	addl	84(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	vpshufd	$132,%xmm6,%xmm7
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%ecx,%eax
-	vpsrldq	$8,%xmm7,%xmm7
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	28(%esp),%esi
-	vpaddd	%xmm7,%xmm3,%xmm3
-	xorl	%ecx,%edx
-	movl	(%esp),%edi
-	xorl	%edi,%esi
-	vpshufd	$80,%xmm3,%xmm7
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	vpsrld	$10,%xmm7,%xmm6
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	vpsrlq	$17,%xmm7,%xmm5
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	12(%esp),%edi
-	vpxor	%xmm5,%xmm6,%xmm6
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,8(%esp)
-	vpsrlq	$19,%xmm7,%xmm7
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	4(%esp),%edx
-	vpxor	%xmm7,%xmm6,%xmm6
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	vpshufd	$232,%xmm6,%xmm7
-	addl	88(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	vpslldq	$8,%xmm7,%xmm7
-	addl	%edx,%ebx
-	addl	20(%esp),%edx
-	addl	%ecx,%ebx
-	vpaddd	%xmm7,%xmm3,%xmm3
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	24(%esp),%esi
-	vpaddd	48(%ebp),%xmm3,%xmm6
-	xorl	%ecx,%edx
-	movl	28(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,20(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	8(%esp),%edi
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,4(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	92(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%ecx,%eax
-	vmovdqa	%xmm6,80(%esp)
-	cmpl	$66051,64(%ebp)
-	jne	L013avx_00_47
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	20(%esp),%esi
-	xorl	%ecx,%edx
-	movl	24(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	4(%esp),%edi
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	28(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	32(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%ebx
-	addl	12(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	16(%esp),%esi
-	xorl	%ecx,%edx
-	movl	20(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,12(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	(%esp),%edi
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,28(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	24(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	36(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	12(%esp),%esi
-	xorl	%ecx,%edx
-	movl	16(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	28(%esp),%edi
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,24(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	20(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	40(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%ebx
-	addl	4(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	8(%esp),%esi
-	xorl	%ecx,%edx
-	movl	12(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,4(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	24(%esp),%edi
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,20(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	16(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	44(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	4(%esp),%esi
-	xorl	%ecx,%edx
-	movl	8(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	20(%esp),%edi
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,16(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	12(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	48(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%ebx
-	addl	28(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	(%esp),%esi
-	xorl	%ecx,%edx
-	movl	4(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,28(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	16(%esp),%edi
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,12(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	8(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	52(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	28(%esp),%esi
-	xorl	%ecx,%edx
-	movl	(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	12(%esp),%edi
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,8(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	4(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	56(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%ebx
-	addl	20(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	24(%esp),%esi
-	xorl	%ecx,%edx
-	movl	28(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,20(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	8(%esp),%edi
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,4(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	60(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	20(%esp),%esi
-	xorl	%ecx,%edx
-	movl	24(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,16(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	4(%esp),%edi
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	28(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	64(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%ebx
-	addl	12(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	16(%esp),%esi
-	xorl	%ecx,%edx
-	movl	20(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,12(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	(%esp),%edi
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,28(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	24(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	68(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%eax
-	addl	8(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	12(%esp),%esi
-	xorl	%ecx,%edx
-	movl	16(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,8(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	28(%esp),%edi
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,24(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	20(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	72(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%ebx
-	addl	4(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	8(%esp),%esi
-	xorl	%ecx,%edx
-	movl	12(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,4(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	24(%esp),%edi
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,20(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	16(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	76(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%eax
-	addl	(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	4(%esp),%esi
-	xorl	%ecx,%edx
-	movl	8(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	20(%esp),%edi
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,16(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	12(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	80(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%ebx
-	addl	28(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	(%esp),%esi
-	xorl	%ecx,%edx
-	movl	4(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,28(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	16(%esp),%edi
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,12(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	8(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	84(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%eax
-	addl	24(%esp),%edx
-	addl	%ecx,%eax
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	28(%esp),%esi
-	xorl	%ecx,%edx
-	movl	(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,24(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%eax,%ecx
-	addl	%edi,%edx
-	movl	12(%esp),%edi
-	movl	%eax,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%eax,8(%esp)
-	xorl	%eax,%ecx
-	xorl	%edi,%eax
-	addl	4(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%eax,%ebx
-	xorl	%esi,%ecx
-	addl	88(%esp),%edx
-	xorl	%edi,%ebx
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%ebx
-	addl	20(%esp),%edx
-	addl	%ecx,%ebx
-	movl	%edx,%ecx
-	shrdl	$14,%edx,%edx
-	movl	24(%esp),%esi
-	xorl	%ecx,%edx
-	movl	28(%esp),%edi
-	xorl	%edi,%esi
-	shrdl	$5,%edx,%edx
-	andl	%ecx,%esi
-	movl	%ecx,20(%esp)
-	xorl	%ecx,%edx
-	xorl	%esi,%edi
-	shrdl	$6,%edx,%edx
-	movl	%ebx,%ecx
-	addl	%edi,%edx
-	movl	8(%esp),%edi
-	movl	%ebx,%esi
-	shrdl	$9,%ecx,%ecx
-	movl	%ebx,4(%esp)
-	xorl	%ebx,%ecx
-	xorl	%edi,%ebx
-	addl	(%esp),%edx
-	shrdl	$11,%ecx,%ecx
-	andl	%ebx,%eax
-	xorl	%esi,%ecx
-	addl	92(%esp),%edx
-	xorl	%edi,%eax
-	shrdl	$2,%ecx,%ecx
-	addl	%edx,%eax
-	addl	16(%esp),%edx
-	addl	%ecx,%eax
-	movl	96(%esp),%esi
-	xorl	%edi,%ebx
-	movl	12(%esp),%ecx
-	addl	(%esi),%eax
-	addl	4(%esi),%ebx
-	addl	8(%esi),%edi
-	addl	12(%esi),%ecx
-	movl	%eax,(%esi)
-	movl	%ebx,4(%esi)
-	movl	%edi,8(%esi)
-	movl	%ecx,12(%esi)
-	movl	%ebx,4(%esp)
-	xorl	%edi,%ebx
-	movl	%edi,8(%esp)
-	movl	%ecx,12(%esp)
-	movl	20(%esp),%edi
-	movl	24(%esp),%ecx
-	addl	16(%esi),%edx
-	addl	20(%esi),%edi
-	addl	24(%esi),%ecx
-	movl	%edx,16(%esi)
-	movl	%edi,20(%esi)
-	movl	%edi,20(%esp)
-	movl	28(%esp),%edi
-	movl	%ecx,24(%esi)
-	addl	28(%esi),%edi
-	movl	%ecx,24(%esp)
-	movl	%edi,28(%esi)
-	movl	%edi,28(%esp)
-	movl	100(%esp),%edi
-	vmovdqa	64(%ebp),%xmm7
-	subl	$192,%ebp
-	cmpl	104(%esp),%edi
-	jb	L012grand_avx
-	movl	108(%esp),%esp
-	vzeroall
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol	_OPENSSL_ia32cap_P
-.long	0
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/sha512-586-apple.S b/apple-x86/crypto/fipsmodule/sha512-586-apple.S
deleted file mode 100644
index cfdeac1..0000000
--- a/apple-x86/crypto/fipsmodule/sha512-586-apple.S
+++ /dev/null
@@ -1,2837 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl	_sha512_block_data_order
-.private_extern	_sha512_block_data_order
-.align	4
-_sha512_block_data_order:
-L_sha512_block_data_order_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%eax
-	movl	%esp,%ebx
-	call	L000pic_point
-L000pic_point:
-	popl	%ebp
-	leal	L001K512-L000pic_point(%ebp),%ebp
-	subl	$16,%esp
-	andl	$-64,%esp
-	shll	$7,%eax
-	addl	%edi,%eax
-	movl	%esi,(%esp)
-	movl	%edi,4(%esp)
-	movl	%eax,8(%esp)
-	movl	%ebx,12(%esp)
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L001K512(%ebp),%edx
-	movl	(%edx),%ecx
-	testl	$67108864,%ecx
-	jz	L002loop_x86
-	movl	4(%edx),%edx
-	movq	(%esi),%mm0
-	andl	$16777216,%ecx
-	movq	8(%esi),%mm1
-	andl	$512,%edx
-	movq	16(%esi),%mm2
-	orl	%edx,%ecx
-	movq	24(%esi),%mm3
-	movq	32(%esi),%mm4
-	movq	40(%esi),%mm5
-	movq	48(%esi),%mm6
-	movq	56(%esi),%mm7
-	cmpl	$16777728,%ecx
-	je	L003SSSE3
-	subl	$80,%esp
-	jmp	L004loop_sse2
-.align	4,0x90
-L004loop_sse2:
-	movq	%mm1,8(%esp)
-	movq	%mm2,16(%esp)
-	movq	%mm3,24(%esp)
-	movq	%mm5,40(%esp)
-	movq	%mm6,48(%esp)
-	pxor	%mm1,%mm2
-	movq	%mm7,56(%esp)
-	movq	%mm0,%mm3
-	movl	(%edi),%eax
-	movl	4(%edi),%ebx
-	addl	$8,%edi
-	movl	$15,%edx
-	bswap	%eax
-	bswap	%ebx
-	jmp	L00500_14_sse2
-.align	4,0x90
-L00500_14_sse2:
-	movd	%eax,%mm1
-	movl	(%edi),%eax
-	movd	%ebx,%mm7
-	movl	4(%edi),%ebx
-	addl	$8,%edi
-	bswap	%eax
-	bswap	%ebx
-	punpckldq	%mm1,%mm7
-	movq	%mm4,%mm1
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,32(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	movq	%mm3,%mm0
-	movq	%mm7,72(%esp)
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	56(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	paddq	(%ebp),%mm7
-	pxor	%mm4,%mm3
-	movq	24(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	8(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	subl	$8,%esp
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	40(%esp),%mm5
-	paddq	%mm2,%mm3
-	movq	%mm0,%mm2
-	addl	$8,%ebp
-	paddq	%mm6,%mm3
-	movq	48(%esp),%mm6
-	decl	%edx
-	jnz	L00500_14_sse2
-	movd	%eax,%mm1
-	movd	%ebx,%mm7
-	punpckldq	%mm1,%mm7
-	movq	%mm4,%mm1
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,32(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	movq	%mm3,%mm0
-	movq	%mm7,72(%esp)
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	56(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	paddq	(%ebp),%mm7
-	pxor	%mm4,%mm3
-	movq	24(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	8(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	subl	$8,%esp
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	192(%esp),%mm7
-	paddq	%mm2,%mm3
-	movq	%mm0,%mm2
-	addl	$8,%ebp
-	paddq	%mm6,%mm3
-	pxor	%mm0,%mm0
-	movl	$32,%edx
-	jmp	L00616_79_sse2
-.align	4,0x90
-L00616_79_sse2:
-	movq	88(%esp),%mm5
-	movq	%mm7,%mm1
-	psrlq	$1,%mm7
-	movq	%mm5,%mm6
-	psrlq	$6,%mm5
-	psllq	$56,%mm1
-	paddq	%mm3,%mm0
-	movq	%mm7,%mm3
-	psrlq	$6,%mm7
-	pxor	%mm1,%mm3
-	psllq	$7,%mm1
-	pxor	%mm7,%mm3
-	psrlq	$1,%mm7
-	pxor	%mm1,%mm3
-	movq	%mm5,%mm1
-	psrlq	$13,%mm5
-	pxor	%mm3,%mm7
-	psllq	$3,%mm6
-	pxor	%mm5,%mm1
-	paddq	200(%esp),%mm7
-	pxor	%mm6,%mm1
-	psrlq	$42,%mm5
-	paddq	128(%esp),%mm7
-	pxor	%mm5,%mm1
-	psllq	$42,%mm6
-	movq	40(%esp),%mm5
-	pxor	%mm6,%mm1
-	movq	48(%esp),%mm6
-	paddq	%mm1,%mm7
-	movq	%mm4,%mm1
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,32(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	movq	%mm7,72(%esp)
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	56(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	paddq	(%ebp),%mm7
-	pxor	%mm4,%mm3
-	movq	24(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	8(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	subl	$8,%esp
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	192(%esp),%mm7
-	paddq	%mm6,%mm2
-	addl	$8,%ebp
-	movq	88(%esp),%mm5
-	movq	%mm7,%mm1
-	psrlq	$1,%mm7
-	movq	%mm5,%mm6
-	psrlq	$6,%mm5
-	psllq	$56,%mm1
-	paddq	%mm3,%mm2
-	movq	%mm7,%mm3
-	psrlq	$6,%mm7
-	pxor	%mm1,%mm3
-	psllq	$7,%mm1
-	pxor	%mm7,%mm3
-	psrlq	$1,%mm7
-	pxor	%mm1,%mm3
-	movq	%mm5,%mm1
-	psrlq	$13,%mm5
-	pxor	%mm3,%mm7
-	psllq	$3,%mm6
-	pxor	%mm5,%mm1
-	paddq	200(%esp),%mm7
-	pxor	%mm6,%mm1
-	psrlq	$42,%mm5
-	paddq	128(%esp),%mm7
-	pxor	%mm5,%mm1
-	psllq	$42,%mm6
-	movq	40(%esp),%mm5
-	pxor	%mm6,%mm1
-	movq	48(%esp),%mm6
-	paddq	%mm1,%mm7
-	movq	%mm4,%mm1
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,32(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	movq	%mm7,72(%esp)
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	56(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	paddq	(%ebp),%mm7
-	pxor	%mm4,%mm3
-	movq	24(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	8(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	subl	$8,%esp
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	192(%esp),%mm7
-	paddq	%mm6,%mm0
-	addl	$8,%ebp
-	decl	%edx
-	jnz	L00616_79_sse2
-	paddq	%mm3,%mm0
-	movq	8(%esp),%mm1
-	movq	24(%esp),%mm3
-	movq	40(%esp),%mm5
-	movq	48(%esp),%mm6
-	movq	56(%esp),%mm7
-	pxor	%mm1,%mm2
-	paddq	(%esi),%mm0
-	paddq	8(%esi),%mm1
-	paddq	16(%esi),%mm2
-	paddq	24(%esi),%mm3
-	paddq	32(%esi),%mm4
-	paddq	40(%esi),%mm5
-	paddq	48(%esi),%mm6
-	paddq	56(%esi),%mm7
-	movl	$640,%eax
-	movq	%mm0,(%esi)
-	movq	%mm1,8(%esi)
-	movq	%mm2,16(%esi)
-	movq	%mm3,24(%esi)
-	movq	%mm4,32(%esi)
-	movq	%mm5,40(%esi)
-	movq	%mm6,48(%esi)
-	movq	%mm7,56(%esi)
-	leal	(%esp,%eax,1),%esp
-	subl	%eax,%ebp
-	cmpl	88(%esp),%edi
-	jb	L004loop_sse2
-	movl	92(%esp),%esp
-	emms
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.align	5,0x90
-L003SSSE3:
-	leal	-64(%esp),%edx
-	subl	$256,%esp
-	movdqa	640(%ebp),%xmm1
-	movdqu	(%edi),%xmm0
-.byte	102,15,56,0,193
-	movdqa	(%ebp),%xmm3
-	movdqa	%xmm1,%xmm2
-	movdqu	16(%edi),%xmm1
-	paddq	%xmm0,%xmm3
-.byte	102,15,56,0,202
-	movdqa	%xmm3,-128(%edx)
-	movdqa	16(%ebp),%xmm4
-	movdqa	%xmm2,%xmm3
-	movdqu	32(%edi),%xmm2
-	paddq	%xmm1,%xmm4
-.byte	102,15,56,0,211
-	movdqa	%xmm4,-112(%edx)
-	movdqa	32(%ebp),%xmm5
-	movdqa	%xmm3,%xmm4
-	movdqu	48(%edi),%xmm3
-	paddq	%xmm2,%xmm5
-.byte	102,15,56,0,220
-	movdqa	%xmm5,-96(%edx)
-	movdqa	48(%ebp),%xmm6
-	movdqa	%xmm4,%xmm5
-	movdqu	64(%edi),%xmm4
-	paddq	%xmm3,%xmm6
-.byte	102,15,56,0,229
-	movdqa	%xmm6,-80(%edx)
-	movdqa	64(%ebp),%xmm7
-	movdqa	%xmm5,%xmm6
-	movdqu	80(%edi),%xmm5
-	paddq	%xmm4,%xmm7
-.byte	102,15,56,0,238
-	movdqa	%xmm7,-64(%edx)
-	movdqa	%xmm0,(%edx)
-	movdqa	80(%ebp),%xmm0
-	movdqa	%xmm6,%xmm7
-	movdqu	96(%edi),%xmm6
-	paddq	%xmm5,%xmm0
-.byte	102,15,56,0,247
-	movdqa	%xmm0,-48(%edx)
-	movdqa	%xmm1,16(%edx)
-	movdqa	96(%ebp),%xmm1
-	movdqa	%xmm7,%xmm0
-	movdqu	112(%edi),%xmm7
-	paddq	%xmm6,%xmm1
-.byte	102,15,56,0,248
-	movdqa	%xmm1,-32(%edx)
-	movdqa	%xmm2,32(%edx)
-	movdqa	112(%ebp),%xmm2
-	movdqa	(%edx),%xmm0
-	paddq	%xmm7,%xmm2
-	movdqa	%xmm2,-16(%edx)
-	nop
-.align	5,0x90
-L007loop_ssse3:
-	movdqa	16(%edx),%xmm2
-	movdqa	%xmm3,48(%edx)
-	leal	128(%ebp),%ebp
-	movq	%mm1,8(%esp)
-	movl	%edi,%ebx
-	movq	%mm2,16(%esp)
-	leal	128(%edi),%edi
-	movq	%mm3,24(%esp)
-	cmpl	%eax,%edi
-	movq	%mm5,40(%esp)
-	cmovbl	%edi,%ebx
-	movq	%mm6,48(%esp)
-	movl	$4,%ecx
-	pxor	%mm1,%mm2
-	movq	%mm7,56(%esp)
-	pxor	%mm3,%mm3
-	jmp	L00800_47_ssse3
-.align	5,0x90
-L00800_47_ssse3:
-	movdqa	%xmm5,%xmm3
-	movdqa	%xmm2,%xmm1
-.byte	102,15,58,15,208,8
-	movdqa	%xmm4,(%edx)
-.byte	102,15,58,15,220,8
-	movdqa	%xmm2,%xmm4
-	psrlq	$7,%xmm2
-	paddq	%xmm3,%xmm0
-	movdqa	%xmm4,%xmm3
-	psrlq	$1,%xmm4
-	psllq	$56,%xmm3
-	pxor	%xmm4,%xmm2
-	psrlq	$7,%xmm4
-	pxor	%xmm3,%xmm2
-	psllq	$7,%xmm3
-	pxor	%xmm4,%xmm2
-	movdqa	%xmm7,%xmm4
-	pxor	%xmm3,%xmm2
-	movdqa	%xmm7,%xmm3
-	psrlq	$6,%xmm4
-	paddq	%xmm2,%xmm0
-	movdqa	%xmm7,%xmm2
-	psrlq	$19,%xmm3
-	psllq	$3,%xmm2
-	pxor	%xmm3,%xmm4
-	psrlq	$42,%xmm3
-	pxor	%xmm2,%xmm4
-	psllq	$42,%xmm2
-	pxor	%xmm3,%xmm4
-	movdqa	32(%edx),%xmm3
-	pxor	%xmm2,%xmm4
-	movdqa	(%ebp),%xmm2
-	movq	%mm4,%mm1
-	paddq	%xmm4,%xmm0
-	movq	-128(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,32(%esp)
-	paddq	%xmm0,%xmm2
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	56(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	24(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	8(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	32(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	40(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-120(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,24(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,56(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	48(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	16(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	24(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	32(%esp),%mm6
-	movdqa	%xmm2,-128(%edx)
-	movdqa	%xmm6,%xmm4
-	movdqa	%xmm3,%xmm2
-.byte	102,15,58,15,217,8
-	movdqa	%xmm5,16(%edx)
-.byte	102,15,58,15,229,8
-	movdqa	%xmm3,%xmm5
-	psrlq	$7,%xmm3
-	paddq	%xmm4,%xmm1
-	movdqa	%xmm5,%xmm4
-	psrlq	$1,%xmm5
-	psllq	$56,%xmm4
-	pxor	%xmm5,%xmm3
-	psrlq	$7,%xmm5
-	pxor	%xmm4,%xmm3
-	psllq	$7,%xmm4
-	pxor	%xmm5,%xmm3
-	movdqa	%xmm0,%xmm5
-	pxor	%xmm4,%xmm3
-	movdqa	%xmm0,%xmm4
-	psrlq	$6,%xmm5
-	paddq	%xmm3,%xmm1
-	movdqa	%xmm0,%xmm3
-	psrlq	$19,%xmm4
-	psllq	$3,%xmm3
-	pxor	%xmm4,%xmm5
-	psrlq	$42,%xmm4
-	pxor	%xmm3,%xmm5
-	psllq	$42,%xmm3
-	pxor	%xmm4,%xmm5
-	movdqa	48(%edx),%xmm4
-	pxor	%xmm3,%xmm5
-	movdqa	16(%ebp),%xmm3
-	movq	%mm4,%mm1
-	paddq	%xmm5,%xmm1
-	movq	-112(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,16(%esp)
-	paddq	%xmm1,%xmm3
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,48(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	40(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	8(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	56(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	16(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	24(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-104(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,8(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,40(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	32(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	48(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	8(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	16(%esp),%mm6
-	movdqa	%xmm3,-112(%edx)
-	movdqa	%xmm7,%xmm5
-	movdqa	%xmm4,%xmm3
-.byte	102,15,58,15,226,8
-	movdqa	%xmm6,32(%edx)
-.byte	102,15,58,15,238,8
-	movdqa	%xmm4,%xmm6
-	psrlq	$7,%xmm4
-	paddq	%xmm5,%xmm2
-	movdqa	%xmm6,%xmm5
-	psrlq	$1,%xmm6
-	psllq	$56,%xmm5
-	pxor	%xmm6,%xmm4
-	psrlq	$7,%xmm6
-	pxor	%xmm5,%xmm4
-	psllq	$7,%xmm5
-	pxor	%xmm6,%xmm4
-	movdqa	%xmm1,%xmm6
-	pxor	%xmm5,%xmm4
-	movdqa	%xmm1,%xmm5
-	psrlq	$6,%xmm6
-	paddq	%xmm4,%xmm2
-	movdqa	%xmm1,%xmm4
-	psrlq	$19,%xmm5
-	psllq	$3,%xmm4
-	pxor	%xmm5,%xmm6
-	psrlq	$42,%xmm5
-	pxor	%xmm4,%xmm6
-	psllq	$42,%xmm4
-	pxor	%xmm5,%xmm6
-	movdqa	(%edx),%xmm5
-	pxor	%xmm4,%xmm6
-	movdqa	32(%ebp),%xmm4
-	movq	%mm4,%mm1
-	paddq	%xmm6,%xmm2
-	movq	-96(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,(%esp)
-	paddq	%xmm2,%xmm4
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,32(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	24(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	56(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	40(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	8(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-88(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,56(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,24(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	16(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	48(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	32(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	56(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	(%esp),%mm6
-	movdqa	%xmm4,-96(%edx)
-	movdqa	%xmm0,%xmm6
-	movdqa	%xmm5,%xmm4
-.byte	102,15,58,15,235,8
-	movdqa	%xmm7,48(%edx)
-.byte	102,15,58,15,247,8
-	movdqa	%xmm5,%xmm7
-	psrlq	$7,%xmm5
-	paddq	%xmm6,%xmm3
-	movdqa	%xmm7,%xmm6
-	psrlq	$1,%xmm7
-	psllq	$56,%xmm6
-	pxor	%xmm7,%xmm5
-	psrlq	$7,%xmm7
-	pxor	%xmm6,%xmm5
-	psllq	$7,%xmm6
-	pxor	%xmm7,%xmm5
-	movdqa	%xmm2,%xmm7
-	pxor	%xmm6,%xmm5
-	movdqa	%xmm2,%xmm6
-	psrlq	$6,%xmm7
-	paddq	%xmm5,%xmm3
-	movdqa	%xmm2,%xmm5
-	psrlq	$19,%xmm6
-	psllq	$3,%xmm5
-	pxor	%xmm6,%xmm7
-	psrlq	$42,%xmm6
-	pxor	%xmm5,%xmm7
-	psllq	$42,%xmm5
-	pxor	%xmm6,%xmm7
-	movdqa	16(%edx),%xmm6
-	pxor	%xmm5,%xmm7
-	movdqa	48(%ebp),%xmm5
-	movq	%mm4,%mm1
-	paddq	%xmm7,%xmm3
-	movq	-80(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,48(%esp)
-	paddq	%xmm3,%xmm5
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,16(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	8(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	40(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	24(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	48(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	56(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-72(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,40(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,8(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	32(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	16(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	40(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	48(%esp),%mm6
-	movdqa	%xmm5,-80(%edx)
-	movdqa	%xmm1,%xmm7
-	movdqa	%xmm6,%xmm5
-.byte	102,15,58,15,244,8
-	movdqa	%xmm0,(%edx)
-.byte	102,15,58,15,248,8
-	movdqa	%xmm6,%xmm0
-	psrlq	$7,%xmm6
-	paddq	%xmm7,%xmm4
-	movdqa	%xmm0,%xmm7
-	psrlq	$1,%xmm0
-	psllq	$56,%xmm7
-	pxor	%xmm0,%xmm6
-	psrlq	$7,%xmm0
-	pxor	%xmm7,%xmm6
-	psllq	$7,%xmm7
-	pxor	%xmm0,%xmm6
-	movdqa	%xmm3,%xmm0
-	pxor	%xmm7,%xmm6
-	movdqa	%xmm3,%xmm7
-	psrlq	$6,%xmm0
-	paddq	%xmm6,%xmm4
-	movdqa	%xmm3,%xmm6
-	psrlq	$19,%xmm7
-	psllq	$3,%xmm6
-	pxor	%xmm7,%xmm0
-	psrlq	$42,%xmm7
-	pxor	%xmm6,%xmm0
-	psllq	$42,%xmm6
-	pxor	%xmm7,%xmm0
-	movdqa	32(%edx),%xmm7
-	pxor	%xmm6,%xmm0
-	movdqa	64(%ebp),%xmm6
-	movq	%mm4,%mm1
-	paddq	%xmm0,%xmm4
-	movq	-64(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,32(%esp)
-	paddq	%xmm4,%xmm6
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	56(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	24(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	8(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	32(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	40(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-56(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,24(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,56(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	48(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	16(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	24(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	32(%esp),%mm6
-	movdqa	%xmm6,-64(%edx)
-	movdqa	%xmm2,%xmm0
-	movdqa	%xmm7,%xmm6
-.byte	102,15,58,15,253,8
-	movdqa	%xmm1,16(%edx)
-.byte	102,15,58,15,193,8
-	movdqa	%xmm7,%xmm1
-	psrlq	$7,%xmm7
-	paddq	%xmm0,%xmm5
-	movdqa	%xmm1,%xmm0
-	psrlq	$1,%xmm1
-	psllq	$56,%xmm0
-	pxor	%xmm1,%xmm7
-	psrlq	$7,%xmm1
-	pxor	%xmm0,%xmm7
-	psllq	$7,%xmm0
-	pxor	%xmm1,%xmm7
-	movdqa	%xmm4,%xmm1
-	pxor	%xmm0,%xmm7
-	movdqa	%xmm4,%xmm0
-	psrlq	$6,%xmm1
-	paddq	%xmm7,%xmm5
-	movdqa	%xmm4,%xmm7
-	psrlq	$19,%xmm0
-	psllq	$3,%xmm7
-	pxor	%xmm0,%xmm1
-	psrlq	$42,%xmm0
-	pxor	%xmm7,%xmm1
-	psllq	$42,%xmm7
-	pxor	%xmm0,%xmm1
-	movdqa	48(%edx),%xmm0
-	pxor	%xmm7,%xmm1
-	movdqa	80(%ebp),%xmm7
-	movq	%mm4,%mm1
-	paddq	%xmm1,%xmm5
-	movq	-48(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,16(%esp)
-	paddq	%xmm5,%xmm7
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,48(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	40(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	8(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	56(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	16(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	24(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-40(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,8(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,40(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	32(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	48(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	8(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	16(%esp),%mm6
-	movdqa	%xmm7,-48(%edx)
-	movdqa	%xmm3,%xmm1
-	movdqa	%xmm0,%xmm7
-.byte	102,15,58,15,198,8
-	movdqa	%xmm2,32(%edx)
-.byte	102,15,58,15,202,8
-	movdqa	%xmm0,%xmm2
-	psrlq	$7,%xmm0
-	paddq	%xmm1,%xmm6
-	movdqa	%xmm2,%xmm1
-	psrlq	$1,%xmm2
-	psllq	$56,%xmm1
-	pxor	%xmm2,%xmm0
-	psrlq	$7,%xmm2
-	pxor	%xmm1,%xmm0
-	psllq	$7,%xmm1
-	pxor	%xmm2,%xmm0
-	movdqa	%xmm5,%xmm2
-	pxor	%xmm1,%xmm0
-	movdqa	%xmm5,%xmm1
-	psrlq	$6,%xmm2
-	paddq	%xmm0,%xmm6
-	movdqa	%xmm5,%xmm0
-	psrlq	$19,%xmm1
-	psllq	$3,%xmm0
-	pxor	%xmm1,%xmm2
-	psrlq	$42,%xmm1
-	pxor	%xmm0,%xmm2
-	psllq	$42,%xmm0
-	pxor	%xmm1,%xmm2
-	movdqa	(%edx),%xmm1
-	pxor	%xmm0,%xmm2
-	movdqa	96(%ebp),%xmm0
-	movq	%mm4,%mm1
-	paddq	%xmm2,%xmm6
-	movq	-32(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,(%esp)
-	paddq	%xmm6,%xmm0
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,32(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	24(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	56(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	40(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	8(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-24(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,56(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,24(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	16(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	48(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	32(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	56(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	(%esp),%mm6
-	movdqa	%xmm0,-32(%edx)
-	movdqa	%xmm4,%xmm2
-	movdqa	%xmm1,%xmm0
-.byte	102,15,58,15,207,8
-	movdqa	%xmm3,48(%edx)
-.byte	102,15,58,15,211,8
-	movdqa	%xmm1,%xmm3
-	psrlq	$7,%xmm1
-	paddq	%xmm2,%xmm7
-	movdqa	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	psllq	$56,%xmm2
-	pxor	%xmm3,%xmm1
-	psrlq	$7,%xmm3
-	pxor	%xmm2,%xmm1
-	psllq	$7,%xmm2
-	pxor	%xmm3,%xmm1
-	movdqa	%xmm6,%xmm3
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm6,%xmm2
-	psrlq	$6,%xmm3
-	paddq	%xmm1,%xmm7
-	movdqa	%xmm6,%xmm1
-	psrlq	$19,%xmm2
-	psllq	$3,%xmm1
-	pxor	%xmm2,%xmm3
-	psrlq	$42,%xmm2
-	pxor	%xmm1,%xmm3
-	psllq	$42,%xmm1
-	pxor	%xmm2,%xmm3
-	movdqa	16(%edx),%xmm2
-	pxor	%xmm1,%xmm3
-	movdqa	112(%ebp),%xmm1
-	movq	%mm4,%mm1
-	paddq	%xmm3,%xmm7
-	movq	-16(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,48(%esp)
-	paddq	%xmm7,%xmm1
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,16(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	8(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	40(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	24(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	48(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	56(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-8(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,40(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,8(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	32(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	16(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	40(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	48(%esp),%mm6
-	movdqa	%xmm1,-16(%edx)
-	leal	128(%ebp),%ebp
-	decl	%ecx
-	jnz	L00800_47_ssse3
-	movdqa	(%ebp),%xmm1
-	leal	-640(%ebp),%ebp
-	movdqu	(%ebx),%xmm0
-.byte	102,15,56,0,193
-	movdqa	(%ebp),%xmm3
-	movdqa	%xmm1,%xmm2
-	movdqu	16(%ebx),%xmm1
-	paddq	%xmm0,%xmm3
-.byte	102,15,56,0,202
-	movq	%mm4,%mm1
-	movq	-128(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,32(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	56(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	24(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	8(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	32(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	40(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-120(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,24(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,56(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	48(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	16(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	24(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	32(%esp),%mm6
-	movdqa	%xmm3,-128(%edx)
-	movdqa	16(%ebp),%xmm4
-	movdqa	%xmm2,%xmm3
-	movdqu	32(%ebx),%xmm2
-	paddq	%xmm1,%xmm4
-.byte	102,15,56,0,211
-	movq	%mm4,%mm1
-	movq	-112(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,16(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,48(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	40(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	8(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	56(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	16(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	24(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-104(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,8(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,40(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	32(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	48(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	8(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	16(%esp),%mm6
-	movdqa	%xmm4,-112(%edx)
-	movdqa	32(%ebp),%xmm5
-	movdqa	%xmm3,%xmm4
-	movdqu	48(%ebx),%xmm3
-	paddq	%xmm2,%xmm5
-.byte	102,15,56,0,220
-	movq	%mm4,%mm1
-	movq	-96(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,32(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	24(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	56(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	40(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	8(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-88(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,56(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,24(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	16(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	48(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	32(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	56(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	(%esp),%mm6
-	movdqa	%xmm5,-96(%edx)
-	movdqa	48(%ebp),%xmm6
-	movdqa	%xmm4,%xmm5
-	movdqu	64(%ebx),%xmm4
-	paddq	%xmm3,%xmm6
-.byte	102,15,56,0,229
-	movq	%mm4,%mm1
-	movq	-80(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,48(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,16(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	8(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	40(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	24(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	48(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	56(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-72(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,40(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,8(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	32(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	16(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	40(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	48(%esp),%mm6
-	movdqa	%xmm6,-80(%edx)
-	movdqa	64(%ebp),%xmm7
-	movdqa	%xmm5,%xmm6
-	movdqu	80(%ebx),%xmm5
-	paddq	%xmm4,%xmm7
-.byte	102,15,56,0,238
-	movq	%mm4,%mm1
-	movq	-64(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,32(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	56(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	24(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	8(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	32(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	40(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-56(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,24(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,56(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	48(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	16(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	24(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	32(%esp),%mm6
-	movdqa	%xmm7,-64(%edx)
-	movdqa	%xmm0,(%edx)
-	movdqa	80(%ebp),%xmm0
-	movdqa	%xmm6,%xmm7
-	movdqu	96(%ebx),%xmm6
-	paddq	%xmm5,%xmm0
-.byte	102,15,56,0,247
-	movq	%mm4,%mm1
-	movq	-48(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,16(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,48(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	40(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	8(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	56(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	16(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	24(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-40(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,8(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,40(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	32(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	48(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	8(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	16(%esp),%mm6
-	movdqa	%xmm0,-48(%edx)
-	movdqa	%xmm1,16(%edx)
-	movdqa	96(%ebp),%xmm1
-	movdqa	%xmm7,%xmm0
-	movdqu	112(%ebx),%xmm7
-	paddq	%xmm6,%xmm1
-.byte	102,15,56,0,248
-	movq	%mm4,%mm1
-	movq	-32(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,32(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	24(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	56(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	40(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	8(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-24(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,56(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,24(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	16(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	48(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	32(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	56(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	(%esp),%mm6
-	movdqa	%xmm1,-32(%edx)
-	movdqa	%xmm2,32(%edx)
-	movdqa	112(%ebp),%xmm2
-	movdqa	(%edx),%xmm0
-	paddq	%xmm7,%xmm2
-	movq	%mm4,%mm1
-	movq	-16(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,48(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm0
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm0,16(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	8(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	40(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm0,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm0,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	24(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm0,%mm2
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	pxor	%mm7,%mm6
-	movq	48(%esp),%mm5
-	paddq	%mm6,%mm2
-	movq	56(%esp),%mm6
-	movq	%mm4,%mm1
-	movq	-8(%edx),%mm7
-	pxor	%mm6,%mm5
-	psrlq	$14,%mm1
-	movq	%mm4,40(%esp)
-	pand	%mm4,%mm5
-	psllq	$23,%mm4
-	paddq	%mm3,%mm2
-	movq	%mm1,%mm3
-	psrlq	$4,%mm1
-	pxor	%mm6,%mm5
-	pxor	%mm4,%mm3
-	psllq	$23,%mm4
-	pxor	%mm1,%mm3
-	movq	%mm2,8(%esp)
-	paddq	%mm5,%mm7
-	pxor	%mm4,%mm3
-	psrlq	$23,%mm1
-	paddq	(%esp),%mm7
-	pxor	%mm1,%mm3
-	psllq	$4,%mm4
-	pxor	%mm4,%mm3
-	movq	32(%esp),%mm4
-	paddq	%mm7,%mm3
-	movq	%mm2,%mm5
-	psrlq	$28,%mm5
-	paddq	%mm3,%mm4
-	movq	%mm2,%mm6
-	movq	%mm5,%mm7
-	psllq	$25,%mm6
-	movq	16(%esp),%mm1
-	psrlq	$6,%mm5
-	pxor	%mm6,%mm7
-	psllq	$5,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm2
-	psrlq	$5,%mm5
-	pxor	%mm6,%mm7
-	pand	%mm2,%mm0
-	psllq	$6,%mm6
-	pxor	%mm5,%mm7
-	pxor	%mm1,%mm0
-	pxor	%mm7,%mm6
-	movq	40(%esp),%mm5
-	paddq	%mm6,%mm0
-	movq	48(%esp),%mm6
-	movdqa	%xmm2,-16(%edx)
-	movq	8(%esp),%mm1
-	paddq	%mm3,%mm0
-	movq	24(%esp),%mm3
-	movq	56(%esp),%mm7
-	pxor	%mm1,%mm2
-	paddq	(%esi),%mm0
-	paddq	8(%esi),%mm1
-	paddq	16(%esi),%mm2
-	paddq	24(%esi),%mm3
-	paddq	32(%esi),%mm4
-	paddq	40(%esi),%mm5
-	paddq	48(%esi),%mm6
-	paddq	56(%esi),%mm7
-	movq	%mm0,(%esi)
-	movq	%mm1,8(%esi)
-	movq	%mm2,16(%esi)
-	movq	%mm3,24(%esi)
-	movq	%mm4,32(%esi)
-	movq	%mm5,40(%esi)
-	movq	%mm6,48(%esi)
-	movq	%mm7,56(%esi)
-	cmpl	%eax,%edi
-	jb	L007loop_ssse3
-	movl	76(%edx),%esp
-	emms
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.align	4,0x90
-L002loop_x86:
-	movl	(%edi),%eax
-	movl	4(%edi),%ebx
-	movl	8(%edi),%ecx
-	movl	12(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	16(%edi),%eax
-	movl	20(%edi),%ebx
-	movl	24(%edi),%ecx
-	movl	28(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	32(%edi),%eax
-	movl	36(%edi),%ebx
-	movl	40(%edi),%ecx
-	movl	44(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	48(%edi),%eax
-	movl	52(%edi),%ebx
-	movl	56(%edi),%ecx
-	movl	60(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	64(%edi),%eax
-	movl	68(%edi),%ebx
-	movl	72(%edi),%ecx
-	movl	76(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	80(%edi),%eax
-	movl	84(%edi),%ebx
-	movl	88(%edi),%ecx
-	movl	92(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	96(%edi),%eax
-	movl	100(%edi),%ebx
-	movl	104(%edi),%ecx
-	movl	108(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	112(%edi),%eax
-	movl	116(%edi),%ebx
-	movl	120(%edi),%ecx
-	movl	124(%edi),%edx
-	bswap	%eax
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	addl	$128,%edi
-	subl	$72,%esp
-	movl	%edi,204(%esp)
-	leal	8(%esp),%edi
-	movl	$16,%ecx
-.long	2784229001
-.align	4,0x90
-L00900_15_x86:
-	movl	40(%esp),%ecx
-	movl	44(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$9,%ecx
-	movl	%edx,%edi
-	shrl	$9,%edx
-	movl	%ecx,%ebx
-	shll	$14,%esi
-	movl	%edx,%eax
-	shll	$14,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%eax
-	shll	$4,%esi
-	xorl	%edx,%ebx
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$4,%ecx
-	xorl	%edi,%eax
-	shrl	$4,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	48(%esp),%ecx
-	movl	52(%esp),%edx
-	movl	56(%esp),%esi
-	movl	60(%esp),%edi
-	addl	64(%esp),%eax
-	adcl	68(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	andl	40(%esp),%ecx
-	andl	44(%esp),%edx
-	addl	192(%esp),%eax
-	adcl	196(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	movl	(%ebp),%esi
-	movl	4(%ebp),%edi
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	32(%esp),%ecx
-	movl	36(%esp),%edx
-	addl	%esi,%eax
-	adcl	%edi,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	%eax,32(%esp)
-	movl	%ebx,36(%esp)
-	movl	%ecx,%esi
-	shrl	$2,%ecx
-	movl	%edx,%edi
-	shrl	$2,%edx
-	movl	%ecx,%ebx
-	shll	$4,%esi
-	movl	%edx,%eax
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%ebx
-	shll	$21,%esi
-	xorl	%edx,%eax
-	shll	$21,%edi
-	xorl	%esi,%eax
-	shrl	$21,%ecx
-	xorl	%edi,%ebx
-	shrl	$21,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	16(%esp),%esi
-	movl	20(%esp),%edi
-	addl	(%esp),%eax
-	adcl	4(%esp),%ebx
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	andl	24(%esp),%ecx
-	andl	28(%esp),%edx
-	andl	8(%esp),%esi
-	andl	12(%esp),%edi
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movb	(%ebp),%dl
-	subl	$8,%esp
-	leal	8(%ebp),%ebp
-	cmpb	$148,%dl
-	jne	L00900_15_x86
-.align	4,0x90
-L01016_79_x86:
-	movl	312(%esp),%ecx
-	movl	316(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$1,%ecx
-	movl	%edx,%edi
-	shrl	$1,%edx
-	movl	%ecx,%eax
-	shll	$24,%esi
-	movl	%edx,%ebx
-	shll	$24,%edi
-	xorl	%esi,%ebx
-	shrl	$6,%ecx
-	xorl	%edi,%eax
-	shrl	$6,%edx
-	xorl	%ecx,%eax
-	shll	$7,%esi
-	xorl	%edx,%ebx
-	shll	$1,%edi
-	xorl	%esi,%ebx
-	shrl	$1,%ecx
-	xorl	%edi,%eax
-	shrl	$1,%edx
-	xorl	%ecx,%eax
-	shll	$6,%edi
-	xorl	%edx,%ebx
-	xorl	%edi,%eax
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movl	208(%esp),%ecx
-	movl	212(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$6,%ecx
-	movl	%edx,%edi
-	shrl	$6,%edx
-	movl	%ecx,%eax
-	shll	$3,%esi
-	movl	%edx,%ebx
-	shll	$3,%edi
-	xorl	%esi,%eax
-	shrl	$13,%ecx
-	xorl	%edi,%ebx
-	shrl	$13,%edx
-	xorl	%ecx,%eax
-	shll	$10,%esi
-	xorl	%edx,%ebx
-	shll	$10,%edi
-	xorl	%esi,%ebx
-	shrl	$10,%ecx
-	xorl	%edi,%eax
-	shrl	$10,%edx
-	xorl	%ecx,%ebx
-	shll	$13,%edi
-	xorl	%edx,%eax
-	xorl	%edi,%eax
-	movl	320(%esp),%ecx
-	movl	324(%esp),%edx
-	addl	(%esp),%eax
-	adcl	4(%esp),%ebx
-	movl	248(%esp),%esi
-	movl	252(%esp),%edi
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	addl	%esi,%eax
-	adcl	%edi,%ebx
-	movl	%eax,192(%esp)
-	movl	%ebx,196(%esp)
-	movl	40(%esp),%ecx
-	movl	44(%esp),%edx
-	movl	%ecx,%esi
-	shrl	$9,%ecx
-	movl	%edx,%edi
-	shrl	$9,%edx
-	movl	%ecx,%ebx
-	shll	$14,%esi
-	movl	%edx,%eax
-	shll	$14,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%eax
-	shll	$4,%esi
-	xorl	%edx,%ebx
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$4,%ecx
-	xorl	%edi,%eax
-	shrl	$4,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	48(%esp),%ecx
-	movl	52(%esp),%edx
-	movl	56(%esp),%esi
-	movl	60(%esp),%edi
-	addl	64(%esp),%eax
-	adcl	68(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	andl	40(%esp),%ecx
-	andl	44(%esp),%edx
-	addl	192(%esp),%eax
-	adcl	196(%esp),%ebx
-	xorl	%esi,%ecx
-	xorl	%edi,%edx
-	movl	(%ebp),%esi
-	movl	4(%ebp),%edi
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	32(%esp),%ecx
-	movl	36(%esp),%edx
-	addl	%esi,%eax
-	adcl	%edi,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	%eax,32(%esp)
-	movl	%ebx,36(%esp)
-	movl	%ecx,%esi
-	shrl	$2,%ecx
-	movl	%edx,%edi
-	shrl	$2,%edx
-	movl	%ecx,%ebx
-	shll	$4,%esi
-	movl	%edx,%eax
-	shll	$4,%edi
-	xorl	%esi,%ebx
-	shrl	$5,%ecx
-	xorl	%edi,%eax
-	shrl	$5,%edx
-	xorl	%ecx,%ebx
-	shll	$21,%esi
-	xorl	%edx,%eax
-	shll	$21,%edi
-	xorl	%esi,%eax
-	shrl	$21,%ecx
-	xorl	%edi,%ebx
-	shrl	$21,%edx
-	xorl	%ecx,%eax
-	shll	$5,%esi
-	xorl	%edx,%ebx
-	shll	$5,%edi
-	xorl	%esi,%eax
-	xorl	%edi,%ebx
-	movl	8(%esp),%ecx
-	movl	12(%esp),%edx
-	movl	16(%esp),%esi
-	movl	20(%esp),%edi
-	addl	(%esp),%eax
-	adcl	4(%esp),%ebx
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	andl	24(%esp),%ecx
-	andl	28(%esp),%edx
-	andl	8(%esp),%esi
-	andl	12(%esp),%edi
-	orl	%esi,%ecx
-	orl	%edi,%edx
-	addl	%ecx,%eax
-	adcl	%edx,%ebx
-	movl	%eax,(%esp)
-	movl	%ebx,4(%esp)
-	movb	(%ebp),%dl
-	subl	$8,%esp
-	leal	8(%ebp),%ebp
-	cmpb	$23,%dl
-	jne	L01016_79_x86
-	movl	840(%esp),%esi
-	movl	844(%esp),%edi
-	movl	(%esi),%eax
-	movl	4(%esi),%ebx
-	movl	8(%esi),%ecx
-	movl	12(%esi),%edx
-	addl	8(%esp),%eax
-	adcl	12(%esp),%ebx
-	movl	%eax,(%esi)
-	movl	%ebx,4(%esi)
-	addl	16(%esp),%ecx
-	adcl	20(%esp),%edx
-	movl	%ecx,8(%esi)
-	movl	%edx,12(%esi)
-	movl	16(%esi),%eax
-	movl	20(%esi),%ebx
-	movl	24(%esi),%ecx
-	movl	28(%esi),%edx
-	addl	24(%esp),%eax
-	adcl	28(%esp),%ebx
-	movl	%eax,16(%esi)
-	movl	%ebx,20(%esi)
-	addl	32(%esp),%ecx
-	adcl	36(%esp),%edx
-	movl	%ecx,24(%esi)
-	movl	%edx,28(%esi)
-	movl	32(%esi),%eax
-	movl	36(%esi),%ebx
-	movl	40(%esi),%ecx
-	movl	44(%esi),%edx
-	addl	40(%esp),%eax
-	adcl	44(%esp),%ebx
-	movl	%eax,32(%esi)
-	movl	%ebx,36(%esi)
-	addl	48(%esp),%ecx
-	adcl	52(%esp),%edx
-	movl	%ecx,40(%esi)
-	movl	%edx,44(%esi)
-	movl	48(%esi),%eax
-	movl	52(%esi),%ebx
-	movl	56(%esi),%ecx
-	movl	60(%esi),%edx
-	addl	56(%esp),%eax
-	adcl	60(%esp),%ebx
-	movl	%eax,48(%esi)
-	movl	%ebx,52(%esi)
-	addl	64(%esp),%ecx
-	adcl	68(%esp),%edx
-	movl	%ecx,56(%esi)
-	movl	%edx,60(%esi)
-	addl	$840,%esp
-	subl	$640,%ebp
-	cmpl	8(%esp),%edi
-	jb	L002loop_x86
-	movl	12(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.align	6,0x90
-L001K512:
-.long	3609767458,1116352408
-.long	602891725,1899447441
-.long	3964484399,3049323471
-.long	2173295548,3921009573
-.long	4081628472,961987163
-.long	3053834265,1508970993
-.long	2937671579,2453635748
-.long	3664609560,2870763221
-.long	2734883394,3624381080
-.long	1164996542,310598401
-.long	1323610764,607225278
-.long	3590304994,1426881987
-.long	4068182383,1925078388
-.long	991336113,2162078206
-.long	633803317,2614888103
-.long	3479774868,3248222580
-.long	2666613458,3835390401
-.long	944711139,4022224774
-.long	2341262773,264347078
-.long	2007800933,604807628
-.long	1495990901,770255983
-.long	1856431235,1249150122
-.long	3175218132,1555081692
-.long	2198950837,1996064986
-.long	3999719339,2554220882
-.long	766784016,2821834349
-.long	2566594879,2952996808
-.long	3203337956,3210313671
-.long	1034457026,3336571891
-.long	2466948901,3584528711
-.long	3758326383,113926993
-.long	168717936,338241895
-.long	1188179964,666307205
-.long	1546045734,773529912
-.long	1522805485,1294757372
-.long	2643833823,1396182291
-.long	2343527390,1695183700
-.long	1014477480,1986661051
-.long	1206759142,2177026350
-.long	344077627,2456956037
-.long	1290863460,2730485921
-.long	3158454273,2820302411
-.long	3505952657,3259730800
-.long	106217008,3345764771
-.long	3606008344,3516065817
-.long	1432725776,3600352804
-.long	1467031594,4094571909
-.long	851169720,275423344
-.long	3100823752,430227734
-.long	1363258195,506948616
-.long	3750685593,659060556
-.long	3785050280,883997877
-.long	3318307427,958139571
-.long	3812723403,1322822218
-.long	2003034995,1537002063
-.long	3602036899,1747873779
-.long	1575990012,1955562222
-.long	1125592928,2024104815
-.long	2716904306,2227730452
-.long	442776044,2361852424
-.long	593698344,2428436474
-.long	3733110249,2756734187
-.long	2999351573,3204031479
-.long	3815920427,3329325298
-.long	3928383900,3391569614
-.long	566280711,3515267271
-.long	3454069534,3940187606
-.long	4000239992,4118630271
-.long	1914138554,116418474
-.long	2731055270,174292421
-.long	3203993006,289380356
-.long	320620315,460393269
-.long	587496836,685471733
-.long	1086792851,852142971
-.long	365543100,1017036298
-.long	2618297676,1126000580
-.long	3409855158,1288033470
-.long	4234509866,1501505948
-.long	987167468,1607167915
-.long	1246189591,1816402316
-.long	67438087,66051
-.long	202182159,134810123
-.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
-.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
-.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
-.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
-.byte	62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol	_OPENSSL_ia32cap_P
-.long	0
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/vpaes-x86-apple.S b/apple-x86/crypto/fipsmodule/vpaes-x86-apple.S
deleted file mode 100644
index 4d2c485..0000000
--- a/apple-x86/crypto/fipsmodule/vpaes-x86-apple.S
+++ /dev/null
@@ -1,680 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-#ifdef BORINGSSL_DISPATCH_TEST
-#endif
-.align	6,0x90
-L_vpaes_consts:
-.long	218628480,235210255,168496130,67568393
-.long	252381056,17041926,33884169,51187212
-.long	252645135,252645135,252645135,252645135
-.long	1512730624,3266504856,1377990664,3401244816
-.long	830229760,1275146365,2969422977,3447763452
-.long	3411033600,2979783055,338359620,2782886510
-.long	4209124096,907596821,221174255,1006095553
-.long	191964160,3799684038,3164090317,1589111125
-.long	182528256,1777043520,2877432650,3265356744
-.long	1874708224,3503451415,3305285752,363511674
-.long	1606117888,3487855781,1093350906,2384367825
-.long	197121,67569157,134941193,202313229
-.long	67569157,134941193,202313229,197121
-.long	134941193,202313229,197121,67569157
-.long	202313229,197121,67569157,134941193
-.long	33619971,100992007,168364043,235736079
-.long	235736079,33619971,100992007,168364043
-.long	168364043,235736079,33619971,100992007
-.long	100992007,168364043,235736079,33619971
-.long	50462976,117835012,185207048,252579084
-.long	252314880,51251460,117574920,184942860
-.long	184682752,252054788,50987272,118359308
-.long	118099200,185467140,251790600,50727180
-.long	2946363062,528716217,1300004225,1881839624
-.long	1532713819,1532713819,1532713819,1532713819
-.long	3602276352,4288629033,3737020424,4153884961
-.long	1354558464,32357713,2958822624,3775749553
-.long	1201988352,132424512,1572796698,503232858
-.long	2213177600,1597421020,4103937655,675398315
-.long	2749646592,4273543773,1511898873,121693092
-.long	3040248576,1103263732,2871565598,1608280554
-.long	2236667136,2588920351,482954393,64377734
-.long	3069987328,291237287,2117370568,3650299247
-.long	533321216,3573750986,2572112006,1401264716
-.long	1339849704,2721158661,548607111,3445553514
-.long	2128193280,3054596040,2183486460,1257083700
-.long	655635200,1165381986,3923443150,2344132524
-.long	190078720,256924420,290342170,357187870
-.long	1610966272,2263057382,4103205268,309794674
-.long	2592527872,2233205587,1335446729,3402964816
-.long	3973531904,3225098121,3002836325,1918774430
-.long	3870401024,2102906079,2284471353,4117666579
-.long	617007872,1021508343,366931923,691083277
-.long	2528395776,3491914898,2968704004,1613121270
-.long	3445188352,3247741094,844474987,4093578302
-.long	651481088,1190302358,1689581232,574775300
-.long	4289380608,206939853,2555985458,2489840491
-.long	2130264064,327674451,3566485037,3349835193
-.long	2470714624,316102159,3636825756,3393945945
-.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
-.byte	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
-.byte	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
-.byte	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
-.byte	118,101,114,115,105,116,121,41,0
-.align	6,0x90
-.private_extern	__vpaes_preheat
-.align	4
-__vpaes_preheat:
-	addl	(%esp),%ebp
-	movdqa	-48(%ebp),%xmm7
-	movdqa	-16(%ebp),%xmm6
-	ret
-.private_extern	__vpaes_encrypt_core
-.align	4
-__vpaes_encrypt_core:
-	movl	$16,%ecx
-	movl	240(%edx),%eax
-	movdqa	%xmm6,%xmm1
-	movdqa	(%ebp),%xmm2
-	pandn	%xmm0,%xmm1
-	pand	%xmm6,%xmm0
-	movdqu	(%edx),%xmm5
-.byte	102,15,56,0,208
-	movdqa	16(%ebp),%xmm0
-	pxor	%xmm5,%xmm2
-	psrld	$4,%xmm1
-	addl	$16,%edx
-.byte	102,15,56,0,193
-	leal	192(%ebp),%ebx
-	pxor	%xmm2,%xmm0
-	jmp	L000enc_entry
-.align	4,0x90
-L001enc_loop:
-	movdqa	32(%ebp),%xmm4
-	movdqa	48(%ebp),%xmm0
-.byte	102,15,56,0,226
-.byte	102,15,56,0,195
-	pxor	%xmm5,%xmm4
-	movdqa	64(%ebp),%xmm5
-	pxor	%xmm4,%xmm0
-	movdqa	-64(%ebx,%ecx,1),%xmm1
-.byte	102,15,56,0,234
-	movdqa	80(%ebp),%xmm2
-	movdqa	(%ebx,%ecx,1),%xmm4
-.byte	102,15,56,0,211
-	movdqa	%xmm0,%xmm3
-	pxor	%xmm5,%xmm2
-.byte	102,15,56,0,193
-	addl	$16,%edx
-	pxor	%xmm2,%xmm0
-.byte	102,15,56,0,220
-	addl	$16,%ecx
-	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,193
-	andl	$48,%ecx
-	subl	$1,%eax
-	pxor	%xmm3,%xmm0
-L000enc_entry:
-	movdqa	%xmm6,%xmm1
-	movdqa	-32(%ebp),%xmm5
-	pandn	%xmm0,%xmm1
-	psrld	$4,%xmm1
-	pand	%xmm6,%xmm0
-.byte	102,15,56,0,232
-	movdqa	%xmm7,%xmm3
-	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,217
-	movdqa	%xmm7,%xmm4
-	pxor	%xmm5,%xmm3
-.byte	102,15,56,0,224
-	movdqa	%xmm7,%xmm2
-	pxor	%xmm5,%xmm4
-.byte	102,15,56,0,211
-	movdqa	%xmm7,%xmm3
-	pxor	%xmm0,%xmm2
-.byte	102,15,56,0,220
-	movdqu	(%edx),%xmm5
-	pxor	%xmm1,%xmm3
-	jnz	L001enc_loop
-	movdqa	96(%ebp),%xmm4
-	movdqa	112(%ebp),%xmm0
-.byte	102,15,56,0,226
-	pxor	%xmm5,%xmm4
-.byte	102,15,56,0,195
-	movdqa	64(%ebx,%ecx,1),%xmm1
-	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,193
-	ret
-.private_extern	__vpaes_decrypt_core
-.align	4
-__vpaes_decrypt_core:
-	leal	608(%ebp),%ebx
-	movl	240(%edx),%eax
-	movdqa	%xmm6,%xmm1
-	movdqa	-64(%ebx),%xmm2
-	pandn	%xmm0,%xmm1
-	movl	%eax,%ecx
-	psrld	$4,%xmm1
-	movdqu	(%edx),%xmm5
-	shll	$4,%ecx
-	pand	%xmm6,%xmm0
-.byte	102,15,56,0,208
-	movdqa	-48(%ebx),%xmm0
-	xorl	$48,%ecx
-.byte	102,15,56,0,193
-	andl	$48,%ecx
-	pxor	%xmm5,%xmm2
-	movdqa	176(%ebp),%xmm5
-	pxor	%xmm2,%xmm0
-	addl	$16,%edx
-	leal	-352(%ebx,%ecx,1),%ecx
-	jmp	L002dec_entry
-.align	4,0x90
-L003dec_loop:
-	movdqa	-32(%ebx),%xmm4
-	movdqa	-16(%ebx),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,203
-	pxor	%xmm4,%xmm0
-	movdqa	(%ebx),%xmm4
-	pxor	%xmm1,%xmm0
-	movdqa	16(%ebx),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
-	pxor	%xmm4,%xmm0
-	movdqa	32(%ebx),%xmm4
-	pxor	%xmm1,%xmm0
-	movdqa	48(%ebx),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
-	pxor	%xmm4,%xmm0
-	movdqa	64(%ebx),%xmm4
-	pxor	%xmm1,%xmm0
-	movdqa	80(%ebx),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
-	pxor	%xmm4,%xmm0
-	addl	$16,%edx
-.byte	102,15,58,15,237,12
-	pxor	%xmm1,%xmm0
-	subl	$1,%eax
-L002dec_entry:
-	movdqa	%xmm6,%xmm1
-	movdqa	-32(%ebp),%xmm2
-	pandn	%xmm0,%xmm1
-	pand	%xmm6,%xmm0
-	psrld	$4,%xmm1
-.byte	102,15,56,0,208
-	movdqa	%xmm7,%xmm3
-	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,217
-	movdqa	%xmm7,%xmm4
-	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,224
-	pxor	%xmm2,%xmm4
-	movdqa	%xmm7,%xmm2
-.byte	102,15,56,0,211
-	movdqa	%xmm7,%xmm3
-	pxor	%xmm0,%xmm2
-.byte	102,15,56,0,220
-	movdqu	(%edx),%xmm0
-	pxor	%xmm1,%xmm3
-	jnz	L003dec_loop
-	movdqa	96(%ebx),%xmm4
-.byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	112(%ebx),%xmm0
-	movdqa	(%ecx),%xmm2
-.byte	102,15,56,0,195
-	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,194
-	ret
-.private_extern	__vpaes_schedule_core
-.align	4
-__vpaes_schedule_core:
-	addl	(%esp),%ebp
-	movdqu	(%esi),%xmm0
-	movdqa	320(%ebp),%xmm2
-	movdqa	%xmm0,%xmm3
-	leal	(%ebp),%ebx
-	movdqa	%xmm2,4(%esp)
-	call	__vpaes_schedule_transform
-	movdqa	%xmm0,%xmm7
-	testl	%edi,%edi
-	jnz	L004schedule_am_decrypting
-	movdqu	%xmm0,(%edx)
-	jmp	L005schedule_go
-L004schedule_am_decrypting:
-	movdqa	256(%ebp,%ecx,1),%xmm1
-.byte	102,15,56,0,217
-	movdqu	%xmm3,(%edx)
-	xorl	$48,%ecx
-L005schedule_go:
-	cmpl	$192,%eax
-	ja	L006schedule_256
-	je	L007schedule_192
-L008schedule_128:
-	movl	$10,%eax
-L009loop_schedule_128:
-	call	__vpaes_schedule_round
-	decl	%eax
-	jz	L010schedule_mangle_last
-	call	__vpaes_schedule_mangle
-	jmp	L009loop_schedule_128
-.align	4,0x90
-L007schedule_192:
-	movdqu	8(%esi),%xmm0
-	call	__vpaes_schedule_transform
-	movdqa	%xmm0,%xmm6
-	pxor	%xmm4,%xmm4
-	movhlps	%xmm4,%xmm6
-	movl	$4,%eax
-L011loop_schedule_192:
-	call	__vpaes_schedule_round
-.byte	102,15,58,15,198,8
-	call	__vpaes_schedule_mangle
-	call	__vpaes_schedule_192_smear
-	call	__vpaes_schedule_mangle
-	call	__vpaes_schedule_round
-	decl	%eax
-	jz	L010schedule_mangle_last
-	call	__vpaes_schedule_mangle
-	call	__vpaes_schedule_192_smear
-	jmp	L011loop_schedule_192
-.align	4,0x90
-L006schedule_256:
-	movdqu	16(%esi),%xmm0
-	call	__vpaes_schedule_transform
-	movl	$7,%eax
-L012loop_schedule_256:
-	call	__vpaes_schedule_mangle
-	movdqa	%xmm0,%xmm6
-	call	__vpaes_schedule_round
-	decl	%eax
-	jz	L010schedule_mangle_last
-	call	__vpaes_schedule_mangle
-	pshufd	$255,%xmm0,%xmm0
-	movdqa	%xmm7,20(%esp)
-	movdqa	%xmm6,%xmm7
-	call	L_vpaes_schedule_low_round
-	movdqa	20(%esp),%xmm7
-	jmp	L012loop_schedule_256
-.align	4,0x90
-L010schedule_mangle_last:
-	leal	384(%ebp),%ebx
-	testl	%edi,%edi
-	jnz	L013schedule_mangle_last_dec
-	movdqa	256(%ebp,%ecx,1),%xmm1
-.byte	102,15,56,0,193
-	leal	352(%ebp),%ebx
-	addl	$32,%edx
-L013schedule_mangle_last_dec:
-	addl	$-16,%edx
-	pxor	336(%ebp),%xmm0
-	call	__vpaes_schedule_transform
-	movdqu	%xmm0,(%edx)
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-	pxor	%xmm7,%xmm7
-	ret
-.private_extern	__vpaes_schedule_192_smear
-.align	4
-__vpaes_schedule_192_smear:
-	pshufd	$128,%xmm6,%xmm1
-	pshufd	$254,%xmm7,%xmm0
-	pxor	%xmm1,%xmm6
-	pxor	%xmm1,%xmm1
-	pxor	%xmm0,%xmm6
-	movdqa	%xmm6,%xmm0
-	movhlps	%xmm1,%xmm6
-	ret
-.private_extern	__vpaes_schedule_round
-.align	4
-__vpaes_schedule_round:
-	movdqa	8(%esp),%xmm2
-	pxor	%xmm1,%xmm1
-.byte	102,15,58,15,202,15
-.byte	102,15,58,15,210,15
-	pxor	%xmm1,%xmm7
-	pshufd	$255,%xmm0,%xmm0
-.byte	102,15,58,15,192,1
-	movdqa	%xmm2,8(%esp)
-L_vpaes_schedule_low_round:
-	movdqa	%xmm7,%xmm1
-	pslldq	$4,%xmm7
-	pxor	%xmm1,%xmm7
-	movdqa	%xmm7,%xmm1
-	pslldq	$8,%xmm7
-	pxor	%xmm1,%xmm7
-	pxor	336(%ebp),%xmm7
-	movdqa	-16(%ebp),%xmm4
-	movdqa	-48(%ebp),%xmm5
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm0,%xmm1
-	psrld	$4,%xmm1
-	pand	%xmm4,%xmm0
-	movdqa	-32(%ebp),%xmm2
-.byte	102,15,56,0,208
-	pxor	%xmm1,%xmm0
-	movdqa	%xmm5,%xmm3
-.byte	102,15,56,0,217
-	pxor	%xmm2,%xmm3
-	movdqa	%xmm5,%xmm4
-.byte	102,15,56,0,224
-	pxor	%xmm2,%xmm4
-	movdqa	%xmm5,%xmm2
-.byte	102,15,56,0,211
-	pxor	%xmm0,%xmm2
-	movdqa	%xmm5,%xmm3
-.byte	102,15,56,0,220
-	pxor	%xmm1,%xmm3
-	movdqa	32(%ebp),%xmm4
-.byte	102,15,56,0,226
-	movdqa	48(%ebp),%xmm0
-.byte	102,15,56,0,195
-	pxor	%xmm4,%xmm0
-	pxor	%xmm7,%xmm0
-	movdqa	%xmm0,%xmm7
-	ret
-.private_extern	__vpaes_schedule_transform
-.align	4
-__vpaes_schedule_transform:
-	movdqa	-16(%ebp),%xmm2
-	movdqa	%xmm2,%xmm1
-	pandn	%xmm0,%xmm1
-	psrld	$4,%xmm1
-	pand	%xmm2,%xmm0
-	movdqa	(%ebx),%xmm2
-.byte	102,15,56,0,208
-	movdqa	16(%ebx),%xmm0
-.byte	102,15,56,0,193
-	pxor	%xmm2,%xmm0
-	ret
-.private_extern	__vpaes_schedule_mangle
-.align	4
-__vpaes_schedule_mangle:
-	movdqa	%xmm0,%xmm4
-	movdqa	128(%ebp),%xmm5
-	testl	%edi,%edi
-	jnz	L014schedule_mangle_dec
-	addl	$16,%edx
-	pxor	336(%ebp),%xmm4
-.byte	102,15,56,0,229
-	movdqa	%xmm4,%xmm3
-.byte	102,15,56,0,229
-	pxor	%xmm4,%xmm3
-.byte	102,15,56,0,229
-	pxor	%xmm4,%xmm3
-	jmp	L015schedule_mangle_both
-.align	4,0x90
-L014schedule_mangle_dec:
-	movdqa	-16(%ebp),%xmm2
-	leal	416(%ebp),%esi
-	movdqa	%xmm2,%xmm1
-	pandn	%xmm4,%xmm1
-	psrld	$4,%xmm1
-	pand	%xmm2,%xmm4
-	movdqa	(%esi),%xmm2
-.byte	102,15,56,0,212
-	movdqa	16(%esi),%xmm3
-.byte	102,15,56,0,217
-	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
-	movdqa	32(%esi),%xmm2
-.byte	102,15,56,0,212
-	pxor	%xmm3,%xmm2
-	movdqa	48(%esi),%xmm3
-.byte	102,15,56,0,217
-	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
-	movdqa	64(%esi),%xmm2
-.byte	102,15,56,0,212
-	pxor	%xmm3,%xmm2
-	movdqa	80(%esi),%xmm3
-.byte	102,15,56,0,217
-	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
-	movdqa	96(%esi),%xmm2
-.byte	102,15,56,0,212
-	pxor	%xmm3,%xmm2
-	movdqa	112(%esi),%xmm3
-.byte	102,15,56,0,217
-	pxor	%xmm2,%xmm3
-	addl	$-16,%edx
-L015schedule_mangle_both:
-	movdqa	256(%ebp,%ecx,1),%xmm1
-.byte	102,15,56,0,217
-	addl	$-16,%ecx
-	andl	$48,%ecx
-	movdqu	%xmm3,(%edx)
-	ret
-.globl	_vpaes_set_encrypt_key
-.private_extern	_vpaes_set_encrypt_key
-.align	4
-_vpaes_set_encrypt_key:
-L_vpaes_set_encrypt_key_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-#ifdef BORINGSSL_DISPATCH_TEST
-	pushl	%ebx
-	pushl	%edx
-	call	L016pic
-L016pic:
-	popl	%ebx
-	leal	_BORINGSSL_function_hit+5-L016pic(%ebx),%ebx
-	movl	$1,%edx
-	movb	%dl,(%ebx)
-	popl	%edx
-	popl	%ebx
-#endif
-	movl	20(%esp),%esi
-	leal	-56(%esp),%ebx
-	movl	24(%esp),%eax
-	andl	$-16,%ebx
-	movl	28(%esp),%edx
-	xchgl	%esp,%ebx
-	movl	%ebx,48(%esp)
-	movl	%eax,%ebx
-	shrl	$5,%ebx
-	addl	$5,%ebx
-	movl	%ebx,240(%edx)
-	movl	$48,%ecx
-	movl	$0,%edi
-	leal	L_vpaes_consts+0x30-L017pic_point,%ebp
-	call	__vpaes_schedule_core
-L017pic_point:
-	movl	48(%esp),%esp
-	xorl	%eax,%eax
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_vpaes_set_decrypt_key
-.private_extern	_vpaes_set_decrypt_key
-.align	4
-_vpaes_set_decrypt_key:
-L_vpaes_set_decrypt_key_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%esi
-	leal	-56(%esp),%ebx
-	movl	24(%esp),%eax
-	andl	$-16,%ebx
-	movl	28(%esp),%edx
-	xchgl	%esp,%ebx
-	movl	%ebx,48(%esp)
-	movl	%eax,%ebx
-	shrl	$5,%ebx
-	addl	$5,%ebx
-	movl	%ebx,240(%edx)
-	shll	$4,%ebx
-	leal	16(%edx,%ebx,1),%edx
-	movl	$1,%edi
-	movl	%eax,%ecx
-	shrl	$1,%ecx
-	andl	$32,%ecx
-	xorl	$32,%ecx
-	leal	L_vpaes_consts+0x30-L018pic_point,%ebp
-	call	__vpaes_schedule_core
-L018pic_point:
-	movl	48(%esp),%esp
-	xorl	%eax,%eax
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_vpaes_encrypt
-.private_extern	_vpaes_encrypt
-.align	4
-_vpaes_encrypt:
-L_vpaes_encrypt_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-#ifdef BORINGSSL_DISPATCH_TEST
-	pushl	%ebx
-	pushl	%edx
-	call	L019pic
-L019pic:
-	popl	%ebx
-	leal	_BORINGSSL_function_hit+4-L019pic(%ebx),%ebx
-	movl	$1,%edx
-	movb	%dl,(%ebx)
-	popl	%edx
-	popl	%ebx
-#endif
-	leal	L_vpaes_consts+0x30-L020pic_point,%ebp
-	call	__vpaes_preheat
-L020pic_point:
-	movl	20(%esp),%esi
-	leal	-56(%esp),%ebx
-	movl	24(%esp),%edi
-	andl	$-16,%ebx
-	movl	28(%esp),%edx
-	xchgl	%esp,%ebx
-	movl	%ebx,48(%esp)
-	movdqu	(%esi),%xmm0
-	call	__vpaes_encrypt_core
-	movdqu	%xmm0,(%edi)
-	movl	48(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_vpaes_decrypt
-.private_extern	_vpaes_decrypt
-.align	4
-_vpaes_decrypt:
-L_vpaes_decrypt_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	leal	L_vpaes_consts+0x30-L021pic_point,%ebp
-	call	__vpaes_preheat
-L021pic_point:
-	movl	20(%esp),%esi
-	leal	-56(%esp),%ebx
-	movl	24(%esp),%edi
-	andl	$-16,%ebx
-	movl	28(%esp),%edx
-	xchgl	%esp,%ebx
-	movl	%ebx,48(%esp)
-	movdqu	(%esi),%xmm0
-	call	__vpaes_decrypt_core
-	movdqu	%xmm0,(%edi)
-	movl	48(%esp),%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_vpaes_cbc_encrypt
-.private_extern	_vpaes_cbc_encrypt
-.align	4
-_vpaes_cbc_encrypt:
-L_vpaes_cbc_encrypt_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	20(%esp),%esi
-	movl	24(%esp),%edi
-	movl	28(%esp),%eax
-	movl	32(%esp),%edx
-	subl	$16,%eax
-	jc	L022cbc_abort
-	leal	-56(%esp),%ebx
-	movl	36(%esp),%ebp
-	andl	$-16,%ebx
-	movl	40(%esp),%ecx
-	xchgl	%esp,%ebx
-	movdqu	(%ebp),%xmm1
-	subl	%esi,%edi
-	movl	%ebx,48(%esp)
-	movl	%edi,(%esp)
-	movl	%edx,4(%esp)
-	movl	%ebp,8(%esp)
-	movl	%eax,%edi
-	leal	L_vpaes_consts+0x30-L023pic_point,%ebp
-	call	__vpaes_preheat
-L023pic_point:
-	cmpl	$0,%ecx
-	je	L024cbc_dec_loop
-	jmp	L025cbc_enc_loop
-.align	4,0x90
-L025cbc_enc_loop:
-	movdqu	(%esi),%xmm0
-	pxor	%xmm1,%xmm0
-	call	__vpaes_encrypt_core
-	movl	(%esp),%ebx
-	movl	4(%esp),%edx
-	movdqa	%xmm0,%xmm1
-	movdqu	%xmm0,(%ebx,%esi,1)
-	leal	16(%esi),%esi
-	subl	$16,%edi
-	jnc	L025cbc_enc_loop
-	jmp	L026cbc_done
-.align	4,0x90
-L024cbc_dec_loop:
-	movdqu	(%esi),%xmm0
-	movdqa	%xmm1,16(%esp)
-	movdqa	%xmm0,32(%esp)
-	call	__vpaes_decrypt_core
-	movl	(%esp),%ebx
-	movl	4(%esp),%edx
-	pxor	16(%esp),%xmm0
-	movdqa	32(%esp),%xmm1
-	movdqu	%xmm0,(%ebx,%esi,1)
-	leal	16(%esi),%esi
-	subl	$16,%edi
-	jnc	L024cbc_dec_loop
-L026cbc_done:
-	movl	8(%esp),%ebx
-	movl	48(%esp),%esp
-	movdqu	%xmm1,(%ebx)
-L022cbc_abort:
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/fipsmodule/x86-mont-apple.S b/apple-x86/crypto/fipsmodule/x86-mont-apple.S
deleted file mode 100644
index f991f6c..0000000
--- a/apple-x86/crypto/fipsmodule/x86-mont-apple.S
+++ /dev/null
@@ -1,484 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl	_bn_mul_mont
-.private_extern	_bn_mul_mont
-.align	4
-_bn_mul_mont:
-L_bn_mul_mont_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	xorl	%eax,%eax
-	movl	40(%esp),%edi
-	cmpl	$4,%edi
-	jl	L000just_leave
-	leal	20(%esp),%esi
-	leal	24(%esp),%edx
-	addl	$2,%edi
-	negl	%edi
-	leal	-32(%esp,%edi,4),%ebp
-	negl	%edi
-	movl	%ebp,%eax
-	subl	%edx,%eax
-	andl	$2047,%eax
-	subl	%eax,%ebp
-	xorl	%ebp,%edx
-	andl	$2048,%edx
-	xorl	$2048,%edx
-	subl	%edx,%ebp
-	andl	$-64,%ebp
-	movl	%esp,%eax
-	subl	%ebp,%eax
-	andl	$-4096,%eax
-	movl	%esp,%edx
-	leal	(%ebp,%eax,1),%esp
-	movl	(%esp),%eax
-	cmpl	%ebp,%esp
-	ja	L001page_walk
-	jmp	L002page_walk_done
-.align	4,0x90
-L001page_walk:
-	leal	-4096(%esp),%esp
-	movl	(%esp),%eax
-	cmpl	%ebp,%esp
-	ja	L001page_walk
-L002page_walk_done:
-	movl	(%esi),%eax
-	movl	4(%esi),%ebx
-	movl	8(%esi),%ecx
-	movl	12(%esi),%ebp
-	movl	16(%esi),%esi
-	movl	(%esi),%esi
-	movl	%eax,4(%esp)
-	movl	%ebx,8(%esp)
-	movl	%ecx,12(%esp)
-	movl	%ebp,16(%esp)
-	movl	%esi,20(%esp)
-	leal	-3(%edi),%ebx
-	movl	%edx,24(%esp)
-	call	L003PIC_me_up
-L003PIC_me_up:
-	popl	%eax
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L003PIC_me_up(%eax),%eax
-	btl	$26,(%eax)
-	jnc	L004non_sse2
-	movl	$-1,%eax
-	movd	%eax,%mm7
-	movl	8(%esp),%esi
-	movl	12(%esp),%edi
-	movl	16(%esp),%ebp
-	xorl	%edx,%edx
-	xorl	%ecx,%ecx
-	movd	(%edi),%mm4
-	movd	(%esi),%mm5
-	movd	(%ebp),%mm3
-	pmuludq	%mm4,%mm5
-	movq	%mm5,%mm2
-	movq	%mm5,%mm0
-	pand	%mm7,%mm0
-	pmuludq	20(%esp),%mm5
-	pmuludq	%mm5,%mm3
-	paddq	%mm0,%mm3
-	movd	4(%ebp),%mm1
-	movd	4(%esi),%mm0
-	psrlq	$32,%mm2
-	psrlq	$32,%mm3
-	incl	%ecx
-.align	4,0x90
-L0051st:
-	pmuludq	%mm4,%mm0
-	pmuludq	%mm5,%mm1
-	paddq	%mm0,%mm2
-	paddq	%mm1,%mm3
-	movq	%mm2,%mm0
-	pand	%mm7,%mm0
-	movd	4(%ebp,%ecx,4),%mm1
-	paddq	%mm0,%mm3
-	movd	4(%esi,%ecx,4),%mm0
-	psrlq	$32,%mm2
-	movd	%mm3,28(%esp,%ecx,4)
-	psrlq	$32,%mm3
-	leal	1(%ecx),%ecx
-	cmpl	%ebx,%ecx
-	jl	L0051st
-	pmuludq	%mm4,%mm0
-	pmuludq	%mm5,%mm1
-	paddq	%mm0,%mm2
-	paddq	%mm1,%mm3
-	movq	%mm2,%mm0
-	pand	%mm7,%mm0
-	paddq	%mm0,%mm3
-	movd	%mm3,28(%esp,%ecx,4)
-	psrlq	$32,%mm2
-	psrlq	$32,%mm3
-	paddq	%mm2,%mm3
-	movq	%mm3,32(%esp,%ebx,4)
-	incl	%edx
-L006outer:
-	xorl	%ecx,%ecx
-	movd	(%edi,%edx,4),%mm4
-	movd	(%esi),%mm5
-	movd	32(%esp),%mm6
-	movd	(%ebp),%mm3
-	pmuludq	%mm4,%mm5
-	paddq	%mm6,%mm5
-	movq	%mm5,%mm0
-	movq	%mm5,%mm2
-	pand	%mm7,%mm0
-	pmuludq	20(%esp),%mm5
-	pmuludq	%mm5,%mm3
-	paddq	%mm0,%mm3
-	movd	36(%esp),%mm6
-	movd	4(%ebp),%mm1
-	movd	4(%esi),%mm0
-	psrlq	$32,%mm2
-	psrlq	$32,%mm3
-	paddq	%mm6,%mm2
-	incl	%ecx
-	decl	%ebx
-L007inner:
-	pmuludq	%mm4,%mm0
-	pmuludq	%mm5,%mm1
-	paddq	%mm0,%mm2
-	paddq	%mm1,%mm3
-	movq	%mm2,%mm0
-	movd	36(%esp,%ecx,4),%mm6
-	pand	%mm7,%mm0
-	movd	4(%ebp,%ecx,4),%mm1
-	paddq	%mm0,%mm3
-	movd	4(%esi,%ecx,4),%mm0
-	psrlq	$32,%mm2
-	movd	%mm3,28(%esp,%ecx,4)
-	psrlq	$32,%mm3
-	paddq	%mm6,%mm2
-	decl	%ebx
-	leal	1(%ecx),%ecx
-	jnz	L007inner
-	movl	%ecx,%ebx
-	pmuludq	%mm4,%mm0
-	pmuludq	%mm5,%mm1
-	paddq	%mm0,%mm2
-	paddq	%mm1,%mm3
-	movq	%mm2,%mm0
-	pand	%mm7,%mm0
-	paddq	%mm0,%mm3
-	movd	%mm3,28(%esp,%ecx,4)
-	psrlq	$32,%mm2
-	psrlq	$32,%mm3
-	movd	36(%esp,%ebx,4),%mm6
-	paddq	%mm2,%mm3
-	paddq	%mm6,%mm3
-	movq	%mm3,32(%esp,%ebx,4)
-	leal	1(%edx),%edx
-	cmpl	%ebx,%edx
-	jle	L006outer
-	emms
-	jmp	L008common_tail
-.align	4,0x90
-L004non_sse2:
-	movl	8(%esp),%esi
-	leal	1(%ebx),%ebp
-	movl	12(%esp),%edi
-	xorl	%ecx,%ecx
-	movl	%esi,%edx
-	andl	$1,%ebp
-	subl	%edi,%edx
-	leal	4(%edi,%ebx,4),%eax
-	orl	%edx,%ebp
-	movl	(%edi),%edi
-	jz	L009bn_sqr_mont
-	movl	%eax,28(%esp)
-	movl	(%esi),%eax
-	xorl	%edx,%edx
-.align	4,0x90
-L010mull:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%eax,%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	movl	(%esi,%ecx,4),%eax
-	cmpl	%ebx,%ecx
-	movl	%ebp,28(%esp,%ecx,4)
-	jl	L010mull
-	movl	%edx,%ebp
-	mull	%edi
-	movl	20(%esp),%edi
-	addl	%ebp,%eax
-	movl	16(%esp),%esi
-	adcl	$0,%edx
-	imull	32(%esp),%edi
-	movl	%eax,32(%esp,%ebx,4)
-	xorl	%ecx,%ecx
-	movl	%edx,36(%esp,%ebx,4)
-	movl	%ecx,40(%esp,%ebx,4)
-	movl	(%esi),%eax
-	mull	%edi
-	addl	32(%esp),%eax
-	movl	4(%esi),%eax
-	adcl	$0,%edx
-	incl	%ecx
-	jmp	L0112ndmadd
-.align	4,0x90
-L0121stmadd:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ecx,4),%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	cmpl	%ebx,%ecx
-	movl	%ebp,28(%esp,%ecx,4)
-	jl	L0121stmadd
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ebx,4),%eax
-	movl	20(%esp),%edi
-	adcl	$0,%edx
-	movl	16(%esp),%esi
-	addl	%eax,%ebp
-	adcl	$0,%edx
-	imull	32(%esp),%edi
-	xorl	%ecx,%ecx
-	addl	36(%esp,%ebx,4),%edx
-	movl	%ebp,32(%esp,%ebx,4)
-	adcl	$0,%ecx
-	movl	(%esi),%eax
-	movl	%edx,36(%esp,%ebx,4)
-	movl	%ecx,40(%esp,%ebx,4)
-	mull	%edi
-	addl	32(%esp),%eax
-	movl	4(%esi),%eax
-	adcl	$0,%edx
-	movl	$1,%ecx
-.align	4,0x90
-L0112ndmadd:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ecx,4),%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	cmpl	%ebx,%ecx
-	movl	%ebp,24(%esp,%ecx,4)
-	jl	L0112ndmadd
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ebx,4),%ebp
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	adcl	$0,%edx
-	movl	%ebp,28(%esp,%ebx,4)
-	xorl	%eax,%eax
-	movl	12(%esp),%ecx
-	addl	36(%esp,%ebx,4),%edx
-	adcl	40(%esp,%ebx,4),%eax
-	leal	4(%ecx),%ecx
-	movl	%edx,32(%esp,%ebx,4)
-	cmpl	28(%esp),%ecx
-	movl	%eax,36(%esp,%ebx,4)
-	je	L008common_tail
-	movl	(%ecx),%edi
-	movl	8(%esp),%esi
-	movl	%ecx,12(%esp)
-	xorl	%ecx,%ecx
-	xorl	%edx,%edx
-	movl	(%esi),%eax
-	jmp	L0121stmadd
-.align	4,0x90
-L009bn_sqr_mont:
-	movl	%ebx,(%esp)
-	movl	%ecx,12(%esp)
-	movl	%edi,%eax
-	mull	%edi
-	movl	%eax,32(%esp)
-	movl	%edx,%ebx
-	shrl	$1,%edx
-	andl	$1,%ebx
-	incl	%ecx
-.align	4,0x90
-L013sqr:
-	movl	(%esi,%ecx,4),%eax
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%ebp,%eax
-	leal	1(%ecx),%ecx
-	adcl	$0,%edx
-	leal	(%ebx,%eax,2),%ebp
-	shrl	$31,%eax
-	cmpl	(%esp),%ecx
-	movl	%eax,%ebx
-	movl	%ebp,28(%esp,%ecx,4)
-	jl	L013sqr
-	movl	(%esi,%ecx,4),%eax
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%ebp,%eax
-	movl	20(%esp),%edi
-	adcl	$0,%edx
-	movl	16(%esp),%esi
-	leal	(%ebx,%eax,2),%ebp
-	imull	32(%esp),%edi
-	shrl	$31,%eax
-	movl	%ebp,32(%esp,%ecx,4)
-	leal	(%eax,%edx,2),%ebp
-	movl	(%esi),%eax
-	shrl	$31,%edx
-	movl	%ebp,36(%esp,%ecx,4)
-	movl	%edx,40(%esp,%ecx,4)
-	mull	%edi
-	addl	32(%esp),%eax
-	movl	%ecx,%ebx
-	adcl	$0,%edx
-	movl	4(%esi),%eax
-	movl	$1,%ecx
-.align	4,0x90
-L0143rdmadd:
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ecx,4),%ebp
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	4(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	movl	%ebp,28(%esp,%ecx,4)
-	movl	%edx,%ebp
-	mull	%edi
-	addl	36(%esp,%ecx,4),%ebp
-	leal	2(%ecx),%ecx
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	movl	(%esi,%ecx,4),%eax
-	adcl	$0,%edx
-	cmpl	%ebx,%ecx
-	movl	%ebp,24(%esp,%ecx,4)
-	jl	L0143rdmadd
-	movl	%edx,%ebp
-	mull	%edi
-	addl	32(%esp,%ebx,4),%ebp
-	adcl	$0,%edx
-	addl	%eax,%ebp
-	adcl	$0,%edx
-	movl	%ebp,28(%esp,%ebx,4)
-	movl	12(%esp),%ecx
-	xorl	%eax,%eax
-	movl	8(%esp),%esi
-	addl	36(%esp,%ebx,4),%edx
-	adcl	40(%esp,%ebx,4),%eax
-	movl	%edx,32(%esp,%ebx,4)
-	cmpl	%ebx,%ecx
-	movl	%eax,36(%esp,%ebx,4)
-	je	L008common_tail
-	movl	4(%esi,%ecx,4),%edi
-	leal	1(%ecx),%ecx
-	movl	%edi,%eax
-	movl	%ecx,12(%esp)
-	mull	%edi
-	addl	32(%esp,%ecx,4),%eax
-	adcl	$0,%edx
-	movl	%eax,32(%esp,%ecx,4)
-	xorl	%ebp,%ebp
-	cmpl	%ebx,%ecx
-	leal	1(%ecx),%ecx
-	je	L015sqrlast
-	movl	%edx,%ebx
-	shrl	$1,%edx
-	andl	$1,%ebx
-.align	4,0x90
-L016sqradd:
-	movl	(%esi,%ecx,4),%eax
-	movl	%edx,%ebp
-	mull	%edi
-	addl	%ebp,%eax
-	leal	(%eax,%eax,1),%ebp
-	adcl	$0,%edx
-	shrl	$31,%eax
-	addl	32(%esp,%ecx,4),%ebp
-	leal	1(%ecx),%ecx
-	adcl	$0,%eax
-	addl	%ebx,%ebp
-	adcl	$0,%eax
-	cmpl	(%esp),%ecx
-	movl	%ebp,28(%esp,%ecx,4)
-	movl	%eax,%ebx
-	jle	L016sqradd
-	movl	%edx,%ebp
-	addl	%edx,%edx
-	shrl	$31,%ebp
-	addl	%ebx,%edx
-	adcl	$0,%ebp
-L015sqrlast:
-	movl	20(%esp),%edi
-	movl	16(%esp),%esi
-	imull	32(%esp),%edi
-	addl	32(%esp,%ecx,4),%edx
-	movl	(%esi),%eax
-	adcl	$0,%ebp
-	movl	%edx,32(%esp,%ecx,4)
-	movl	%ebp,36(%esp,%ecx,4)
-	mull	%edi
-	addl	32(%esp),%eax
-	leal	-1(%ecx),%ebx
-	adcl	$0,%edx
-	movl	$1,%ecx
-	movl	4(%esi),%eax
-	jmp	L0143rdmadd
-.align	4,0x90
-L008common_tail:
-	movl	16(%esp),%ebp
-	movl	4(%esp),%edi
-	leal	32(%esp),%esi
-	movl	(%esi),%eax
-	movl	%ebx,%ecx
-	xorl	%edx,%edx
-.align	4,0x90
-L017sub:
-	sbbl	(%ebp,%edx,4),%eax
-	movl	%eax,(%edi,%edx,4)
-	decl	%ecx
-	movl	4(%esi,%edx,4),%eax
-	leal	1(%edx),%edx
-	jge	L017sub
-	sbbl	$0,%eax
-	movl	$-1,%edx
-	xorl	%eax,%edx
-	jmp	L018copy
-.align	4,0x90
-L018copy:
-	movl	32(%esp,%ebx,4),%esi
-	movl	(%edi,%ebx,4),%ebp
-	movl	%ecx,32(%esp,%ebx,4)
-	andl	%eax,%esi
-	andl	%edx,%ebp
-	orl	%esi,%ebp
-	movl	%ebp,(%edi,%ebx,4)
-	decl	%ebx
-	jge	L018copy
-	movl	24(%esp),%esp
-	movl	$1,%eax
-L000just_leave:
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
-.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
-.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
-.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
-.byte	111,114,103,62,0
-.section __IMPORT,__pointers,non_lazy_symbol_pointers
-L_OPENSSL_ia32cap_P$non_lazy_ptr:
-.indirect_symbol	_OPENSSL_ia32cap_P
-.long	0
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86/crypto/test/trampoline-x86-apple.S b/apple-x86/crypto/test/trampoline-x86-apple.S
deleted file mode 100644
index 4065b9a..0000000
--- a/apple-x86/crypto/test/trampoline-x86-apple.S
+++ /dev/null
@@ -1,168 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
-.text
-.globl	_abi_test_trampoline
-.private_extern	_abi_test_trampoline
-.align	4
-_abi_test_trampoline:
-L_abi_test_trampoline_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	movl	24(%esp),%ecx
-	movl	(%ecx),%esi
-	movl	4(%ecx),%edi
-	movl	8(%ecx),%ebx
-	movl	12(%ecx),%ebp
-	subl	$44,%esp
-	movl	72(%esp),%eax
-	xorl	%ecx,%ecx
-L000loop:
-	cmpl	76(%esp),%ecx
-	jae	L001loop_done
-	movl	(%eax,%ecx,4),%edx
-	movl	%edx,(%esp,%ecx,4)
-	addl	$1,%ecx
-	jmp	L000loop
-L001loop_done:
-	call	*64(%esp)
-	addl	$44,%esp
-	movl	24(%esp),%ecx
-	movl	%esi,(%ecx)
-	movl	%edi,4(%ecx)
-	movl	%ebx,8(%ecx)
-	movl	%ebp,12(%ecx)
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_abi_test_get_and_clear_direction_flag
-.private_extern	_abi_test_get_and_clear_direction_flag
-.align	4
-_abi_test_get_and_clear_direction_flag:
-L_abi_test_get_and_clear_direction_flag_begin:
-	pushfl
-	popl	%eax
-	andl	$1024,%eax
-	shrl	$10,%eax
-	cld
-	ret
-.globl	_abi_test_set_direction_flag
-.private_extern	_abi_test_set_direction_flag
-.align	4
-_abi_test_set_direction_flag:
-L_abi_test_set_direction_flag_begin:
-	std
-	ret
-.globl	_abi_test_clobber_eax
-.private_extern	_abi_test_clobber_eax
-.align	4
-_abi_test_clobber_eax:
-L_abi_test_clobber_eax_begin:
-	xorl	%eax,%eax
-	ret
-.globl	_abi_test_clobber_ebx
-.private_extern	_abi_test_clobber_ebx
-.align	4
-_abi_test_clobber_ebx:
-L_abi_test_clobber_ebx_begin:
-	xorl	%ebx,%ebx
-	ret
-.globl	_abi_test_clobber_ecx
-.private_extern	_abi_test_clobber_ecx
-.align	4
-_abi_test_clobber_ecx:
-L_abi_test_clobber_ecx_begin:
-	xorl	%ecx,%ecx
-	ret
-.globl	_abi_test_clobber_edx
-.private_extern	_abi_test_clobber_edx
-.align	4
-_abi_test_clobber_edx:
-L_abi_test_clobber_edx_begin:
-	xorl	%edx,%edx
-	ret
-.globl	_abi_test_clobber_edi
-.private_extern	_abi_test_clobber_edi
-.align	4
-_abi_test_clobber_edi:
-L_abi_test_clobber_edi_begin:
-	xorl	%edi,%edi
-	ret
-.globl	_abi_test_clobber_esi
-.private_extern	_abi_test_clobber_esi
-.align	4
-_abi_test_clobber_esi:
-L_abi_test_clobber_esi_begin:
-	xorl	%esi,%esi
-	ret
-.globl	_abi_test_clobber_ebp
-.private_extern	_abi_test_clobber_ebp
-.align	4
-_abi_test_clobber_ebp:
-L_abi_test_clobber_ebp_begin:
-	xorl	%ebp,%ebp
-	ret
-.globl	_abi_test_clobber_xmm0
-.private_extern	_abi_test_clobber_xmm0
-.align	4
-_abi_test_clobber_xmm0:
-L_abi_test_clobber_xmm0_begin:
-	pxor	%xmm0,%xmm0
-	ret
-.globl	_abi_test_clobber_xmm1
-.private_extern	_abi_test_clobber_xmm1
-.align	4
-_abi_test_clobber_xmm1:
-L_abi_test_clobber_xmm1_begin:
-	pxor	%xmm1,%xmm1
-	ret
-.globl	_abi_test_clobber_xmm2
-.private_extern	_abi_test_clobber_xmm2
-.align	4
-_abi_test_clobber_xmm2:
-L_abi_test_clobber_xmm2_begin:
-	pxor	%xmm2,%xmm2
-	ret
-.globl	_abi_test_clobber_xmm3
-.private_extern	_abi_test_clobber_xmm3
-.align	4
-_abi_test_clobber_xmm3:
-L_abi_test_clobber_xmm3_begin:
-	pxor	%xmm3,%xmm3
-	ret
-.globl	_abi_test_clobber_xmm4
-.private_extern	_abi_test_clobber_xmm4
-.align	4
-_abi_test_clobber_xmm4:
-L_abi_test_clobber_xmm4_begin:
-	pxor	%xmm4,%xmm4
-	ret
-.globl	_abi_test_clobber_xmm5
-.private_extern	_abi_test_clobber_xmm5
-.align	4
-_abi_test_clobber_xmm5:
-L_abi_test_clobber_xmm5_begin:
-	pxor	%xmm5,%xmm5
-	ret
-.globl	_abi_test_clobber_xmm6
-.private_extern	_abi_test_clobber_xmm6
-.align	4
-_abi_test_clobber_xmm6:
-L_abi_test_clobber_xmm6_begin:
-	pxor	%xmm6,%xmm6
-	ret
-.globl	_abi_test_clobber_xmm7
-.private_extern	_abi_test_clobber_xmm7
-.align	4
-_abi_test_clobber_xmm7:
-L_abi_test_clobber_xmm7_begin:
-	pxor	%xmm7,%xmm7
-	ret
-#endif  // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/apple-x86_64/crypto/chacha/chacha-x86_64-apple.S b/apple-x86_64/crypto/chacha/chacha-x86_64-apple.S
deleted file mode 100644
index 2c46926..0000000
--- a/apple-x86_64/crypto/chacha/chacha-x86_64-apple.S
+++ /dev/null
@@ -1,1621 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-
-.section	__DATA,__const
-.p2align	6
-L$zero:
-.long	0,0,0,0
-L$one:
-.long	1,0,0,0
-L$inc:
-.long	0,1,2,3
-L$four:
-.long	4,4,4,4
-L$incy:
-.long	0,2,4,6,1,3,5,7
-L$eight:
-.long	8,8,8,8,8,8,8,8
-L$rot16:
-.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
-L$rot24:
-.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
-L$sigma:
-.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
-.p2align	6
-L$zeroz:
-.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
-L$fourz:
-.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
-L$incz:
-.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
-L$sixteen:
-.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
-.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.text	
-.globl	_ChaCha20_ctr32
-.private_extern _ChaCha20_ctr32
-
-.p2align	6
-_ChaCha20_ctr32:
-
-_CET_ENDBR
-	cmpq	$0,%rdx
-	je	L$no_data
-	movq	_OPENSSL_ia32cap_P+4(%rip),%r10
-	testl	$512,%r10d
-	jnz	L$ChaCha20_ssse3
-
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	subq	$64+24,%rsp
-
-L$ctr32_body:
-
-
-	movdqu	(%rcx),%xmm1
-	movdqu	16(%rcx),%xmm2
-	movdqu	(%r8),%xmm3
-	movdqa	L$one(%rip),%xmm4
-
-
-	movdqa	%xmm1,16(%rsp)
-	movdqa	%xmm2,32(%rsp)
-	movdqa	%xmm3,48(%rsp)
-	movq	%rdx,%rbp
-	jmp	L$oop_outer
-
-.p2align	5
-L$oop_outer:
-	movl	$0x61707865,%eax
-	movl	$0x3320646e,%ebx
-	movl	$0x79622d32,%ecx
-	movl	$0x6b206574,%edx
-	movl	16(%rsp),%r8d
-	movl	20(%rsp),%r9d
-	movl	24(%rsp),%r10d
-	movl	28(%rsp),%r11d
-	movd	%xmm3,%r12d
-	movl	52(%rsp),%r13d
-	movl	56(%rsp),%r14d
-	movl	60(%rsp),%r15d
-
-	movq	%rbp,64+0(%rsp)
-	movl	$10,%ebp
-	movq	%rsi,64+8(%rsp)
-.byte	102,72,15,126,214
-	movq	%rdi,64+16(%rsp)
-	movq	%rsi,%rdi
-	shrq	$32,%rdi
-	jmp	L$oop
-
-.p2align	5
-L$oop:
-	addl	%r8d,%eax
-	xorl	%eax,%r12d
-	roll	$16,%r12d
-	addl	%r9d,%ebx
-	xorl	%ebx,%r13d
-	roll	$16,%r13d
-	addl	%r12d,%esi
-	xorl	%esi,%r8d
-	roll	$12,%r8d
-	addl	%r13d,%edi
-	xorl	%edi,%r9d
-	roll	$12,%r9d
-	addl	%r8d,%eax
-	xorl	%eax,%r12d
-	roll	$8,%r12d
-	addl	%r9d,%ebx
-	xorl	%ebx,%r13d
-	roll	$8,%r13d
-	addl	%r12d,%esi
-	xorl	%esi,%r8d
-	roll	$7,%r8d
-	addl	%r13d,%edi
-	xorl	%edi,%r9d
-	roll	$7,%r9d
-	movl	%esi,32(%rsp)
-	movl	%edi,36(%rsp)
-	movl	40(%rsp),%esi
-	movl	44(%rsp),%edi
-	addl	%r10d,%ecx
-	xorl	%ecx,%r14d
-	roll	$16,%r14d
-	addl	%r11d,%edx
-	xorl	%edx,%r15d
-	roll	$16,%r15d
-	addl	%r14d,%esi
-	xorl	%esi,%r10d
-	roll	$12,%r10d
-	addl	%r15d,%edi
-	xorl	%edi,%r11d
-	roll	$12,%r11d
-	addl	%r10d,%ecx
-	xorl	%ecx,%r14d
-	roll	$8,%r14d
-	addl	%r11d,%edx
-	xorl	%edx,%r15d
-	roll	$8,%r15d
-	addl	%r14d,%esi
-	xorl	%esi,%r10d
-	roll	$7,%r10d
-	addl	%r15d,%edi
-	xorl	%edi,%r11d
-	roll	$7,%r11d
-	addl	%r9d,%eax
-	xorl	%eax,%r15d
-	roll	$16,%r15d
-	addl	%r10d,%ebx
-	xorl	%ebx,%r12d
-	roll	$16,%r12d
-	addl	%r15d,%esi
-	xorl	%esi,%r9d
-	roll	$12,%r9d
-	addl	%r12d,%edi
-	xorl	%edi,%r10d
-	roll	$12,%r10d
-	addl	%r9d,%eax
-	xorl	%eax,%r15d
-	roll	$8,%r15d
-	addl	%r10d,%ebx
-	xorl	%ebx,%r12d
-	roll	$8,%r12d
-	addl	%r15d,%esi
-	xorl	%esi,%r9d
-	roll	$7,%r9d
-	addl	%r12d,%edi
-	xorl	%edi,%r10d
-	roll	$7,%r10d
-	movl	%esi,40(%rsp)
-	movl	%edi,44(%rsp)
-	movl	32(%rsp),%esi
-	movl	36(%rsp),%edi
-	addl	%r11d,%ecx
-	xorl	%ecx,%r13d
-	roll	$16,%r13d
-	addl	%r8d,%edx
-	xorl	%edx,%r14d
-	roll	$16,%r14d
-	addl	%r13d,%esi
-	xorl	%esi,%r11d
-	roll	$12,%r11d
-	addl	%r14d,%edi
-	xorl	%edi,%r8d
-	roll	$12,%r8d
-	addl	%r11d,%ecx
-	xorl	%ecx,%r13d
-	roll	$8,%r13d
-	addl	%r8d,%edx
-	xorl	%edx,%r14d
-	roll	$8,%r14d
-	addl	%r13d,%esi
-	xorl	%esi,%r11d
-	roll	$7,%r11d
-	addl	%r14d,%edi
-	xorl	%edi,%r8d
-	roll	$7,%r8d
-	decl	%ebp
-	jnz	L$oop
-	movl	%edi,36(%rsp)
-	movl	%esi,32(%rsp)
-	movq	64(%rsp),%rbp
-	movdqa	%xmm2,%xmm1
-	movq	64+8(%rsp),%rsi
-	paddd	%xmm4,%xmm3
-	movq	64+16(%rsp),%rdi
-
-	addl	$0x61707865,%eax
-	addl	$0x3320646e,%ebx
-	addl	$0x79622d32,%ecx
-	addl	$0x6b206574,%edx
-	addl	16(%rsp),%r8d
-	addl	20(%rsp),%r9d
-	addl	24(%rsp),%r10d
-	addl	28(%rsp),%r11d
-	addl	48(%rsp),%r12d
-	addl	52(%rsp),%r13d
-	addl	56(%rsp),%r14d
-	addl	60(%rsp),%r15d
-	paddd	32(%rsp),%xmm1
-
-	cmpq	$64,%rbp
-	jb	L$tail
-
-	xorl	0(%rsi),%eax
-	xorl	4(%rsi),%ebx
-	xorl	8(%rsi),%ecx
-	xorl	12(%rsi),%edx
-	xorl	16(%rsi),%r8d
-	xorl	20(%rsi),%r9d
-	xorl	24(%rsi),%r10d
-	xorl	28(%rsi),%r11d
-	movdqu	32(%rsi),%xmm0
-	xorl	48(%rsi),%r12d
-	xorl	52(%rsi),%r13d
-	xorl	56(%rsi),%r14d
-	xorl	60(%rsi),%r15d
-	leaq	64(%rsi),%rsi
-	pxor	%xmm1,%xmm0
-
-	movdqa	%xmm2,32(%rsp)
-	movd	%xmm3,48(%rsp)
-
-	movl	%eax,0(%rdi)
-	movl	%ebx,4(%rdi)
-	movl	%ecx,8(%rdi)
-	movl	%edx,12(%rdi)
-	movl	%r8d,16(%rdi)
-	movl	%r9d,20(%rdi)
-	movl	%r10d,24(%rdi)
-	movl	%r11d,28(%rdi)
-	movdqu	%xmm0,32(%rdi)
-	movl	%r12d,48(%rdi)
-	movl	%r13d,52(%rdi)
-	movl	%r14d,56(%rdi)
-	movl	%r15d,60(%rdi)
-	leaq	64(%rdi),%rdi
-
-	subq	$64,%rbp
-	jnz	L$oop_outer
-
-	jmp	L$done
-
-.p2align	4
-L$tail:
-	movl	%eax,0(%rsp)
-	movl	%ebx,4(%rsp)
-	xorq	%rbx,%rbx
-	movl	%ecx,8(%rsp)
-	movl	%edx,12(%rsp)
-	movl	%r8d,16(%rsp)
-	movl	%r9d,20(%rsp)
-	movl	%r10d,24(%rsp)
-	movl	%r11d,28(%rsp)
-	movdqa	%xmm1,32(%rsp)
-	movl	%r12d,48(%rsp)
-	movl	%r13d,52(%rsp)
-	movl	%r14d,56(%rsp)
-	movl	%r15d,60(%rsp)
-
-L$oop_tail:
-	movzbl	(%rsi,%rbx,1),%eax
-	movzbl	(%rsp,%rbx,1),%edx
-	leaq	1(%rbx),%rbx
-	xorl	%edx,%eax
-	movb	%al,-1(%rdi,%rbx,1)
-	decq	%rbp
-	jnz	L$oop_tail
-
-L$done:
-	leaq	64+24+48(%rsp),%rsi
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbp
-
-	movq	-8(%rsi),%rbx
-
-	leaq	(%rsi),%rsp
-
-L$no_data:
-	ret
-
-
-
-.p2align	5
-ChaCha20_ssse3:
-L$ChaCha20_ssse3:
-
-	movq	%rsp,%r9
-
-	cmpq	$128,%rdx
-	ja	L$ChaCha20_4x
-
-L$do_sse3_after_all:
-	subq	$64+8,%rsp
-	movdqa	L$sigma(%rip),%xmm0
-	movdqu	(%rcx),%xmm1
-	movdqu	16(%rcx),%xmm2
-	movdqu	(%r8),%xmm3
-	movdqa	L$rot16(%rip),%xmm6
-	movdqa	L$rot24(%rip),%xmm7
-
-	movdqa	%xmm0,0(%rsp)
-	movdqa	%xmm1,16(%rsp)
-	movdqa	%xmm2,32(%rsp)
-	movdqa	%xmm3,48(%rsp)
-	movq	$10,%r8
-	jmp	L$oop_ssse3
-
-.p2align	5
-L$oop_outer_ssse3:
-	movdqa	L$one(%rip),%xmm3
-	movdqa	0(%rsp),%xmm0
-	movdqa	16(%rsp),%xmm1
-	movdqa	32(%rsp),%xmm2
-	paddd	48(%rsp),%xmm3
-	movq	$10,%r8
-	movdqa	%xmm3,48(%rsp)
-	jmp	L$oop_ssse3
-
-.p2align	5
-L$oop_ssse3:
-	paddd	%xmm1,%xmm0
-	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,222
-	paddd	%xmm3,%xmm2
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm1,%xmm4
-	psrld	$20,%xmm1
-	pslld	$12,%xmm4
-	por	%xmm4,%xmm1
-	paddd	%xmm1,%xmm0
-	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,223
-	paddd	%xmm3,%xmm2
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm1,%xmm4
-	psrld	$25,%xmm1
-	pslld	$7,%xmm4
-	por	%xmm4,%xmm1
-	pshufd	$78,%xmm2,%xmm2
-	pshufd	$57,%xmm1,%xmm1
-	pshufd	$147,%xmm3,%xmm3
-	nop
-	paddd	%xmm1,%xmm0
-	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,222
-	paddd	%xmm3,%xmm2
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm1,%xmm4
-	psrld	$20,%xmm1
-	pslld	$12,%xmm4
-	por	%xmm4,%xmm1
-	paddd	%xmm1,%xmm0
-	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,223
-	paddd	%xmm3,%xmm2
-	pxor	%xmm2,%xmm1
-	movdqa	%xmm1,%xmm4
-	psrld	$25,%xmm1
-	pslld	$7,%xmm4
-	por	%xmm4,%xmm1
-	pshufd	$78,%xmm2,%xmm2
-	pshufd	$147,%xmm1,%xmm1
-	pshufd	$57,%xmm3,%xmm3
-	decq	%r8
-	jnz	L$oop_ssse3
-	paddd	0(%rsp),%xmm0
-	paddd	16(%rsp),%xmm1
-	paddd	32(%rsp),%xmm2
-	paddd	48(%rsp),%xmm3
-
-	cmpq	$64,%rdx
-	jb	L$tail_ssse3
-
-	movdqu	0(%rsi),%xmm4
-	movdqu	16(%rsi),%xmm5
-	pxor	%xmm4,%xmm0
-	movdqu	32(%rsi),%xmm4
-	pxor	%xmm5,%xmm1
-	movdqu	48(%rsi),%xmm5
-	leaq	64(%rsi),%rsi
-	pxor	%xmm4,%xmm2
-	pxor	%xmm5,%xmm3
-
-	movdqu	%xmm0,0(%rdi)
-	movdqu	%xmm1,16(%rdi)
-	movdqu	%xmm2,32(%rdi)
-	movdqu	%xmm3,48(%rdi)
-	leaq	64(%rdi),%rdi
-
-	subq	$64,%rdx
-	jnz	L$oop_outer_ssse3
-
-	jmp	L$done_ssse3
-
-.p2align	4
-L$tail_ssse3:
-	movdqa	%xmm0,0(%rsp)
-	movdqa	%xmm1,16(%rsp)
-	movdqa	%xmm2,32(%rsp)
-	movdqa	%xmm3,48(%rsp)
-	xorq	%r8,%r8
-
-L$oop_tail_ssse3:
-	movzbl	(%rsi,%r8,1),%eax
-	movzbl	(%rsp,%r8,1),%ecx
-	leaq	1(%r8),%r8
-	xorl	%ecx,%eax
-	movb	%al,-1(%rdi,%r8,1)
-	decq	%rdx
-	jnz	L$oop_tail_ssse3
-
-L$done_ssse3:
-	leaq	(%r9),%rsp
-
-L$ssse3_epilogue:
-	ret
-
-
-
-.p2align	5
-ChaCha20_4x:
-L$ChaCha20_4x:
-
-	movq	%rsp,%r9
-
-	movq	%r10,%r11
-	shrq	$32,%r10
-	testq	$32,%r10
-	jnz	L$ChaCha20_8x
-	cmpq	$192,%rdx
-	ja	L$proceed4x
-
-	andq	$71303168,%r11
-	cmpq	$4194304,%r11
-	je	L$do_sse3_after_all
-
-L$proceed4x:
-	subq	$0x140+8,%rsp
-	movdqa	L$sigma(%rip),%xmm11
-	movdqu	(%rcx),%xmm15
-	movdqu	16(%rcx),%xmm7
-	movdqu	(%r8),%xmm3
-	leaq	256(%rsp),%rcx
-	leaq	L$rot16(%rip),%r10
-	leaq	L$rot24(%rip),%r11
-
-	pshufd	$0x00,%xmm11,%xmm8
-	pshufd	$0x55,%xmm11,%xmm9
-	movdqa	%xmm8,64(%rsp)
-	pshufd	$0xaa,%xmm11,%xmm10
-	movdqa	%xmm9,80(%rsp)
-	pshufd	$0xff,%xmm11,%xmm11
-	movdqa	%xmm10,96(%rsp)
-	movdqa	%xmm11,112(%rsp)
-
-	pshufd	$0x00,%xmm15,%xmm12
-	pshufd	$0x55,%xmm15,%xmm13
-	movdqa	%xmm12,128-256(%rcx)
-	pshufd	$0xaa,%xmm15,%xmm14
-	movdqa	%xmm13,144-256(%rcx)
-	pshufd	$0xff,%xmm15,%xmm15
-	movdqa	%xmm14,160-256(%rcx)
-	movdqa	%xmm15,176-256(%rcx)
-
-	pshufd	$0x00,%xmm7,%xmm4
-	pshufd	$0x55,%xmm7,%xmm5
-	movdqa	%xmm4,192-256(%rcx)
-	pshufd	$0xaa,%xmm7,%xmm6
-	movdqa	%xmm5,208-256(%rcx)
-	pshufd	$0xff,%xmm7,%xmm7
-	movdqa	%xmm6,224-256(%rcx)
-	movdqa	%xmm7,240-256(%rcx)
-
-	pshufd	$0x00,%xmm3,%xmm0
-	pshufd	$0x55,%xmm3,%xmm1
-	paddd	L$inc(%rip),%xmm0
-	pshufd	$0xaa,%xmm3,%xmm2
-	movdqa	%xmm1,272-256(%rcx)
-	pshufd	$0xff,%xmm3,%xmm3
-	movdqa	%xmm2,288-256(%rcx)
-	movdqa	%xmm3,304-256(%rcx)
-
-	jmp	L$oop_enter4x
-
-.p2align	5
-L$oop_outer4x:
-	movdqa	64(%rsp),%xmm8
-	movdqa	80(%rsp),%xmm9
-	movdqa	96(%rsp),%xmm10
-	movdqa	112(%rsp),%xmm11
-	movdqa	128-256(%rcx),%xmm12
-	movdqa	144-256(%rcx),%xmm13
-	movdqa	160-256(%rcx),%xmm14
-	movdqa	176-256(%rcx),%xmm15
-	movdqa	192-256(%rcx),%xmm4
-	movdqa	208-256(%rcx),%xmm5
-	movdqa	224-256(%rcx),%xmm6
-	movdqa	240-256(%rcx),%xmm7
-	movdqa	256-256(%rcx),%xmm0
-	movdqa	272-256(%rcx),%xmm1
-	movdqa	288-256(%rcx),%xmm2
-	movdqa	304-256(%rcx),%xmm3
-	paddd	L$four(%rip),%xmm0
-
-L$oop_enter4x:
-	movdqa	%xmm6,32(%rsp)
-	movdqa	%xmm7,48(%rsp)
-	movdqa	(%r10),%xmm7
-	movl	$10,%eax
-	movdqa	%xmm0,256-256(%rcx)
-	jmp	L$oop4x
-
-.p2align	5
-L$oop4x:
-	paddd	%xmm12,%xmm8
-	paddd	%xmm13,%xmm9
-	pxor	%xmm8,%xmm0
-	pxor	%xmm9,%xmm1
-.byte	102,15,56,0,199
-.byte	102,15,56,0,207
-	paddd	%xmm0,%xmm4
-	paddd	%xmm1,%xmm5
-	pxor	%xmm4,%xmm12
-	pxor	%xmm5,%xmm13
-	movdqa	%xmm12,%xmm6
-	pslld	$12,%xmm12
-	psrld	$20,%xmm6
-	movdqa	%xmm13,%xmm7
-	pslld	$12,%xmm13
-	por	%xmm6,%xmm12
-	psrld	$20,%xmm7
-	movdqa	(%r11),%xmm6
-	por	%xmm7,%xmm13
-	paddd	%xmm12,%xmm8
-	paddd	%xmm13,%xmm9
-	pxor	%xmm8,%xmm0
-	pxor	%xmm9,%xmm1
-.byte	102,15,56,0,198
-.byte	102,15,56,0,206
-	paddd	%xmm0,%xmm4
-	paddd	%xmm1,%xmm5
-	pxor	%xmm4,%xmm12
-	pxor	%xmm5,%xmm13
-	movdqa	%xmm12,%xmm7
-	pslld	$7,%xmm12
-	psrld	$25,%xmm7
-	movdqa	%xmm13,%xmm6
-	pslld	$7,%xmm13
-	por	%xmm7,%xmm12
-	psrld	$25,%xmm6
-	movdqa	(%r10),%xmm7
-	por	%xmm6,%xmm13
-	movdqa	%xmm4,0(%rsp)
-	movdqa	%xmm5,16(%rsp)
-	movdqa	32(%rsp),%xmm4
-	movdqa	48(%rsp),%xmm5
-	paddd	%xmm14,%xmm10
-	paddd	%xmm15,%xmm11
-	pxor	%xmm10,%xmm2
-	pxor	%xmm11,%xmm3
-.byte	102,15,56,0,215
-.byte	102,15,56,0,223
-	paddd	%xmm2,%xmm4
-	paddd	%xmm3,%xmm5
-	pxor	%xmm4,%xmm14
-	pxor	%xmm5,%xmm15
-	movdqa	%xmm14,%xmm6
-	pslld	$12,%xmm14
-	psrld	$20,%xmm6
-	movdqa	%xmm15,%xmm7
-	pslld	$12,%xmm15
-	por	%xmm6,%xmm14
-	psrld	$20,%xmm7
-	movdqa	(%r11),%xmm6
-	por	%xmm7,%xmm15
-	paddd	%xmm14,%xmm10
-	paddd	%xmm15,%xmm11
-	pxor	%xmm10,%xmm2
-	pxor	%xmm11,%xmm3
-.byte	102,15,56,0,214
-.byte	102,15,56,0,222
-	paddd	%xmm2,%xmm4
-	paddd	%xmm3,%xmm5
-	pxor	%xmm4,%xmm14
-	pxor	%xmm5,%xmm15
-	movdqa	%xmm14,%xmm7
-	pslld	$7,%xmm14
-	psrld	$25,%xmm7
-	movdqa	%xmm15,%xmm6
-	pslld	$7,%xmm15
-	por	%xmm7,%xmm14
-	psrld	$25,%xmm6
-	movdqa	(%r10),%xmm7
-	por	%xmm6,%xmm15
-	paddd	%xmm13,%xmm8
-	paddd	%xmm14,%xmm9
-	pxor	%xmm8,%xmm3
-	pxor	%xmm9,%xmm0
-.byte	102,15,56,0,223
-.byte	102,15,56,0,199
-	paddd	%xmm3,%xmm4
-	paddd	%xmm0,%xmm5
-	pxor	%xmm4,%xmm13
-	pxor	%xmm5,%xmm14
-	movdqa	%xmm13,%xmm6
-	pslld	$12,%xmm13
-	psrld	$20,%xmm6
-	movdqa	%xmm14,%xmm7
-	pslld	$12,%xmm14
-	por	%xmm6,%xmm13
-	psrld	$20,%xmm7
-	movdqa	(%r11),%xmm6
-	por	%xmm7,%xmm14
-	paddd	%xmm13,%xmm8
-	paddd	%xmm14,%xmm9
-	pxor	%xmm8,%xmm3
-	pxor	%xmm9,%xmm0
-.byte	102,15,56,0,222
-.byte	102,15,56,0,198
-	paddd	%xmm3,%xmm4
-	paddd	%xmm0,%xmm5
-	pxor	%xmm4,%xmm13
-	pxor	%xmm5,%xmm14
-	movdqa	%xmm13,%xmm7
-	pslld	$7,%xmm13
-	psrld	$25,%xmm7
-	movdqa	%xmm14,%xmm6
-	pslld	$7,%xmm14
-	por	%xmm7,%xmm13
-	psrld	$25,%xmm6
-	movdqa	(%r10),%xmm7
-	por	%xmm6,%xmm14
-	movdqa	%xmm4,32(%rsp)
-	movdqa	%xmm5,48(%rsp)
-	movdqa	0(%rsp),%xmm4
-	movdqa	16(%rsp),%xmm5
-	paddd	%xmm15,%xmm10
-	paddd	%xmm12,%xmm11
-	pxor	%xmm10,%xmm1
-	pxor	%xmm11,%xmm2
-.byte	102,15,56,0,207
-.byte	102,15,56,0,215
-	paddd	%xmm1,%xmm4
-	paddd	%xmm2,%xmm5
-	pxor	%xmm4,%xmm15
-	pxor	%xmm5,%xmm12
-	movdqa	%xmm15,%xmm6
-	pslld	$12,%xmm15
-	psrld	$20,%xmm6
-	movdqa	%xmm12,%xmm7
-	pslld	$12,%xmm12
-	por	%xmm6,%xmm15
-	psrld	$20,%xmm7
-	movdqa	(%r11),%xmm6
-	por	%xmm7,%xmm12
-	paddd	%xmm15,%xmm10
-	paddd	%xmm12,%xmm11
-	pxor	%xmm10,%xmm1
-	pxor	%xmm11,%xmm2
-.byte	102,15,56,0,206
-.byte	102,15,56,0,214
-	paddd	%xmm1,%xmm4
-	paddd	%xmm2,%xmm5
-	pxor	%xmm4,%xmm15
-	pxor	%xmm5,%xmm12
-	movdqa	%xmm15,%xmm7
-	pslld	$7,%xmm15
-	psrld	$25,%xmm7
-	movdqa	%xmm12,%xmm6
-	pslld	$7,%xmm12
-	por	%xmm7,%xmm15
-	psrld	$25,%xmm6
-	movdqa	(%r10),%xmm7
-	por	%xmm6,%xmm12
-	decl	%eax
-	jnz	L$oop4x
-
-	paddd	64(%rsp),%xmm8
-	paddd	80(%rsp),%xmm9
-	paddd	96(%rsp),%xmm10
-	paddd	112(%rsp),%xmm11
-
-	movdqa	%xmm8,%xmm6
-	punpckldq	%xmm9,%xmm8
-	movdqa	%xmm10,%xmm7
-	punpckldq	%xmm11,%xmm10
-	punpckhdq	%xmm9,%xmm6
-	punpckhdq	%xmm11,%xmm7
-	movdqa	%xmm8,%xmm9
-	punpcklqdq	%xmm10,%xmm8
-	movdqa	%xmm6,%xmm11
-	punpcklqdq	%xmm7,%xmm6
-	punpckhqdq	%xmm10,%xmm9
-	punpckhqdq	%xmm7,%xmm11
-	paddd	128-256(%rcx),%xmm12
-	paddd	144-256(%rcx),%xmm13
-	paddd	160-256(%rcx),%xmm14
-	paddd	176-256(%rcx),%xmm15
-
-	movdqa	%xmm8,0(%rsp)
-	movdqa	%xmm9,16(%rsp)
-	movdqa	32(%rsp),%xmm8
-	movdqa	48(%rsp),%xmm9
-
-	movdqa	%xmm12,%xmm10
-	punpckldq	%xmm13,%xmm12
-	movdqa	%xmm14,%xmm7
-	punpckldq	%xmm15,%xmm14
-	punpckhdq	%xmm13,%xmm10
-	punpckhdq	%xmm15,%xmm7
-	movdqa	%xmm12,%xmm13
-	punpcklqdq	%xmm14,%xmm12
-	movdqa	%xmm10,%xmm15
-	punpcklqdq	%xmm7,%xmm10
-	punpckhqdq	%xmm14,%xmm13
-	punpckhqdq	%xmm7,%xmm15
-	paddd	192-256(%rcx),%xmm4
-	paddd	208-256(%rcx),%xmm5
-	paddd	224-256(%rcx),%xmm8
-	paddd	240-256(%rcx),%xmm9
-
-	movdqa	%xmm6,32(%rsp)
-	movdqa	%xmm11,48(%rsp)
-
-	movdqa	%xmm4,%xmm14
-	punpckldq	%xmm5,%xmm4
-	movdqa	%xmm8,%xmm7
-	punpckldq	%xmm9,%xmm8
-	punpckhdq	%xmm5,%xmm14
-	punpckhdq	%xmm9,%xmm7
-	movdqa	%xmm4,%xmm5
-	punpcklqdq	%xmm8,%xmm4
-	movdqa	%xmm14,%xmm9
-	punpcklqdq	%xmm7,%xmm14
-	punpckhqdq	%xmm8,%xmm5
-	punpckhqdq	%xmm7,%xmm9
-	paddd	256-256(%rcx),%xmm0
-	paddd	272-256(%rcx),%xmm1
-	paddd	288-256(%rcx),%xmm2
-	paddd	304-256(%rcx),%xmm3
-
-	movdqa	%xmm0,%xmm8
-	punpckldq	%xmm1,%xmm0
-	movdqa	%xmm2,%xmm7
-	punpckldq	%xmm3,%xmm2
-	punpckhdq	%xmm1,%xmm8
-	punpckhdq	%xmm3,%xmm7
-	movdqa	%xmm0,%xmm1
-	punpcklqdq	%xmm2,%xmm0
-	movdqa	%xmm8,%xmm3
-	punpcklqdq	%xmm7,%xmm8
-	punpckhqdq	%xmm2,%xmm1
-	punpckhqdq	%xmm7,%xmm3
-	cmpq	$256,%rdx
-	jb	L$tail4x
-
-	movdqu	0(%rsi),%xmm6
-	movdqu	16(%rsi),%xmm11
-	movdqu	32(%rsi),%xmm2
-	movdqu	48(%rsi),%xmm7
-	pxor	0(%rsp),%xmm6
-	pxor	%xmm12,%xmm11
-	pxor	%xmm4,%xmm2
-	pxor	%xmm0,%xmm7
-
-	movdqu	%xmm6,0(%rdi)
-	movdqu	64(%rsi),%xmm6
-	movdqu	%xmm11,16(%rdi)
-	movdqu	80(%rsi),%xmm11
-	movdqu	%xmm2,32(%rdi)
-	movdqu	96(%rsi),%xmm2
-	movdqu	%xmm7,48(%rdi)
-	movdqu	112(%rsi),%xmm7
-	leaq	128(%rsi),%rsi
-	pxor	16(%rsp),%xmm6
-	pxor	%xmm13,%xmm11
-	pxor	%xmm5,%xmm2
-	pxor	%xmm1,%xmm7
-
-	movdqu	%xmm6,64(%rdi)
-	movdqu	0(%rsi),%xmm6
-	movdqu	%xmm11,80(%rdi)
-	movdqu	16(%rsi),%xmm11
-	movdqu	%xmm2,96(%rdi)
-	movdqu	32(%rsi),%xmm2
-	movdqu	%xmm7,112(%rdi)
-	leaq	128(%rdi),%rdi
-	movdqu	48(%rsi),%xmm7
-	pxor	32(%rsp),%xmm6
-	pxor	%xmm10,%xmm11
-	pxor	%xmm14,%xmm2
-	pxor	%xmm8,%xmm7
-
-	movdqu	%xmm6,0(%rdi)
-	movdqu	64(%rsi),%xmm6
-	movdqu	%xmm11,16(%rdi)
-	movdqu	80(%rsi),%xmm11
-	movdqu	%xmm2,32(%rdi)
-	movdqu	96(%rsi),%xmm2
-	movdqu	%xmm7,48(%rdi)
-	movdqu	112(%rsi),%xmm7
-	leaq	128(%rsi),%rsi
-	pxor	48(%rsp),%xmm6
-	pxor	%xmm15,%xmm11
-	pxor	%xmm9,%xmm2
-	pxor	%xmm3,%xmm7
-	movdqu	%xmm6,64(%rdi)
-	movdqu	%xmm11,80(%rdi)
-	movdqu	%xmm2,96(%rdi)
-	movdqu	%xmm7,112(%rdi)
-	leaq	128(%rdi),%rdi
-
-	subq	$256,%rdx
-	jnz	L$oop_outer4x
-
-	jmp	L$done4x
-
-L$tail4x:
-	cmpq	$192,%rdx
-	jae	L$192_or_more4x
-	cmpq	$128,%rdx
-	jae	L$128_or_more4x
-	cmpq	$64,%rdx
-	jae	L$64_or_more4x
-
-
-	xorq	%r10,%r10
-
-	movdqa	%xmm12,16(%rsp)
-	movdqa	%xmm4,32(%rsp)
-	movdqa	%xmm0,48(%rsp)
-	jmp	L$oop_tail4x
-
-.p2align	5
-L$64_or_more4x:
-	movdqu	0(%rsi),%xmm6
-	movdqu	16(%rsi),%xmm11
-	movdqu	32(%rsi),%xmm2
-	movdqu	48(%rsi),%xmm7
-	pxor	0(%rsp),%xmm6
-	pxor	%xmm12,%xmm11
-	pxor	%xmm4,%xmm2
-	pxor	%xmm0,%xmm7
-	movdqu	%xmm6,0(%rdi)
-	movdqu	%xmm11,16(%rdi)
-	movdqu	%xmm2,32(%rdi)
-	movdqu	%xmm7,48(%rdi)
-	je	L$done4x
-
-	movdqa	16(%rsp),%xmm6
-	leaq	64(%rsi),%rsi
-	xorq	%r10,%r10
-	movdqa	%xmm6,0(%rsp)
-	movdqa	%xmm13,16(%rsp)
-	leaq	64(%rdi),%rdi
-	movdqa	%xmm5,32(%rsp)
-	subq	$64,%rdx
-	movdqa	%xmm1,48(%rsp)
-	jmp	L$oop_tail4x
-
-.p2align	5
-L$128_or_more4x:
-	movdqu	0(%rsi),%xmm6
-	movdqu	16(%rsi),%xmm11
-	movdqu	32(%rsi),%xmm2
-	movdqu	48(%rsi),%xmm7
-	pxor	0(%rsp),%xmm6
-	pxor	%xmm12,%xmm11
-	pxor	%xmm4,%xmm2
-	pxor	%xmm0,%xmm7
-
-	movdqu	%xmm6,0(%rdi)
-	movdqu	64(%rsi),%xmm6
-	movdqu	%xmm11,16(%rdi)
-	movdqu	80(%rsi),%xmm11
-	movdqu	%xmm2,32(%rdi)
-	movdqu	96(%rsi),%xmm2
-	movdqu	%xmm7,48(%rdi)
-	movdqu	112(%rsi),%xmm7
-	pxor	16(%rsp),%xmm6
-	pxor	%xmm13,%xmm11
-	pxor	%xmm5,%xmm2
-	pxor	%xmm1,%xmm7
-	movdqu	%xmm6,64(%rdi)
-	movdqu	%xmm11,80(%rdi)
-	movdqu	%xmm2,96(%rdi)
-	movdqu	%xmm7,112(%rdi)
-	je	L$done4x
-
-	movdqa	32(%rsp),%xmm6
-	leaq	128(%rsi),%rsi
-	xorq	%r10,%r10
-	movdqa	%xmm6,0(%rsp)
-	movdqa	%xmm10,16(%rsp)
-	leaq	128(%rdi),%rdi
-	movdqa	%xmm14,32(%rsp)
-	subq	$128,%rdx
-	movdqa	%xmm8,48(%rsp)
-	jmp	L$oop_tail4x
-
-.p2align	5
-L$192_or_more4x:
-	movdqu	0(%rsi),%xmm6
-	movdqu	16(%rsi),%xmm11
-	movdqu	32(%rsi),%xmm2
-	movdqu	48(%rsi),%xmm7
-	pxor	0(%rsp),%xmm6
-	pxor	%xmm12,%xmm11
-	pxor	%xmm4,%xmm2
-	pxor	%xmm0,%xmm7
-
-	movdqu	%xmm6,0(%rdi)
-	movdqu	64(%rsi),%xmm6
-	movdqu	%xmm11,16(%rdi)
-	movdqu	80(%rsi),%xmm11
-	movdqu	%xmm2,32(%rdi)
-	movdqu	96(%rsi),%xmm2
-	movdqu	%xmm7,48(%rdi)
-	movdqu	112(%rsi),%xmm7
-	leaq	128(%rsi),%rsi
-	pxor	16(%rsp),%xmm6
-	pxor	%xmm13,%xmm11
-	pxor	%xmm5,%xmm2
-	pxor	%xmm1,%xmm7
-
-	movdqu	%xmm6,64(%rdi)
-	movdqu	0(%rsi),%xmm6
-	movdqu	%xmm11,80(%rdi)
-	movdqu	16(%rsi),%xmm11
-	movdqu	%xmm2,96(%rdi)
-	movdqu	32(%rsi),%xmm2
-	movdqu	%xmm7,112(%rdi)
-	leaq	128(%rdi),%rdi
-	movdqu	48(%rsi),%xmm7
-	pxor	32(%rsp),%xmm6
-	pxor	%xmm10,%xmm11
-	pxor	%xmm14,%xmm2
-	pxor	%xmm8,%xmm7
-	movdqu	%xmm6,0(%rdi)
-	movdqu	%xmm11,16(%rdi)
-	movdqu	%xmm2,32(%rdi)
-	movdqu	%xmm7,48(%rdi)
-	je	L$done4x
-
-	movdqa	48(%rsp),%xmm6
-	leaq	64(%rsi),%rsi
-	xorq	%r10,%r10
-	movdqa	%xmm6,0(%rsp)
-	movdqa	%xmm15,16(%rsp)
-	leaq	64(%rdi),%rdi
-	movdqa	%xmm9,32(%rsp)
-	subq	$192,%rdx
-	movdqa	%xmm3,48(%rsp)
-
-L$oop_tail4x:
-	movzbl	(%rsi,%r10,1),%eax
-	movzbl	(%rsp,%r10,1),%ecx
-	leaq	1(%r10),%r10
-	xorl	%ecx,%eax
-	movb	%al,-1(%rdi,%r10,1)
-	decq	%rdx
-	jnz	L$oop_tail4x
-
-L$done4x:
-	leaq	(%r9),%rsp
-
-L$4x_epilogue:
-	ret
-
-
-
-.p2align	5
-ChaCha20_8x:
-L$ChaCha20_8x:
-
-	movq	%rsp,%r9
-
-	subq	$0x280+8,%rsp
-	andq	$-32,%rsp
-	vzeroupper
-
-
-
-
-
-
-
-
-
-
-	vbroadcasti128	L$sigma(%rip),%ymm11
-	vbroadcasti128	(%rcx),%ymm3
-	vbroadcasti128	16(%rcx),%ymm15
-	vbroadcasti128	(%r8),%ymm7
-	leaq	256(%rsp),%rcx
-	leaq	512(%rsp),%rax
-	leaq	L$rot16(%rip),%r10
-	leaq	L$rot24(%rip),%r11
-
-	vpshufd	$0x00,%ymm11,%ymm8
-	vpshufd	$0x55,%ymm11,%ymm9
-	vmovdqa	%ymm8,128-256(%rcx)
-	vpshufd	$0xaa,%ymm11,%ymm10
-	vmovdqa	%ymm9,160-256(%rcx)
-	vpshufd	$0xff,%ymm11,%ymm11
-	vmovdqa	%ymm10,192-256(%rcx)
-	vmovdqa	%ymm11,224-256(%rcx)
-
-	vpshufd	$0x00,%ymm3,%ymm0
-	vpshufd	$0x55,%ymm3,%ymm1
-	vmovdqa	%ymm0,256-256(%rcx)
-	vpshufd	$0xaa,%ymm3,%ymm2
-	vmovdqa	%ymm1,288-256(%rcx)
-	vpshufd	$0xff,%ymm3,%ymm3
-	vmovdqa	%ymm2,320-256(%rcx)
-	vmovdqa	%ymm3,352-256(%rcx)
-
-	vpshufd	$0x00,%ymm15,%ymm12
-	vpshufd	$0x55,%ymm15,%ymm13
-	vmovdqa	%ymm12,384-512(%rax)
-	vpshufd	$0xaa,%ymm15,%ymm14
-	vmovdqa	%ymm13,416-512(%rax)
-	vpshufd	$0xff,%ymm15,%ymm15
-	vmovdqa	%ymm14,448-512(%rax)
-	vmovdqa	%ymm15,480-512(%rax)
-
-	vpshufd	$0x00,%ymm7,%ymm4
-	vpshufd	$0x55,%ymm7,%ymm5
-	vpaddd	L$incy(%rip),%ymm4,%ymm4
-	vpshufd	$0xaa,%ymm7,%ymm6
-	vmovdqa	%ymm5,544-512(%rax)
-	vpshufd	$0xff,%ymm7,%ymm7
-	vmovdqa	%ymm6,576-512(%rax)
-	vmovdqa	%ymm7,608-512(%rax)
-
-	jmp	L$oop_enter8x
-
-.p2align	5
-L$oop_outer8x:
-	vmovdqa	128-256(%rcx),%ymm8
-	vmovdqa	160-256(%rcx),%ymm9
-	vmovdqa	192-256(%rcx),%ymm10
-	vmovdqa	224-256(%rcx),%ymm11
-	vmovdqa	256-256(%rcx),%ymm0
-	vmovdqa	288-256(%rcx),%ymm1
-	vmovdqa	320-256(%rcx),%ymm2
-	vmovdqa	352-256(%rcx),%ymm3
-	vmovdqa	384-512(%rax),%ymm12
-	vmovdqa	416-512(%rax),%ymm13
-	vmovdqa	448-512(%rax),%ymm14
-	vmovdqa	480-512(%rax),%ymm15
-	vmovdqa	512-512(%rax),%ymm4
-	vmovdqa	544-512(%rax),%ymm5
-	vmovdqa	576-512(%rax),%ymm6
-	vmovdqa	608-512(%rax),%ymm7
-	vpaddd	L$eight(%rip),%ymm4,%ymm4
-
-L$oop_enter8x:
-	vmovdqa	%ymm14,64(%rsp)
-	vmovdqa	%ymm15,96(%rsp)
-	vbroadcasti128	(%r10),%ymm15
-	vmovdqa	%ymm4,512-512(%rax)
-	movl	$10,%eax
-	jmp	L$oop8x
-
-.p2align	5
-L$oop8x:
-	vpaddd	%ymm0,%ymm8,%ymm8
-	vpxor	%ymm4,%ymm8,%ymm4
-	vpshufb	%ymm15,%ymm4,%ymm4
-	vpaddd	%ymm1,%ymm9,%ymm9
-	vpxor	%ymm5,%ymm9,%ymm5
-	vpshufb	%ymm15,%ymm5,%ymm5
-	vpaddd	%ymm4,%ymm12,%ymm12
-	vpxor	%ymm0,%ymm12,%ymm0
-	vpslld	$12,%ymm0,%ymm14
-	vpsrld	$20,%ymm0,%ymm0
-	vpor	%ymm0,%ymm14,%ymm0
-	vbroadcasti128	(%r11),%ymm14
-	vpaddd	%ymm5,%ymm13,%ymm13
-	vpxor	%ymm1,%ymm13,%ymm1
-	vpslld	$12,%ymm1,%ymm15
-	vpsrld	$20,%ymm1,%ymm1
-	vpor	%ymm1,%ymm15,%ymm1
-	vpaddd	%ymm0,%ymm8,%ymm8
-	vpxor	%ymm4,%ymm8,%ymm4
-	vpshufb	%ymm14,%ymm4,%ymm4
-	vpaddd	%ymm1,%ymm9,%ymm9
-	vpxor	%ymm5,%ymm9,%ymm5
-	vpshufb	%ymm14,%ymm5,%ymm5
-	vpaddd	%ymm4,%ymm12,%ymm12
-	vpxor	%ymm0,%ymm12,%ymm0
-	vpslld	$7,%ymm0,%ymm15
-	vpsrld	$25,%ymm0,%ymm0
-	vpor	%ymm0,%ymm15,%ymm0
-	vbroadcasti128	(%r10),%ymm15
-	vpaddd	%ymm5,%ymm13,%ymm13
-	vpxor	%ymm1,%ymm13,%ymm1
-	vpslld	$7,%ymm1,%ymm14
-	vpsrld	$25,%ymm1,%ymm1
-	vpor	%ymm1,%ymm14,%ymm1
-	vmovdqa	%ymm12,0(%rsp)
-	vmovdqa	%ymm13,32(%rsp)
-	vmovdqa	64(%rsp),%ymm12
-	vmovdqa	96(%rsp),%ymm13
-	vpaddd	%ymm2,%ymm10,%ymm10
-	vpxor	%ymm6,%ymm10,%ymm6
-	vpshufb	%ymm15,%ymm6,%ymm6
-	vpaddd	%ymm3,%ymm11,%ymm11
-	vpxor	%ymm7,%ymm11,%ymm7
-	vpshufb	%ymm15,%ymm7,%ymm7
-	vpaddd	%ymm6,%ymm12,%ymm12
-	vpxor	%ymm2,%ymm12,%ymm2
-	vpslld	$12,%ymm2,%ymm14
-	vpsrld	$20,%ymm2,%ymm2
-	vpor	%ymm2,%ymm14,%ymm2
-	vbroadcasti128	(%r11),%ymm14
-	vpaddd	%ymm7,%ymm13,%ymm13
-	vpxor	%ymm3,%ymm13,%ymm3
-	vpslld	$12,%ymm3,%ymm15
-	vpsrld	$20,%ymm3,%ymm3
-	vpor	%ymm3,%ymm15,%ymm3
-	vpaddd	%ymm2,%ymm10,%ymm10
-	vpxor	%ymm6,%ymm10,%ymm6
-	vpshufb	%ymm14,%ymm6,%ymm6
-	vpaddd	%ymm3,%ymm11,%ymm11
-	vpxor	%ymm7,%ymm11,%ymm7
-	vpshufb	%ymm14,%ymm7,%ymm7
-	vpaddd	%ymm6,%ymm12,%ymm12
-	vpxor	%ymm2,%ymm12,%ymm2
-	vpslld	$7,%ymm2,%ymm15
-	vpsrld	$25,%ymm2,%ymm2
-	vpor	%ymm2,%ymm15,%ymm2
-	vbroadcasti128	(%r10),%ymm15
-	vpaddd	%ymm7,%ymm13,%ymm13
-	vpxor	%ymm3,%ymm13,%ymm3
-	vpslld	$7,%ymm3,%ymm14
-	vpsrld	$25,%ymm3,%ymm3
-	vpor	%ymm3,%ymm14,%ymm3
-	vpaddd	%ymm1,%ymm8,%ymm8
-	vpxor	%ymm7,%ymm8,%ymm7
-	vpshufb	%ymm15,%ymm7,%ymm7
-	vpaddd	%ymm2,%ymm9,%ymm9
-	vpxor	%ymm4,%ymm9,%ymm4
-	vpshufb	%ymm15,%ymm4,%ymm4
-	vpaddd	%ymm7,%ymm12,%ymm12
-	vpxor	%ymm1,%ymm12,%ymm1
-	vpslld	$12,%ymm1,%ymm14
-	vpsrld	$20,%ymm1,%ymm1
-	vpor	%ymm1,%ymm14,%ymm1
-	vbroadcasti128	(%r11),%ymm14
-	vpaddd	%ymm4,%ymm13,%ymm13
-	vpxor	%ymm2,%ymm13,%ymm2
-	vpslld	$12,%ymm2,%ymm15
-	vpsrld	$20,%ymm2,%ymm2
-	vpor	%ymm2,%ymm15,%ymm2
-	vpaddd	%ymm1,%ymm8,%ymm8
-	vpxor	%ymm7,%ymm8,%ymm7
-	vpshufb	%ymm14,%ymm7,%ymm7
-	vpaddd	%ymm2,%ymm9,%ymm9
-	vpxor	%ymm4,%ymm9,%ymm4
-	vpshufb	%ymm14,%ymm4,%ymm4
-	vpaddd	%ymm7,%ymm12,%ymm12
-	vpxor	%ymm1,%ymm12,%ymm1
-	vpslld	$7,%ymm1,%ymm15
-	vpsrld	$25,%ymm1,%ymm1
-	vpor	%ymm1,%ymm15,%ymm1
-	vbroadcasti128	(%r10),%ymm15
-	vpaddd	%ymm4,%ymm13,%ymm13
-	vpxor	%ymm2,%ymm13,%ymm2
-	vpslld	$7,%ymm2,%ymm14
-	vpsrld	$25,%ymm2,%ymm2
-	vpor	%ymm2,%ymm14,%ymm2
-	vmovdqa	%ymm12,64(%rsp)
-	vmovdqa	%ymm13,96(%rsp)
-	vmovdqa	0(%rsp),%ymm12
-	vmovdqa	32(%rsp),%ymm13
-	vpaddd	%ymm3,%ymm10,%ymm10
-	vpxor	%ymm5,%ymm10,%ymm5
-	vpshufb	%ymm15,%ymm5,%ymm5
-	vpaddd	%ymm0,%ymm11,%ymm11
-	vpxor	%ymm6,%ymm11,%ymm6
-	vpshufb	%ymm15,%ymm6,%ymm6
-	vpaddd	%ymm5,%ymm12,%ymm12
-	vpxor	%ymm3,%ymm12,%ymm3
-	vpslld	$12,%ymm3,%ymm14
-	vpsrld	$20,%ymm3,%ymm3
-	vpor	%ymm3,%ymm14,%ymm3
-	vbroadcasti128	(%r11),%ymm14
-	vpaddd	%ymm6,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm13,%ymm0
-	vpslld	$12,%ymm0,%ymm15
-	vpsrld	$20,%ymm0,%ymm0
-	vpor	%ymm0,%ymm15,%ymm0
-	vpaddd	%ymm3,%ymm10,%ymm10
-	vpxor	%ymm5,%ymm10,%ymm5
-	vpshufb	%ymm14,%ymm5,%ymm5
-	vpaddd	%ymm0,%ymm11,%ymm11
-	vpxor	%ymm6,%ymm11,%ymm6
-	vpshufb	%ymm14,%ymm6,%ymm6
-	vpaddd	%ymm5,%ymm12,%ymm12
-	vpxor	%ymm3,%ymm12,%ymm3
-	vpslld	$7,%ymm3,%ymm15
-	vpsrld	$25,%ymm3,%ymm3
-	vpor	%ymm3,%ymm15,%ymm3
-	vbroadcasti128	(%r10),%ymm15
-	vpaddd	%ymm6,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm13,%ymm0
-	vpslld	$7,%ymm0,%ymm14
-	vpsrld	$25,%ymm0,%ymm0
-	vpor	%ymm0,%ymm14,%ymm0
-	decl	%eax
-	jnz	L$oop8x
-
-	leaq	512(%rsp),%rax
-	vpaddd	128-256(%rcx),%ymm8,%ymm8
-	vpaddd	160-256(%rcx),%ymm9,%ymm9
-	vpaddd	192-256(%rcx),%ymm10,%ymm10
-	vpaddd	224-256(%rcx),%ymm11,%ymm11
-
-	vpunpckldq	%ymm9,%ymm8,%ymm14
-	vpunpckldq	%ymm11,%ymm10,%ymm15
-	vpunpckhdq	%ymm9,%ymm8,%ymm8
-	vpunpckhdq	%ymm11,%ymm10,%ymm10
-	vpunpcklqdq	%ymm15,%ymm14,%ymm9
-	vpunpckhqdq	%ymm15,%ymm14,%ymm14
-	vpunpcklqdq	%ymm10,%ymm8,%ymm11
-	vpunpckhqdq	%ymm10,%ymm8,%ymm8
-	vpaddd	256-256(%rcx),%ymm0,%ymm0
-	vpaddd	288-256(%rcx),%ymm1,%ymm1
-	vpaddd	320-256(%rcx),%ymm2,%ymm2
-	vpaddd	352-256(%rcx),%ymm3,%ymm3
-
-	vpunpckldq	%ymm1,%ymm0,%ymm10
-	vpunpckldq	%ymm3,%ymm2,%ymm15
-	vpunpckhdq	%ymm1,%ymm0,%ymm0
-	vpunpckhdq	%ymm3,%ymm2,%ymm2
-	vpunpcklqdq	%ymm15,%ymm10,%ymm1
-	vpunpckhqdq	%ymm15,%ymm10,%ymm10
-	vpunpcklqdq	%ymm2,%ymm0,%ymm3
-	vpunpckhqdq	%ymm2,%ymm0,%ymm0
-	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
-	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
-	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
-	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
-	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
-	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
-	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
-	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
-	vmovdqa	%ymm15,0(%rsp)
-	vmovdqa	%ymm9,32(%rsp)
-	vmovdqa	64(%rsp),%ymm15
-	vmovdqa	96(%rsp),%ymm9
-
-	vpaddd	384-512(%rax),%ymm12,%ymm12
-	vpaddd	416-512(%rax),%ymm13,%ymm13
-	vpaddd	448-512(%rax),%ymm15,%ymm15
-	vpaddd	480-512(%rax),%ymm9,%ymm9
-
-	vpunpckldq	%ymm13,%ymm12,%ymm2
-	vpunpckldq	%ymm9,%ymm15,%ymm8
-	vpunpckhdq	%ymm13,%ymm12,%ymm12
-	vpunpckhdq	%ymm9,%ymm15,%ymm15
-	vpunpcklqdq	%ymm8,%ymm2,%ymm13
-	vpunpckhqdq	%ymm8,%ymm2,%ymm2
-	vpunpcklqdq	%ymm15,%ymm12,%ymm9
-	vpunpckhqdq	%ymm15,%ymm12,%ymm12
-	vpaddd	512-512(%rax),%ymm4,%ymm4
-	vpaddd	544-512(%rax),%ymm5,%ymm5
-	vpaddd	576-512(%rax),%ymm6,%ymm6
-	vpaddd	608-512(%rax),%ymm7,%ymm7
-
-	vpunpckldq	%ymm5,%ymm4,%ymm15
-	vpunpckldq	%ymm7,%ymm6,%ymm8
-	vpunpckhdq	%ymm5,%ymm4,%ymm4
-	vpunpckhdq	%ymm7,%ymm6,%ymm6
-	vpunpcklqdq	%ymm8,%ymm15,%ymm5
-	vpunpckhqdq	%ymm8,%ymm15,%ymm15
-	vpunpcklqdq	%ymm6,%ymm4,%ymm7
-	vpunpckhqdq	%ymm6,%ymm4,%ymm4
-	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
-	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
-	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
-	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
-	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
-	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
-	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
-	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
-	vmovdqa	0(%rsp),%ymm6
-	vmovdqa	32(%rsp),%ymm12
-
-	cmpq	$512,%rdx
-	jb	L$tail8x
-
-	vpxor	0(%rsi),%ymm6,%ymm6
-	vpxor	32(%rsi),%ymm8,%ymm8
-	vpxor	64(%rsi),%ymm1,%ymm1
-	vpxor	96(%rsi),%ymm5,%ymm5
-	leaq	128(%rsi),%rsi
-	vmovdqu	%ymm6,0(%rdi)
-	vmovdqu	%ymm8,32(%rdi)
-	vmovdqu	%ymm1,64(%rdi)
-	vmovdqu	%ymm5,96(%rdi)
-	leaq	128(%rdi),%rdi
-
-	vpxor	0(%rsi),%ymm12,%ymm12
-	vpxor	32(%rsi),%ymm13,%ymm13
-	vpxor	64(%rsi),%ymm10,%ymm10
-	vpxor	96(%rsi),%ymm15,%ymm15
-	leaq	128(%rsi),%rsi
-	vmovdqu	%ymm12,0(%rdi)
-	vmovdqu	%ymm13,32(%rdi)
-	vmovdqu	%ymm10,64(%rdi)
-	vmovdqu	%ymm15,96(%rdi)
-	leaq	128(%rdi),%rdi
-
-	vpxor	0(%rsi),%ymm14,%ymm14
-	vpxor	32(%rsi),%ymm2,%ymm2
-	vpxor	64(%rsi),%ymm3,%ymm3
-	vpxor	96(%rsi),%ymm7,%ymm7
-	leaq	128(%rsi),%rsi
-	vmovdqu	%ymm14,0(%rdi)
-	vmovdqu	%ymm2,32(%rdi)
-	vmovdqu	%ymm3,64(%rdi)
-	vmovdqu	%ymm7,96(%rdi)
-	leaq	128(%rdi),%rdi
-
-	vpxor	0(%rsi),%ymm11,%ymm11
-	vpxor	32(%rsi),%ymm9,%ymm9
-	vpxor	64(%rsi),%ymm0,%ymm0
-	vpxor	96(%rsi),%ymm4,%ymm4
-	leaq	128(%rsi),%rsi
-	vmovdqu	%ymm11,0(%rdi)
-	vmovdqu	%ymm9,32(%rdi)
-	vmovdqu	%ymm0,64(%rdi)
-	vmovdqu	%ymm4,96(%rdi)
-	leaq	128(%rdi),%rdi
-
-	subq	$512,%rdx
-	jnz	L$oop_outer8x
-
-	jmp	L$done8x
-
-L$tail8x:
-	cmpq	$448,%rdx
-	jae	L$448_or_more8x
-	cmpq	$384,%rdx
-	jae	L$384_or_more8x
-	cmpq	$320,%rdx
-	jae	L$320_or_more8x
-	cmpq	$256,%rdx
-	jae	L$256_or_more8x
-	cmpq	$192,%rdx
-	jae	L$192_or_more8x
-	cmpq	$128,%rdx
-	jae	L$128_or_more8x
-	cmpq	$64,%rdx
-	jae	L$64_or_more8x
-
-	xorq	%r10,%r10
-	vmovdqa	%ymm6,0(%rsp)
-	vmovdqa	%ymm8,32(%rsp)
-	jmp	L$oop_tail8x
-
-.p2align	5
-L$64_or_more8x:
-	vpxor	0(%rsi),%ymm6,%ymm6
-	vpxor	32(%rsi),%ymm8,%ymm8
-	vmovdqu	%ymm6,0(%rdi)
-	vmovdqu	%ymm8,32(%rdi)
-	je	L$done8x
-
-	leaq	64(%rsi),%rsi
-	xorq	%r10,%r10
-	vmovdqa	%ymm1,0(%rsp)
-	leaq	64(%rdi),%rdi
-	subq	$64,%rdx
-	vmovdqa	%ymm5,32(%rsp)
-	jmp	L$oop_tail8x
-
-.p2align	5
-L$128_or_more8x:
-	vpxor	0(%rsi),%ymm6,%ymm6
-	vpxor	32(%rsi),%ymm8,%ymm8
-	vpxor	64(%rsi),%ymm1,%ymm1
-	vpxor	96(%rsi),%ymm5,%ymm5
-	vmovdqu	%ymm6,0(%rdi)
-	vmovdqu	%ymm8,32(%rdi)
-	vmovdqu	%ymm1,64(%rdi)
-	vmovdqu	%ymm5,96(%rdi)
-	je	L$done8x
-
-	leaq	128(%rsi),%rsi
-	xorq	%r10,%r10
-	vmovdqa	%ymm12,0(%rsp)
-	leaq	128(%rdi),%rdi
-	subq	$128,%rdx
-	vmovdqa	%ymm13,32(%rsp)
-	jmp	L$oop_tail8x
-
-.p2align	5
-L$192_or_more8x:
-	vpxor	0(%rsi),%ymm6,%ymm6
-	vpxor	32(%rsi),%ymm8,%ymm8
-	vpxor	64(%rsi),%ymm1,%ymm1
-	vpxor	96(%rsi),%ymm5,%ymm5
-	vpxor	128(%rsi),%ymm12,%ymm12
-	vpxor	160(%rsi),%ymm13,%ymm13
-	vmovdqu	%ymm6,0(%rdi)
-	vmovdqu	%ymm8,32(%rdi)
-	vmovdqu	%ymm1,64(%rdi)
-	vmovdqu	%ymm5,96(%rdi)
-	vmovdqu	%ymm12,128(%rdi)
-	vmovdqu	%ymm13,160(%rdi)
-	je	L$done8x
-
-	leaq	192(%rsi),%rsi
-	xorq	%r10,%r10
-	vmovdqa	%ymm10,0(%rsp)
-	leaq	192(%rdi),%rdi
-	subq	$192,%rdx
-	vmovdqa	%ymm15,32(%rsp)
-	jmp	L$oop_tail8x
-
-.p2align	5
-L$256_or_more8x:
-	vpxor	0(%rsi),%ymm6,%ymm6
-	vpxor	32(%rsi),%ymm8,%ymm8
-	vpxor	64(%rsi),%ymm1,%ymm1
-	vpxor	96(%rsi),%ymm5,%ymm5
-	vpxor	128(%rsi),%ymm12,%ymm12
-	vpxor	160(%rsi),%ymm13,%ymm13
-	vpxor	192(%rsi),%ymm10,%ymm10
-	vpxor	224(%rsi),%ymm15,%ymm15
-	vmovdqu	%ymm6,0(%rdi)
-	vmovdqu	%ymm8,32(%rdi)
-	vmovdqu	%ymm1,64(%rdi)
-	vmovdqu	%ymm5,96(%rdi)
-	vmovdqu	%ymm12,128(%rdi)
-	vmovdqu	%ymm13,160(%rdi)
-	vmovdqu	%ymm10,192(%rdi)
-	vmovdqu	%ymm15,224(%rdi)
-	je	L$done8x
-
-	leaq	256(%rsi),%rsi
-	xorq	%r10,%r10
-	vmovdqa	%ymm14,0(%rsp)
-	leaq	256(%rdi),%rdi
-	subq	$256,%rdx
-	vmovdqa	%ymm2,32(%rsp)
-	jmp	L$oop_tail8x
-
-.p2align	5
-L$320_or_more8x:
-	vpxor	0(%rsi),%ymm6,%ymm6
-	vpxor	32(%rsi),%ymm8,%ymm8
-	vpxor	64(%rsi),%ymm1,%ymm1
-	vpxor	96(%rsi),%ymm5,%ymm5
-	vpxor	128(%rsi),%ymm12,%ymm12
-	vpxor	160(%rsi),%ymm13,%ymm13
-	vpxor	192(%rsi),%ymm10,%ymm10
-	vpxor	224(%rsi),%ymm15,%ymm15
-	vpxor	256(%rsi),%ymm14,%ymm14
-	vpxor	288(%rsi),%ymm2,%ymm2
-	vmovdqu	%ymm6,0(%rdi)
-	vmovdqu	%ymm8,32(%rdi)
-	vmovdqu	%ymm1,64(%rdi)
-	vmovdqu	%ymm5,96(%rdi)
-	vmovdqu	%ymm12,128(%rdi)
-	vmovdqu	%ymm13,160(%rdi)
-	vmovdqu	%ymm10,192(%rdi)
-	vmovdqu	%ymm15,224(%rdi)
-	vmovdqu	%ymm14,256(%rdi)
-	vmovdqu	%ymm2,288(%rdi)
-	je	L$done8x
-
-	leaq	320(%rsi),%rsi
-	xorq	%r10,%r10
-	vmovdqa	%ymm3,0(%rsp)
-	leaq	320(%rdi),%rdi
-	subq	$320,%rdx
-	vmovdqa	%ymm7,32(%rsp)
-	jmp	L$oop_tail8x
-
-.p2align	5
-L$384_or_more8x:
-	vpxor	0(%rsi),%ymm6,%ymm6
-	vpxor	32(%rsi),%ymm8,%ymm8
-	vpxor	64(%rsi),%ymm1,%ymm1
-	vpxor	96(%rsi),%ymm5,%ymm5
-	vpxor	128(%rsi),%ymm12,%ymm12
-	vpxor	160(%rsi),%ymm13,%ymm13
-	vpxor	192(%rsi),%ymm10,%ymm10
-	vpxor	224(%rsi),%ymm15,%ymm15
-	vpxor	256(%rsi),%ymm14,%ymm14
-	vpxor	288(%rsi),%ymm2,%ymm2
-	vpxor	320(%rsi),%ymm3,%ymm3
-	vpxor	352(%rsi),%ymm7,%ymm7
-	vmovdqu	%ymm6,0(%rdi)
-	vmovdqu	%ymm8,32(%rdi)
-	vmovdqu	%ymm1,64(%rdi)
-	vmovdqu	%ymm5,96(%rdi)
-	vmovdqu	%ymm12,128(%rdi)
-	vmovdqu	%ymm13,160(%rdi)
-	vmovdqu	%ymm10,192(%rdi)
-	vmovdqu	%ymm15,224(%rdi)
-	vmovdqu	%ymm14,256(%rdi)
-	vmovdqu	%ymm2,288(%rdi)
-	vmovdqu	%ymm3,320(%rdi)
-	vmovdqu	%ymm7,352(%rdi)
-	je	L$done8x
-
-	leaq	384(%rsi),%rsi
-	xorq	%r10,%r10
-	vmovdqa	%ymm11,0(%rsp)
-	leaq	384(%rdi),%rdi
-	subq	$384,%rdx
-	vmovdqa	%ymm9,32(%rsp)
-	jmp	L$oop_tail8x
-
-.p2align	5
-L$448_or_more8x:
-	vpxor	0(%rsi),%ymm6,%ymm6
-	vpxor	32(%rsi),%ymm8,%ymm8
-	vpxor	64(%rsi),%ymm1,%ymm1
-	vpxor	96(%rsi),%ymm5,%ymm5
-	vpxor	128(%rsi),%ymm12,%ymm12
-	vpxor	160(%rsi),%ymm13,%ymm13
-	vpxor	192(%rsi),%ymm10,%ymm10
-	vpxor	224(%rsi),%ymm15,%ymm15
-	vpxor	256(%rsi),%ymm14,%ymm14
-	vpxor	288(%rsi),%ymm2,%ymm2
-	vpxor	320(%rsi),%ymm3,%ymm3
-	vpxor	352(%rsi),%ymm7,%ymm7
-	vpxor	384(%rsi),%ymm11,%ymm11
-	vpxor	416(%rsi),%ymm9,%ymm9
-	vmovdqu	%ymm6,0(%rdi)
-	vmovdqu	%ymm8,32(%rdi)
-	vmovdqu	%ymm1,64(%rdi)
-	vmovdqu	%ymm5,96(%rdi)
-	vmovdqu	%ymm12,128(%rdi)
-	vmovdqu	%ymm13,160(%rdi)
-	vmovdqu	%ymm10,192(%rdi)
-	vmovdqu	%ymm15,224(%rdi)
-	vmovdqu	%ymm14,256(%rdi)
-	vmovdqu	%ymm2,288(%rdi)
-	vmovdqu	%ymm3,320(%rdi)
-	vmovdqu	%ymm7,352(%rdi)
-	vmovdqu	%ymm11,384(%rdi)
-	vmovdqu	%ymm9,416(%rdi)
-	je	L$done8x
-
-	leaq	448(%rsi),%rsi
-	xorq	%r10,%r10
-	vmovdqa	%ymm0,0(%rsp)
-	leaq	448(%rdi),%rdi
-	subq	$448,%rdx
-	vmovdqa	%ymm4,32(%rsp)
-
-L$oop_tail8x:
-	movzbl	(%rsi,%r10,1),%eax
-	movzbl	(%rsp,%r10,1),%ecx
-	leaq	1(%r10),%r10
-	xorl	%ecx,%eax
-	movb	%al,-1(%rdi,%r10,1)
-	decq	%rdx
-	jnz	L$oop_tail8x
-
-L$done8x:
-	vzeroall
-	leaq	(%r9),%rsp
-
-L$8x_epilogue:
-	ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64-apple.S b/apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64-apple.S
deleted file mode 100644
index 188ce56..0000000
--- a/apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64-apple.S
+++ /dev/null
@@ -1,3079 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.section	__DATA,__const
-
-.p2align	4
-one:
-.quad	1,0
-two:
-.quad	2,0
-three:
-.quad	3,0
-four:
-.quad	4,0
-five:
-.quad	5,0
-six:
-.quad	6,0
-seven:
-.quad	7,0
-eight:
-.quad	8,0
-
-OR_MASK:
-.long	0x00000000,0x00000000,0x00000000,0x80000000
-poly:
-.quad	0x1, 0xc200000000000000
-mask:
-.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
-con1:
-.long	1,1,1,1
-con2:
-.long	0x1b,0x1b,0x1b,0x1b
-con3:
-.byte	-1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
-and_mask:
-.long	0,0xffffffff, 0xffffffff, 0xffffffff
-.text	
-
-.p2align	4
-GFMUL:
-
-	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
-	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm5
-	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
-	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$8,%xmm3,%xmm4
-	vpsrldq	$8,%xmm3,%xmm3
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpxor	%xmm3,%xmm5,%xmm5
-
-	vpclmulqdq	$0x10,poly(%rip),%xmm2,%xmm3
-	vpshufd	$78,%xmm2,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm2
-
-	vpclmulqdq	$0x10,poly(%rip),%xmm2,%xmm3
-	vpshufd	$78,%xmm2,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm2
-
-	vpxor	%xmm5,%xmm2,%xmm0
-	ret
-
-
-.globl	_aesgcmsiv_htable_init
-.private_extern _aesgcmsiv_htable_init
-
-.p2align	4
-_aesgcmsiv_htable_init:
-
-_CET_ENDBR
-	vmovdqa	(%rsi),%xmm0
-	vmovdqa	%xmm0,%xmm1
-	vmovdqa	%xmm0,(%rdi)
-	call	GFMUL
-	vmovdqa	%xmm0,16(%rdi)
-	call	GFMUL
-	vmovdqa	%xmm0,32(%rdi)
-	call	GFMUL
-	vmovdqa	%xmm0,48(%rdi)
-	call	GFMUL
-	vmovdqa	%xmm0,64(%rdi)
-	call	GFMUL
-	vmovdqa	%xmm0,80(%rdi)
-	call	GFMUL
-	vmovdqa	%xmm0,96(%rdi)
-	call	GFMUL
-	vmovdqa	%xmm0,112(%rdi)
-	ret
-
-
-.globl	_aesgcmsiv_htable6_init
-.private_extern _aesgcmsiv_htable6_init
-
-.p2align	4
-_aesgcmsiv_htable6_init:
-
-_CET_ENDBR
-	vmovdqa	(%rsi),%xmm0
-	vmovdqa	%xmm0,%xmm1
-	vmovdqa	%xmm0,(%rdi)
-	call	GFMUL
-	vmovdqa	%xmm0,16(%rdi)
-	call	GFMUL
-	vmovdqa	%xmm0,32(%rdi)
-	call	GFMUL
-	vmovdqa	%xmm0,48(%rdi)
-	call	GFMUL
-	vmovdqa	%xmm0,64(%rdi)
-	call	GFMUL
-	vmovdqa	%xmm0,80(%rdi)
-	ret
-
-
-.globl	_aesgcmsiv_htable_polyval
-.private_extern _aesgcmsiv_htable_polyval
-
-.p2align	4
-_aesgcmsiv_htable_polyval:
-
-_CET_ENDBR
-	testq	%rdx,%rdx
-	jnz	L$htable_polyval_start
-	ret
-
-L$htable_polyval_start:
-	vzeroall
-
-
-
-	movq	%rdx,%r11
-	andq	$127,%r11
-
-	jz	L$htable_polyval_no_prefix
-
-	vpxor	%xmm9,%xmm9,%xmm9
-	vmovdqa	(%rcx),%xmm1
-	subq	%r11,%rdx
-
-	subq	$16,%r11
-
-
-	vmovdqu	(%rsi),%xmm0
-	vpxor	%xmm1,%xmm0,%xmm0
-
-	vpclmulqdq	$0x01,(%rdi,%r11,1),%xmm0,%xmm5
-	vpclmulqdq	$0x00,(%rdi,%r11,1),%xmm0,%xmm3
-	vpclmulqdq	$0x11,(%rdi,%r11,1),%xmm0,%xmm4
-	vpclmulqdq	$0x10,(%rdi,%r11,1),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-
-	leaq	16(%rsi),%rsi
-	testq	%r11,%r11
-	jnz	L$htable_polyval_prefix_loop
-	jmp	L$htable_polyval_prefix_complete
-
-
-.p2align	6
-L$htable_polyval_prefix_loop:
-	subq	$16,%r11
-
-	vmovdqu	(%rsi),%xmm0
-
-	vpclmulqdq	$0x00,(%rdi,%r11,1),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm3,%xmm3
-	vpclmulqdq	$0x11,(%rdi,%r11,1),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm4,%xmm4
-	vpclmulqdq	$0x01,(%rdi,%r11,1),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-	vpclmulqdq	$0x10,(%rdi,%r11,1),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-
-	testq	%r11,%r11
-
-	leaq	16(%rsi),%rsi
-
-	jnz	L$htable_polyval_prefix_loop
-
-L$htable_polyval_prefix_complete:
-	vpsrldq	$8,%xmm5,%xmm6
-	vpslldq	$8,%xmm5,%xmm5
-
-	vpxor	%xmm6,%xmm4,%xmm9
-	vpxor	%xmm5,%xmm3,%xmm1
-
-	jmp	L$htable_polyval_main_loop
-
-L$htable_polyval_no_prefix:
-
-
-
-
-	vpxor	%xmm1,%xmm1,%xmm1
-	vmovdqa	(%rcx),%xmm9
-
-.p2align	6
-L$htable_polyval_main_loop:
-	subq	$0x80,%rdx
-	jb	L$htable_polyval_out
-
-	vmovdqu	112(%rsi),%xmm0
-
-	vpclmulqdq	$0x01,(%rdi),%xmm0,%xmm5
-	vpclmulqdq	$0x00,(%rdi),%xmm0,%xmm3
-	vpclmulqdq	$0x11,(%rdi),%xmm0,%xmm4
-	vpclmulqdq	$0x10,(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-
-
-	vmovdqu	96(%rsi),%xmm0
-	vpclmulqdq	$0x01,16(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-	vpclmulqdq	$0x00,16(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm3,%xmm3
-	vpclmulqdq	$0x11,16(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm4,%xmm4
-	vpclmulqdq	$0x10,16(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-
-
-
-	vmovdqu	80(%rsi),%xmm0
-
-	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm7
-	vpalignr	$8,%xmm1,%xmm1,%xmm1
-
-	vpclmulqdq	$0x01,32(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-	vpclmulqdq	$0x00,32(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm3,%xmm3
-	vpclmulqdq	$0x11,32(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm4,%xmm4
-	vpclmulqdq	$0x10,32(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-
-
-	vpxor	%xmm7,%xmm1,%xmm1
-
-	vmovdqu	64(%rsi),%xmm0
-
-	vpclmulqdq	$0x01,48(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-	vpclmulqdq	$0x00,48(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm3,%xmm3
-	vpclmulqdq	$0x11,48(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm4,%xmm4
-	vpclmulqdq	$0x10,48(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-
-
-	vmovdqu	48(%rsi),%xmm0
-
-	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm7
-	vpalignr	$8,%xmm1,%xmm1,%xmm1
-
-	vpclmulqdq	$0x01,64(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-	vpclmulqdq	$0x00,64(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm3,%xmm3
-	vpclmulqdq	$0x11,64(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm4,%xmm4
-	vpclmulqdq	$0x10,64(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-
-
-	vpxor	%xmm7,%xmm1,%xmm1
-
-	vmovdqu	32(%rsi),%xmm0
-
-	vpclmulqdq	$0x01,80(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-	vpclmulqdq	$0x00,80(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm3,%xmm3
-	vpclmulqdq	$0x11,80(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm4,%xmm4
-	vpclmulqdq	$0x10,80(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-
-
-	vpxor	%xmm9,%xmm1,%xmm1
-
-	vmovdqu	16(%rsi),%xmm0
-
-	vpclmulqdq	$0x01,96(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-	vpclmulqdq	$0x00,96(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm3,%xmm3
-	vpclmulqdq	$0x11,96(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm4,%xmm4
-	vpclmulqdq	$0x10,96(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-
-
-	vmovdqu	0(%rsi),%xmm0
-	vpxor	%xmm1,%xmm0,%xmm0
-
-	vpclmulqdq	$0x01,112(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-	vpclmulqdq	$0x00,112(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm3,%xmm3
-	vpclmulqdq	$0x11,112(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm4,%xmm4
-	vpclmulqdq	$0x10,112(%rdi),%xmm0,%xmm6
-	vpxor	%xmm6,%xmm5,%xmm5
-
-
-	vpsrldq	$8,%xmm5,%xmm6
-	vpslldq	$8,%xmm5,%xmm5
-
-	vpxor	%xmm6,%xmm4,%xmm9
-	vpxor	%xmm5,%xmm3,%xmm1
-
-	leaq	128(%rsi),%rsi
-	jmp	L$htable_polyval_main_loop
-
-
-
-L$htable_polyval_out:
-	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm6
-	vpalignr	$8,%xmm1,%xmm1,%xmm1
-	vpxor	%xmm6,%xmm1,%xmm1
-
-	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm6
-	vpalignr	$8,%xmm1,%xmm1,%xmm1
-	vpxor	%xmm6,%xmm1,%xmm1
-	vpxor	%xmm9,%xmm1,%xmm1
-
-	vmovdqu	%xmm1,(%rcx)
-	vzeroupper
-	ret
-
-
-.globl	_aesgcmsiv_polyval_horner
-.private_extern _aesgcmsiv_polyval_horner
-
-.p2align	4
-_aesgcmsiv_polyval_horner:
-
-_CET_ENDBR
-	testq	%rcx,%rcx
-	jnz	L$polyval_horner_start
-	ret
-
-L$polyval_horner_start:
-
-
-
-	xorq	%r10,%r10
-	shlq	$4,%rcx
-
-	vmovdqa	(%rsi),%xmm1
-	vmovdqa	(%rdi),%xmm0
-
-L$polyval_horner_loop:
-	vpxor	(%rdx,%r10,1),%xmm0,%xmm0
-	call	GFMUL
-
-	addq	$16,%r10
-	cmpq	%r10,%rcx
-	jne	L$polyval_horner_loop
-
-
-	vmovdqa	%xmm0,(%rdi)
-	ret
-
-
-.globl	_aes128gcmsiv_aes_ks
-.private_extern _aes128gcmsiv_aes_ks
-
-.p2align	4
-_aes128gcmsiv_aes_ks:
-
-_CET_ENDBR
-	vmovdqu	(%rdi),%xmm1
-	vmovdqa	%xmm1,(%rsi)
-
-	vmovdqa	con1(%rip),%xmm0
-	vmovdqa	mask(%rip),%xmm15
-
-	movq	$8,%rax
-
-L$ks128_loop:
-	addq	$16,%rsi
-	subq	$1,%rax
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpslldq	$4,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpslldq	$4,%xmm3,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpslldq	$4,%xmm3,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-	vmovdqa	%xmm1,(%rsi)
-	jne	L$ks128_loop
-
-	vmovdqa	con2(%rip),%xmm0
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpslldq	$4,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpslldq	$4,%xmm3,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpslldq	$4,%xmm3,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-	vmovdqa	%xmm1,16(%rsi)
-
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslldq	$4,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpslldq	$4,%xmm3,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpslldq	$4,%xmm3,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-	vmovdqa	%xmm1,32(%rsi)
-	ret
-
-
-.globl	_aes256gcmsiv_aes_ks
-.private_extern _aes256gcmsiv_aes_ks
-
-.p2align	4
-_aes256gcmsiv_aes_ks:
-
-_CET_ENDBR
-	vmovdqu	(%rdi),%xmm1
-	vmovdqu	16(%rdi),%xmm3
-	vmovdqa	%xmm1,(%rsi)
-	vmovdqa	%xmm3,16(%rsi)
-	vmovdqa	con1(%rip),%xmm0
-	vmovdqa	mask(%rip),%xmm15
-	vpxor	%xmm14,%xmm14,%xmm14
-	movq	$6,%rax
-
-L$ks256_loop:
-	addq	$32,%rsi
-	subq	$1,%rax
-	vpshufb	%xmm15,%xmm3,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpsllq	$32,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpshufb	con3(%rip),%xmm1,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-	vmovdqa	%xmm1,(%rsi)
-	vpshufd	$0xff,%xmm1,%xmm2
-	vaesenclast	%xmm14,%xmm2,%xmm2
-	vpsllq	$32,%xmm3,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpshufb	con3(%rip),%xmm3,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpxor	%xmm2,%xmm3,%xmm3
-	vmovdqa	%xmm3,16(%rsi)
-	jne	L$ks256_loop
-
-	vpshufb	%xmm15,%xmm3,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpsllq	$32,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpshufb	con3(%rip),%xmm1,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-	vmovdqa	%xmm1,32(%rsi)
-	ret
-
-.globl	_aes128gcmsiv_aes_ks_enc_x1
-.private_extern _aes128gcmsiv_aes_ks_enc_x1
-
-.p2align	4
-_aes128gcmsiv_aes_ks_enc_x1:
-
-_CET_ENDBR
-	vmovdqa	(%rcx),%xmm1
-	vmovdqa	0(%rdi),%xmm4
-
-	vmovdqa	%xmm1,(%rdx)
-	vpxor	%xmm1,%xmm4,%xmm4
-
-	vmovdqa	con1(%rip),%xmm0
-	vmovdqa	mask(%rip),%xmm15
-
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpsllq	$32,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpshufb	con3(%rip),%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vmovdqa	%xmm1,16(%rdx)
-
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpsllq	$32,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpshufb	con3(%rip),%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vmovdqa	%xmm1,32(%rdx)
-
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpsllq	$32,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpshufb	con3(%rip),%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vmovdqa	%xmm1,48(%rdx)
-
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpsllq	$32,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpshufb	con3(%rip),%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vmovdqa	%xmm1,64(%rdx)
-
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpsllq	$32,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpshufb	con3(%rip),%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vmovdqa	%xmm1,80(%rdx)
-
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpsllq	$32,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpshufb	con3(%rip),%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vmovdqa	%xmm1,96(%rdx)
-
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpsllq	$32,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpshufb	con3(%rip),%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vmovdqa	%xmm1,112(%rdx)
-
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpsllq	$32,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpshufb	con3(%rip),%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vmovdqa	%xmm1,128(%rdx)
-
-
-	vmovdqa	con2(%rip),%xmm0
-
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpsllq	$32,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpshufb	con3(%rip),%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vmovdqa	%xmm1,144(%rdx)
-
-	vpshufb	%xmm15,%xmm1,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpsllq	$32,%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpshufb	con3(%rip),%xmm1,%xmm3
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-
-	vaesenclast	%xmm1,%xmm4,%xmm4
-	vmovdqa	%xmm1,160(%rdx)
-
-
-	vmovdqa	%xmm4,0(%rsi)
-	ret
-
-
-.globl	_aes128gcmsiv_kdf
-.private_extern _aes128gcmsiv_kdf
-
-.p2align	4
-_aes128gcmsiv_kdf:
-
-_CET_ENDBR
-
-
-
-
-	vmovdqa	(%rdx),%xmm1
-	vmovdqa	0(%rdi),%xmm9
-	vmovdqa	and_mask(%rip),%xmm12
-	vmovdqa	one(%rip),%xmm13
-	vpshufd	$0x90,%xmm9,%xmm9
-	vpand	%xmm12,%xmm9,%xmm9
-	vpaddd	%xmm13,%xmm9,%xmm10
-	vpaddd	%xmm13,%xmm10,%xmm11
-	vpaddd	%xmm13,%xmm11,%xmm12
-
-	vpxor	%xmm1,%xmm9,%xmm9
-	vpxor	%xmm1,%xmm10,%xmm10
-	vpxor	%xmm1,%xmm11,%xmm11
-	vpxor	%xmm1,%xmm12,%xmm12
-
-	vmovdqa	16(%rdx),%xmm1
-	vaesenc	%xmm1,%xmm9,%xmm9
-	vaesenc	%xmm1,%xmm10,%xmm10
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-
-	vmovdqa	32(%rdx),%xmm2
-	vaesenc	%xmm2,%xmm9,%xmm9
-	vaesenc	%xmm2,%xmm10,%xmm10
-	vaesenc	%xmm2,%xmm11,%xmm11
-	vaesenc	%xmm2,%xmm12,%xmm12
-
-	vmovdqa	48(%rdx),%xmm1
-	vaesenc	%xmm1,%xmm9,%xmm9
-	vaesenc	%xmm1,%xmm10,%xmm10
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-
-	vmovdqa	64(%rdx),%xmm2
-	vaesenc	%xmm2,%xmm9,%xmm9
-	vaesenc	%xmm2,%xmm10,%xmm10
-	vaesenc	%xmm2,%xmm11,%xmm11
-	vaesenc	%xmm2,%xmm12,%xmm12
-
-	vmovdqa	80(%rdx),%xmm1
-	vaesenc	%xmm1,%xmm9,%xmm9
-	vaesenc	%xmm1,%xmm10,%xmm10
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-
-	vmovdqa	96(%rdx),%xmm2
-	vaesenc	%xmm2,%xmm9,%xmm9
-	vaesenc	%xmm2,%xmm10,%xmm10
-	vaesenc	%xmm2,%xmm11,%xmm11
-	vaesenc	%xmm2,%xmm12,%xmm12
-
-	vmovdqa	112(%rdx),%xmm1
-	vaesenc	%xmm1,%xmm9,%xmm9
-	vaesenc	%xmm1,%xmm10,%xmm10
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-
-	vmovdqa	128(%rdx),%xmm2
-	vaesenc	%xmm2,%xmm9,%xmm9
-	vaesenc	%xmm2,%xmm10,%xmm10
-	vaesenc	%xmm2,%xmm11,%xmm11
-	vaesenc	%xmm2,%xmm12,%xmm12
-
-	vmovdqa	144(%rdx),%xmm1
-	vaesenc	%xmm1,%xmm9,%xmm9
-	vaesenc	%xmm1,%xmm10,%xmm10
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-
-	vmovdqa	160(%rdx),%xmm2
-	vaesenclast	%xmm2,%xmm9,%xmm9
-	vaesenclast	%xmm2,%xmm10,%xmm10
-	vaesenclast	%xmm2,%xmm11,%xmm11
-	vaesenclast	%xmm2,%xmm12,%xmm12
-
-
-	vmovdqa	%xmm9,0(%rsi)
-	vmovdqa	%xmm10,16(%rsi)
-	vmovdqa	%xmm11,32(%rsi)
-	vmovdqa	%xmm12,48(%rsi)
-	ret
-
-
-.globl	_aes128gcmsiv_enc_msg_x4
-.private_extern _aes128gcmsiv_enc_msg_x4
-
-.p2align	4
-_aes128gcmsiv_enc_msg_x4:
-
-_CET_ENDBR
-	testq	%r8,%r8
-	jnz	L$128_enc_msg_x4_start
-	ret
-
-L$128_enc_msg_x4_start:
-	pushq	%r12
-
-	pushq	%r13
-
-
-	shrq	$4,%r8
-	movq	%r8,%r10
-	shlq	$62,%r10
-	shrq	$62,%r10
-
-
-	vmovdqa	(%rdx),%xmm15
-	vpor	OR_MASK(%rip),%xmm15,%xmm15
-
-	vmovdqu	four(%rip),%xmm4
-	vmovdqa	%xmm15,%xmm0
-	vpaddd	one(%rip),%xmm15,%xmm1
-	vpaddd	two(%rip),%xmm15,%xmm2
-	vpaddd	three(%rip),%xmm15,%xmm3
-
-	shrq	$2,%r8
-	je	L$128_enc_msg_x4_check_remainder
-
-	subq	$64,%rsi
-	subq	$64,%rdi
-
-L$128_enc_msg_x4_loop1:
-	addq	$64,%rsi
-	addq	$64,%rdi
-
-	vmovdqa	%xmm0,%xmm5
-	vmovdqa	%xmm1,%xmm6
-	vmovdqa	%xmm2,%xmm7
-	vmovdqa	%xmm3,%xmm8
-
-	vpxor	(%rcx),%xmm5,%xmm5
-	vpxor	(%rcx),%xmm6,%xmm6
-	vpxor	(%rcx),%xmm7,%xmm7
-	vpxor	(%rcx),%xmm8,%xmm8
-
-	vmovdqu	16(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vpaddd	%xmm4,%xmm0,%xmm0
-	vmovdqu	32(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vpaddd	%xmm4,%xmm1,%xmm1
-	vmovdqu	48(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vpaddd	%xmm4,%xmm2,%xmm2
-	vmovdqu	64(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vpaddd	%xmm4,%xmm3,%xmm3
-
-	vmovdqu	80(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	96(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	112(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	128(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	144(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	160(%rcx),%xmm12
-	vaesenclast	%xmm12,%xmm5,%xmm5
-	vaesenclast	%xmm12,%xmm6,%xmm6
-	vaesenclast	%xmm12,%xmm7,%xmm7
-	vaesenclast	%xmm12,%xmm8,%xmm8
-
-
-
-	vpxor	0(%rdi),%xmm5,%xmm5
-	vpxor	16(%rdi),%xmm6,%xmm6
-	vpxor	32(%rdi),%xmm7,%xmm7
-	vpxor	48(%rdi),%xmm8,%xmm8
-
-	subq	$1,%r8
-
-	vmovdqu	%xmm5,0(%rsi)
-	vmovdqu	%xmm6,16(%rsi)
-	vmovdqu	%xmm7,32(%rsi)
-	vmovdqu	%xmm8,48(%rsi)
-
-	jne	L$128_enc_msg_x4_loop1
-
-	addq	$64,%rsi
-	addq	$64,%rdi
-
-L$128_enc_msg_x4_check_remainder:
-	cmpq	$0,%r10
-	je	L$128_enc_msg_x4_out
-
-L$128_enc_msg_x4_loop2:
-
-
-	vmovdqa	%xmm0,%xmm5
-	vpaddd	one(%rip),%xmm0,%xmm0
-
-	vpxor	(%rcx),%xmm5,%xmm5
-	vaesenc	16(%rcx),%xmm5,%xmm5
-	vaesenc	32(%rcx),%xmm5,%xmm5
-	vaesenc	48(%rcx),%xmm5,%xmm5
-	vaesenc	64(%rcx),%xmm5,%xmm5
-	vaesenc	80(%rcx),%xmm5,%xmm5
-	vaesenc	96(%rcx),%xmm5,%xmm5
-	vaesenc	112(%rcx),%xmm5,%xmm5
-	vaesenc	128(%rcx),%xmm5,%xmm5
-	vaesenc	144(%rcx),%xmm5,%xmm5
-	vaesenclast	160(%rcx),%xmm5,%xmm5
-
-
-	vpxor	(%rdi),%xmm5,%xmm5
-	vmovdqu	%xmm5,(%rsi)
-
-	addq	$16,%rdi
-	addq	$16,%rsi
-
-	subq	$1,%r10
-	jne	L$128_enc_msg_x4_loop2
-
-L$128_enc_msg_x4_out:
-	popq	%r13
-
-	popq	%r12
-
-	ret
-
-
-.globl	_aes128gcmsiv_enc_msg_x8
-.private_extern _aes128gcmsiv_enc_msg_x8
-
-.p2align	4
-_aes128gcmsiv_enc_msg_x8:
-
-_CET_ENDBR
-	testq	%r8,%r8
-	jnz	L$128_enc_msg_x8_start
-	ret
-
-L$128_enc_msg_x8_start:
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%rbp
-
-	movq	%rsp,%rbp
-
-
-
-	subq	$128,%rsp
-	andq	$-64,%rsp
-
-	shrq	$4,%r8
-	movq	%r8,%r10
-	shlq	$61,%r10
-	shrq	$61,%r10
-
-
-	vmovdqu	(%rdx),%xmm1
-	vpor	OR_MASK(%rip),%xmm1,%xmm1
-
-
-	vpaddd	seven(%rip),%xmm1,%xmm0
-	vmovdqu	%xmm0,(%rsp)
-	vpaddd	one(%rip),%xmm1,%xmm9
-	vpaddd	two(%rip),%xmm1,%xmm10
-	vpaddd	three(%rip),%xmm1,%xmm11
-	vpaddd	four(%rip),%xmm1,%xmm12
-	vpaddd	five(%rip),%xmm1,%xmm13
-	vpaddd	six(%rip),%xmm1,%xmm14
-	vmovdqa	%xmm1,%xmm0
-
-	shrq	$3,%r8
-	je	L$128_enc_msg_x8_check_remainder
-
-	subq	$128,%rsi
-	subq	$128,%rdi
-
-L$128_enc_msg_x8_loop1:
-	addq	$128,%rsi
-	addq	$128,%rdi
-
-	vmovdqa	%xmm0,%xmm1
-	vmovdqa	%xmm9,%xmm2
-	vmovdqa	%xmm10,%xmm3
-	vmovdqa	%xmm11,%xmm4
-	vmovdqa	%xmm12,%xmm5
-	vmovdqa	%xmm13,%xmm6
-	vmovdqa	%xmm14,%xmm7
-
-	vmovdqu	(%rsp),%xmm8
-
-	vpxor	(%rcx),%xmm1,%xmm1
-	vpxor	(%rcx),%xmm2,%xmm2
-	vpxor	(%rcx),%xmm3,%xmm3
-	vpxor	(%rcx),%xmm4,%xmm4
-	vpxor	(%rcx),%xmm5,%xmm5
-	vpxor	(%rcx),%xmm6,%xmm6
-	vpxor	(%rcx),%xmm7,%xmm7
-	vpxor	(%rcx),%xmm8,%xmm8
-
-	vmovdqu	16(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vmovdqu	(%rsp),%xmm14
-	vpaddd	eight(%rip),%xmm14,%xmm14
-	vmovdqu	%xmm14,(%rsp)
-	vmovdqu	32(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpsubd	one(%rip),%xmm14,%xmm14
-	vmovdqu	48(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpaddd	eight(%rip),%xmm0,%xmm0
-	vmovdqu	64(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpaddd	eight(%rip),%xmm9,%xmm9
-	vmovdqu	80(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpaddd	eight(%rip),%xmm10,%xmm10
-	vmovdqu	96(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpaddd	eight(%rip),%xmm11,%xmm11
-	vmovdqu	112(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpaddd	eight(%rip),%xmm12,%xmm12
-	vmovdqu	128(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpaddd	eight(%rip),%xmm13,%xmm13
-	vmovdqu	144(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vmovdqu	160(%rcx),%xmm15
-	vaesenclast	%xmm15,%xmm1,%xmm1
-	vaesenclast	%xmm15,%xmm2,%xmm2
-	vaesenclast	%xmm15,%xmm3,%xmm3
-	vaesenclast	%xmm15,%xmm4,%xmm4
-	vaesenclast	%xmm15,%xmm5,%xmm5
-	vaesenclast	%xmm15,%xmm6,%xmm6
-	vaesenclast	%xmm15,%xmm7,%xmm7
-	vaesenclast	%xmm15,%xmm8,%xmm8
-
-
-
-	vpxor	0(%rdi),%xmm1,%xmm1
-	vpxor	16(%rdi),%xmm2,%xmm2
-	vpxor	32(%rdi),%xmm3,%xmm3
-	vpxor	48(%rdi),%xmm4,%xmm4
-	vpxor	64(%rdi),%xmm5,%xmm5
-	vpxor	80(%rdi),%xmm6,%xmm6
-	vpxor	96(%rdi),%xmm7,%xmm7
-	vpxor	112(%rdi),%xmm8,%xmm8
-
-	decq	%r8
-
-	vmovdqu	%xmm1,0(%rsi)
-	vmovdqu	%xmm2,16(%rsi)
-	vmovdqu	%xmm3,32(%rsi)
-	vmovdqu	%xmm4,48(%rsi)
-	vmovdqu	%xmm5,64(%rsi)
-	vmovdqu	%xmm6,80(%rsi)
-	vmovdqu	%xmm7,96(%rsi)
-	vmovdqu	%xmm8,112(%rsi)
-
-	jne	L$128_enc_msg_x8_loop1
-
-	addq	$128,%rsi
-	addq	$128,%rdi
-
-L$128_enc_msg_x8_check_remainder:
-	cmpq	$0,%r10
-	je	L$128_enc_msg_x8_out
-
-L$128_enc_msg_x8_loop2:
-
-
-	vmovdqa	%xmm0,%xmm1
-	vpaddd	one(%rip),%xmm0,%xmm0
-
-	vpxor	(%rcx),%xmm1,%xmm1
-	vaesenc	16(%rcx),%xmm1,%xmm1
-	vaesenc	32(%rcx),%xmm1,%xmm1
-	vaesenc	48(%rcx),%xmm1,%xmm1
-	vaesenc	64(%rcx),%xmm1,%xmm1
-	vaesenc	80(%rcx),%xmm1,%xmm1
-	vaesenc	96(%rcx),%xmm1,%xmm1
-	vaesenc	112(%rcx),%xmm1,%xmm1
-	vaesenc	128(%rcx),%xmm1,%xmm1
-	vaesenc	144(%rcx),%xmm1,%xmm1
-	vaesenclast	160(%rcx),%xmm1,%xmm1
-
-
-	vpxor	(%rdi),%xmm1,%xmm1
-
-	vmovdqu	%xmm1,(%rsi)
-
-	addq	$16,%rdi
-	addq	$16,%rsi
-
-	decq	%r10
-	jne	L$128_enc_msg_x8_loop2
-
-L$128_enc_msg_x8_out:
-	movq	%rbp,%rsp
-
-	popq	%rbp
-
-	popq	%r13
-
-	popq	%r12
-
-	ret
-
-
-.globl	_aes128gcmsiv_dec
-.private_extern _aes128gcmsiv_dec
-
-.p2align	4
-_aes128gcmsiv_dec:
-
-_CET_ENDBR
-	testq	$~15,%r9
-	jnz	L$128_dec_start
-	ret
-
-L$128_dec_start:
-	vzeroupper
-	vmovdqa	(%rdx),%xmm0
-	movq	%rdx,%rax
-
-	leaq	32(%rax),%rax
-	leaq	32(%rcx),%rcx
-
-
-	vmovdqu	(%rdi,%r9,1),%xmm15
-	vpor	OR_MASK(%rip),%xmm15,%xmm15
-	andq	$~15,%r9
-
-
-	cmpq	$96,%r9
-	jb	L$128_dec_loop2
-
-
-	subq	$96,%r9
-	vmovdqa	%xmm15,%xmm7
-	vpaddd	one(%rip),%xmm7,%xmm8
-	vpaddd	two(%rip),%xmm7,%xmm9
-	vpaddd	one(%rip),%xmm9,%xmm10
-	vpaddd	two(%rip),%xmm9,%xmm11
-	vpaddd	one(%rip),%xmm11,%xmm12
-	vpaddd	two(%rip),%xmm11,%xmm15
-
-	vpxor	(%r8),%xmm7,%xmm7
-	vpxor	(%r8),%xmm8,%xmm8
-	vpxor	(%r8),%xmm9,%xmm9
-	vpxor	(%r8),%xmm10,%xmm10
-	vpxor	(%r8),%xmm11,%xmm11
-	vpxor	(%r8),%xmm12,%xmm12
-
-	vmovdqu	16(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	32(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	48(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	64(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	80(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	96(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	112(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	128(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	144(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	160(%r8),%xmm4
-	vaesenclast	%xmm4,%xmm7,%xmm7
-	vaesenclast	%xmm4,%xmm8,%xmm8
-	vaesenclast	%xmm4,%xmm9,%xmm9
-	vaesenclast	%xmm4,%xmm10,%xmm10
-	vaesenclast	%xmm4,%xmm11,%xmm11
-	vaesenclast	%xmm4,%xmm12,%xmm12
-
-
-	vpxor	0(%rdi),%xmm7,%xmm7
-	vpxor	16(%rdi),%xmm8,%xmm8
-	vpxor	32(%rdi),%xmm9,%xmm9
-	vpxor	48(%rdi),%xmm10,%xmm10
-	vpxor	64(%rdi),%xmm11,%xmm11
-	vpxor	80(%rdi),%xmm12,%xmm12
-
-	vmovdqu	%xmm7,0(%rsi)
-	vmovdqu	%xmm8,16(%rsi)
-	vmovdqu	%xmm9,32(%rsi)
-	vmovdqu	%xmm10,48(%rsi)
-	vmovdqu	%xmm11,64(%rsi)
-	vmovdqu	%xmm12,80(%rsi)
-
-	addq	$96,%rdi
-	addq	$96,%rsi
-	jmp	L$128_dec_loop1
-
-
-.p2align	6
-L$128_dec_loop1:
-	cmpq	$96,%r9
-	jb	L$128_dec_finish_96
-	subq	$96,%r9
-
-	vmovdqa	%xmm12,%xmm6
-	vmovdqa	%xmm11,16-32(%rax)
-	vmovdqa	%xmm10,32-32(%rax)
-	vmovdqa	%xmm9,48-32(%rax)
-	vmovdqa	%xmm8,64-32(%rax)
-	vmovdqa	%xmm7,80-32(%rax)
-
-	vmovdqa	%xmm15,%xmm7
-	vpaddd	one(%rip),%xmm7,%xmm8
-	vpaddd	two(%rip),%xmm7,%xmm9
-	vpaddd	one(%rip),%xmm9,%xmm10
-	vpaddd	two(%rip),%xmm9,%xmm11
-	vpaddd	one(%rip),%xmm11,%xmm12
-	vpaddd	two(%rip),%xmm11,%xmm15
-
-	vmovdqa	(%r8),%xmm4
-	vpxor	%xmm4,%xmm7,%xmm7
-	vpxor	%xmm4,%xmm8,%xmm8
-	vpxor	%xmm4,%xmm9,%xmm9
-	vpxor	%xmm4,%xmm10,%xmm10
-	vpxor	%xmm4,%xmm11,%xmm11
-	vpxor	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	0-32(%rcx),%xmm4
-	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
-	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
-	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm1
-	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vmovdqu	16(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	-16(%rax),%xmm6
-	vmovdqu	-16(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-
-	vmovdqu	32(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	0(%rax),%xmm6
-	vmovdqu	0(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-
-	vmovdqu	48(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	16(%rax),%xmm6
-	vmovdqu	16(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-
-	vmovdqu	64(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	32(%rax),%xmm6
-	vmovdqu	32(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-
-	vmovdqu	80(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	96(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	112(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-
-	vmovdqa	80-32(%rax),%xmm6
-	vpxor	%xmm0,%xmm6,%xmm6
-	vmovdqu	80-32(%rcx),%xmm5
-
-	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vmovdqu	128(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-
-	vpsrldq	$8,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm5
-	vpslldq	$8,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm0
-
-	vmovdqa	poly(%rip),%xmm3
-
-	vmovdqu	144(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	160(%r8),%xmm6
-	vpalignr	$8,%xmm0,%xmm0,%xmm2
-	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
-	vpxor	%xmm0,%xmm2,%xmm0
-
-	vpxor	0(%rdi),%xmm6,%xmm4
-	vaesenclast	%xmm4,%xmm7,%xmm7
-	vpxor	16(%rdi),%xmm6,%xmm4
-	vaesenclast	%xmm4,%xmm8,%xmm8
-	vpxor	32(%rdi),%xmm6,%xmm4
-	vaesenclast	%xmm4,%xmm9,%xmm9
-	vpxor	48(%rdi),%xmm6,%xmm4
-	vaesenclast	%xmm4,%xmm10,%xmm10
-	vpxor	64(%rdi),%xmm6,%xmm4
-	vaesenclast	%xmm4,%xmm11,%xmm11
-	vpxor	80(%rdi),%xmm6,%xmm4
-	vaesenclast	%xmm4,%xmm12,%xmm12
-
-	vpalignr	$8,%xmm0,%xmm0,%xmm2
-	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
-	vpxor	%xmm0,%xmm2,%xmm0
-
-	vmovdqu	%xmm7,0(%rsi)
-	vmovdqu	%xmm8,16(%rsi)
-	vmovdqu	%xmm9,32(%rsi)
-	vmovdqu	%xmm10,48(%rsi)
-	vmovdqu	%xmm11,64(%rsi)
-	vmovdqu	%xmm12,80(%rsi)
-
-	vpxor	%xmm5,%xmm0,%xmm0
-
-	leaq	96(%rdi),%rdi
-	leaq	96(%rsi),%rsi
-	jmp	L$128_dec_loop1
-
-L$128_dec_finish_96:
-	vmovdqa	%xmm12,%xmm6
-	vmovdqa	%xmm11,16-32(%rax)
-	vmovdqa	%xmm10,32-32(%rax)
-	vmovdqa	%xmm9,48-32(%rax)
-	vmovdqa	%xmm8,64-32(%rax)
-	vmovdqa	%xmm7,80-32(%rax)
-
-	vmovdqu	0-32(%rcx),%xmm4
-	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm1
-	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
-	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
-	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vmovdqu	-16(%rax),%xmm6
-	vmovdqu	-16(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vmovdqu	0(%rax),%xmm6
-	vmovdqu	0(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vmovdqu	16(%rax),%xmm6
-	vmovdqu	16(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vmovdqu	32(%rax),%xmm6
-	vmovdqu	32(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-
-	vmovdqu	80-32(%rax),%xmm6
-	vpxor	%xmm0,%xmm6,%xmm6
-	vmovdqu	80-32(%rcx),%xmm5
-	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vpsrldq	$8,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm5
-	vpslldq	$8,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm0
-
-	vmovdqa	poly(%rip),%xmm3
-
-	vpalignr	$8,%xmm0,%xmm0,%xmm2
-	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
-	vpxor	%xmm0,%xmm2,%xmm0
-
-	vpalignr	$8,%xmm0,%xmm0,%xmm2
-	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
-	vpxor	%xmm0,%xmm2,%xmm0
-
-	vpxor	%xmm5,%xmm0,%xmm0
-
-L$128_dec_loop2:
-
-
-
-	cmpq	$16,%r9
-	jb	L$128_dec_out
-	subq	$16,%r9
-
-	vmovdqa	%xmm15,%xmm2
-	vpaddd	one(%rip),%xmm15,%xmm15
-
-	vpxor	0(%r8),%xmm2,%xmm2
-	vaesenc	16(%r8),%xmm2,%xmm2
-	vaesenc	32(%r8),%xmm2,%xmm2
-	vaesenc	48(%r8),%xmm2,%xmm2
-	vaesenc	64(%r8),%xmm2,%xmm2
-	vaesenc	80(%r8),%xmm2,%xmm2
-	vaesenc	96(%r8),%xmm2,%xmm2
-	vaesenc	112(%r8),%xmm2,%xmm2
-	vaesenc	128(%r8),%xmm2,%xmm2
-	vaesenc	144(%r8),%xmm2,%xmm2
-	vaesenclast	160(%r8),%xmm2,%xmm2
-	vpxor	(%rdi),%xmm2,%xmm2
-	vmovdqu	%xmm2,(%rsi)
-	addq	$16,%rdi
-	addq	$16,%rsi
-
-	vpxor	%xmm2,%xmm0,%xmm0
-	vmovdqa	-32(%rcx),%xmm1
-	call	GFMUL
-
-	jmp	L$128_dec_loop2
-
-L$128_dec_out:
-	vmovdqu	%xmm0,(%rdx)
-	ret
-
-
-.globl	_aes128gcmsiv_ecb_enc_block
-.private_extern _aes128gcmsiv_ecb_enc_block
-
-.p2align	4
-_aes128gcmsiv_ecb_enc_block:
-
-_CET_ENDBR
-	vmovdqa	(%rdi),%xmm1
-
-	vpxor	(%rdx),%xmm1,%xmm1
-	vaesenc	16(%rdx),%xmm1,%xmm1
-	vaesenc	32(%rdx),%xmm1,%xmm1
-	vaesenc	48(%rdx),%xmm1,%xmm1
-	vaesenc	64(%rdx),%xmm1,%xmm1
-	vaesenc	80(%rdx),%xmm1,%xmm1
-	vaesenc	96(%rdx),%xmm1,%xmm1
-	vaesenc	112(%rdx),%xmm1,%xmm1
-	vaesenc	128(%rdx),%xmm1,%xmm1
-	vaesenc	144(%rdx),%xmm1,%xmm1
-	vaesenclast	160(%rdx),%xmm1,%xmm1
-
-	vmovdqa	%xmm1,(%rsi)
-
-	ret
-
-
-.globl	_aes256gcmsiv_aes_ks_enc_x1
-.private_extern _aes256gcmsiv_aes_ks_enc_x1
-
-.p2align	4
-_aes256gcmsiv_aes_ks_enc_x1:
-
-_CET_ENDBR
-	vmovdqa	con1(%rip),%xmm0
-	vmovdqa	mask(%rip),%xmm15
-	vmovdqa	(%rdi),%xmm8
-	vmovdqa	(%rcx),%xmm1
-	vmovdqa	16(%rcx),%xmm3
-	vpxor	%xmm1,%xmm8,%xmm8
-	vaesenc	%xmm3,%xmm8,%xmm8
-	vmovdqu	%xmm1,(%rdx)
-	vmovdqu	%xmm3,16(%rdx)
-	vpxor	%xmm14,%xmm14,%xmm14
-
-	vpshufb	%xmm15,%xmm3,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpslldq	$4,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-	vaesenc	%xmm1,%xmm8,%xmm8
-	vmovdqu	%xmm1,32(%rdx)
-
-	vpshufd	$0xff,%xmm1,%xmm2
-	vaesenclast	%xmm14,%xmm2,%xmm2
-	vpslldq	$4,%xmm3,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpxor	%xmm2,%xmm3,%xmm3
-	vaesenc	%xmm3,%xmm8,%xmm8
-	vmovdqu	%xmm3,48(%rdx)
-
-	vpshufb	%xmm15,%xmm3,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpslldq	$4,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-	vaesenc	%xmm1,%xmm8,%xmm8
-	vmovdqu	%xmm1,64(%rdx)
-
-	vpshufd	$0xff,%xmm1,%xmm2
-	vaesenclast	%xmm14,%xmm2,%xmm2
-	vpslldq	$4,%xmm3,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpxor	%xmm2,%xmm3,%xmm3
-	vaesenc	%xmm3,%xmm8,%xmm8
-	vmovdqu	%xmm3,80(%rdx)
-
-	vpshufb	%xmm15,%xmm3,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpslldq	$4,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-	vaesenc	%xmm1,%xmm8,%xmm8
-	vmovdqu	%xmm1,96(%rdx)
-
-	vpshufd	$0xff,%xmm1,%xmm2
-	vaesenclast	%xmm14,%xmm2,%xmm2
-	vpslldq	$4,%xmm3,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpxor	%xmm2,%xmm3,%xmm3
-	vaesenc	%xmm3,%xmm8,%xmm8
-	vmovdqu	%xmm3,112(%rdx)
-
-	vpshufb	%xmm15,%xmm3,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpslldq	$4,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-	vaesenc	%xmm1,%xmm8,%xmm8
-	vmovdqu	%xmm1,128(%rdx)
-
-	vpshufd	$0xff,%xmm1,%xmm2
-	vaesenclast	%xmm14,%xmm2,%xmm2
-	vpslldq	$4,%xmm3,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpxor	%xmm2,%xmm3,%xmm3
-	vaesenc	%xmm3,%xmm8,%xmm8
-	vmovdqu	%xmm3,144(%rdx)
-
-	vpshufb	%xmm15,%xmm3,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpslldq	$4,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-	vaesenc	%xmm1,%xmm8,%xmm8
-	vmovdqu	%xmm1,160(%rdx)
-
-	vpshufd	$0xff,%xmm1,%xmm2
-	vaesenclast	%xmm14,%xmm2,%xmm2
-	vpslldq	$4,%xmm3,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpxor	%xmm2,%xmm3,%xmm3
-	vaesenc	%xmm3,%xmm8,%xmm8
-	vmovdqu	%xmm3,176(%rdx)
-
-	vpshufb	%xmm15,%xmm3,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslld	$1,%xmm0,%xmm0
-	vpslldq	$4,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-	vaesenc	%xmm1,%xmm8,%xmm8
-	vmovdqu	%xmm1,192(%rdx)
-
-	vpshufd	$0xff,%xmm1,%xmm2
-	vaesenclast	%xmm14,%xmm2,%xmm2
-	vpslldq	$4,%xmm3,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpxor	%xmm2,%xmm3,%xmm3
-	vaesenc	%xmm3,%xmm8,%xmm8
-	vmovdqu	%xmm3,208(%rdx)
-
-	vpshufb	%xmm15,%xmm3,%xmm2
-	vaesenclast	%xmm0,%xmm2,%xmm2
-	vpslldq	$4,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpslldq	$4,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm1,%xmm1
-	vaesenclast	%xmm1,%xmm8,%xmm8
-	vmovdqu	%xmm1,224(%rdx)
-
-	vmovdqa	%xmm8,(%rsi)
-	ret
-
-
-.globl	_aes256gcmsiv_ecb_enc_block
-.private_extern _aes256gcmsiv_ecb_enc_block
-
-.p2align	4
-_aes256gcmsiv_ecb_enc_block:
-
-_CET_ENDBR
-	vmovdqa	(%rdi),%xmm1
-	vpxor	(%rdx),%xmm1,%xmm1
-	vaesenc	16(%rdx),%xmm1,%xmm1
-	vaesenc	32(%rdx),%xmm1,%xmm1
-	vaesenc	48(%rdx),%xmm1,%xmm1
-	vaesenc	64(%rdx),%xmm1,%xmm1
-	vaesenc	80(%rdx),%xmm1,%xmm1
-	vaesenc	96(%rdx),%xmm1,%xmm1
-	vaesenc	112(%rdx),%xmm1,%xmm1
-	vaesenc	128(%rdx),%xmm1,%xmm1
-	vaesenc	144(%rdx),%xmm1,%xmm1
-	vaesenc	160(%rdx),%xmm1,%xmm1
-	vaesenc	176(%rdx),%xmm1,%xmm1
-	vaesenc	192(%rdx),%xmm1,%xmm1
-	vaesenc	208(%rdx),%xmm1,%xmm1
-	vaesenclast	224(%rdx),%xmm1,%xmm1
-	vmovdqa	%xmm1,(%rsi)
-	ret
-
-
-.globl	_aes256gcmsiv_enc_msg_x4
-.private_extern _aes256gcmsiv_enc_msg_x4
-
-.p2align	4
-_aes256gcmsiv_enc_msg_x4:
-
-_CET_ENDBR
-	testq	%r8,%r8
-	jnz	L$256_enc_msg_x4_start
-	ret
-
-L$256_enc_msg_x4_start:
-	movq	%r8,%r10
-	shrq	$4,%r8
-	shlq	$60,%r10
-	jz	L$256_enc_msg_x4_start2
-	addq	$1,%r8
-
-L$256_enc_msg_x4_start2:
-	movq	%r8,%r10
-	shlq	$62,%r10
-	shrq	$62,%r10
-
-
-	vmovdqa	(%rdx),%xmm15
-	vpor	OR_MASK(%rip),%xmm15,%xmm15
-
-	vmovdqa	four(%rip),%xmm4
-	vmovdqa	%xmm15,%xmm0
-	vpaddd	one(%rip),%xmm15,%xmm1
-	vpaddd	two(%rip),%xmm15,%xmm2
-	vpaddd	three(%rip),%xmm15,%xmm3
-
-	shrq	$2,%r8
-	je	L$256_enc_msg_x4_check_remainder
-
-	subq	$64,%rsi
-	subq	$64,%rdi
-
-L$256_enc_msg_x4_loop1:
-	addq	$64,%rsi
-	addq	$64,%rdi
-
-	vmovdqa	%xmm0,%xmm5
-	vmovdqa	%xmm1,%xmm6
-	vmovdqa	%xmm2,%xmm7
-	vmovdqa	%xmm3,%xmm8
-
-	vpxor	(%rcx),%xmm5,%xmm5
-	vpxor	(%rcx),%xmm6,%xmm6
-	vpxor	(%rcx),%xmm7,%xmm7
-	vpxor	(%rcx),%xmm8,%xmm8
-
-	vmovdqu	16(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vpaddd	%xmm4,%xmm0,%xmm0
-	vmovdqu	32(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vpaddd	%xmm4,%xmm1,%xmm1
-	vmovdqu	48(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vpaddd	%xmm4,%xmm2,%xmm2
-	vmovdqu	64(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vpaddd	%xmm4,%xmm3,%xmm3
-
-	vmovdqu	80(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	96(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	112(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	128(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	144(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	160(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	176(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	192(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	208(%rcx),%xmm12
-	vaesenc	%xmm12,%xmm5,%xmm5
-	vaesenc	%xmm12,%xmm6,%xmm6
-	vaesenc	%xmm12,%xmm7,%xmm7
-	vaesenc	%xmm12,%xmm8,%xmm8
-
-	vmovdqu	224(%rcx),%xmm12
-	vaesenclast	%xmm12,%xmm5,%xmm5
-	vaesenclast	%xmm12,%xmm6,%xmm6
-	vaesenclast	%xmm12,%xmm7,%xmm7
-	vaesenclast	%xmm12,%xmm8,%xmm8
-
-
-
-	vpxor	0(%rdi),%xmm5,%xmm5
-	vpxor	16(%rdi),%xmm6,%xmm6
-	vpxor	32(%rdi),%xmm7,%xmm7
-	vpxor	48(%rdi),%xmm8,%xmm8
-
-	subq	$1,%r8
-
-	vmovdqu	%xmm5,0(%rsi)
-	vmovdqu	%xmm6,16(%rsi)
-	vmovdqu	%xmm7,32(%rsi)
-	vmovdqu	%xmm8,48(%rsi)
-
-	jne	L$256_enc_msg_x4_loop1
-
-	addq	$64,%rsi
-	addq	$64,%rdi
-
-L$256_enc_msg_x4_check_remainder:
-	cmpq	$0,%r10
-	je	L$256_enc_msg_x4_out
-
-L$256_enc_msg_x4_loop2:
-
-
-
-	vmovdqa	%xmm0,%xmm5
-	vpaddd	one(%rip),%xmm0,%xmm0
-	vpxor	(%rcx),%xmm5,%xmm5
-	vaesenc	16(%rcx),%xmm5,%xmm5
-	vaesenc	32(%rcx),%xmm5,%xmm5
-	vaesenc	48(%rcx),%xmm5,%xmm5
-	vaesenc	64(%rcx),%xmm5,%xmm5
-	vaesenc	80(%rcx),%xmm5,%xmm5
-	vaesenc	96(%rcx),%xmm5,%xmm5
-	vaesenc	112(%rcx),%xmm5,%xmm5
-	vaesenc	128(%rcx),%xmm5,%xmm5
-	vaesenc	144(%rcx),%xmm5,%xmm5
-	vaesenc	160(%rcx),%xmm5,%xmm5
-	vaesenc	176(%rcx),%xmm5,%xmm5
-	vaesenc	192(%rcx),%xmm5,%xmm5
-	vaesenc	208(%rcx),%xmm5,%xmm5
-	vaesenclast	224(%rcx),%xmm5,%xmm5
-
-
-	vpxor	(%rdi),%xmm5,%xmm5
-
-	vmovdqu	%xmm5,(%rsi)
-
-	addq	$16,%rdi
-	addq	$16,%rsi
-
-	subq	$1,%r10
-	jne	L$256_enc_msg_x4_loop2
-
-L$256_enc_msg_x4_out:
-	ret
-
-
-.globl	_aes256gcmsiv_enc_msg_x8
-.private_extern _aes256gcmsiv_enc_msg_x8
-
-.p2align	4
-_aes256gcmsiv_enc_msg_x8:
-
-_CET_ENDBR
-	testq	%r8,%r8
-	jnz	L$256_enc_msg_x8_start
-	ret
-
-L$256_enc_msg_x8_start:
-
-	movq	%rsp,%r11
-	subq	$16,%r11
-	andq	$-64,%r11
-
-	movq	%r8,%r10
-	shrq	$4,%r8
-	shlq	$60,%r10
-	jz	L$256_enc_msg_x8_start2
-	addq	$1,%r8
-
-L$256_enc_msg_x8_start2:
-	movq	%r8,%r10
-	shlq	$61,%r10
-	shrq	$61,%r10
-
-
-	vmovdqa	(%rdx),%xmm1
-	vpor	OR_MASK(%rip),%xmm1,%xmm1
-
-
-	vpaddd	seven(%rip),%xmm1,%xmm0
-	vmovdqa	%xmm0,(%r11)
-	vpaddd	one(%rip),%xmm1,%xmm9
-	vpaddd	two(%rip),%xmm1,%xmm10
-	vpaddd	three(%rip),%xmm1,%xmm11
-	vpaddd	four(%rip),%xmm1,%xmm12
-	vpaddd	five(%rip),%xmm1,%xmm13
-	vpaddd	six(%rip),%xmm1,%xmm14
-	vmovdqa	%xmm1,%xmm0
-
-	shrq	$3,%r8
-	jz	L$256_enc_msg_x8_check_remainder
-
-	subq	$128,%rsi
-	subq	$128,%rdi
-
-L$256_enc_msg_x8_loop1:
-	addq	$128,%rsi
-	addq	$128,%rdi
-
-	vmovdqa	%xmm0,%xmm1
-	vmovdqa	%xmm9,%xmm2
-	vmovdqa	%xmm10,%xmm3
-	vmovdqa	%xmm11,%xmm4
-	vmovdqa	%xmm12,%xmm5
-	vmovdqa	%xmm13,%xmm6
-	vmovdqa	%xmm14,%xmm7
-
-	vmovdqa	(%r11),%xmm8
-
-	vpxor	(%rcx),%xmm1,%xmm1
-	vpxor	(%rcx),%xmm2,%xmm2
-	vpxor	(%rcx),%xmm3,%xmm3
-	vpxor	(%rcx),%xmm4,%xmm4
-	vpxor	(%rcx),%xmm5,%xmm5
-	vpxor	(%rcx),%xmm6,%xmm6
-	vpxor	(%rcx),%xmm7,%xmm7
-	vpxor	(%rcx),%xmm8,%xmm8
-
-	vmovdqu	16(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vmovdqa	(%r11),%xmm14
-	vpaddd	eight(%rip),%xmm14,%xmm14
-	vmovdqa	%xmm14,(%r11)
-	vmovdqu	32(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpsubd	one(%rip),%xmm14,%xmm14
-	vmovdqu	48(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpaddd	eight(%rip),%xmm0,%xmm0
-	vmovdqu	64(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpaddd	eight(%rip),%xmm9,%xmm9
-	vmovdqu	80(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpaddd	eight(%rip),%xmm10,%xmm10
-	vmovdqu	96(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpaddd	eight(%rip),%xmm11,%xmm11
-	vmovdqu	112(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpaddd	eight(%rip),%xmm12,%xmm12
-	vmovdqu	128(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vpaddd	eight(%rip),%xmm13,%xmm13
-	vmovdqu	144(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vmovdqu	160(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vmovdqu	176(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vmovdqu	192(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vmovdqu	208(%rcx),%xmm15
-	vaesenc	%xmm15,%xmm1,%xmm1
-	vaesenc	%xmm15,%xmm2,%xmm2
-	vaesenc	%xmm15,%xmm3,%xmm3
-	vaesenc	%xmm15,%xmm4,%xmm4
-	vaesenc	%xmm15,%xmm5,%xmm5
-	vaesenc	%xmm15,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm8,%xmm8
-
-	vmovdqu	224(%rcx),%xmm15
-	vaesenclast	%xmm15,%xmm1,%xmm1
-	vaesenclast	%xmm15,%xmm2,%xmm2
-	vaesenclast	%xmm15,%xmm3,%xmm3
-	vaesenclast	%xmm15,%xmm4,%xmm4
-	vaesenclast	%xmm15,%xmm5,%xmm5
-	vaesenclast	%xmm15,%xmm6,%xmm6
-	vaesenclast	%xmm15,%xmm7,%xmm7
-	vaesenclast	%xmm15,%xmm8,%xmm8
-
-
-
-	vpxor	0(%rdi),%xmm1,%xmm1
-	vpxor	16(%rdi),%xmm2,%xmm2
-	vpxor	32(%rdi),%xmm3,%xmm3
-	vpxor	48(%rdi),%xmm4,%xmm4
-	vpxor	64(%rdi),%xmm5,%xmm5
-	vpxor	80(%rdi),%xmm6,%xmm6
-	vpxor	96(%rdi),%xmm7,%xmm7
-	vpxor	112(%rdi),%xmm8,%xmm8
-
-	subq	$1,%r8
-
-	vmovdqu	%xmm1,0(%rsi)
-	vmovdqu	%xmm2,16(%rsi)
-	vmovdqu	%xmm3,32(%rsi)
-	vmovdqu	%xmm4,48(%rsi)
-	vmovdqu	%xmm5,64(%rsi)
-	vmovdqu	%xmm6,80(%rsi)
-	vmovdqu	%xmm7,96(%rsi)
-	vmovdqu	%xmm8,112(%rsi)
-
-	jne	L$256_enc_msg_x8_loop1
-
-	addq	$128,%rsi
-	addq	$128,%rdi
-
-L$256_enc_msg_x8_check_remainder:
-	cmpq	$0,%r10
-	je	L$256_enc_msg_x8_out
-
-L$256_enc_msg_x8_loop2:
-
-
-	vmovdqa	%xmm0,%xmm1
-	vpaddd	one(%rip),%xmm0,%xmm0
-
-	vpxor	(%rcx),%xmm1,%xmm1
-	vaesenc	16(%rcx),%xmm1,%xmm1
-	vaesenc	32(%rcx),%xmm1,%xmm1
-	vaesenc	48(%rcx),%xmm1,%xmm1
-	vaesenc	64(%rcx),%xmm1,%xmm1
-	vaesenc	80(%rcx),%xmm1,%xmm1
-	vaesenc	96(%rcx),%xmm1,%xmm1
-	vaesenc	112(%rcx),%xmm1,%xmm1
-	vaesenc	128(%rcx),%xmm1,%xmm1
-	vaesenc	144(%rcx),%xmm1,%xmm1
-	vaesenc	160(%rcx),%xmm1,%xmm1
-	vaesenc	176(%rcx),%xmm1,%xmm1
-	vaesenc	192(%rcx),%xmm1,%xmm1
-	vaesenc	208(%rcx),%xmm1,%xmm1
-	vaesenclast	224(%rcx),%xmm1,%xmm1
-
-
-	vpxor	(%rdi),%xmm1,%xmm1
-
-	vmovdqu	%xmm1,(%rsi)
-
-	addq	$16,%rdi
-	addq	$16,%rsi
-	subq	$1,%r10
-	jnz	L$256_enc_msg_x8_loop2
-
-L$256_enc_msg_x8_out:
-	ret
-
-
-
-.globl	_aes256gcmsiv_dec
-.private_extern _aes256gcmsiv_dec
-
-.p2align	4
-_aes256gcmsiv_dec:
-
-_CET_ENDBR
-	testq	$~15,%r9
-	jnz	L$256_dec_start
-	ret
-
-L$256_dec_start:
-	vzeroupper
-	vmovdqa	(%rdx),%xmm0
-	movq	%rdx,%rax
-
-	leaq	32(%rax),%rax
-	leaq	32(%rcx),%rcx
-
-
-	vmovdqu	(%rdi,%r9,1),%xmm15
-	vpor	OR_MASK(%rip),%xmm15,%xmm15
-	andq	$~15,%r9
-
-
-	cmpq	$96,%r9
-	jb	L$256_dec_loop2
-
-
-	subq	$96,%r9
-	vmovdqa	%xmm15,%xmm7
-	vpaddd	one(%rip),%xmm7,%xmm8
-	vpaddd	two(%rip),%xmm7,%xmm9
-	vpaddd	one(%rip),%xmm9,%xmm10
-	vpaddd	two(%rip),%xmm9,%xmm11
-	vpaddd	one(%rip),%xmm11,%xmm12
-	vpaddd	two(%rip),%xmm11,%xmm15
-
-	vpxor	(%r8),%xmm7,%xmm7
-	vpxor	(%r8),%xmm8,%xmm8
-	vpxor	(%r8),%xmm9,%xmm9
-	vpxor	(%r8),%xmm10,%xmm10
-	vpxor	(%r8),%xmm11,%xmm11
-	vpxor	(%r8),%xmm12,%xmm12
-
-	vmovdqu	16(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	32(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	48(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	64(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	80(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	96(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	112(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	128(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	144(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	160(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	176(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	192(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	208(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	224(%r8),%xmm4
-	vaesenclast	%xmm4,%xmm7,%xmm7
-	vaesenclast	%xmm4,%xmm8,%xmm8
-	vaesenclast	%xmm4,%xmm9,%xmm9
-	vaesenclast	%xmm4,%xmm10,%xmm10
-	vaesenclast	%xmm4,%xmm11,%xmm11
-	vaesenclast	%xmm4,%xmm12,%xmm12
-
-
-	vpxor	0(%rdi),%xmm7,%xmm7
-	vpxor	16(%rdi),%xmm8,%xmm8
-	vpxor	32(%rdi),%xmm9,%xmm9
-	vpxor	48(%rdi),%xmm10,%xmm10
-	vpxor	64(%rdi),%xmm11,%xmm11
-	vpxor	80(%rdi),%xmm12,%xmm12
-
-	vmovdqu	%xmm7,0(%rsi)
-	vmovdqu	%xmm8,16(%rsi)
-	vmovdqu	%xmm9,32(%rsi)
-	vmovdqu	%xmm10,48(%rsi)
-	vmovdqu	%xmm11,64(%rsi)
-	vmovdqu	%xmm12,80(%rsi)
-
-	addq	$96,%rdi
-	addq	$96,%rsi
-	jmp	L$256_dec_loop1
-
-
-.p2align	6
-L$256_dec_loop1:
-	cmpq	$96,%r9
-	jb	L$256_dec_finish_96
-	subq	$96,%r9
-
-	vmovdqa	%xmm12,%xmm6
-	vmovdqa	%xmm11,16-32(%rax)
-	vmovdqa	%xmm10,32-32(%rax)
-	vmovdqa	%xmm9,48-32(%rax)
-	vmovdqa	%xmm8,64-32(%rax)
-	vmovdqa	%xmm7,80-32(%rax)
-
-	vmovdqa	%xmm15,%xmm7
-	vpaddd	one(%rip),%xmm7,%xmm8
-	vpaddd	two(%rip),%xmm7,%xmm9
-	vpaddd	one(%rip),%xmm9,%xmm10
-	vpaddd	two(%rip),%xmm9,%xmm11
-	vpaddd	one(%rip),%xmm11,%xmm12
-	vpaddd	two(%rip),%xmm11,%xmm15
-
-	vmovdqa	(%r8),%xmm4
-	vpxor	%xmm4,%xmm7,%xmm7
-	vpxor	%xmm4,%xmm8,%xmm8
-	vpxor	%xmm4,%xmm9,%xmm9
-	vpxor	%xmm4,%xmm10,%xmm10
-	vpxor	%xmm4,%xmm11,%xmm11
-	vpxor	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	0-32(%rcx),%xmm4
-	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
-	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
-	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm1
-	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vmovdqu	16(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	-16(%rax),%xmm6
-	vmovdqu	-16(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-
-	vmovdqu	32(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	0(%rax),%xmm6
-	vmovdqu	0(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-
-	vmovdqu	48(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	16(%rax),%xmm6
-	vmovdqu	16(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-
-	vmovdqu	64(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	32(%rax),%xmm6
-	vmovdqu	32(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-
-	vmovdqu	80(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	96(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	112(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-
-	vmovdqa	80-32(%rax),%xmm6
-	vpxor	%xmm0,%xmm6,%xmm6
-	vmovdqu	80-32(%rcx),%xmm5
-
-	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vmovdqu	128(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-
-	vpsrldq	$8,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm5
-	vpslldq	$8,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm0
-
-	vmovdqa	poly(%rip),%xmm3
-
-	vmovdqu	144(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	160(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	176(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	192(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	208(%r8),%xmm4
-	vaesenc	%xmm4,%xmm7,%xmm7
-	vaesenc	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm4,%xmm9,%xmm9
-	vaesenc	%xmm4,%xmm10,%xmm10
-	vaesenc	%xmm4,%xmm11,%xmm11
-	vaesenc	%xmm4,%xmm12,%xmm12
-
-	vmovdqu	224(%r8),%xmm6
-	vpalignr	$8,%xmm0,%xmm0,%xmm2
-	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
-	vpxor	%xmm0,%xmm2,%xmm0
-
-	vpxor	0(%rdi),%xmm6,%xmm4
-	vaesenclast	%xmm4,%xmm7,%xmm7
-	vpxor	16(%rdi),%xmm6,%xmm4
-	vaesenclast	%xmm4,%xmm8,%xmm8
-	vpxor	32(%rdi),%xmm6,%xmm4
-	vaesenclast	%xmm4,%xmm9,%xmm9
-	vpxor	48(%rdi),%xmm6,%xmm4
-	vaesenclast	%xmm4,%xmm10,%xmm10
-	vpxor	64(%rdi),%xmm6,%xmm4
-	vaesenclast	%xmm4,%xmm11,%xmm11
-	vpxor	80(%rdi),%xmm6,%xmm4
-	vaesenclast	%xmm4,%xmm12,%xmm12
-
-	vpalignr	$8,%xmm0,%xmm0,%xmm2
-	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
-	vpxor	%xmm0,%xmm2,%xmm0
-
-	vmovdqu	%xmm7,0(%rsi)
-	vmovdqu	%xmm8,16(%rsi)
-	vmovdqu	%xmm9,32(%rsi)
-	vmovdqu	%xmm10,48(%rsi)
-	vmovdqu	%xmm11,64(%rsi)
-	vmovdqu	%xmm12,80(%rsi)
-
-	vpxor	%xmm5,%xmm0,%xmm0
-
-	leaq	96(%rdi),%rdi
-	leaq	96(%rsi),%rsi
-	jmp	L$256_dec_loop1
-
-L$256_dec_finish_96:
-	vmovdqa	%xmm12,%xmm6
-	vmovdqa	%xmm11,16-32(%rax)
-	vmovdqa	%xmm10,32-32(%rax)
-	vmovdqa	%xmm9,48-32(%rax)
-	vmovdqa	%xmm8,64-32(%rax)
-	vmovdqa	%xmm7,80-32(%rax)
-
-	vmovdqu	0-32(%rcx),%xmm4
-	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm1
-	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
-	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
-	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vmovdqu	-16(%rax),%xmm6
-	vmovdqu	-16(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vmovdqu	0(%rax),%xmm6
-	vmovdqu	0(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vmovdqu	16(%rax),%xmm6
-	vmovdqu	16(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vmovdqu	32(%rax),%xmm6
-	vmovdqu	32(%rcx),%xmm13
-
-	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-
-	vmovdqu	80-32(%rax),%xmm6
-	vpxor	%xmm0,%xmm6,%xmm6
-	vmovdqu	80-32(%rcx),%xmm5
-	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vpsrldq	$8,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm2,%xmm5
-	vpslldq	$8,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm0
-
-	vmovdqa	poly(%rip),%xmm3
-
-	vpalignr	$8,%xmm0,%xmm0,%xmm2
-	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
-	vpxor	%xmm0,%xmm2,%xmm0
-
-	vpalignr	$8,%xmm0,%xmm0,%xmm2
-	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
-	vpxor	%xmm0,%xmm2,%xmm0
-
-	vpxor	%xmm5,%xmm0,%xmm0
-
-L$256_dec_loop2:
-
-
-
-	cmpq	$16,%r9
-	jb	L$256_dec_out
-	subq	$16,%r9
-
-	vmovdqa	%xmm15,%xmm2
-	vpaddd	one(%rip),%xmm15,%xmm15
-
-	vpxor	0(%r8),%xmm2,%xmm2
-	vaesenc	16(%r8),%xmm2,%xmm2
-	vaesenc	32(%r8),%xmm2,%xmm2
-	vaesenc	48(%r8),%xmm2,%xmm2
-	vaesenc	64(%r8),%xmm2,%xmm2
-	vaesenc	80(%r8),%xmm2,%xmm2
-	vaesenc	96(%r8),%xmm2,%xmm2
-	vaesenc	112(%r8),%xmm2,%xmm2
-	vaesenc	128(%r8),%xmm2,%xmm2
-	vaesenc	144(%r8),%xmm2,%xmm2
-	vaesenc	160(%r8),%xmm2,%xmm2
-	vaesenc	176(%r8),%xmm2,%xmm2
-	vaesenc	192(%r8),%xmm2,%xmm2
-	vaesenc	208(%r8),%xmm2,%xmm2
-	vaesenclast	224(%r8),%xmm2,%xmm2
-	vpxor	(%rdi),%xmm2,%xmm2
-	vmovdqu	%xmm2,(%rsi)
-	addq	$16,%rdi
-	addq	$16,%rsi
-
-	vpxor	%xmm2,%xmm0,%xmm0
-	vmovdqa	-32(%rcx),%xmm1
-	call	GFMUL
-
-	jmp	L$256_dec_loop2
-
-L$256_dec_out:
-	vmovdqu	%xmm0,(%rdx)
-	ret
-
-
-.globl	_aes256gcmsiv_kdf
-.private_extern _aes256gcmsiv_kdf
-
-.p2align	4
-_aes256gcmsiv_kdf:
-
-_CET_ENDBR
-
-
-
-
-	vmovdqa	(%rdx),%xmm1
-	vmovdqa	0(%rdi),%xmm4
-	vmovdqa	and_mask(%rip),%xmm11
-	vmovdqa	one(%rip),%xmm8
-	vpshufd	$0x90,%xmm4,%xmm4
-	vpand	%xmm11,%xmm4,%xmm4
-	vpaddd	%xmm8,%xmm4,%xmm6
-	vpaddd	%xmm8,%xmm6,%xmm7
-	vpaddd	%xmm8,%xmm7,%xmm11
-	vpaddd	%xmm8,%xmm11,%xmm12
-	vpaddd	%xmm8,%xmm12,%xmm13
-
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpxor	%xmm1,%xmm6,%xmm6
-	vpxor	%xmm1,%xmm7,%xmm7
-	vpxor	%xmm1,%xmm11,%xmm11
-	vpxor	%xmm1,%xmm12,%xmm12
-	vpxor	%xmm1,%xmm13,%xmm13
-
-	vmovdqa	16(%rdx),%xmm1
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vaesenc	%xmm1,%xmm6,%xmm6
-	vaesenc	%xmm1,%xmm7,%xmm7
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-	vaesenc	%xmm1,%xmm13,%xmm13
-
-	vmovdqa	32(%rdx),%xmm2
-	vaesenc	%xmm2,%xmm4,%xmm4
-	vaesenc	%xmm2,%xmm6,%xmm6
-	vaesenc	%xmm2,%xmm7,%xmm7
-	vaesenc	%xmm2,%xmm11,%xmm11
-	vaesenc	%xmm2,%xmm12,%xmm12
-	vaesenc	%xmm2,%xmm13,%xmm13
-
-	vmovdqa	48(%rdx),%xmm1
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vaesenc	%xmm1,%xmm6,%xmm6
-	vaesenc	%xmm1,%xmm7,%xmm7
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-	vaesenc	%xmm1,%xmm13,%xmm13
-
-	vmovdqa	64(%rdx),%xmm2
-	vaesenc	%xmm2,%xmm4,%xmm4
-	vaesenc	%xmm2,%xmm6,%xmm6
-	vaesenc	%xmm2,%xmm7,%xmm7
-	vaesenc	%xmm2,%xmm11,%xmm11
-	vaesenc	%xmm2,%xmm12,%xmm12
-	vaesenc	%xmm2,%xmm13,%xmm13
-
-	vmovdqa	80(%rdx),%xmm1
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vaesenc	%xmm1,%xmm6,%xmm6
-	vaesenc	%xmm1,%xmm7,%xmm7
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-	vaesenc	%xmm1,%xmm13,%xmm13
-
-	vmovdqa	96(%rdx),%xmm2
-	vaesenc	%xmm2,%xmm4,%xmm4
-	vaesenc	%xmm2,%xmm6,%xmm6
-	vaesenc	%xmm2,%xmm7,%xmm7
-	vaesenc	%xmm2,%xmm11,%xmm11
-	vaesenc	%xmm2,%xmm12,%xmm12
-	vaesenc	%xmm2,%xmm13,%xmm13
-
-	vmovdqa	112(%rdx),%xmm1
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vaesenc	%xmm1,%xmm6,%xmm6
-	vaesenc	%xmm1,%xmm7,%xmm7
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-	vaesenc	%xmm1,%xmm13,%xmm13
-
-	vmovdqa	128(%rdx),%xmm2
-	vaesenc	%xmm2,%xmm4,%xmm4
-	vaesenc	%xmm2,%xmm6,%xmm6
-	vaesenc	%xmm2,%xmm7,%xmm7
-	vaesenc	%xmm2,%xmm11,%xmm11
-	vaesenc	%xmm2,%xmm12,%xmm12
-	vaesenc	%xmm2,%xmm13,%xmm13
-
-	vmovdqa	144(%rdx),%xmm1
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vaesenc	%xmm1,%xmm6,%xmm6
-	vaesenc	%xmm1,%xmm7,%xmm7
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-	vaesenc	%xmm1,%xmm13,%xmm13
-
-	vmovdqa	160(%rdx),%xmm2
-	vaesenc	%xmm2,%xmm4,%xmm4
-	vaesenc	%xmm2,%xmm6,%xmm6
-	vaesenc	%xmm2,%xmm7,%xmm7
-	vaesenc	%xmm2,%xmm11,%xmm11
-	vaesenc	%xmm2,%xmm12,%xmm12
-	vaesenc	%xmm2,%xmm13,%xmm13
-
-	vmovdqa	176(%rdx),%xmm1
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vaesenc	%xmm1,%xmm6,%xmm6
-	vaesenc	%xmm1,%xmm7,%xmm7
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-	vaesenc	%xmm1,%xmm13,%xmm13
-
-	vmovdqa	192(%rdx),%xmm2
-	vaesenc	%xmm2,%xmm4,%xmm4
-	vaesenc	%xmm2,%xmm6,%xmm6
-	vaesenc	%xmm2,%xmm7,%xmm7
-	vaesenc	%xmm2,%xmm11,%xmm11
-	vaesenc	%xmm2,%xmm12,%xmm12
-	vaesenc	%xmm2,%xmm13,%xmm13
-
-	vmovdqa	208(%rdx),%xmm1
-	vaesenc	%xmm1,%xmm4,%xmm4
-	vaesenc	%xmm1,%xmm6,%xmm6
-	vaesenc	%xmm1,%xmm7,%xmm7
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-	vaesenc	%xmm1,%xmm13,%xmm13
-
-	vmovdqa	224(%rdx),%xmm2
-	vaesenclast	%xmm2,%xmm4,%xmm4
-	vaesenclast	%xmm2,%xmm6,%xmm6
-	vaesenclast	%xmm2,%xmm7,%xmm7
-	vaesenclast	%xmm2,%xmm11,%xmm11
-	vaesenclast	%xmm2,%xmm12,%xmm12
-	vaesenclast	%xmm2,%xmm13,%xmm13
-
-
-	vmovdqa	%xmm4,0(%rsi)
-	vmovdqa	%xmm6,16(%rsi)
-	vmovdqa	%xmm7,32(%rsi)
-	vmovdqa	%xmm11,48(%rsi)
-	vmovdqa	%xmm12,64(%rsi)
-	vmovdqa	%xmm13,80(%rsi)
-	ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64-apple.S b/apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64-apple.S
deleted file mode 100644
index e4a7202..0000000
--- a/apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64-apple.S
+++ /dev/null
@@ -1,8875 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-chacha20_poly1305_constants:
-
-.section	__DATA,__const
-.p2align	6
-L$chacha20_consts:
-.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
-.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
-L$rol8:
-.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
-.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
-L$rol16:
-.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
-.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
-L$avx2_init:
-.long	0,0,0,0
-L$sse_inc:
-.long	1,0,0,0
-L$avx2_inc:
-.long	2,0,0,0,2,0,0,0
-L$clamp:
-.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
-.quad	0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
-.p2align	4
-L$and_masks:
-.byte	0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte	0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte	0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte	0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
-.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
-.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
-.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
-.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
-.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
-.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
-.text	
-
-
-.p2align	6
-poly_hash_ad_internal:
-
-
-	xorq	%r10,%r10
-	xorq	%r11,%r11
-	xorq	%r12,%r12
-	cmpq	$13,%r8
-	jne	L$hash_ad_loop
-L$poly_fast_tls_ad:
-
-	movq	(%rcx),%r10
-	movq	5(%rcx),%r11
-	shrq	$24,%r11
-	movq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	ret
-L$hash_ad_loop:
-
-	cmpq	$16,%r8
-	jb	L$hash_ad_tail
-	addq	0+0(%rcx),%r10
-	adcq	8+0(%rcx),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rcx),%rcx
-	subq	$16,%r8
-	jmp	L$hash_ad_loop
-L$hash_ad_tail:
-	cmpq	$0,%r8
-	je	L$hash_ad_done
-
-	xorq	%r13,%r13
-	xorq	%r14,%r14
-	xorq	%r15,%r15
-	addq	%r8,%rcx
-L$hash_ad_tail_loop:
-	shldq	$8,%r13,%r14
-	shlq	$8,%r13
-	movzbq	-1(%rcx),%r15
-	xorq	%r15,%r13
-	decq	%rcx
-	decq	%r8
-	jne	L$hash_ad_tail_loop
-
-	addq	%r13,%r10
-	adcq	%r14,%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-
-L$hash_ad_done:
-	ret
-
-
-
-.globl	_chacha20_poly1305_open
-.private_extern _chacha20_poly1305_open
-
-.p2align	6
-_chacha20_poly1305_open:
-
-_CET_ENDBR
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-
-
-	pushq	%r9
-
-	subq	$288 + 0 + 32,%rsp
-
-
-	leaq	32(%rsp),%rbp
-	andq	$-32,%rbp
-
-	movq	%rdx,%rbx
-	movq	%r8,0+0+32(%rbp)
-	movq	%rbx,8+0+32(%rbp)
-
-	movl	_OPENSSL_ia32cap_P+8(%rip),%eax
-	andl	$288,%eax
-	xorl	$288,%eax
-	jz	chacha20_poly1305_open_avx2
-
-	cmpq	$128,%rbx
-	jbe	L$open_sse_128
-
-	movdqa	L$chacha20_consts(%rip),%xmm0
-	movdqu	0(%r9),%xmm4
-	movdqu	16(%r9),%xmm8
-	movdqu	32(%r9),%xmm12
-
-	movdqa	%xmm12,%xmm7
-
-	movdqa	%xmm4,0+48(%rbp)
-	movdqa	%xmm8,0+64(%rbp)
-	movdqa	%xmm12,0+96(%rbp)
-	movq	$10,%r10
-L$open_sse_init_rounds:
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-
-	decq	%r10
-	jne	L$open_sse_init_rounds
-
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	0+48(%rbp),%xmm4
-
-	pand	L$clamp(%rip),%xmm0
-	movdqa	%xmm0,0+0(%rbp)
-	movdqa	%xmm4,0+16(%rbp)
-
-	movq	%r8,%r8
-	call	poly_hash_ad_internal
-L$open_sse_main_loop:
-	cmpq	$256,%rbx
-	jb	L$open_sse_tail
-
-	movdqa	L$chacha20_consts(%rip),%xmm0
-	movdqa	0+48(%rbp),%xmm4
-	movdqa	0+64(%rbp),%xmm8
-	movdqa	%xmm0,%xmm1
-	movdqa	%xmm4,%xmm5
-	movdqa	%xmm8,%xmm9
-	movdqa	%xmm0,%xmm2
-	movdqa	%xmm4,%xmm6
-	movdqa	%xmm8,%xmm10
-	movdqa	%xmm0,%xmm3
-	movdqa	%xmm4,%xmm7
-	movdqa	%xmm8,%xmm11
-	movdqa	0+96(%rbp),%xmm15
-	paddd	L$sse_inc(%rip),%xmm15
-	movdqa	%xmm15,%xmm14
-	paddd	L$sse_inc(%rip),%xmm14
-	movdqa	%xmm14,%xmm13
-	paddd	L$sse_inc(%rip),%xmm13
-	movdqa	%xmm13,%xmm12
-	paddd	L$sse_inc(%rip),%xmm12
-	movdqa	%xmm12,0+96(%rbp)
-	movdqa	%xmm13,0+112(%rbp)
-	movdqa	%xmm14,0+128(%rbp)
-	movdqa	%xmm15,0+144(%rbp)
-
-
-
-	movq	$4,%rcx
-	movq	%rsi,%r8
-L$open_sse_main_loop_rounds:
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	L$rol16(%rip),%xmm8
-	paddd	%xmm7,%xmm3
-	paddd	%xmm6,%xmm2
-	paddd	%xmm5,%xmm1
-	paddd	%xmm4,%xmm0
-	pxor	%xmm3,%xmm15
-	pxor	%xmm2,%xmm14
-	pxor	%xmm1,%xmm13
-	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
-	movdqa	0+80(%rbp),%xmm8
-	paddd	%xmm15,%xmm11
-	paddd	%xmm14,%xmm10
-	paddd	%xmm13,%xmm9
-	paddd	%xmm12,%xmm8
-	pxor	%xmm11,%xmm7
-	addq	0+0(%r8),%r10
-	adcq	8+0(%r8),%r11
-	adcq	$1,%r12
-
-	leaq	16(%r8),%r8
-	pxor	%xmm10,%xmm6
-	pxor	%xmm9,%xmm5
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	%xmm7,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm7
-	pxor	%xmm8,%xmm7
-	movdqa	%xmm6,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm6
-	pxor	%xmm8,%xmm6
-	movdqa	%xmm5,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm5
-	pxor	%xmm8,%xmm5
-	movdqa	%xmm4,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm4
-	pxor	%xmm8,%xmm4
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movdqa	L$rol8(%rip),%xmm8
-	paddd	%xmm7,%xmm3
-	paddd	%xmm6,%xmm2
-	paddd	%xmm5,%xmm1
-	paddd	%xmm4,%xmm0
-	pxor	%xmm3,%xmm15
-	pxor	%xmm2,%xmm14
-	pxor	%xmm1,%xmm13
-	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
-	movdqa	0+80(%rbp),%xmm8
-	paddd	%xmm15,%xmm11
-	paddd	%xmm14,%xmm10
-	paddd	%xmm13,%xmm9
-	paddd	%xmm12,%xmm8
-	pxor	%xmm11,%xmm7
-	pxor	%xmm10,%xmm6
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	pxor	%xmm9,%xmm5
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	%xmm7,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm7
-	pxor	%xmm8,%xmm7
-	movdqa	%xmm6,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm6
-	pxor	%xmm8,%xmm6
-	movdqa	%xmm5,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm5
-	pxor	%xmm8,%xmm5
-	movdqa	%xmm4,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm4
-	pxor	%xmm8,%xmm4
-	movdqa	0+80(%rbp),%xmm8
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-.byte	102,15,58,15,255,4
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,12
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	L$rol16(%rip),%xmm8
-	paddd	%xmm7,%xmm3
-	paddd	%xmm6,%xmm2
-	paddd	%xmm5,%xmm1
-	paddd	%xmm4,%xmm0
-	pxor	%xmm3,%xmm15
-	pxor	%xmm2,%xmm14
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	pxor	%xmm1,%xmm13
-	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
-	movdqa	0+80(%rbp),%xmm8
-	paddd	%xmm15,%xmm11
-	paddd	%xmm14,%xmm10
-	paddd	%xmm13,%xmm9
-	paddd	%xmm12,%xmm8
-	pxor	%xmm11,%xmm7
-	pxor	%xmm10,%xmm6
-	pxor	%xmm9,%xmm5
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	%xmm7,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm7
-	pxor	%xmm8,%xmm7
-	movdqa	%xmm6,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm6
-	pxor	%xmm8,%xmm6
-	movdqa	%xmm5,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm5
-	pxor	%xmm8,%xmm5
-	movdqa	%xmm4,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm4
-	pxor	%xmm8,%xmm4
-	movdqa	L$rol8(%rip),%xmm8
-	paddd	%xmm7,%xmm3
-	paddd	%xmm6,%xmm2
-	paddd	%xmm5,%xmm1
-	paddd	%xmm4,%xmm0
-	pxor	%xmm3,%xmm15
-	pxor	%xmm2,%xmm14
-	pxor	%xmm1,%xmm13
-	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
-	movdqa	0+80(%rbp),%xmm8
-	paddd	%xmm15,%xmm11
-	paddd	%xmm14,%xmm10
-	paddd	%xmm13,%xmm9
-	paddd	%xmm12,%xmm8
-	pxor	%xmm11,%xmm7
-	pxor	%xmm10,%xmm6
-	pxor	%xmm9,%xmm5
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	%xmm7,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm7
-	pxor	%xmm8,%xmm7
-	movdqa	%xmm6,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm6
-	pxor	%xmm8,%xmm6
-	movdqa	%xmm5,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm5
-	pxor	%xmm8,%xmm5
-	movdqa	%xmm4,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm4
-	pxor	%xmm8,%xmm4
-	movdqa	0+80(%rbp),%xmm8
-.byte	102,15,58,15,255,12
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,4
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-
-	decq	%rcx
-	jge	L$open_sse_main_loop_rounds
-	addq	0+0(%r8),%r10
-	adcq	8+0(%r8),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%r8),%r8
-	cmpq	$-6,%rcx
-	jg	L$open_sse_main_loop_rounds
-	paddd	L$chacha20_consts(%rip),%xmm3
-	paddd	0+48(%rbp),%xmm7
-	paddd	0+64(%rbp),%xmm11
-	paddd	0+144(%rbp),%xmm15
-	paddd	L$chacha20_consts(%rip),%xmm2
-	paddd	0+48(%rbp),%xmm6
-	paddd	0+64(%rbp),%xmm10
-	paddd	0+128(%rbp),%xmm14
-	paddd	L$chacha20_consts(%rip),%xmm1
-	paddd	0+48(%rbp),%xmm5
-	paddd	0+64(%rbp),%xmm9
-	paddd	0+112(%rbp),%xmm13
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	0+48(%rbp),%xmm4
-	paddd	0+64(%rbp),%xmm8
-	paddd	0+96(%rbp),%xmm12
-	movdqa	%xmm12,0+80(%rbp)
-	movdqu	0 + 0(%rsi),%xmm12
-	pxor	%xmm3,%xmm12
-	movdqu	%xmm12,0 + 0(%rdi)
-	movdqu	16 + 0(%rsi),%xmm12
-	pxor	%xmm7,%xmm12
-	movdqu	%xmm12,16 + 0(%rdi)
-	movdqu	32 + 0(%rsi),%xmm12
-	pxor	%xmm11,%xmm12
-	movdqu	%xmm12,32 + 0(%rdi)
-	movdqu	48 + 0(%rsi),%xmm12
-	pxor	%xmm15,%xmm12
-	movdqu	%xmm12,48 + 0(%rdi)
-	movdqu	0 + 64(%rsi),%xmm3
-	movdqu	16 + 64(%rsi),%xmm7
-	movdqu	32 + 64(%rsi),%xmm11
-	movdqu	48 + 64(%rsi),%xmm15
-	pxor	%xmm3,%xmm2
-	pxor	%xmm7,%xmm6
-	pxor	%xmm11,%xmm10
-	pxor	%xmm14,%xmm15
-	movdqu	%xmm2,0 + 64(%rdi)
-	movdqu	%xmm6,16 + 64(%rdi)
-	movdqu	%xmm10,32 + 64(%rdi)
-	movdqu	%xmm15,48 + 64(%rdi)
-	movdqu	0 + 128(%rsi),%xmm3
-	movdqu	16 + 128(%rsi),%xmm7
-	movdqu	32 + 128(%rsi),%xmm11
-	movdqu	48 + 128(%rsi),%xmm15
-	pxor	%xmm3,%xmm1
-	pxor	%xmm7,%xmm5
-	pxor	%xmm11,%xmm9
-	pxor	%xmm13,%xmm15
-	movdqu	%xmm1,0 + 128(%rdi)
-	movdqu	%xmm5,16 + 128(%rdi)
-	movdqu	%xmm9,32 + 128(%rdi)
-	movdqu	%xmm15,48 + 128(%rdi)
-	movdqu	0 + 192(%rsi),%xmm3
-	movdqu	16 + 192(%rsi),%xmm7
-	movdqu	32 + 192(%rsi),%xmm11
-	movdqu	48 + 192(%rsi),%xmm15
-	pxor	%xmm3,%xmm0
-	pxor	%xmm7,%xmm4
-	pxor	%xmm11,%xmm8
-	pxor	0+80(%rbp),%xmm15
-	movdqu	%xmm0,0 + 192(%rdi)
-	movdqu	%xmm4,16 + 192(%rdi)
-	movdqu	%xmm8,32 + 192(%rdi)
-	movdqu	%xmm15,48 + 192(%rdi)
-
-	leaq	256(%rsi),%rsi
-	leaq	256(%rdi),%rdi
-	subq	$256,%rbx
-	jmp	L$open_sse_main_loop
-L$open_sse_tail:
-
-	testq	%rbx,%rbx
-	jz	L$open_sse_finalize
-	cmpq	$192,%rbx
-	ja	L$open_sse_tail_256
-	cmpq	$128,%rbx
-	ja	L$open_sse_tail_192
-	cmpq	$64,%rbx
-	ja	L$open_sse_tail_128
-	movdqa	L$chacha20_consts(%rip),%xmm0
-	movdqa	0+48(%rbp),%xmm4
-	movdqa	0+64(%rbp),%xmm8
-	movdqa	0+96(%rbp),%xmm12
-	paddd	L$sse_inc(%rip),%xmm12
-	movdqa	%xmm12,0+96(%rbp)
-
-	xorq	%r8,%r8
-	movq	%rbx,%rcx
-	cmpq	$16,%rcx
-	jb	L$open_sse_tail_64_rounds
-L$open_sse_tail_64_rounds_and_x1hash:
-	addq	0+0(%rsi,%r8,1),%r10
-	adcq	8+0(%rsi,%r8,1),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	subq	$16,%rcx
-L$open_sse_tail_64_rounds:
-	addq	$16,%r8
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-
-	cmpq	$16,%rcx
-	jae	L$open_sse_tail_64_rounds_and_x1hash
-	cmpq	$160,%r8
-	jne	L$open_sse_tail_64_rounds
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	0+48(%rbp),%xmm4
-	paddd	0+64(%rbp),%xmm8
-	paddd	0+96(%rbp),%xmm12
-
-	jmp	L$open_sse_tail_64_dec_loop
-
-L$open_sse_tail_128:
-	movdqa	L$chacha20_consts(%rip),%xmm0
-	movdqa	0+48(%rbp),%xmm4
-	movdqa	0+64(%rbp),%xmm8
-	movdqa	%xmm0,%xmm1
-	movdqa	%xmm4,%xmm5
-	movdqa	%xmm8,%xmm9
-	movdqa	0+96(%rbp),%xmm13
-	paddd	L$sse_inc(%rip),%xmm13
-	movdqa	%xmm13,%xmm12
-	paddd	L$sse_inc(%rip),%xmm12
-	movdqa	%xmm12,0+96(%rbp)
-	movdqa	%xmm13,0+112(%rbp)
-
-	movq	%rbx,%rcx
-	andq	$-16,%rcx
-	xorq	%r8,%r8
-L$open_sse_tail_128_rounds_and_x1hash:
-	addq	0+0(%rsi,%r8,1),%r10
-	adcq	8+0(%rsi,%r8,1),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-L$open_sse_tail_128_rounds:
-	addq	$16,%r8
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm5
-	pxor	%xmm3,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm5
-	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm5
-	pxor	%xmm3,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm5
-	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-
-	cmpq	%rcx,%r8
-	jb	L$open_sse_tail_128_rounds_and_x1hash
-	cmpq	$160,%r8
-	jne	L$open_sse_tail_128_rounds
-	paddd	L$chacha20_consts(%rip),%xmm1
-	paddd	0+48(%rbp),%xmm5
-	paddd	0+64(%rbp),%xmm9
-	paddd	0+112(%rbp),%xmm13
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	0+48(%rbp),%xmm4
-	paddd	0+64(%rbp),%xmm8
-	paddd	0+96(%rbp),%xmm12
-	movdqu	0 + 0(%rsi),%xmm3
-	movdqu	16 + 0(%rsi),%xmm7
-	movdqu	32 + 0(%rsi),%xmm11
-	movdqu	48 + 0(%rsi),%xmm15
-	pxor	%xmm3,%xmm1
-	pxor	%xmm7,%xmm5
-	pxor	%xmm11,%xmm9
-	pxor	%xmm13,%xmm15
-	movdqu	%xmm1,0 + 0(%rdi)
-	movdqu	%xmm5,16 + 0(%rdi)
-	movdqu	%xmm9,32 + 0(%rdi)
-	movdqu	%xmm15,48 + 0(%rdi)
-
-	subq	$64,%rbx
-	leaq	64(%rsi),%rsi
-	leaq	64(%rdi),%rdi
-	jmp	L$open_sse_tail_64_dec_loop
-
-L$open_sse_tail_192:
-	movdqa	L$chacha20_consts(%rip),%xmm0
-	movdqa	0+48(%rbp),%xmm4
-	movdqa	0+64(%rbp),%xmm8
-	movdqa	%xmm0,%xmm1
-	movdqa	%xmm4,%xmm5
-	movdqa	%xmm8,%xmm9
-	movdqa	%xmm0,%xmm2
-	movdqa	%xmm4,%xmm6
-	movdqa	%xmm8,%xmm10
-	movdqa	0+96(%rbp),%xmm14
-	paddd	L$sse_inc(%rip),%xmm14
-	movdqa	%xmm14,%xmm13
-	paddd	L$sse_inc(%rip),%xmm13
-	movdqa	%xmm13,%xmm12
-	paddd	L$sse_inc(%rip),%xmm12
-	movdqa	%xmm12,0+96(%rbp)
-	movdqa	%xmm13,0+112(%rbp)
-	movdqa	%xmm14,0+128(%rbp)
-
-	movq	%rbx,%rcx
-	movq	$160,%r8
-	cmpq	$160,%rcx
-	cmovgq	%r8,%rcx
-	andq	$-16,%rcx
-	xorq	%r8,%r8
-L$open_sse_tail_192_rounds_and_x1hash:
-	addq	0+0(%rsi,%r8,1),%r10
-	adcq	8+0(%rsi,%r8,1),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-L$open_sse_tail_192_rounds:
-	addq	$16,%r8
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm5
-	pxor	%xmm3,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm5
-	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol16(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm6
-	pxor	%xmm3,%xmm6
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol8(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm6
-	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm5
-	pxor	%xmm3,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm5
-	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol16(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm6
-	pxor	%xmm3,%xmm6
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol8(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm6
-	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-
-	cmpq	%rcx,%r8
-	jb	L$open_sse_tail_192_rounds_and_x1hash
-	cmpq	$160,%r8
-	jne	L$open_sse_tail_192_rounds
-	cmpq	$176,%rbx
-	jb	L$open_sse_tail_192_finish
-	addq	0+160(%rsi),%r10
-	adcq	8+160(%rsi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	cmpq	$192,%rbx
-	jb	L$open_sse_tail_192_finish
-	addq	0+176(%rsi),%r10
-	adcq	8+176(%rsi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-L$open_sse_tail_192_finish:
-	paddd	L$chacha20_consts(%rip),%xmm2
-	paddd	0+48(%rbp),%xmm6
-	paddd	0+64(%rbp),%xmm10
-	paddd	0+128(%rbp),%xmm14
-	paddd	L$chacha20_consts(%rip),%xmm1
-	paddd	0+48(%rbp),%xmm5
-	paddd	0+64(%rbp),%xmm9
-	paddd	0+112(%rbp),%xmm13
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	0+48(%rbp),%xmm4
-	paddd	0+64(%rbp),%xmm8
-	paddd	0+96(%rbp),%xmm12
-	movdqu	0 + 0(%rsi),%xmm3
-	movdqu	16 + 0(%rsi),%xmm7
-	movdqu	32 + 0(%rsi),%xmm11
-	movdqu	48 + 0(%rsi),%xmm15
-	pxor	%xmm3,%xmm2
-	pxor	%xmm7,%xmm6
-	pxor	%xmm11,%xmm10
-	pxor	%xmm14,%xmm15
-	movdqu	%xmm2,0 + 0(%rdi)
-	movdqu	%xmm6,16 + 0(%rdi)
-	movdqu	%xmm10,32 + 0(%rdi)
-	movdqu	%xmm15,48 + 0(%rdi)
-	movdqu	0 + 64(%rsi),%xmm3
-	movdqu	16 + 64(%rsi),%xmm7
-	movdqu	32 + 64(%rsi),%xmm11
-	movdqu	48 + 64(%rsi),%xmm15
-	pxor	%xmm3,%xmm1
-	pxor	%xmm7,%xmm5
-	pxor	%xmm11,%xmm9
-	pxor	%xmm13,%xmm15
-	movdqu	%xmm1,0 + 64(%rdi)
-	movdqu	%xmm5,16 + 64(%rdi)
-	movdqu	%xmm9,32 + 64(%rdi)
-	movdqu	%xmm15,48 + 64(%rdi)
-
-	subq	$128,%rbx
-	leaq	128(%rsi),%rsi
-	leaq	128(%rdi),%rdi
-	jmp	L$open_sse_tail_64_dec_loop
-
-L$open_sse_tail_256:
-	movdqa	L$chacha20_consts(%rip),%xmm0
-	movdqa	0+48(%rbp),%xmm4
-	movdqa	0+64(%rbp),%xmm8
-	movdqa	%xmm0,%xmm1
-	movdqa	%xmm4,%xmm5
-	movdqa	%xmm8,%xmm9
-	movdqa	%xmm0,%xmm2
-	movdqa	%xmm4,%xmm6
-	movdqa	%xmm8,%xmm10
-	movdqa	%xmm0,%xmm3
-	movdqa	%xmm4,%xmm7
-	movdqa	%xmm8,%xmm11
-	movdqa	0+96(%rbp),%xmm15
-	paddd	L$sse_inc(%rip),%xmm15
-	movdqa	%xmm15,%xmm14
-	paddd	L$sse_inc(%rip),%xmm14
-	movdqa	%xmm14,%xmm13
-	paddd	L$sse_inc(%rip),%xmm13
-	movdqa	%xmm13,%xmm12
-	paddd	L$sse_inc(%rip),%xmm12
-	movdqa	%xmm12,0+96(%rbp)
-	movdqa	%xmm13,0+112(%rbp)
-	movdqa	%xmm14,0+128(%rbp)
-	movdqa	%xmm15,0+144(%rbp)
-
-	xorq	%r8,%r8
-L$open_sse_tail_256_rounds_and_x1hash:
-	addq	0+0(%rsi,%r8,1),%r10
-	adcq	8+0(%rsi,%r8,1),%r11
-	adcq	$1,%r12
-	movdqa	%xmm11,0+80(%rbp)
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm11
-	pslld	$12,%xmm11
-	psrld	$20,%xmm4
-	pxor	%xmm11,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm11
-	pslld	$7,%xmm11
-	psrld	$25,%xmm4
-	pxor	%xmm11,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm11
-	pslld	$12,%xmm11
-	psrld	$20,%xmm5
-	pxor	%xmm11,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm11
-	pslld	$7,%xmm11
-	psrld	$25,%xmm5
-	pxor	%xmm11,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol16(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm11
-	pslld	$12,%xmm11
-	psrld	$20,%xmm6
-	pxor	%xmm11,%xmm6
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol8(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm11
-	pslld	$7,%xmm11
-	psrld	$25,%xmm6
-	pxor	%xmm11,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-	movdqa	0+80(%rbp),%xmm11
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movdqa	%xmm9,0+80(%rbp)
-	paddd	%xmm7,%xmm3
-	pxor	%xmm3,%xmm15
-	pshufb	L$rol16(%rip),%xmm15
-	paddd	%xmm15,%xmm11
-	pxor	%xmm11,%xmm7
-	movdqa	%xmm7,%xmm9
-	pslld	$12,%xmm9
-	psrld	$20,%xmm7
-	pxor	%xmm9,%xmm7
-	paddd	%xmm7,%xmm3
-	pxor	%xmm3,%xmm15
-	pshufb	L$rol8(%rip),%xmm15
-	paddd	%xmm15,%xmm11
-	pxor	%xmm11,%xmm7
-	movdqa	%xmm7,%xmm9
-	pslld	$7,%xmm9
-	psrld	$25,%xmm7
-	pxor	%xmm9,%xmm7
-.byte	102,15,58,15,255,4
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,12
-	movdqa	0+80(%rbp),%xmm9
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	movdqa	%xmm11,0+80(%rbp)
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm11
-	pslld	$12,%xmm11
-	psrld	$20,%xmm4
-	pxor	%xmm11,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm11
-	pslld	$7,%xmm11
-	psrld	$25,%xmm4
-	pxor	%xmm11,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm11
-	pslld	$12,%xmm11
-	psrld	$20,%xmm5
-	pxor	%xmm11,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm11
-	pslld	$7,%xmm11
-	psrld	$25,%xmm5
-	pxor	%xmm11,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol16(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm11
-	pslld	$12,%xmm11
-	psrld	$20,%xmm6
-	pxor	%xmm11,%xmm6
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol8(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm11
-	pslld	$7,%xmm11
-	psrld	$25,%xmm6
-	pxor	%xmm11,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-	movdqa	0+80(%rbp),%xmm11
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	movdqa	%xmm9,0+80(%rbp)
-	paddd	%xmm7,%xmm3
-	pxor	%xmm3,%xmm15
-	pshufb	L$rol16(%rip),%xmm15
-	paddd	%xmm15,%xmm11
-	pxor	%xmm11,%xmm7
-	movdqa	%xmm7,%xmm9
-	pslld	$12,%xmm9
-	psrld	$20,%xmm7
-	pxor	%xmm9,%xmm7
-	paddd	%xmm7,%xmm3
-	pxor	%xmm3,%xmm15
-	pshufb	L$rol8(%rip),%xmm15
-	paddd	%xmm15,%xmm11
-	pxor	%xmm11,%xmm7
-	movdqa	%xmm7,%xmm9
-	pslld	$7,%xmm9
-	psrld	$25,%xmm7
-	pxor	%xmm9,%xmm7
-.byte	102,15,58,15,255,12
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,4
-	movdqa	0+80(%rbp),%xmm9
-
-	addq	$16,%r8
-	cmpq	$160,%r8
-	jb	L$open_sse_tail_256_rounds_and_x1hash
-
-	movq	%rbx,%rcx
-	andq	$-16,%rcx
-L$open_sse_tail_256_hash:
-	addq	0+0(%rsi,%r8,1),%r10
-	adcq	8+0(%rsi,%r8,1),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	addq	$16,%r8
-	cmpq	%rcx,%r8
-	jb	L$open_sse_tail_256_hash
-	paddd	L$chacha20_consts(%rip),%xmm3
-	paddd	0+48(%rbp),%xmm7
-	paddd	0+64(%rbp),%xmm11
-	paddd	0+144(%rbp),%xmm15
-	paddd	L$chacha20_consts(%rip),%xmm2
-	paddd	0+48(%rbp),%xmm6
-	paddd	0+64(%rbp),%xmm10
-	paddd	0+128(%rbp),%xmm14
-	paddd	L$chacha20_consts(%rip),%xmm1
-	paddd	0+48(%rbp),%xmm5
-	paddd	0+64(%rbp),%xmm9
-	paddd	0+112(%rbp),%xmm13
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	0+48(%rbp),%xmm4
-	paddd	0+64(%rbp),%xmm8
-	paddd	0+96(%rbp),%xmm12
-	movdqa	%xmm12,0+80(%rbp)
-	movdqu	0 + 0(%rsi),%xmm12
-	pxor	%xmm3,%xmm12
-	movdqu	%xmm12,0 + 0(%rdi)
-	movdqu	16 + 0(%rsi),%xmm12
-	pxor	%xmm7,%xmm12
-	movdqu	%xmm12,16 + 0(%rdi)
-	movdqu	32 + 0(%rsi),%xmm12
-	pxor	%xmm11,%xmm12
-	movdqu	%xmm12,32 + 0(%rdi)
-	movdqu	48 + 0(%rsi),%xmm12
-	pxor	%xmm15,%xmm12
-	movdqu	%xmm12,48 + 0(%rdi)
-	movdqu	0 + 64(%rsi),%xmm3
-	movdqu	16 + 64(%rsi),%xmm7
-	movdqu	32 + 64(%rsi),%xmm11
-	movdqu	48 + 64(%rsi),%xmm15
-	pxor	%xmm3,%xmm2
-	pxor	%xmm7,%xmm6
-	pxor	%xmm11,%xmm10
-	pxor	%xmm14,%xmm15
-	movdqu	%xmm2,0 + 64(%rdi)
-	movdqu	%xmm6,16 + 64(%rdi)
-	movdqu	%xmm10,32 + 64(%rdi)
-	movdqu	%xmm15,48 + 64(%rdi)
-	movdqu	0 + 128(%rsi),%xmm3
-	movdqu	16 + 128(%rsi),%xmm7
-	movdqu	32 + 128(%rsi),%xmm11
-	movdqu	48 + 128(%rsi),%xmm15
-	pxor	%xmm3,%xmm1
-	pxor	%xmm7,%xmm5
-	pxor	%xmm11,%xmm9
-	pxor	%xmm13,%xmm15
-	movdqu	%xmm1,0 + 128(%rdi)
-	movdqu	%xmm5,16 + 128(%rdi)
-	movdqu	%xmm9,32 + 128(%rdi)
-	movdqu	%xmm15,48 + 128(%rdi)
-
-	movdqa	0+80(%rbp),%xmm12
-	subq	$192,%rbx
-	leaq	192(%rsi),%rsi
-	leaq	192(%rdi),%rdi
-
-
-L$open_sse_tail_64_dec_loop:
-	cmpq	$16,%rbx
-	jb	L$open_sse_tail_16_init
-	subq	$16,%rbx
-	movdqu	(%rsi),%xmm3
-	pxor	%xmm3,%xmm0
-	movdqu	%xmm0,(%rdi)
-	leaq	16(%rsi),%rsi
-	leaq	16(%rdi),%rdi
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm8,%xmm4
-	movdqa	%xmm12,%xmm8
-	jmp	L$open_sse_tail_64_dec_loop
-L$open_sse_tail_16_init:
-	movdqa	%xmm0,%xmm1
-
-
-L$open_sse_tail_16:
-	testq	%rbx,%rbx
-	jz	L$open_sse_finalize
-
-
-
-	pxor	%xmm3,%xmm3
-	leaq	-1(%rsi,%rbx,1),%rsi
-	movq	%rbx,%r8
-L$open_sse_tail_16_compose:
-	pslldq	$1,%xmm3
-	pinsrb	$0,(%rsi),%xmm3
-	subq	$1,%rsi
-	subq	$1,%r8
-	jnz	L$open_sse_tail_16_compose
-
-.byte	102,73,15,126,221
-	pextrq	$1,%xmm3,%r14
-
-	pxor	%xmm1,%xmm3
-
-
-L$open_sse_tail_16_extract:
-	pextrb	$0,%xmm3,(%rdi)
-	psrldq	$1,%xmm3
-	addq	$1,%rdi
-	subq	$1,%rbx
-	jne	L$open_sse_tail_16_extract
-
-	addq	%r13,%r10
-	adcq	%r14,%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-
-L$open_sse_finalize:
-	addq	0+0+32(%rbp),%r10
-	adcq	8+0+32(%rbp),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-
-	movq	%r10,%r13
-	movq	%r11,%r14
-	movq	%r12,%r15
-	subq	$-5,%r10
-	sbbq	$-1,%r11
-	sbbq	$3,%r12
-	cmovcq	%r13,%r10
-	cmovcq	%r14,%r11
-	cmovcq	%r15,%r12
-
-	addq	0+0+16(%rbp),%r10
-	adcq	8+0+16(%rbp),%r11
-
-
-	addq	$288 + 0 + 32,%rsp
-
-
-	popq	%r9
-
-	movq	%r10,(%r9)
-	movq	%r11,8(%r9)
-	popq	%r15
-
-	popq	%r14
-
-	popq	%r13
-
-	popq	%r12
-
-	popq	%rbx
-
-	popq	%rbp
-
-	ret
-
-L$open_sse_128:
-
-	movdqu	L$chacha20_consts(%rip),%xmm0
-	movdqa	%xmm0,%xmm1
-	movdqa	%xmm0,%xmm2
-	movdqu	0(%r9),%xmm4
-	movdqa	%xmm4,%xmm5
-	movdqa	%xmm4,%xmm6
-	movdqu	16(%r9),%xmm8
-	movdqa	%xmm8,%xmm9
-	movdqa	%xmm8,%xmm10
-	movdqu	32(%r9),%xmm12
-	movdqa	%xmm12,%xmm13
-	paddd	L$sse_inc(%rip),%xmm13
-	movdqa	%xmm13,%xmm14
-	paddd	L$sse_inc(%rip),%xmm14
-	movdqa	%xmm4,%xmm7
-	movdqa	%xmm8,%xmm11
-	movdqa	%xmm13,%xmm15
-	movq	$10,%r10
-
-L$open_sse_128_rounds:
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm5
-	pxor	%xmm3,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm5
-	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol16(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm6
-	pxor	%xmm3,%xmm6
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol8(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm6
-	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm5
-	pxor	%xmm3,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm5
-	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol16(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm6
-	pxor	%xmm3,%xmm6
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol8(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm6
-	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-
-	decq	%r10
-	jnz	L$open_sse_128_rounds
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	L$chacha20_consts(%rip),%xmm1
-	paddd	L$chacha20_consts(%rip),%xmm2
-	paddd	%xmm7,%xmm4
-	paddd	%xmm7,%xmm5
-	paddd	%xmm7,%xmm6
-	paddd	%xmm11,%xmm9
-	paddd	%xmm11,%xmm10
-	paddd	%xmm15,%xmm13
-	paddd	L$sse_inc(%rip),%xmm15
-	paddd	%xmm15,%xmm14
-
-	pand	L$clamp(%rip),%xmm0
-	movdqa	%xmm0,0+0(%rbp)
-	movdqa	%xmm4,0+16(%rbp)
-
-	movq	%r8,%r8
-	call	poly_hash_ad_internal
-L$open_sse_128_xor_hash:
-	cmpq	$16,%rbx
-	jb	L$open_sse_tail_16
-	subq	$16,%rbx
-	addq	0+0(%rsi),%r10
-	adcq	8+0(%rsi),%r11
-	adcq	$1,%r12
-
-
-	movdqu	0(%rsi),%xmm3
-	pxor	%xmm3,%xmm1
-	movdqu	%xmm1,0(%rdi)
-	leaq	16(%rsi),%rsi
-	leaq	16(%rdi),%rdi
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-
-	movdqa	%xmm5,%xmm1
-	movdqa	%xmm9,%xmm5
-	movdqa	%xmm13,%xmm9
-	movdqa	%xmm2,%xmm13
-	movdqa	%xmm6,%xmm2
-	movdqa	%xmm10,%xmm6
-	movdqa	%xmm14,%xmm10
-	jmp	L$open_sse_128_xor_hash
-
-
-
-
-
-
-
-
-
-.globl	_chacha20_poly1305_seal
-.private_extern _chacha20_poly1305_seal
-
-.p2align	6
-_chacha20_poly1305_seal:
-
-_CET_ENDBR
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-
-
-	pushq	%r9
-
-	subq	$288 + 0 + 32,%rsp
-
-	leaq	32(%rsp),%rbp
-	andq	$-32,%rbp
-
-	movq	56(%r9),%rbx
-	addq	%rdx,%rbx
-	movq	%r8,0+0+32(%rbp)
-	movq	%rbx,8+0+32(%rbp)
-	movq	%rdx,%rbx
-
-	movl	_OPENSSL_ia32cap_P+8(%rip),%eax
-	andl	$288,%eax
-	xorl	$288,%eax
-	jz	chacha20_poly1305_seal_avx2
-
-	cmpq	$128,%rbx
-	jbe	L$seal_sse_128
-
-	movdqa	L$chacha20_consts(%rip),%xmm0
-	movdqu	0(%r9),%xmm4
-	movdqu	16(%r9),%xmm8
-	movdqu	32(%r9),%xmm12
-
-	movdqa	%xmm0,%xmm1
-	movdqa	%xmm0,%xmm2
-	movdqa	%xmm0,%xmm3
-	movdqa	%xmm4,%xmm5
-	movdqa	%xmm4,%xmm6
-	movdqa	%xmm4,%xmm7
-	movdqa	%xmm8,%xmm9
-	movdqa	%xmm8,%xmm10
-	movdqa	%xmm8,%xmm11
-	movdqa	%xmm12,%xmm15
-	paddd	L$sse_inc(%rip),%xmm12
-	movdqa	%xmm12,%xmm14
-	paddd	L$sse_inc(%rip),%xmm12
-	movdqa	%xmm12,%xmm13
-	paddd	L$sse_inc(%rip),%xmm12
-
-	movdqa	%xmm4,0+48(%rbp)
-	movdqa	%xmm8,0+64(%rbp)
-	movdqa	%xmm12,0+96(%rbp)
-	movdqa	%xmm13,0+112(%rbp)
-	movdqa	%xmm14,0+128(%rbp)
-	movdqa	%xmm15,0+144(%rbp)
-	movq	$10,%r10
-L$seal_sse_init_rounds:
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	L$rol16(%rip),%xmm8
-	paddd	%xmm7,%xmm3
-	paddd	%xmm6,%xmm2
-	paddd	%xmm5,%xmm1
-	paddd	%xmm4,%xmm0
-	pxor	%xmm3,%xmm15
-	pxor	%xmm2,%xmm14
-	pxor	%xmm1,%xmm13
-	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
-	movdqa	0+80(%rbp),%xmm8
-	paddd	%xmm15,%xmm11
-	paddd	%xmm14,%xmm10
-	paddd	%xmm13,%xmm9
-	paddd	%xmm12,%xmm8
-	pxor	%xmm11,%xmm7
-	pxor	%xmm10,%xmm6
-	pxor	%xmm9,%xmm5
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	%xmm7,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm7
-	pxor	%xmm8,%xmm7
-	movdqa	%xmm6,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm6
-	pxor	%xmm8,%xmm6
-	movdqa	%xmm5,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm5
-	pxor	%xmm8,%xmm5
-	movdqa	%xmm4,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm4
-	pxor	%xmm8,%xmm4
-	movdqa	L$rol8(%rip),%xmm8
-	paddd	%xmm7,%xmm3
-	paddd	%xmm6,%xmm2
-	paddd	%xmm5,%xmm1
-	paddd	%xmm4,%xmm0
-	pxor	%xmm3,%xmm15
-	pxor	%xmm2,%xmm14
-	pxor	%xmm1,%xmm13
-	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
-	movdqa	0+80(%rbp),%xmm8
-	paddd	%xmm15,%xmm11
-	paddd	%xmm14,%xmm10
-	paddd	%xmm13,%xmm9
-	paddd	%xmm12,%xmm8
-	pxor	%xmm11,%xmm7
-	pxor	%xmm10,%xmm6
-	pxor	%xmm9,%xmm5
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	%xmm7,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm7
-	pxor	%xmm8,%xmm7
-	movdqa	%xmm6,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm6
-	pxor	%xmm8,%xmm6
-	movdqa	%xmm5,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm5
-	pxor	%xmm8,%xmm5
-	movdqa	%xmm4,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm4
-	pxor	%xmm8,%xmm4
-	movdqa	0+80(%rbp),%xmm8
-.byte	102,15,58,15,255,4
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,12
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	L$rol16(%rip),%xmm8
-	paddd	%xmm7,%xmm3
-	paddd	%xmm6,%xmm2
-	paddd	%xmm5,%xmm1
-	paddd	%xmm4,%xmm0
-	pxor	%xmm3,%xmm15
-	pxor	%xmm2,%xmm14
-	pxor	%xmm1,%xmm13
-	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
-	movdqa	0+80(%rbp),%xmm8
-	paddd	%xmm15,%xmm11
-	paddd	%xmm14,%xmm10
-	paddd	%xmm13,%xmm9
-	paddd	%xmm12,%xmm8
-	pxor	%xmm11,%xmm7
-	pxor	%xmm10,%xmm6
-	pxor	%xmm9,%xmm5
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	%xmm7,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm7
-	pxor	%xmm8,%xmm7
-	movdqa	%xmm6,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm6
-	pxor	%xmm8,%xmm6
-	movdqa	%xmm5,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm5
-	pxor	%xmm8,%xmm5
-	movdqa	%xmm4,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm4
-	pxor	%xmm8,%xmm4
-	movdqa	L$rol8(%rip),%xmm8
-	paddd	%xmm7,%xmm3
-	paddd	%xmm6,%xmm2
-	paddd	%xmm5,%xmm1
-	paddd	%xmm4,%xmm0
-	pxor	%xmm3,%xmm15
-	pxor	%xmm2,%xmm14
-	pxor	%xmm1,%xmm13
-	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
-	movdqa	0+80(%rbp),%xmm8
-	paddd	%xmm15,%xmm11
-	paddd	%xmm14,%xmm10
-	paddd	%xmm13,%xmm9
-	paddd	%xmm12,%xmm8
-	pxor	%xmm11,%xmm7
-	pxor	%xmm10,%xmm6
-	pxor	%xmm9,%xmm5
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	%xmm7,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm7
-	pxor	%xmm8,%xmm7
-	movdqa	%xmm6,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm6
-	pxor	%xmm8,%xmm6
-	movdqa	%xmm5,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm5
-	pxor	%xmm8,%xmm5
-	movdqa	%xmm4,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm4
-	pxor	%xmm8,%xmm4
-	movdqa	0+80(%rbp),%xmm8
-.byte	102,15,58,15,255,12
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,4
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-
-	decq	%r10
-	jnz	L$seal_sse_init_rounds
-	paddd	L$chacha20_consts(%rip),%xmm3
-	paddd	0+48(%rbp),%xmm7
-	paddd	0+64(%rbp),%xmm11
-	paddd	0+144(%rbp),%xmm15
-	paddd	L$chacha20_consts(%rip),%xmm2
-	paddd	0+48(%rbp),%xmm6
-	paddd	0+64(%rbp),%xmm10
-	paddd	0+128(%rbp),%xmm14
-	paddd	L$chacha20_consts(%rip),%xmm1
-	paddd	0+48(%rbp),%xmm5
-	paddd	0+64(%rbp),%xmm9
-	paddd	0+112(%rbp),%xmm13
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	0+48(%rbp),%xmm4
-	paddd	0+64(%rbp),%xmm8
-	paddd	0+96(%rbp),%xmm12
-
-
-	pand	L$clamp(%rip),%xmm3
-	movdqa	%xmm3,0+0(%rbp)
-	movdqa	%xmm7,0+16(%rbp)
-
-	movq	%r8,%r8
-	call	poly_hash_ad_internal
-	movdqu	0 + 0(%rsi),%xmm3
-	movdqu	16 + 0(%rsi),%xmm7
-	movdqu	32 + 0(%rsi),%xmm11
-	movdqu	48 + 0(%rsi),%xmm15
-	pxor	%xmm3,%xmm2
-	pxor	%xmm7,%xmm6
-	pxor	%xmm11,%xmm10
-	pxor	%xmm14,%xmm15
-	movdqu	%xmm2,0 + 0(%rdi)
-	movdqu	%xmm6,16 + 0(%rdi)
-	movdqu	%xmm10,32 + 0(%rdi)
-	movdqu	%xmm15,48 + 0(%rdi)
-	movdqu	0 + 64(%rsi),%xmm3
-	movdqu	16 + 64(%rsi),%xmm7
-	movdqu	32 + 64(%rsi),%xmm11
-	movdqu	48 + 64(%rsi),%xmm15
-	pxor	%xmm3,%xmm1
-	pxor	%xmm7,%xmm5
-	pxor	%xmm11,%xmm9
-	pxor	%xmm13,%xmm15
-	movdqu	%xmm1,0 + 64(%rdi)
-	movdqu	%xmm5,16 + 64(%rdi)
-	movdqu	%xmm9,32 + 64(%rdi)
-	movdqu	%xmm15,48 + 64(%rdi)
-
-	cmpq	$192,%rbx
-	ja	L$seal_sse_main_init
-	movq	$128,%rcx
-	subq	$128,%rbx
-	leaq	128(%rsi),%rsi
-	jmp	L$seal_sse_128_tail_hash
-L$seal_sse_main_init:
-	movdqu	0 + 128(%rsi),%xmm3
-	movdqu	16 + 128(%rsi),%xmm7
-	movdqu	32 + 128(%rsi),%xmm11
-	movdqu	48 + 128(%rsi),%xmm15
-	pxor	%xmm3,%xmm0
-	pxor	%xmm7,%xmm4
-	pxor	%xmm11,%xmm8
-	pxor	%xmm12,%xmm15
-	movdqu	%xmm0,0 + 128(%rdi)
-	movdqu	%xmm4,16 + 128(%rdi)
-	movdqu	%xmm8,32 + 128(%rdi)
-	movdqu	%xmm15,48 + 128(%rdi)
-
-	movq	$192,%rcx
-	subq	$192,%rbx
-	leaq	192(%rsi),%rsi
-	movq	$2,%rcx
-	movq	$8,%r8
-	cmpq	$64,%rbx
-	jbe	L$seal_sse_tail_64
-	cmpq	$128,%rbx
-	jbe	L$seal_sse_tail_128
-	cmpq	$192,%rbx
-	jbe	L$seal_sse_tail_192
-
-L$seal_sse_main_loop:
-	movdqa	L$chacha20_consts(%rip),%xmm0
-	movdqa	0+48(%rbp),%xmm4
-	movdqa	0+64(%rbp),%xmm8
-	movdqa	%xmm0,%xmm1
-	movdqa	%xmm4,%xmm5
-	movdqa	%xmm8,%xmm9
-	movdqa	%xmm0,%xmm2
-	movdqa	%xmm4,%xmm6
-	movdqa	%xmm8,%xmm10
-	movdqa	%xmm0,%xmm3
-	movdqa	%xmm4,%xmm7
-	movdqa	%xmm8,%xmm11
-	movdqa	0+96(%rbp),%xmm15
-	paddd	L$sse_inc(%rip),%xmm15
-	movdqa	%xmm15,%xmm14
-	paddd	L$sse_inc(%rip),%xmm14
-	movdqa	%xmm14,%xmm13
-	paddd	L$sse_inc(%rip),%xmm13
-	movdqa	%xmm13,%xmm12
-	paddd	L$sse_inc(%rip),%xmm12
-	movdqa	%xmm12,0+96(%rbp)
-	movdqa	%xmm13,0+112(%rbp)
-	movdqa	%xmm14,0+128(%rbp)
-	movdqa	%xmm15,0+144(%rbp)
-
-.p2align	5
-L$seal_sse_main_rounds:
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	L$rol16(%rip),%xmm8
-	paddd	%xmm7,%xmm3
-	paddd	%xmm6,%xmm2
-	paddd	%xmm5,%xmm1
-	paddd	%xmm4,%xmm0
-	pxor	%xmm3,%xmm15
-	pxor	%xmm2,%xmm14
-	pxor	%xmm1,%xmm13
-	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
-	movdqa	0+80(%rbp),%xmm8
-	paddd	%xmm15,%xmm11
-	paddd	%xmm14,%xmm10
-	paddd	%xmm13,%xmm9
-	paddd	%xmm12,%xmm8
-	pxor	%xmm11,%xmm7
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	pxor	%xmm10,%xmm6
-	pxor	%xmm9,%xmm5
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	%xmm7,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm7
-	pxor	%xmm8,%xmm7
-	movdqa	%xmm6,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm6
-	pxor	%xmm8,%xmm6
-	movdqa	%xmm5,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm5
-	pxor	%xmm8,%xmm5
-	movdqa	%xmm4,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm4
-	pxor	%xmm8,%xmm4
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movdqa	L$rol8(%rip),%xmm8
-	paddd	%xmm7,%xmm3
-	paddd	%xmm6,%xmm2
-	paddd	%xmm5,%xmm1
-	paddd	%xmm4,%xmm0
-	pxor	%xmm3,%xmm15
-	pxor	%xmm2,%xmm14
-	pxor	%xmm1,%xmm13
-	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
-	movdqa	0+80(%rbp),%xmm8
-	paddd	%xmm15,%xmm11
-	paddd	%xmm14,%xmm10
-	paddd	%xmm13,%xmm9
-	paddd	%xmm12,%xmm8
-	pxor	%xmm11,%xmm7
-	pxor	%xmm10,%xmm6
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	pxor	%xmm9,%xmm5
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	%xmm7,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm7
-	pxor	%xmm8,%xmm7
-	movdqa	%xmm6,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm6
-	pxor	%xmm8,%xmm6
-	movdqa	%xmm5,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm5
-	pxor	%xmm8,%xmm5
-	movdqa	%xmm4,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm4
-	pxor	%xmm8,%xmm4
-	movdqa	0+80(%rbp),%xmm8
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-.byte	102,15,58,15,255,4
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,12
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	L$rol16(%rip),%xmm8
-	paddd	%xmm7,%xmm3
-	paddd	%xmm6,%xmm2
-	paddd	%xmm5,%xmm1
-	paddd	%xmm4,%xmm0
-	pxor	%xmm3,%xmm15
-	pxor	%xmm2,%xmm14
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	pxor	%xmm1,%xmm13
-	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
-	movdqa	0+80(%rbp),%xmm8
-	paddd	%xmm15,%xmm11
-	paddd	%xmm14,%xmm10
-	paddd	%xmm13,%xmm9
-	paddd	%xmm12,%xmm8
-	pxor	%xmm11,%xmm7
-	pxor	%xmm10,%xmm6
-	pxor	%xmm9,%xmm5
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	%xmm7,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm7
-	pxor	%xmm8,%xmm7
-	movdqa	%xmm6,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm6
-	pxor	%xmm8,%xmm6
-	movdqa	%xmm5,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm5
-	pxor	%xmm8,%xmm5
-	movdqa	%xmm4,%xmm8
-	psrld	$20,%xmm8
-	pslld	$32-20,%xmm4
-	pxor	%xmm8,%xmm4
-	movdqa	L$rol8(%rip),%xmm8
-	paddd	%xmm7,%xmm3
-	paddd	%xmm6,%xmm2
-	paddd	%xmm5,%xmm1
-	paddd	%xmm4,%xmm0
-	pxor	%xmm3,%xmm15
-	pxor	%xmm2,%xmm14
-	pxor	%xmm1,%xmm13
-	pxor	%xmm0,%xmm12
-.byte	102,69,15,56,0,248
-.byte	102,69,15,56,0,240
-.byte	102,69,15,56,0,232
-.byte	102,69,15,56,0,224
-	movdqa	0+80(%rbp),%xmm8
-	paddd	%xmm15,%xmm11
-	paddd	%xmm14,%xmm10
-	paddd	%xmm13,%xmm9
-	paddd	%xmm12,%xmm8
-	pxor	%xmm11,%xmm7
-	pxor	%xmm10,%xmm6
-	pxor	%xmm9,%xmm5
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm8,0+80(%rbp)
-	movdqa	%xmm7,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm7
-	pxor	%xmm8,%xmm7
-	movdqa	%xmm6,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm6
-	pxor	%xmm8,%xmm6
-	movdqa	%xmm5,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm5
-	pxor	%xmm8,%xmm5
-	movdqa	%xmm4,%xmm8
-	psrld	$25,%xmm8
-	pslld	$32-25,%xmm4
-	pxor	%xmm8,%xmm4
-	movdqa	0+80(%rbp),%xmm8
-.byte	102,15,58,15,255,12
-.byte	102,69,15,58,15,219,8
-.byte	102,69,15,58,15,255,4
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-
-	leaq	16(%rdi),%rdi
-	decq	%r8
-	jge	L$seal_sse_main_rounds
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rdi),%rdi
-	decq	%rcx
-	jg	L$seal_sse_main_rounds
-	paddd	L$chacha20_consts(%rip),%xmm3
-	paddd	0+48(%rbp),%xmm7
-	paddd	0+64(%rbp),%xmm11
-	paddd	0+144(%rbp),%xmm15
-	paddd	L$chacha20_consts(%rip),%xmm2
-	paddd	0+48(%rbp),%xmm6
-	paddd	0+64(%rbp),%xmm10
-	paddd	0+128(%rbp),%xmm14
-	paddd	L$chacha20_consts(%rip),%xmm1
-	paddd	0+48(%rbp),%xmm5
-	paddd	0+64(%rbp),%xmm9
-	paddd	0+112(%rbp),%xmm13
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	0+48(%rbp),%xmm4
-	paddd	0+64(%rbp),%xmm8
-	paddd	0+96(%rbp),%xmm12
-
-	movdqa	%xmm14,0+80(%rbp)
-	movdqa	%xmm14,0+80(%rbp)
-	movdqu	0 + 0(%rsi),%xmm14
-	pxor	%xmm3,%xmm14
-	movdqu	%xmm14,0 + 0(%rdi)
-	movdqu	16 + 0(%rsi),%xmm14
-	pxor	%xmm7,%xmm14
-	movdqu	%xmm14,16 + 0(%rdi)
-	movdqu	32 + 0(%rsi),%xmm14
-	pxor	%xmm11,%xmm14
-	movdqu	%xmm14,32 + 0(%rdi)
-	movdqu	48 + 0(%rsi),%xmm14
-	pxor	%xmm15,%xmm14
-	movdqu	%xmm14,48 + 0(%rdi)
-
-	movdqa	0+80(%rbp),%xmm14
-	movdqu	0 + 64(%rsi),%xmm3
-	movdqu	16 + 64(%rsi),%xmm7
-	movdqu	32 + 64(%rsi),%xmm11
-	movdqu	48 + 64(%rsi),%xmm15
-	pxor	%xmm3,%xmm2
-	pxor	%xmm7,%xmm6
-	pxor	%xmm11,%xmm10
-	pxor	%xmm14,%xmm15
-	movdqu	%xmm2,0 + 64(%rdi)
-	movdqu	%xmm6,16 + 64(%rdi)
-	movdqu	%xmm10,32 + 64(%rdi)
-	movdqu	%xmm15,48 + 64(%rdi)
-	movdqu	0 + 128(%rsi),%xmm3
-	movdqu	16 + 128(%rsi),%xmm7
-	movdqu	32 + 128(%rsi),%xmm11
-	movdqu	48 + 128(%rsi),%xmm15
-	pxor	%xmm3,%xmm1
-	pxor	%xmm7,%xmm5
-	pxor	%xmm11,%xmm9
-	pxor	%xmm13,%xmm15
-	movdqu	%xmm1,0 + 128(%rdi)
-	movdqu	%xmm5,16 + 128(%rdi)
-	movdqu	%xmm9,32 + 128(%rdi)
-	movdqu	%xmm15,48 + 128(%rdi)
-
-	cmpq	$256,%rbx
-	ja	L$seal_sse_main_loop_xor
-
-	movq	$192,%rcx
-	subq	$192,%rbx
-	leaq	192(%rsi),%rsi
-	jmp	L$seal_sse_128_tail_hash
-L$seal_sse_main_loop_xor:
-	movdqu	0 + 192(%rsi),%xmm3
-	movdqu	16 + 192(%rsi),%xmm7
-	movdqu	32 + 192(%rsi),%xmm11
-	movdqu	48 + 192(%rsi),%xmm15
-	pxor	%xmm3,%xmm0
-	pxor	%xmm7,%xmm4
-	pxor	%xmm11,%xmm8
-	pxor	%xmm12,%xmm15
-	movdqu	%xmm0,0 + 192(%rdi)
-	movdqu	%xmm4,16 + 192(%rdi)
-	movdqu	%xmm8,32 + 192(%rdi)
-	movdqu	%xmm15,48 + 192(%rdi)
-
-	leaq	256(%rsi),%rsi
-	subq	$256,%rbx
-	movq	$6,%rcx
-	movq	$4,%r8
-	cmpq	$192,%rbx
-	jg	L$seal_sse_main_loop
-	movq	%rbx,%rcx
-	testq	%rbx,%rbx
-	je	L$seal_sse_128_tail_hash
-	movq	$6,%rcx
-	cmpq	$128,%rbx
-	ja	L$seal_sse_tail_192
-	cmpq	$64,%rbx
-	ja	L$seal_sse_tail_128
-
-L$seal_sse_tail_64:
-	movdqa	L$chacha20_consts(%rip),%xmm0
-	movdqa	0+48(%rbp),%xmm4
-	movdqa	0+64(%rbp),%xmm8
-	movdqa	0+96(%rbp),%xmm12
-	paddd	L$sse_inc(%rip),%xmm12
-	movdqa	%xmm12,0+96(%rbp)
-
-L$seal_sse_tail_64_rounds_and_x2hash:
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rdi),%rdi
-L$seal_sse_tail_64_rounds_and_x1hash:
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rdi),%rdi
-	decq	%rcx
-	jg	L$seal_sse_tail_64_rounds_and_x2hash
-	decq	%r8
-	jge	L$seal_sse_tail_64_rounds_and_x1hash
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	0+48(%rbp),%xmm4
-	paddd	0+64(%rbp),%xmm8
-	paddd	0+96(%rbp),%xmm12
-
-	jmp	L$seal_sse_128_tail_xor
-
-L$seal_sse_tail_128:
-	movdqa	L$chacha20_consts(%rip),%xmm0
-	movdqa	0+48(%rbp),%xmm4
-	movdqa	0+64(%rbp),%xmm8
-	movdqa	%xmm0,%xmm1
-	movdqa	%xmm4,%xmm5
-	movdqa	%xmm8,%xmm9
-	movdqa	0+96(%rbp),%xmm13
-	paddd	L$sse_inc(%rip),%xmm13
-	movdqa	%xmm13,%xmm12
-	paddd	L$sse_inc(%rip),%xmm12
-	movdqa	%xmm12,0+96(%rbp)
-	movdqa	%xmm13,0+112(%rbp)
-
-L$seal_sse_tail_128_rounds_and_x2hash:
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rdi),%rdi
-L$seal_sse_tail_128_rounds_and_x1hash:
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm5
-	pxor	%xmm3,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm5
-	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm5
-	pxor	%xmm3,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm5
-	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-
-	leaq	16(%rdi),%rdi
-	decq	%rcx
-	jg	L$seal_sse_tail_128_rounds_and_x2hash
-	decq	%r8
-	jge	L$seal_sse_tail_128_rounds_and_x1hash
-	paddd	L$chacha20_consts(%rip),%xmm1
-	paddd	0+48(%rbp),%xmm5
-	paddd	0+64(%rbp),%xmm9
-	paddd	0+112(%rbp),%xmm13
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	0+48(%rbp),%xmm4
-	paddd	0+64(%rbp),%xmm8
-	paddd	0+96(%rbp),%xmm12
-	movdqu	0 + 0(%rsi),%xmm3
-	movdqu	16 + 0(%rsi),%xmm7
-	movdqu	32 + 0(%rsi),%xmm11
-	movdqu	48 + 0(%rsi),%xmm15
-	pxor	%xmm3,%xmm1
-	pxor	%xmm7,%xmm5
-	pxor	%xmm11,%xmm9
-	pxor	%xmm13,%xmm15
-	movdqu	%xmm1,0 + 0(%rdi)
-	movdqu	%xmm5,16 + 0(%rdi)
-	movdqu	%xmm9,32 + 0(%rdi)
-	movdqu	%xmm15,48 + 0(%rdi)
-
-	movq	$64,%rcx
-	subq	$64,%rbx
-	leaq	64(%rsi),%rsi
-	jmp	L$seal_sse_128_tail_hash
-
-L$seal_sse_tail_192:
-	movdqa	L$chacha20_consts(%rip),%xmm0
-	movdqa	0+48(%rbp),%xmm4
-	movdqa	0+64(%rbp),%xmm8
-	movdqa	%xmm0,%xmm1
-	movdqa	%xmm4,%xmm5
-	movdqa	%xmm8,%xmm9
-	movdqa	%xmm0,%xmm2
-	movdqa	%xmm4,%xmm6
-	movdqa	%xmm8,%xmm10
-	movdqa	0+96(%rbp),%xmm14
-	paddd	L$sse_inc(%rip),%xmm14
-	movdqa	%xmm14,%xmm13
-	paddd	L$sse_inc(%rip),%xmm13
-	movdqa	%xmm13,%xmm12
-	paddd	L$sse_inc(%rip),%xmm12
-	movdqa	%xmm12,0+96(%rbp)
-	movdqa	%xmm13,0+112(%rbp)
-	movdqa	%xmm14,0+128(%rbp)
-
-L$seal_sse_tail_192_rounds_and_x2hash:
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rdi),%rdi
-L$seal_sse_tail_192_rounds_and_x1hash:
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm5
-	pxor	%xmm3,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm5
-	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol16(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm6
-	pxor	%xmm3,%xmm6
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol8(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm6
-	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm5
-	pxor	%xmm3,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm5
-	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol16(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm6
-	pxor	%xmm3,%xmm6
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol8(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm6
-	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-
-	leaq	16(%rdi),%rdi
-	decq	%rcx
-	jg	L$seal_sse_tail_192_rounds_and_x2hash
-	decq	%r8
-	jge	L$seal_sse_tail_192_rounds_and_x1hash
-	paddd	L$chacha20_consts(%rip),%xmm2
-	paddd	0+48(%rbp),%xmm6
-	paddd	0+64(%rbp),%xmm10
-	paddd	0+128(%rbp),%xmm14
-	paddd	L$chacha20_consts(%rip),%xmm1
-	paddd	0+48(%rbp),%xmm5
-	paddd	0+64(%rbp),%xmm9
-	paddd	0+112(%rbp),%xmm13
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	0+48(%rbp),%xmm4
-	paddd	0+64(%rbp),%xmm8
-	paddd	0+96(%rbp),%xmm12
-	movdqu	0 + 0(%rsi),%xmm3
-	movdqu	16 + 0(%rsi),%xmm7
-	movdqu	32 + 0(%rsi),%xmm11
-	movdqu	48 + 0(%rsi),%xmm15
-	pxor	%xmm3,%xmm2
-	pxor	%xmm7,%xmm6
-	pxor	%xmm11,%xmm10
-	pxor	%xmm14,%xmm15
-	movdqu	%xmm2,0 + 0(%rdi)
-	movdqu	%xmm6,16 + 0(%rdi)
-	movdqu	%xmm10,32 + 0(%rdi)
-	movdqu	%xmm15,48 + 0(%rdi)
-	movdqu	0 + 64(%rsi),%xmm3
-	movdqu	16 + 64(%rsi),%xmm7
-	movdqu	32 + 64(%rsi),%xmm11
-	movdqu	48 + 64(%rsi),%xmm15
-	pxor	%xmm3,%xmm1
-	pxor	%xmm7,%xmm5
-	pxor	%xmm11,%xmm9
-	pxor	%xmm13,%xmm15
-	movdqu	%xmm1,0 + 64(%rdi)
-	movdqu	%xmm5,16 + 64(%rdi)
-	movdqu	%xmm9,32 + 64(%rdi)
-	movdqu	%xmm15,48 + 64(%rdi)
-
-	movq	$128,%rcx
-	subq	$128,%rbx
-	leaq	128(%rsi),%rsi
-
-L$seal_sse_128_tail_hash:
-	cmpq	$16,%rcx
-	jb	L$seal_sse_128_tail_xor
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	subq	$16,%rcx
-	leaq	16(%rdi),%rdi
-	jmp	L$seal_sse_128_tail_hash
-
-L$seal_sse_128_tail_xor:
-	cmpq	$16,%rbx
-	jb	L$seal_sse_tail_16
-	subq	$16,%rbx
-
-	movdqu	0(%rsi),%xmm3
-	pxor	%xmm3,%xmm0
-	movdqu	%xmm0,0(%rdi)
-
-	addq	0(%rdi),%r10
-	adcq	8(%rdi),%r11
-	adcq	$1,%r12
-	leaq	16(%rsi),%rsi
-	leaq	16(%rdi),%rdi
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm8,%xmm4
-	movdqa	%xmm12,%xmm8
-	movdqa	%xmm1,%xmm12
-	movdqa	%xmm5,%xmm1
-	movdqa	%xmm9,%xmm5
-	movdqa	%xmm13,%xmm9
-	jmp	L$seal_sse_128_tail_xor
-
-L$seal_sse_tail_16:
-	testq	%rbx,%rbx
-	jz	L$process_blocks_of_extra_in
-
-	movq	%rbx,%r8
-	movq	%rbx,%rcx
-	leaq	-1(%rsi,%rbx,1),%rsi
-	pxor	%xmm15,%xmm15
-L$seal_sse_tail_16_compose:
-	pslldq	$1,%xmm15
-	pinsrb	$0,(%rsi),%xmm15
-	leaq	-1(%rsi),%rsi
-	decq	%rcx
-	jne	L$seal_sse_tail_16_compose
-
-
-	pxor	%xmm0,%xmm15
-
-
-	movq	%rbx,%rcx
-	movdqu	%xmm15,%xmm0
-L$seal_sse_tail_16_extract:
-	pextrb	$0,%xmm0,(%rdi)
-	psrldq	$1,%xmm0
-	addq	$1,%rdi
-	subq	$1,%rcx
-	jnz	L$seal_sse_tail_16_extract
-
-
-
-
-
-
-
-
-	movq	288 + 0 + 32(%rsp),%r9
-	movq	56(%r9),%r14
-	movq	48(%r9),%r13
-	testq	%r14,%r14
-	jz	L$process_partial_block
-
-	movq	$16,%r15
-	subq	%rbx,%r15
-	cmpq	%r15,%r14
-
-	jge	L$load_extra_in
-	movq	%r14,%r15
-
-L$load_extra_in:
-
-
-	leaq	-1(%r13,%r15,1),%rsi
-
-
-	addq	%r15,%r13
-	subq	%r15,%r14
-	movq	%r13,48(%r9)
-	movq	%r14,56(%r9)
-
-
-
-	addq	%r15,%r8
-
-
-	pxor	%xmm11,%xmm11
-L$load_extra_load_loop:
-	pslldq	$1,%xmm11
-	pinsrb	$0,(%rsi),%xmm11
-	leaq	-1(%rsi),%rsi
-	subq	$1,%r15
-	jnz	L$load_extra_load_loop
-
-
-
-
-	movq	%rbx,%r15
-
-L$load_extra_shift_loop:
-	pslldq	$1,%xmm11
-	subq	$1,%r15
-	jnz	L$load_extra_shift_loop
-
-
-
-
-	leaq	L$and_masks(%rip),%r15
-	shlq	$4,%rbx
-	pand	-16(%r15,%rbx,1),%xmm15
-
-
-	por	%xmm11,%xmm15
-
-
-
-.byte	102,77,15,126,253
-	pextrq	$1,%xmm15,%r14
-	addq	%r13,%r10
-	adcq	%r14,%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-
-L$process_blocks_of_extra_in:
-
-	movq	288+32+0 (%rsp),%r9
-	movq	48(%r9),%rsi
-	movq	56(%r9),%r8
-	movq	%r8,%rcx
-	shrq	$4,%r8
-
-L$process_extra_hash_loop:
-	jz	process_extra_in_trailer
-	addq	0+0(%rsi),%r10
-	adcq	8+0(%rsi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rsi),%rsi
-	subq	$1,%r8
-	jmp	L$process_extra_hash_loop
-process_extra_in_trailer:
-	andq	$15,%rcx
-	movq	%rcx,%rbx
-	jz	L$do_length_block
-	leaq	-1(%rsi,%rcx,1),%rsi
-
-L$process_extra_in_trailer_load:
-	pslldq	$1,%xmm15
-	pinsrb	$0,(%rsi),%xmm15
-	leaq	-1(%rsi),%rsi
-	subq	$1,%rcx
-	jnz	L$process_extra_in_trailer_load
-
-L$process_partial_block:
-
-	leaq	L$and_masks(%rip),%r15
-	shlq	$4,%rbx
-	pand	-16(%r15,%rbx,1),%xmm15
-.byte	102,77,15,126,253
-	pextrq	$1,%xmm15,%r14
-	addq	%r13,%r10
-	adcq	%r14,%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-
-L$do_length_block:
-	addq	0+0+32(%rbp),%r10
-	adcq	8+0+32(%rbp),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-
-	movq	%r10,%r13
-	movq	%r11,%r14
-	movq	%r12,%r15
-	subq	$-5,%r10
-	sbbq	$-1,%r11
-	sbbq	$3,%r12
-	cmovcq	%r13,%r10
-	cmovcq	%r14,%r11
-	cmovcq	%r15,%r12
-
-	addq	0+0+16(%rbp),%r10
-	adcq	8+0+16(%rbp),%r11
-
-
-	addq	$288 + 0 + 32,%rsp
-
-
-	popq	%r9
-
-	movq	%r10,(%r9)
-	movq	%r11,8(%r9)
-	popq	%r15
-
-	popq	%r14
-
-	popq	%r13
-
-	popq	%r12
-
-	popq	%rbx
-
-	popq	%rbp
-
-	ret
-
-L$seal_sse_128:
-
-	movdqu	L$chacha20_consts(%rip),%xmm0
-	movdqa	%xmm0,%xmm1
-	movdqa	%xmm0,%xmm2
-	movdqu	0(%r9),%xmm4
-	movdqa	%xmm4,%xmm5
-	movdqa	%xmm4,%xmm6
-	movdqu	16(%r9),%xmm8
-	movdqa	%xmm8,%xmm9
-	movdqa	%xmm8,%xmm10
-	movdqu	32(%r9),%xmm14
-	movdqa	%xmm14,%xmm12
-	paddd	L$sse_inc(%rip),%xmm12
-	movdqa	%xmm12,%xmm13
-	paddd	L$sse_inc(%rip),%xmm13
-	movdqa	%xmm4,%xmm7
-	movdqa	%xmm8,%xmm11
-	movdqa	%xmm12,%xmm15
-	movq	$10,%r10
-
-L$seal_sse_128_rounds:
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,4
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,12
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm5
-	pxor	%xmm3,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm5
-	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,4
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,12
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol16(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm6
-	pxor	%xmm3,%xmm6
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol8(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm6
-	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,4
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,12
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol16(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm4
-	pxor	%xmm3,%xmm4
-	paddd	%xmm4,%xmm0
-	pxor	%xmm0,%xmm12
-	pshufb	L$rol8(%rip),%xmm12
-	paddd	%xmm12,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,15,228,12
-.byte	102,69,15,58,15,192,8
-.byte	102,69,15,58,15,228,4
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol16(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm5
-	pxor	%xmm3,%xmm5
-	paddd	%xmm5,%xmm1
-	pxor	%xmm1,%xmm13
-	pshufb	L$rol8(%rip),%xmm13
-	paddd	%xmm13,%xmm9
-	pxor	%xmm9,%xmm5
-	movdqa	%xmm5,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm5
-	pxor	%xmm3,%xmm5
-.byte	102,15,58,15,237,12
-.byte	102,69,15,58,15,201,8
-.byte	102,69,15,58,15,237,4
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol16(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$12,%xmm3
-	psrld	$20,%xmm6
-	pxor	%xmm3,%xmm6
-	paddd	%xmm6,%xmm2
-	pxor	%xmm2,%xmm14
-	pshufb	L$rol8(%rip),%xmm14
-	paddd	%xmm14,%xmm10
-	pxor	%xmm10,%xmm6
-	movdqa	%xmm6,%xmm3
-	pslld	$7,%xmm3
-	psrld	$25,%xmm6
-	pxor	%xmm3,%xmm6
-.byte	102,15,58,15,246,12
-.byte	102,69,15,58,15,210,8
-.byte	102,69,15,58,15,246,4
-
-	decq	%r10
-	jnz	L$seal_sse_128_rounds
-	paddd	L$chacha20_consts(%rip),%xmm0
-	paddd	L$chacha20_consts(%rip),%xmm1
-	paddd	L$chacha20_consts(%rip),%xmm2
-	paddd	%xmm7,%xmm4
-	paddd	%xmm7,%xmm5
-	paddd	%xmm7,%xmm6
-	paddd	%xmm11,%xmm8
-	paddd	%xmm11,%xmm9
-	paddd	%xmm15,%xmm12
-	paddd	L$sse_inc(%rip),%xmm15
-	paddd	%xmm15,%xmm13
-
-	pand	L$clamp(%rip),%xmm2
-	movdqa	%xmm2,0+0(%rbp)
-	movdqa	%xmm6,0+16(%rbp)
-
-	movq	%r8,%r8
-	call	poly_hash_ad_internal
-	jmp	L$seal_sse_128_tail_xor
-
-
-
-
-
-.p2align	6
-chacha20_poly1305_open_avx2:
-
-
-
-
-
-
-
-
-
-
-
-
-	vzeroupper
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vbroadcasti128	0(%r9),%ymm4
-	vbroadcasti128	16(%r9),%ymm8
-	vbroadcasti128	32(%r9),%ymm12
-	vpaddd	L$avx2_init(%rip),%ymm12,%ymm12
-	cmpq	$192,%rbx
-	jbe	L$open_avx2_192
-	cmpq	$320,%rbx
-	jbe	L$open_avx2_320
-
-	vmovdqa	%ymm4,0+64(%rbp)
-	vmovdqa	%ymm8,0+96(%rbp)
-	vmovdqa	%ymm12,0+160(%rbp)
-	movq	$10,%r10
-L$open_avx2_init_rounds:
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-
-	decq	%r10
-	jne	L$open_avx2_init_rounds
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	0+64(%rbp),%ymm4,%ymm4
-	vpaddd	0+96(%rbp),%ymm8,%ymm8
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
-
-	vpand	L$clamp(%rip),%ymm3,%ymm3
-	vmovdqa	%ymm3,0+0(%rbp)
-
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
-
-	movq	%r8,%r8
-	call	poly_hash_ad_internal
-
-	xorq	%rcx,%rcx
-L$open_avx2_init_hash:
-	addq	0+0(%rsi,%rcx,1),%r10
-	adcq	8+0(%rsi,%rcx,1),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	addq	$16,%rcx
-	cmpq	$64,%rcx
-	jne	L$open_avx2_init_hash
-
-	vpxor	0(%rsi),%ymm0,%ymm0
-	vpxor	32(%rsi),%ymm4,%ymm4
-
-	vmovdqu	%ymm0,0(%rdi)
-	vmovdqu	%ymm4,32(%rdi)
-	leaq	64(%rsi),%rsi
-	leaq	64(%rdi),%rdi
-	subq	$64,%rbx
-L$open_avx2_main_loop:
-
-	cmpq	$512,%rbx
-	jb	L$open_avx2_main_loop_done
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vmovdqa	0+64(%rbp),%ymm4
-	vmovdqa	0+96(%rbp),%ymm8
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	%ymm0,%ymm2
-	vmovdqa	%ymm4,%ymm6
-	vmovdqa	%ymm8,%ymm10
-	vmovdqa	%ymm0,%ymm3
-	vmovdqa	%ymm4,%ymm7
-	vmovdqa	%ymm8,%ymm11
-	vmovdqa	L$avx2_inc(%rip),%ymm12
-	vpaddd	0+160(%rbp),%ymm12,%ymm15
-	vpaddd	%ymm15,%ymm12,%ymm14
-	vpaddd	%ymm14,%ymm12,%ymm13
-	vpaddd	%ymm13,%ymm12,%ymm12
-	vmovdqa	%ymm15,0+256(%rbp)
-	vmovdqa	%ymm14,0+224(%rbp)
-	vmovdqa	%ymm13,0+192(%rbp)
-	vmovdqa	%ymm12,0+160(%rbp)
-
-	xorq	%rcx,%rcx
-L$open_avx2_main_loop_rounds:
-	addq	0+0(%rsi,%rcx,1),%r10
-	adcq	8+0(%rsi,%rcx,1),%r11
-	adcq	$1,%r12
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	addq	0+16(%rsi,%rcx,1),%r10
-	adcq	8+16(%rsi,%rcx,1),%r11
-	adcq	$1,%r12
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$25,%ymm7,%ymm8
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	vpslld	$32-25,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$25,%ymm6,%ymm8
-	vpslld	$32-25,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$25,%ymm5,%ymm8
-	vpslld	$32-25,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$25,%ymm4,%ymm8
-	vpslld	$32-25,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	0+128(%rbp),%ymm8
-	vpalignr	$4,%ymm7,%ymm7,%ymm7
-	vpalignr	$8,%ymm11,%ymm11,%ymm11
-	vpalignr	$12,%ymm15,%ymm15,%ymm15
-	vpalignr	$4,%ymm6,%ymm6,%ymm6
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$12,%ymm14,%ymm14,%ymm14
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	addq	0+32(%rsi,%rcx,1),%r10
-	adcq	8+32(%rsi,%rcx,1),%r11
-	adcq	$1,%r12
-
-	leaq	48(%rcx),%rcx
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$25,%ymm7,%ymm8
-	vpslld	$32-25,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$25,%ymm6,%ymm8
-	vpslld	$32-25,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	vpsrld	$25,%ymm5,%ymm8
-	vpslld	$32-25,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$25,%ymm4,%ymm8
-	vpslld	$32-25,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	0+128(%rbp),%ymm8
-	vpalignr	$12,%ymm7,%ymm7,%ymm7
-	vpalignr	$8,%ymm11,%ymm11,%ymm11
-	vpalignr	$4,%ymm15,%ymm15,%ymm15
-	vpalignr	$12,%ymm6,%ymm6,%ymm6
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$4,%ymm14,%ymm14,%ymm14
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-
-	cmpq	$60*8,%rcx
-	jne	L$open_avx2_main_loop_rounds
-	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
-	vpaddd	0+64(%rbp),%ymm7,%ymm7
-	vpaddd	0+96(%rbp),%ymm11,%ymm11
-	vpaddd	0+256(%rbp),%ymm15,%ymm15
-	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
-	vpaddd	0+64(%rbp),%ymm6,%ymm6
-	vpaddd	0+96(%rbp),%ymm10,%ymm10
-	vpaddd	0+224(%rbp),%ymm14,%ymm14
-	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
-	vpaddd	0+64(%rbp),%ymm5,%ymm5
-	vpaddd	0+96(%rbp),%ymm9,%ymm9
-	vpaddd	0+192(%rbp),%ymm13,%ymm13
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	0+64(%rbp),%ymm4,%ymm4
-	vpaddd	0+96(%rbp),%ymm8,%ymm8
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-
-	vmovdqa	%ymm0,0+128(%rbp)
-	addq	0+60*8(%rsi),%r10
-	adcq	8+60*8(%rsi),%r11
-	adcq	$1,%r12
-	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
-	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
-	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
-	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
-	vpxor	0+0(%rsi),%ymm0,%ymm0
-	vpxor	32+0(%rsi),%ymm3,%ymm3
-	vpxor	64+0(%rsi),%ymm7,%ymm7
-	vpxor	96+0(%rsi),%ymm11,%ymm11
-	vmovdqu	%ymm0,0+0(%rdi)
-	vmovdqu	%ymm3,32+0(%rdi)
-	vmovdqu	%ymm7,64+0(%rdi)
-	vmovdqu	%ymm11,96+0(%rdi)
-
-	vmovdqa	0+128(%rbp),%ymm0
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
-	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
-	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
-	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
-	vpxor	0+128(%rsi),%ymm3,%ymm3
-	vpxor	32+128(%rsi),%ymm2,%ymm2
-	vpxor	64+128(%rsi),%ymm6,%ymm6
-	vpxor	96+128(%rsi),%ymm10,%ymm10
-	vmovdqu	%ymm3,0+128(%rdi)
-	vmovdqu	%ymm2,32+128(%rdi)
-	vmovdqu	%ymm6,64+128(%rdi)
-	vmovdqu	%ymm10,96+128(%rdi)
-	addq	0+60*8+16(%rsi),%r10
-	adcq	8+60*8+16(%rsi),%r11
-	adcq	$1,%r12
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
-	vpxor	0+256(%rsi),%ymm3,%ymm3
-	vpxor	32+256(%rsi),%ymm1,%ymm1
-	vpxor	64+256(%rsi),%ymm5,%ymm5
-	vpxor	96+256(%rsi),%ymm9,%ymm9
-	vmovdqu	%ymm3,0+256(%rdi)
-	vmovdqu	%ymm1,32+256(%rdi)
-	vmovdqu	%ymm5,64+256(%rdi)
-	vmovdqu	%ymm9,96+256(%rdi)
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
-	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
-	vpxor	0+384(%rsi),%ymm3,%ymm3
-	vpxor	32+384(%rsi),%ymm0,%ymm0
-	vpxor	64+384(%rsi),%ymm4,%ymm4
-	vpxor	96+384(%rsi),%ymm8,%ymm8
-	vmovdqu	%ymm3,0+384(%rdi)
-	vmovdqu	%ymm0,32+384(%rdi)
-	vmovdqu	%ymm4,64+384(%rdi)
-	vmovdqu	%ymm8,96+384(%rdi)
-
-	leaq	512(%rsi),%rsi
-	leaq	512(%rdi),%rdi
-	subq	$512,%rbx
-	jmp	L$open_avx2_main_loop
-L$open_avx2_main_loop_done:
-	testq	%rbx,%rbx
-	vzeroupper
-	je	L$open_sse_finalize
-
-	cmpq	$384,%rbx
-	ja	L$open_avx2_tail_512
-	cmpq	$256,%rbx
-	ja	L$open_avx2_tail_384
-	cmpq	$128,%rbx
-	ja	L$open_avx2_tail_256
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vmovdqa	0+64(%rbp),%ymm4
-	vmovdqa	0+96(%rbp),%ymm8
-	vmovdqa	L$avx2_inc(%rip),%ymm12
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-	vmovdqa	%ymm12,0+160(%rbp)
-
-	xorq	%r8,%r8
-	movq	%rbx,%rcx
-	andq	$-16,%rcx
-	testq	%rcx,%rcx
-	je	L$open_avx2_tail_128_rounds
-L$open_avx2_tail_128_rounds_and_x1hash:
-	addq	0+0(%rsi,%r8,1),%r10
-	adcq	8+0(%rsi,%r8,1),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-L$open_avx2_tail_128_rounds:
-	addq	$16,%r8
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-
-	cmpq	%rcx,%r8
-	jb	L$open_avx2_tail_128_rounds_and_x1hash
-	cmpq	$160,%r8
-	jne	L$open_avx2_tail_128_rounds
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	0+64(%rbp),%ymm4,%ymm4
-	vpaddd	0+96(%rbp),%ymm8,%ymm8
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
-	vmovdqa	%ymm3,%ymm8
-
-	jmp	L$open_avx2_tail_128_xor
-
-L$open_avx2_tail_256:
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vmovdqa	0+64(%rbp),%ymm4
-	vmovdqa	0+96(%rbp),%ymm8
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	L$avx2_inc(%rip),%ymm12
-	vpaddd	0+160(%rbp),%ymm12,%ymm13
-	vpaddd	%ymm13,%ymm12,%ymm12
-	vmovdqa	%ymm12,0+160(%rbp)
-	vmovdqa	%ymm13,0+192(%rbp)
-
-	movq	%rbx,0+128(%rbp)
-	movq	%rbx,%rcx
-	subq	$128,%rcx
-	shrq	$4,%rcx
-	movq	$10,%r8
-	cmpq	$10,%rcx
-	cmovgq	%r8,%rcx
-	movq	%rsi,%rbx
-	xorq	%r8,%r8
-L$open_avx2_tail_256_rounds_and_x1hash:
-	addq	0+0(%rbx),%r10
-	adcq	8+0(%rbx),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rbx),%rbx
-L$open_avx2_tail_256_rounds:
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-
-	incq	%r8
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol16(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpsrld	$20,%ymm6,%ymm3
-	vpslld	$12,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol8(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpslld	$7,%ymm6,%ymm3
-	vpsrld	$25,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpalignr	$4,%ymm14,%ymm14,%ymm14
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$12,%ymm6,%ymm6,%ymm6
-
-	cmpq	%rcx,%r8
-	jb	L$open_avx2_tail_256_rounds_and_x1hash
-	cmpq	$10,%r8
-	jne	L$open_avx2_tail_256_rounds
-	movq	%rbx,%r8
-	subq	%rsi,%rbx
-	movq	%rbx,%rcx
-	movq	0+128(%rbp),%rbx
-L$open_avx2_tail_256_hash:
-	addq	$16,%rcx
-	cmpq	%rbx,%rcx
-	jg	L$open_avx2_tail_256_done
-	addq	0+0(%r8),%r10
-	adcq	8+0(%r8),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%r8),%r8
-	jmp	L$open_avx2_tail_256_hash
-L$open_avx2_tail_256_done:
-	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
-	vpaddd	0+64(%rbp),%ymm5,%ymm5
-	vpaddd	0+96(%rbp),%ymm9,%ymm9
-	vpaddd	0+192(%rbp),%ymm13,%ymm13
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	0+64(%rbp),%ymm4,%ymm4
-	vpaddd	0+96(%rbp),%ymm8,%ymm8
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
-	vpxor	0+0(%rsi),%ymm3,%ymm3
-	vpxor	32+0(%rsi),%ymm1,%ymm1
-	vpxor	64+0(%rsi),%ymm5,%ymm5
-	vpxor	96+0(%rsi),%ymm9,%ymm9
-	vmovdqu	%ymm3,0+0(%rdi)
-	vmovdqu	%ymm1,32+0(%rdi)
-	vmovdqu	%ymm5,64+0(%rdi)
-	vmovdqu	%ymm9,96+0(%rdi)
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
-	vmovdqa	%ymm3,%ymm8
-
-	leaq	128(%rsi),%rsi
-	leaq	128(%rdi),%rdi
-	subq	$128,%rbx
-	jmp	L$open_avx2_tail_128_xor
-
-L$open_avx2_tail_384:
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vmovdqa	0+64(%rbp),%ymm4
-	vmovdqa	0+96(%rbp),%ymm8
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	%ymm0,%ymm2
-	vmovdqa	%ymm4,%ymm6
-	vmovdqa	%ymm8,%ymm10
-	vmovdqa	L$avx2_inc(%rip),%ymm12
-	vpaddd	0+160(%rbp),%ymm12,%ymm14
-	vpaddd	%ymm14,%ymm12,%ymm13
-	vpaddd	%ymm13,%ymm12,%ymm12
-	vmovdqa	%ymm12,0+160(%rbp)
-	vmovdqa	%ymm13,0+192(%rbp)
-	vmovdqa	%ymm14,0+224(%rbp)
-
-	movq	%rbx,0+128(%rbp)
-	movq	%rbx,%rcx
-	subq	$256,%rcx
-	shrq	$4,%rcx
-	addq	$6,%rcx
-	movq	$10,%r8
-	cmpq	$10,%rcx
-	cmovgq	%r8,%rcx
-	movq	%rsi,%rbx
-	xorq	%r8,%r8
-L$open_avx2_tail_384_rounds_and_x2hash:
-	addq	0+0(%rbx),%r10
-	adcq	8+0(%rbx),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rbx),%rbx
-L$open_avx2_tail_384_rounds_and_x1hash:
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol16(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpsrld	$20,%ymm6,%ymm3
-	vpslld	$12,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol8(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpslld	$7,%ymm6,%ymm3
-	vpsrld	$25,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpalignr	$12,%ymm14,%ymm14,%ymm14
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$4,%ymm6,%ymm6,%ymm6
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	addq	0+0(%rbx),%r10
-	adcq	8+0(%rbx),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rbx),%rbx
-	incq	%r8
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol16(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpsrld	$20,%ymm6,%ymm3
-	vpslld	$12,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol8(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpslld	$7,%ymm6,%ymm3
-	vpsrld	$25,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpalignr	$4,%ymm14,%ymm14,%ymm14
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$12,%ymm6,%ymm6,%ymm6
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-
-	cmpq	%rcx,%r8
-	jb	L$open_avx2_tail_384_rounds_and_x2hash
-	cmpq	$10,%r8
-	jne	L$open_avx2_tail_384_rounds_and_x1hash
-	movq	%rbx,%r8
-	subq	%rsi,%rbx
-	movq	%rbx,%rcx
-	movq	0+128(%rbp),%rbx
-L$open_avx2_384_tail_hash:
-	addq	$16,%rcx
-	cmpq	%rbx,%rcx
-	jg	L$open_avx2_384_tail_done
-	addq	0+0(%r8),%r10
-	adcq	8+0(%r8),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%r8),%r8
-	jmp	L$open_avx2_384_tail_hash
-L$open_avx2_384_tail_done:
-	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
-	vpaddd	0+64(%rbp),%ymm6,%ymm6
-	vpaddd	0+96(%rbp),%ymm10,%ymm10
-	vpaddd	0+224(%rbp),%ymm14,%ymm14
-	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
-	vpaddd	0+64(%rbp),%ymm5,%ymm5
-	vpaddd	0+96(%rbp),%ymm9,%ymm9
-	vpaddd	0+192(%rbp),%ymm13,%ymm13
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	0+64(%rbp),%ymm4,%ymm4
-	vpaddd	0+96(%rbp),%ymm8,%ymm8
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
-	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
-	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
-	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
-	vpxor	0+0(%rsi),%ymm3,%ymm3
-	vpxor	32+0(%rsi),%ymm2,%ymm2
-	vpxor	64+0(%rsi),%ymm6,%ymm6
-	vpxor	96+0(%rsi),%ymm10,%ymm10
-	vmovdqu	%ymm3,0+0(%rdi)
-	vmovdqu	%ymm2,32+0(%rdi)
-	vmovdqu	%ymm6,64+0(%rdi)
-	vmovdqu	%ymm10,96+0(%rdi)
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
-	vpxor	0+128(%rsi),%ymm3,%ymm3
-	vpxor	32+128(%rsi),%ymm1,%ymm1
-	vpxor	64+128(%rsi),%ymm5,%ymm5
-	vpxor	96+128(%rsi),%ymm9,%ymm9
-	vmovdqu	%ymm3,0+128(%rdi)
-	vmovdqu	%ymm1,32+128(%rdi)
-	vmovdqu	%ymm5,64+128(%rdi)
-	vmovdqu	%ymm9,96+128(%rdi)
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
-	vmovdqa	%ymm3,%ymm8
-
-	leaq	256(%rsi),%rsi
-	leaq	256(%rdi),%rdi
-	subq	$256,%rbx
-	jmp	L$open_avx2_tail_128_xor
-
-L$open_avx2_tail_512:
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vmovdqa	0+64(%rbp),%ymm4
-	vmovdqa	0+96(%rbp),%ymm8
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	%ymm0,%ymm2
-	vmovdqa	%ymm4,%ymm6
-	vmovdqa	%ymm8,%ymm10
-	vmovdqa	%ymm0,%ymm3
-	vmovdqa	%ymm4,%ymm7
-	vmovdqa	%ymm8,%ymm11
-	vmovdqa	L$avx2_inc(%rip),%ymm12
-	vpaddd	0+160(%rbp),%ymm12,%ymm15
-	vpaddd	%ymm15,%ymm12,%ymm14
-	vpaddd	%ymm14,%ymm12,%ymm13
-	vpaddd	%ymm13,%ymm12,%ymm12
-	vmovdqa	%ymm15,0+256(%rbp)
-	vmovdqa	%ymm14,0+224(%rbp)
-	vmovdqa	%ymm13,0+192(%rbp)
-	vmovdqa	%ymm12,0+160(%rbp)
-
-	xorq	%rcx,%rcx
-	movq	%rsi,%r8
-L$open_avx2_tail_512_rounds_and_x2hash:
-	addq	0+0(%r8),%r10
-	adcq	8+0(%r8),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%r8),%r8
-L$open_avx2_tail_512_rounds_and_x1hash:
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	addq	0+0(%r8),%r10
-	adcq	8+0(%r8),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$25,%ymm7,%ymm8
-	vpslld	$32-25,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$25,%ymm6,%ymm8
-	vpslld	$32-25,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$25,%ymm5,%ymm8
-	vpslld	$32-25,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$25,%ymm4,%ymm8
-	vpslld	$32-25,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	0+128(%rbp),%ymm8
-	vpalignr	$4,%ymm7,%ymm7,%ymm7
-	vpalignr	$8,%ymm11,%ymm11,%ymm11
-	vpalignr	$12,%ymm15,%ymm15,%ymm15
-	vpalignr	$4,%ymm6,%ymm6,%ymm6
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$12,%ymm14,%ymm14,%ymm14
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	addq	0+16(%r8),%r10
-	adcq	8+16(%r8),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	32(%r8),%r8
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$25,%ymm7,%ymm8
-	vpslld	$32-25,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$25,%ymm6,%ymm8
-	vpslld	$32-25,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$25,%ymm5,%ymm8
-	vpslld	$32-25,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$25,%ymm4,%ymm8
-	vpslld	$32-25,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	0+128(%rbp),%ymm8
-	vpalignr	$12,%ymm7,%ymm7,%ymm7
-	vpalignr	$8,%ymm11,%ymm11,%ymm11
-	vpalignr	$4,%ymm15,%ymm15,%ymm15
-	vpalignr	$12,%ymm6,%ymm6,%ymm6
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$4,%ymm14,%ymm14,%ymm14
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-
-	incq	%rcx
-	cmpq	$4,%rcx
-	jl	L$open_avx2_tail_512_rounds_and_x2hash
-	cmpq	$10,%rcx
-	jne	L$open_avx2_tail_512_rounds_and_x1hash
-	movq	%rbx,%rcx
-	subq	$384,%rcx
-	andq	$-16,%rcx
-L$open_avx2_tail_512_hash:
-	testq	%rcx,%rcx
-	je	L$open_avx2_tail_512_done
-	addq	0+0(%r8),%r10
-	adcq	8+0(%r8),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%r8),%r8
-	subq	$16,%rcx
-	jmp	L$open_avx2_tail_512_hash
-L$open_avx2_tail_512_done:
-	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
-	vpaddd	0+64(%rbp),%ymm7,%ymm7
-	vpaddd	0+96(%rbp),%ymm11,%ymm11
-	vpaddd	0+256(%rbp),%ymm15,%ymm15
-	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
-	vpaddd	0+64(%rbp),%ymm6,%ymm6
-	vpaddd	0+96(%rbp),%ymm10,%ymm10
-	vpaddd	0+224(%rbp),%ymm14,%ymm14
-	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
-	vpaddd	0+64(%rbp),%ymm5,%ymm5
-	vpaddd	0+96(%rbp),%ymm9,%ymm9
-	vpaddd	0+192(%rbp),%ymm13,%ymm13
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	0+64(%rbp),%ymm4,%ymm4
-	vpaddd	0+96(%rbp),%ymm8,%ymm8
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-
-	vmovdqa	%ymm0,0+128(%rbp)
-	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
-	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
-	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
-	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
-	vpxor	0+0(%rsi),%ymm0,%ymm0
-	vpxor	32+0(%rsi),%ymm3,%ymm3
-	vpxor	64+0(%rsi),%ymm7,%ymm7
-	vpxor	96+0(%rsi),%ymm11,%ymm11
-	vmovdqu	%ymm0,0+0(%rdi)
-	vmovdqu	%ymm3,32+0(%rdi)
-	vmovdqu	%ymm7,64+0(%rdi)
-	vmovdqu	%ymm11,96+0(%rdi)
-
-	vmovdqa	0+128(%rbp),%ymm0
-	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
-	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
-	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
-	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
-	vpxor	0+128(%rsi),%ymm3,%ymm3
-	vpxor	32+128(%rsi),%ymm2,%ymm2
-	vpxor	64+128(%rsi),%ymm6,%ymm6
-	vpxor	96+128(%rsi),%ymm10,%ymm10
-	vmovdqu	%ymm3,0+128(%rdi)
-	vmovdqu	%ymm2,32+128(%rdi)
-	vmovdqu	%ymm6,64+128(%rdi)
-	vmovdqu	%ymm10,96+128(%rdi)
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
-	vpxor	0+256(%rsi),%ymm3,%ymm3
-	vpxor	32+256(%rsi),%ymm1,%ymm1
-	vpxor	64+256(%rsi),%ymm5,%ymm5
-	vpxor	96+256(%rsi),%ymm9,%ymm9
-	vmovdqu	%ymm3,0+256(%rdi)
-	vmovdqu	%ymm1,32+256(%rdi)
-	vmovdqu	%ymm5,64+256(%rdi)
-	vmovdqu	%ymm9,96+256(%rdi)
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
-	vmovdqa	%ymm3,%ymm8
-
-	leaq	384(%rsi),%rsi
-	leaq	384(%rdi),%rdi
-	subq	$384,%rbx
-L$open_avx2_tail_128_xor:
-	cmpq	$32,%rbx
-	jb	L$open_avx2_tail_32_xor
-	subq	$32,%rbx
-	vpxor	(%rsi),%ymm0,%ymm0
-	vmovdqu	%ymm0,(%rdi)
-	leaq	32(%rsi),%rsi
-	leaq	32(%rdi),%rdi
-	vmovdqa	%ymm4,%ymm0
-	vmovdqa	%ymm8,%ymm4
-	vmovdqa	%ymm12,%ymm8
-	jmp	L$open_avx2_tail_128_xor
-L$open_avx2_tail_32_xor:
-	cmpq	$16,%rbx
-	vmovdqa	%xmm0,%xmm1
-	jb	L$open_avx2_exit
-	subq	$16,%rbx
-
-	vpxor	(%rsi),%xmm0,%xmm1
-	vmovdqu	%xmm1,(%rdi)
-	leaq	16(%rsi),%rsi
-	leaq	16(%rdi),%rdi
-	vperm2i128	$0x11,%ymm0,%ymm0,%ymm0
-	vmovdqa	%xmm0,%xmm1
-L$open_avx2_exit:
-	vzeroupper
-	jmp	L$open_sse_tail_16
-
-L$open_avx2_192:
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm0,%ymm2
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm4,%ymm6
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	%ymm8,%ymm10
-	vpaddd	L$avx2_inc(%rip),%ymm12,%ymm13
-	vmovdqa	%ymm12,%ymm11
-	vmovdqa	%ymm13,%ymm15
-	movq	$10,%r10
-L$open_avx2_192_rounds:
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-
-	decq	%r10
-	jne	L$open_avx2_192_rounds
-	vpaddd	%ymm2,%ymm0,%ymm0
-	vpaddd	%ymm2,%ymm1,%ymm1
-	vpaddd	%ymm6,%ymm4,%ymm4
-	vpaddd	%ymm6,%ymm5,%ymm5
-	vpaddd	%ymm10,%ymm8,%ymm8
-	vpaddd	%ymm10,%ymm9,%ymm9
-	vpaddd	%ymm11,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm13,%ymm13
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
-
-	vpand	L$clamp(%rip),%ymm3,%ymm3
-	vmovdqa	%ymm3,0+0(%rbp)
-
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
-L$open_avx2_short:
-	movq	%r8,%r8
-	call	poly_hash_ad_internal
-L$open_avx2_short_hash_and_xor_loop:
-	cmpq	$32,%rbx
-	jb	L$open_avx2_short_tail_32
-	subq	$32,%rbx
-	addq	0+0(%rsi),%r10
-	adcq	8+0(%rsi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	addq	0+16(%rsi),%r10
-	adcq	8+16(%rsi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-
-	vpxor	(%rsi),%ymm0,%ymm0
-	vmovdqu	%ymm0,(%rdi)
-	leaq	32(%rsi),%rsi
-	leaq	32(%rdi),%rdi
-
-	vmovdqa	%ymm4,%ymm0
-	vmovdqa	%ymm8,%ymm4
-	vmovdqa	%ymm12,%ymm8
-	vmovdqa	%ymm1,%ymm12
-	vmovdqa	%ymm5,%ymm1
-	vmovdqa	%ymm9,%ymm5
-	vmovdqa	%ymm13,%ymm9
-	vmovdqa	%ymm2,%ymm13
-	vmovdqa	%ymm6,%ymm2
-	jmp	L$open_avx2_short_hash_and_xor_loop
-L$open_avx2_short_tail_32:
-	cmpq	$16,%rbx
-	vmovdqa	%xmm0,%xmm1
-	jb	L$open_avx2_short_tail_32_exit
-	subq	$16,%rbx
-	addq	0+0(%rsi),%r10
-	adcq	8+0(%rsi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	vpxor	(%rsi),%xmm0,%xmm3
-	vmovdqu	%xmm3,(%rdi)
-	leaq	16(%rsi),%rsi
-	leaq	16(%rdi),%rdi
-	vextracti128	$1,%ymm0,%xmm1
-L$open_avx2_short_tail_32_exit:
-	vzeroupper
-	jmp	L$open_sse_tail_16
-
-L$open_avx2_320:
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm0,%ymm2
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm4,%ymm6
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	%ymm8,%ymm10
-	vpaddd	L$avx2_inc(%rip),%ymm12,%ymm13
-	vpaddd	L$avx2_inc(%rip),%ymm13,%ymm14
-	vmovdqa	%ymm4,%ymm7
-	vmovdqa	%ymm8,%ymm11
-	vmovdqa	%ymm12,0+160(%rbp)
-	vmovdqa	%ymm13,0+192(%rbp)
-	vmovdqa	%ymm14,0+224(%rbp)
-	movq	$10,%r10
-L$open_avx2_320_rounds:
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol16(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpsrld	$20,%ymm6,%ymm3
-	vpslld	$12,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol8(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpslld	$7,%ymm6,%ymm3
-	vpsrld	$25,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpalignr	$12,%ymm14,%ymm14,%ymm14
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$4,%ymm6,%ymm6,%ymm6
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol16(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpsrld	$20,%ymm6,%ymm3
-	vpslld	$12,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol8(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpslld	$7,%ymm6,%ymm3
-	vpsrld	$25,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpalignr	$4,%ymm14,%ymm14,%ymm14
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$12,%ymm6,%ymm6,%ymm6
-
-	decq	%r10
-	jne	L$open_avx2_320_rounds
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
-	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
-	vpaddd	%ymm7,%ymm4,%ymm4
-	vpaddd	%ymm7,%ymm5,%ymm5
-	vpaddd	%ymm7,%ymm6,%ymm6
-	vpaddd	%ymm11,%ymm8,%ymm8
-	vpaddd	%ymm11,%ymm9,%ymm9
-	vpaddd	%ymm11,%ymm10,%ymm10
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-	vpaddd	0+192(%rbp),%ymm13,%ymm13
-	vpaddd	0+224(%rbp),%ymm14,%ymm14
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
-
-	vpand	L$clamp(%rip),%ymm3,%ymm3
-	vmovdqa	%ymm3,0+0(%rbp)
-
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
-	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
-	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
-	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
-	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
-	jmp	L$open_avx2_short
-
-
-
-
-
-.p2align	6
-chacha20_poly1305_seal_avx2:
-
-
-
-
-
-
-
-
-
-
-
-
-	vzeroupper
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vbroadcasti128	0(%r9),%ymm4
-	vbroadcasti128	16(%r9),%ymm8
-	vbroadcasti128	32(%r9),%ymm12
-	vpaddd	L$avx2_init(%rip),%ymm12,%ymm12
-	cmpq	$192,%rbx
-	jbe	L$seal_avx2_192
-	cmpq	$320,%rbx
-	jbe	L$seal_avx2_320
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm0,%ymm2
-	vmovdqa	%ymm0,%ymm3
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm4,%ymm6
-	vmovdqa	%ymm4,%ymm7
-	vmovdqa	%ymm4,0+64(%rbp)
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	%ymm8,%ymm10
-	vmovdqa	%ymm8,%ymm11
-	vmovdqa	%ymm8,0+96(%rbp)
-	vmovdqa	%ymm12,%ymm15
-	vpaddd	L$avx2_inc(%rip),%ymm15,%ymm14
-	vpaddd	L$avx2_inc(%rip),%ymm14,%ymm13
-	vpaddd	L$avx2_inc(%rip),%ymm13,%ymm12
-	vmovdqa	%ymm12,0+160(%rbp)
-	vmovdqa	%ymm13,0+192(%rbp)
-	vmovdqa	%ymm14,0+224(%rbp)
-	vmovdqa	%ymm15,0+256(%rbp)
-	movq	$10,%r10
-L$seal_avx2_init_rounds:
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$25,%ymm7,%ymm8
-	vpslld	$32-25,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$25,%ymm6,%ymm8
-	vpslld	$32-25,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$25,%ymm5,%ymm8
-	vpslld	$32-25,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$25,%ymm4,%ymm8
-	vpslld	$32-25,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	0+128(%rbp),%ymm8
-	vpalignr	$4,%ymm7,%ymm7,%ymm7
-	vpalignr	$8,%ymm11,%ymm11,%ymm11
-	vpalignr	$12,%ymm15,%ymm15,%ymm15
-	vpalignr	$4,%ymm6,%ymm6,%ymm6
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$12,%ymm14,%ymm14,%ymm14
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$25,%ymm7,%ymm8
-	vpslld	$32-25,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$25,%ymm6,%ymm8
-	vpslld	$32-25,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$25,%ymm5,%ymm8
-	vpslld	$32-25,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$25,%ymm4,%ymm8
-	vpslld	$32-25,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	0+128(%rbp),%ymm8
-	vpalignr	$12,%ymm7,%ymm7,%ymm7
-	vpalignr	$8,%ymm11,%ymm11,%ymm11
-	vpalignr	$4,%ymm15,%ymm15,%ymm15
-	vpalignr	$12,%ymm6,%ymm6,%ymm6
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$4,%ymm14,%ymm14,%ymm14
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-
-	decq	%r10
-	jnz	L$seal_avx2_init_rounds
-	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
-	vpaddd	0+64(%rbp),%ymm7,%ymm7
-	vpaddd	0+96(%rbp),%ymm11,%ymm11
-	vpaddd	0+256(%rbp),%ymm15,%ymm15
-	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
-	vpaddd	0+64(%rbp),%ymm6,%ymm6
-	vpaddd	0+96(%rbp),%ymm10,%ymm10
-	vpaddd	0+224(%rbp),%ymm14,%ymm14
-	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
-	vpaddd	0+64(%rbp),%ymm5,%ymm5
-	vpaddd	0+96(%rbp),%ymm9,%ymm9
-	vpaddd	0+192(%rbp),%ymm13,%ymm13
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	0+64(%rbp),%ymm4,%ymm4
-	vpaddd	0+96(%rbp),%ymm8,%ymm8
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-
-	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
-	vperm2i128	$0x02,%ymm3,%ymm7,%ymm15
-	vperm2i128	$0x13,%ymm3,%ymm7,%ymm3
-	vpand	L$clamp(%rip),%ymm15,%ymm15
-	vmovdqa	%ymm15,0+0(%rbp)
-	movq	%r8,%r8
-	call	poly_hash_ad_internal
-
-	vpxor	0(%rsi),%ymm3,%ymm3
-	vpxor	32(%rsi),%ymm11,%ymm11
-	vmovdqu	%ymm3,0(%rdi)
-	vmovdqu	%ymm11,32(%rdi)
-	vperm2i128	$0x02,%ymm2,%ymm6,%ymm15
-	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
-	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
-	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
-	vpxor	0+64(%rsi),%ymm15,%ymm15
-	vpxor	32+64(%rsi),%ymm2,%ymm2
-	vpxor	64+64(%rsi),%ymm6,%ymm6
-	vpxor	96+64(%rsi),%ymm10,%ymm10
-	vmovdqu	%ymm15,0+64(%rdi)
-	vmovdqu	%ymm2,32+64(%rdi)
-	vmovdqu	%ymm6,64+64(%rdi)
-	vmovdqu	%ymm10,96+64(%rdi)
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm15
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
-	vpxor	0+192(%rsi),%ymm15,%ymm15
-	vpxor	32+192(%rsi),%ymm1,%ymm1
-	vpxor	64+192(%rsi),%ymm5,%ymm5
-	vpxor	96+192(%rsi),%ymm9,%ymm9
-	vmovdqu	%ymm15,0+192(%rdi)
-	vmovdqu	%ymm1,32+192(%rdi)
-	vmovdqu	%ymm5,64+192(%rdi)
-	vmovdqu	%ymm9,96+192(%rdi)
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm15
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
-	vmovdqa	%ymm15,%ymm8
-
-	leaq	320(%rsi),%rsi
-	subq	$320,%rbx
-	movq	$320,%rcx
-	cmpq	$128,%rbx
-	jbe	L$seal_avx2_short_hash_remainder
-	vpxor	0(%rsi),%ymm0,%ymm0
-	vpxor	32(%rsi),%ymm4,%ymm4
-	vpxor	64(%rsi),%ymm8,%ymm8
-	vpxor	96(%rsi),%ymm12,%ymm12
-	vmovdqu	%ymm0,320(%rdi)
-	vmovdqu	%ymm4,352(%rdi)
-	vmovdqu	%ymm8,384(%rdi)
-	vmovdqu	%ymm12,416(%rdi)
-	leaq	128(%rsi),%rsi
-	subq	$128,%rbx
-	movq	$8,%rcx
-	movq	$2,%r8
-	cmpq	$128,%rbx
-	jbe	L$seal_avx2_tail_128
-	cmpq	$256,%rbx
-	jbe	L$seal_avx2_tail_256
-	cmpq	$384,%rbx
-	jbe	L$seal_avx2_tail_384
-	cmpq	$512,%rbx
-	jbe	L$seal_avx2_tail_512
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vmovdqa	0+64(%rbp),%ymm4
-	vmovdqa	0+96(%rbp),%ymm8
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	%ymm0,%ymm2
-	vmovdqa	%ymm4,%ymm6
-	vmovdqa	%ymm8,%ymm10
-	vmovdqa	%ymm0,%ymm3
-	vmovdqa	%ymm4,%ymm7
-	vmovdqa	%ymm8,%ymm11
-	vmovdqa	L$avx2_inc(%rip),%ymm12
-	vpaddd	0+160(%rbp),%ymm12,%ymm15
-	vpaddd	%ymm15,%ymm12,%ymm14
-	vpaddd	%ymm14,%ymm12,%ymm13
-	vpaddd	%ymm13,%ymm12,%ymm12
-	vmovdqa	%ymm15,0+256(%rbp)
-	vmovdqa	%ymm14,0+224(%rbp)
-	vmovdqa	%ymm13,0+192(%rbp)
-	vmovdqa	%ymm12,0+160(%rbp)
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$25,%ymm7,%ymm8
-	vpslld	$32-25,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$25,%ymm6,%ymm8
-	vpslld	$32-25,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$25,%ymm5,%ymm8
-	vpslld	$32-25,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$25,%ymm4,%ymm8
-	vpslld	$32-25,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	0+128(%rbp),%ymm8
-	vpalignr	$4,%ymm7,%ymm7,%ymm7
-	vpalignr	$8,%ymm11,%ymm11,%ymm11
-	vpalignr	$12,%ymm15,%ymm15,%ymm15
-	vpalignr	$4,%ymm6,%ymm6,%ymm6
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$12,%ymm14,%ymm14,%ymm14
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$25,%ymm7,%ymm8
-	vpslld	$32-25,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$25,%ymm6,%ymm8
-	vpslld	$32-25,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$25,%ymm5,%ymm8
-	vpslld	$32-25,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$25,%ymm4,%ymm8
-	vpslld	$32-25,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	0+128(%rbp),%ymm8
-	vpalignr	$12,%ymm7,%ymm7,%ymm7
-	vpalignr	$8,%ymm11,%ymm11,%ymm11
-	vpalignr	$4,%ymm15,%ymm15,%ymm15
-	vpalignr	$12,%ymm6,%ymm6,%ymm6
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$4,%ymm14,%ymm14,%ymm14
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-
-	subq	$16,%rdi
-	movq	$9,%rcx
-	jmp	L$seal_avx2_main_loop_rounds_entry
-.p2align	5
-L$seal_avx2_main_loop:
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vmovdqa	0+64(%rbp),%ymm4
-	vmovdqa	0+96(%rbp),%ymm8
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	%ymm0,%ymm2
-	vmovdqa	%ymm4,%ymm6
-	vmovdqa	%ymm8,%ymm10
-	vmovdqa	%ymm0,%ymm3
-	vmovdqa	%ymm4,%ymm7
-	vmovdqa	%ymm8,%ymm11
-	vmovdqa	L$avx2_inc(%rip),%ymm12
-	vpaddd	0+160(%rbp),%ymm12,%ymm15
-	vpaddd	%ymm15,%ymm12,%ymm14
-	vpaddd	%ymm14,%ymm12,%ymm13
-	vpaddd	%ymm13,%ymm12,%ymm12
-	vmovdqa	%ymm15,0+256(%rbp)
-	vmovdqa	%ymm14,0+224(%rbp)
-	vmovdqa	%ymm13,0+192(%rbp)
-	vmovdqa	%ymm12,0+160(%rbp)
-
-	movq	$10,%rcx
-.p2align	5
-L$seal_avx2_main_loop_rounds:
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-L$seal_avx2_main_loop_rounds_entry:
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	addq	0+16(%rdi),%r10
-	adcq	8+16(%rdi),%r11
-	adcq	$1,%r12
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$25,%ymm7,%ymm8
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	vpslld	$32-25,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$25,%ymm6,%ymm8
-	vpslld	$32-25,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$25,%ymm5,%ymm8
-	vpslld	$32-25,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$25,%ymm4,%ymm8
-	vpslld	$32-25,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	0+128(%rbp),%ymm8
-	vpalignr	$4,%ymm7,%ymm7,%ymm7
-	vpalignr	$8,%ymm11,%ymm11,%ymm11
-	vpalignr	$12,%ymm15,%ymm15,%ymm15
-	vpalignr	$4,%ymm6,%ymm6,%ymm6
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$12,%ymm14,%ymm14,%ymm14
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	addq	0+32(%rdi),%r10
-	adcq	8+32(%rdi),%r11
-	adcq	$1,%r12
-
-	leaq	48(%rdi),%rdi
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$25,%ymm7,%ymm8
-	vpslld	$32-25,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$25,%ymm6,%ymm8
-	vpslld	$32-25,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	vpsrld	$25,%ymm5,%ymm8
-	vpslld	$32-25,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$25,%ymm4,%ymm8
-	vpslld	$32-25,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	0+128(%rbp),%ymm8
-	vpalignr	$12,%ymm7,%ymm7,%ymm7
-	vpalignr	$8,%ymm11,%ymm11,%ymm11
-	vpalignr	$4,%ymm15,%ymm15,%ymm15
-	vpalignr	$12,%ymm6,%ymm6,%ymm6
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$4,%ymm14,%ymm14,%ymm14
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-
-	decq	%rcx
-	jne	L$seal_avx2_main_loop_rounds
-	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
-	vpaddd	0+64(%rbp),%ymm7,%ymm7
-	vpaddd	0+96(%rbp),%ymm11,%ymm11
-	vpaddd	0+256(%rbp),%ymm15,%ymm15
-	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
-	vpaddd	0+64(%rbp),%ymm6,%ymm6
-	vpaddd	0+96(%rbp),%ymm10,%ymm10
-	vpaddd	0+224(%rbp),%ymm14,%ymm14
-	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
-	vpaddd	0+64(%rbp),%ymm5,%ymm5
-	vpaddd	0+96(%rbp),%ymm9,%ymm9
-	vpaddd	0+192(%rbp),%ymm13,%ymm13
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	0+64(%rbp),%ymm4,%ymm4
-	vpaddd	0+96(%rbp),%ymm8,%ymm8
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-
-	vmovdqa	%ymm0,0+128(%rbp)
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	addq	0+16(%rdi),%r10
-	adcq	8+16(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	32(%rdi),%rdi
-	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
-	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
-	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
-	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
-	vpxor	0+0(%rsi),%ymm0,%ymm0
-	vpxor	32+0(%rsi),%ymm3,%ymm3
-	vpxor	64+0(%rsi),%ymm7,%ymm7
-	vpxor	96+0(%rsi),%ymm11,%ymm11
-	vmovdqu	%ymm0,0+0(%rdi)
-	vmovdqu	%ymm3,32+0(%rdi)
-	vmovdqu	%ymm7,64+0(%rdi)
-	vmovdqu	%ymm11,96+0(%rdi)
-
-	vmovdqa	0+128(%rbp),%ymm0
-	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
-	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
-	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
-	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
-	vpxor	0+128(%rsi),%ymm3,%ymm3
-	vpxor	32+128(%rsi),%ymm2,%ymm2
-	vpxor	64+128(%rsi),%ymm6,%ymm6
-	vpxor	96+128(%rsi),%ymm10,%ymm10
-	vmovdqu	%ymm3,0+128(%rdi)
-	vmovdqu	%ymm2,32+128(%rdi)
-	vmovdqu	%ymm6,64+128(%rdi)
-	vmovdqu	%ymm10,96+128(%rdi)
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
-	vpxor	0+256(%rsi),%ymm3,%ymm3
-	vpxor	32+256(%rsi),%ymm1,%ymm1
-	vpxor	64+256(%rsi),%ymm5,%ymm5
-	vpxor	96+256(%rsi),%ymm9,%ymm9
-	vmovdqu	%ymm3,0+256(%rdi)
-	vmovdqu	%ymm1,32+256(%rdi)
-	vmovdqu	%ymm5,64+256(%rdi)
-	vmovdqu	%ymm9,96+256(%rdi)
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
-	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
-	vpxor	0+384(%rsi),%ymm3,%ymm3
-	vpxor	32+384(%rsi),%ymm0,%ymm0
-	vpxor	64+384(%rsi),%ymm4,%ymm4
-	vpxor	96+384(%rsi),%ymm8,%ymm8
-	vmovdqu	%ymm3,0+384(%rdi)
-	vmovdqu	%ymm0,32+384(%rdi)
-	vmovdqu	%ymm4,64+384(%rdi)
-	vmovdqu	%ymm8,96+384(%rdi)
-
-	leaq	512(%rsi),%rsi
-	subq	$512,%rbx
-	cmpq	$512,%rbx
-	jg	L$seal_avx2_main_loop
-
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	addq	0+16(%rdi),%r10
-	adcq	8+16(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	32(%rdi),%rdi
-	movq	$10,%rcx
-	xorq	%r8,%r8
-
-	cmpq	$384,%rbx
-	ja	L$seal_avx2_tail_512
-	cmpq	$256,%rbx
-	ja	L$seal_avx2_tail_384
-	cmpq	$128,%rbx
-	ja	L$seal_avx2_tail_256
-
-L$seal_avx2_tail_128:
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vmovdqa	0+64(%rbp),%ymm4
-	vmovdqa	0+96(%rbp),%ymm8
-	vmovdqa	L$avx2_inc(%rip),%ymm12
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-	vmovdqa	%ymm12,0+160(%rbp)
-
-L$seal_avx2_tail_128_rounds_and_3xhash:
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rdi),%rdi
-L$seal_avx2_tail_128_rounds_and_2xhash:
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	addq	0+16(%rdi),%r10
-	adcq	8+16(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	32(%rdi),%rdi
-	decq	%rcx
-	jg	L$seal_avx2_tail_128_rounds_and_3xhash
-	decq	%r8
-	jge	L$seal_avx2_tail_128_rounds_and_2xhash
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	0+64(%rbp),%ymm4,%ymm4
-	vpaddd	0+96(%rbp),%ymm8,%ymm8
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
-	vmovdqa	%ymm3,%ymm8
-
-	jmp	L$seal_avx2_short_loop
-
-L$seal_avx2_tail_256:
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vmovdqa	0+64(%rbp),%ymm4
-	vmovdqa	0+96(%rbp),%ymm8
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	L$avx2_inc(%rip),%ymm12
-	vpaddd	0+160(%rbp),%ymm12,%ymm13
-	vpaddd	%ymm13,%ymm12,%ymm12
-	vmovdqa	%ymm12,0+160(%rbp)
-	vmovdqa	%ymm13,0+192(%rbp)
-
-L$seal_avx2_tail_256_rounds_and_3xhash:
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rdi),%rdi
-L$seal_avx2_tail_256_rounds_and_2xhash:
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-	addq	0+16(%rdi),%r10
-	adcq	8+16(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	32(%rdi),%rdi
-	decq	%rcx
-	jg	L$seal_avx2_tail_256_rounds_and_3xhash
-	decq	%r8
-	jge	L$seal_avx2_tail_256_rounds_and_2xhash
-	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
-	vpaddd	0+64(%rbp),%ymm5,%ymm5
-	vpaddd	0+96(%rbp),%ymm9,%ymm9
-	vpaddd	0+192(%rbp),%ymm13,%ymm13
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	0+64(%rbp),%ymm4,%ymm4
-	vpaddd	0+96(%rbp),%ymm8,%ymm8
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
-	vpxor	0+0(%rsi),%ymm3,%ymm3
-	vpxor	32+0(%rsi),%ymm1,%ymm1
-	vpxor	64+0(%rsi),%ymm5,%ymm5
-	vpxor	96+0(%rsi),%ymm9,%ymm9
-	vmovdqu	%ymm3,0+0(%rdi)
-	vmovdqu	%ymm1,32+0(%rdi)
-	vmovdqu	%ymm5,64+0(%rdi)
-	vmovdqu	%ymm9,96+0(%rdi)
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
-	vmovdqa	%ymm3,%ymm8
-
-	movq	$128,%rcx
-	leaq	128(%rsi),%rsi
-	subq	$128,%rbx
-	jmp	L$seal_avx2_short_hash_remainder
-
-L$seal_avx2_tail_384:
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vmovdqa	0+64(%rbp),%ymm4
-	vmovdqa	0+96(%rbp),%ymm8
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	%ymm0,%ymm2
-	vmovdqa	%ymm4,%ymm6
-	vmovdqa	%ymm8,%ymm10
-	vmovdqa	L$avx2_inc(%rip),%ymm12
-	vpaddd	0+160(%rbp),%ymm12,%ymm14
-	vpaddd	%ymm14,%ymm12,%ymm13
-	vpaddd	%ymm13,%ymm12,%ymm12
-	vmovdqa	%ymm12,0+160(%rbp)
-	vmovdqa	%ymm13,0+192(%rbp)
-	vmovdqa	%ymm14,0+224(%rbp)
-
-L$seal_avx2_tail_384_rounds_and_3xhash:
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rdi),%rdi
-L$seal_avx2_tail_384_rounds_and_2xhash:
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol16(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpsrld	$20,%ymm6,%ymm3
-	vpslld	$12,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol8(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpslld	$7,%ymm6,%ymm3
-	vpsrld	$25,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpalignr	$12,%ymm14,%ymm14,%ymm14
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$4,%ymm6,%ymm6,%ymm6
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	addq	0+16(%rdi),%r10
-	adcq	8+16(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol16(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpsrld	$20,%ymm6,%ymm3
-	vpslld	$12,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol8(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpslld	$7,%ymm6,%ymm3
-	vpsrld	$25,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpalignr	$4,%ymm14,%ymm14,%ymm14
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$12,%ymm6,%ymm6,%ymm6
-
-	leaq	32(%rdi),%rdi
-	decq	%rcx
-	jg	L$seal_avx2_tail_384_rounds_and_3xhash
-	decq	%r8
-	jge	L$seal_avx2_tail_384_rounds_and_2xhash
-	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
-	vpaddd	0+64(%rbp),%ymm6,%ymm6
-	vpaddd	0+96(%rbp),%ymm10,%ymm10
-	vpaddd	0+224(%rbp),%ymm14,%ymm14
-	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
-	vpaddd	0+64(%rbp),%ymm5,%ymm5
-	vpaddd	0+96(%rbp),%ymm9,%ymm9
-	vpaddd	0+192(%rbp),%ymm13,%ymm13
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	0+64(%rbp),%ymm4,%ymm4
-	vpaddd	0+96(%rbp),%ymm8,%ymm8
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
-	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
-	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
-	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
-	vpxor	0+0(%rsi),%ymm3,%ymm3
-	vpxor	32+0(%rsi),%ymm2,%ymm2
-	vpxor	64+0(%rsi),%ymm6,%ymm6
-	vpxor	96+0(%rsi),%ymm10,%ymm10
-	vmovdqu	%ymm3,0+0(%rdi)
-	vmovdqu	%ymm2,32+0(%rdi)
-	vmovdqu	%ymm6,64+0(%rdi)
-	vmovdqu	%ymm10,96+0(%rdi)
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
-	vpxor	0+128(%rsi),%ymm3,%ymm3
-	vpxor	32+128(%rsi),%ymm1,%ymm1
-	vpxor	64+128(%rsi),%ymm5,%ymm5
-	vpxor	96+128(%rsi),%ymm9,%ymm9
-	vmovdqu	%ymm3,0+128(%rdi)
-	vmovdqu	%ymm1,32+128(%rdi)
-	vmovdqu	%ymm5,64+128(%rdi)
-	vmovdqu	%ymm9,96+128(%rdi)
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
-	vmovdqa	%ymm3,%ymm8
-
-	movq	$256,%rcx
-	leaq	256(%rsi),%rsi
-	subq	$256,%rbx
-	jmp	L$seal_avx2_short_hash_remainder
-
-L$seal_avx2_tail_512:
-	vmovdqa	L$chacha20_consts(%rip),%ymm0
-	vmovdqa	0+64(%rbp),%ymm4
-	vmovdqa	0+96(%rbp),%ymm8
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	%ymm0,%ymm2
-	vmovdqa	%ymm4,%ymm6
-	vmovdqa	%ymm8,%ymm10
-	vmovdqa	%ymm0,%ymm3
-	vmovdqa	%ymm4,%ymm7
-	vmovdqa	%ymm8,%ymm11
-	vmovdqa	L$avx2_inc(%rip),%ymm12
-	vpaddd	0+160(%rbp),%ymm12,%ymm15
-	vpaddd	%ymm15,%ymm12,%ymm14
-	vpaddd	%ymm14,%ymm12,%ymm13
-	vpaddd	%ymm13,%ymm12,%ymm12
-	vmovdqa	%ymm15,0+256(%rbp)
-	vmovdqa	%ymm14,0+224(%rbp)
-	vmovdqa	%ymm13,0+192(%rbp)
-	vmovdqa	%ymm12,0+160(%rbp)
-
-L$seal_avx2_tail_512_rounds_and_3xhash:
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rdi),%rdi
-L$seal_avx2_tail_512_rounds_and_2xhash:
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$25,%ymm7,%ymm8
-	vpslld	$32-25,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	vpsrld	$25,%ymm6,%ymm8
-	vpslld	$32-25,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$25,%ymm5,%ymm8
-	vpslld	$32-25,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$25,%ymm4,%ymm8
-	vpslld	$32-25,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	0+128(%rbp),%ymm8
-	vpalignr	$4,%ymm7,%ymm7,%ymm7
-	vpalignr	$8,%ymm11,%ymm11,%ymm11
-	vpalignr	$12,%ymm15,%ymm15,%ymm15
-	vpalignr	$4,%ymm6,%ymm6,%ymm6
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$12,%ymm14,%ymm14,%ymm14
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vmovdqa	%ymm8,0+128(%rbp)
-	vmovdqa	L$rol16(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$20,%ymm7,%ymm8
-	vpslld	$32-20,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$20,%ymm6,%ymm8
-	vpslld	$32-20,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$20,%ymm5,%ymm8
-	vpslld	$32-20,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$20,%ymm4,%ymm8
-	vpslld	$32-20,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	L$rol8(%rip),%ymm8
-	vpaddd	%ymm7,%ymm3,%ymm3
-	vpaddd	%ymm6,%ymm2,%ymm2
-	addq	0+16(%rdi),%r10
-	adcq	8+16(%rdi),%r11
-	adcq	$1,%r12
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm3,%ymm15,%ymm15
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	%ymm8,%ymm15,%ymm15
-	vpshufb	%ymm8,%ymm14,%ymm14
-	vpshufb	%ymm8,%ymm13,%ymm13
-	vpshufb	%ymm8,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm11,%ymm11
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpaddd	0+128(%rbp),%ymm12,%ymm8
-	vpxor	%ymm11,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	%ymm8,0+128(%rbp)
-	vpsrld	$25,%ymm7,%ymm8
-	movq	0+0+0(%rbp),%rdx
-	movq	%rdx,%r15
-	mulxq	%r10,%r13,%r14
-	mulxq	%r11,%rax,%rdx
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	vpslld	$32-25,%ymm7,%ymm7
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$25,%ymm6,%ymm8
-	vpslld	$32-25,%ymm6,%ymm6
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$25,%ymm5,%ymm8
-	vpslld	$32-25,%ymm5,%ymm5
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$25,%ymm4,%ymm8
-	vpslld	$32-25,%ymm4,%ymm4
-	vpxor	%ymm8,%ymm4,%ymm4
-	vmovdqa	0+128(%rbp),%ymm8
-	vpalignr	$12,%ymm7,%ymm7,%ymm7
-	vpalignr	$8,%ymm11,%ymm11,%ymm11
-	vpalignr	$4,%ymm15,%ymm15,%ymm15
-	vpalignr	$12,%ymm6,%ymm6,%ymm6
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$4,%ymm14,%ymm14,%ymm14
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	movq	8+0+0(%rbp),%rdx
-	mulxq	%r10,%r10,%rax
-	addq	%r10,%r14
-	mulxq	%r11,%r11,%r9
-	adcq	%r11,%r15
-	adcq	$0,%r9
-	imulq	%r12,%rdx
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-	addq	%rax,%r15
-	adcq	%rdx,%r9
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	32(%rdi),%rdi
-	decq	%rcx
-	jg	L$seal_avx2_tail_512_rounds_and_3xhash
-	decq	%r8
-	jge	L$seal_avx2_tail_512_rounds_and_2xhash
-	vpaddd	L$chacha20_consts(%rip),%ymm3,%ymm3
-	vpaddd	0+64(%rbp),%ymm7,%ymm7
-	vpaddd	0+96(%rbp),%ymm11,%ymm11
-	vpaddd	0+256(%rbp),%ymm15,%ymm15
-	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
-	vpaddd	0+64(%rbp),%ymm6,%ymm6
-	vpaddd	0+96(%rbp),%ymm10,%ymm10
-	vpaddd	0+224(%rbp),%ymm14,%ymm14
-	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
-	vpaddd	0+64(%rbp),%ymm5,%ymm5
-	vpaddd	0+96(%rbp),%ymm9,%ymm9
-	vpaddd	0+192(%rbp),%ymm13,%ymm13
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	0+64(%rbp),%ymm4,%ymm4
-	vpaddd	0+96(%rbp),%ymm8,%ymm8
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-
-	vmovdqa	%ymm0,0+128(%rbp)
-	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
-	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
-	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
-	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
-	vpxor	0+0(%rsi),%ymm0,%ymm0
-	vpxor	32+0(%rsi),%ymm3,%ymm3
-	vpxor	64+0(%rsi),%ymm7,%ymm7
-	vpxor	96+0(%rsi),%ymm11,%ymm11
-	vmovdqu	%ymm0,0+0(%rdi)
-	vmovdqu	%ymm3,32+0(%rdi)
-	vmovdqu	%ymm7,64+0(%rdi)
-	vmovdqu	%ymm11,96+0(%rdi)
-
-	vmovdqa	0+128(%rbp),%ymm0
-	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
-	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
-	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
-	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
-	vpxor	0+128(%rsi),%ymm3,%ymm3
-	vpxor	32+128(%rsi),%ymm2,%ymm2
-	vpxor	64+128(%rsi),%ymm6,%ymm6
-	vpxor	96+128(%rsi),%ymm10,%ymm10
-	vmovdqu	%ymm3,0+128(%rdi)
-	vmovdqu	%ymm2,32+128(%rdi)
-	vmovdqu	%ymm6,64+128(%rdi)
-	vmovdqu	%ymm10,96+128(%rdi)
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
-	vpxor	0+256(%rsi),%ymm3,%ymm3
-	vpxor	32+256(%rsi),%ymm1,%ymm1
-	vpxor	64+256(%rsi),%ymm5,%ymm5
-	vpxor	96+256(%rsi),%ymm9,%ymm9
-	vmovdqu	%ymm3,0+256(%rdi)
-	vmovdqu	%ymm1,32+256(%rdi)
-	vmovdqu	%ymm5,64+256(%rdi)
-	vmovdqu	%ymm9,96+256(%rdi)
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
-	vmovdqa	%ymm3,%ymm8
-
-	movq	$384,%rcx
-	leaq	384(%rsi),%rsi
-	subq	$384,%rbx
-	jmp	L$seal_avx2_short_hash_remainder
-
-L$seal_avx2_320:
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm0,%ymm2
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm4,%ymm6
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	%ymm8,%ymm10
-	vpaddd	L$avx2_inc(%rip),%ymm12,%ymm13
-	vpaddd	L$avx2_inc(%rip),%ymm13,%ymm14
-	vmovdqa	%ymm4,%ymm7
-	vmovdqa	%ymm8,%ymm11
-	vmovdqa	%ymm12,0+160(%rbp)
-	vmovdqa	%ymm13,0+192(%rbp)
-	vmovdqa	%ymm14,0+224(%rbp)
-	movq	$10,%r10
-L$seal_avx2_320_rounds:
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol16(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpsrld	$20,%ymm6,%ymm3
-	vpslld	$12,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol8(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpslld	$7,%ymm6,%ymm3
-	vpsrld	$25,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpalignr	$12,%ymm14,%ymm14,%ymm14
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$4,%ymm6,%ymm6,%ymm6
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol16(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpsrld	$20,%ymm6,%ymm3
-	vpslld	$12,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpaddd	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm2,%ymm14,%ymm14
-	vpshufb	L$rol8(%rip),%ymm14,%ymm14
-	vpaddd	%ymm14,%ymm10,%ymm10
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpslld	$7,%ymm6,%ymm3
-	vpsrld	$25,%ymm6,%ymm6
-	vpxor	%ymm3,%ymm6,%ymm6
-	vpalignr	$4,%ymm14,%ymm14,%ymm14
-	vpalignr	$8,%ymm10,%ymm10,%ymm10
-	vpalignr	$12,%ymm6,%ymm6,%ymm6
-
-	decq	%r10
-	jne	L$seal_avx2_320_rounds
-	vpaddd	L$chacha20_consts(%rip),%ymm0,%ymm0
-	vpaddd	L$chacha20_consts(%rip),%ymm1,%ymm1
-	vpaddd	L$chacha20_consts(%rip),%ymm2,%ymm2
-	vpaddd	%ymm7,%ymm4,%ymm4
-	vpaddd	%ymm7,%ymm5,%ymm5
-	vpaddd	%ymm7,%ymm6,%ymm6
-	vpaddd	%ymm11,%ymm8,%ymm8
-	vpaddd	%ymm11,%ymm9,%ymm9
-	vpaddd	%ymm11,%ymm10,%ymm10
-	vpaddd	0+160(%rbp),%ymm12,%ymm12
-	vpaddd	0+192(%rbp),%ymm13,%ymm13
-	vpaddd	0+224(%rbp),%ymm14,%ymm14
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
-
-	vpand	L$clamp(%rip),%ymm3,%ymm3
-	vmovdqa	%ymm3,0+0(%rbp)
-
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
-	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
-	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
-	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
-	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
-	jmp	L$seal_avx2_short
-
-L$seal_avx2_192:
-	vmovdqa	%ymm0,%ymm1
-	vmovdqa	%ymm0,%ymm2
-	vmovdqa	%ymm4,%ymm5
-	vmovdqa	%ymm4,%ymm6
-	vmovdqa	%ymm8,%ymm9
-	vmovdqa	%ymm8,%ymm10
-	vpaddd	L$avx2_inc(%rip),%ymm12,%ymm13
-	vmovdqa	%ymm12,%ymm11
-	vmovdqa	%ymm13,%ymm15
-	movq	$10,%r10
-L$seal_avx2_192_rounds:
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$12,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$4,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$12,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$4,%ymm5,%ymm5,%ymm5
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol16(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$20,%ymm4,%ymm3
-	vpslld	$12,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpaddd	%ymm4,%ymm0,%ymm0
-	vpxor	%ymm0,%ymm12,%ymm12
-	vpshufb	L$rol8(%rip),%ymm12,%ymm12
-	vpaddd	%ymm12,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpslld	$7,%ymm4,%ymm3
-	vpsrld	$25,%ymm4,%ymm4
-	vpxor	%ymm3,%ymm4,%ymm4
-	vpalignr	$4,%ymm12,%ymm12,%ymm12
-	vpalignr	$8,%ymm8,%ymm8,%ymm8
-	vpalignr	$12,%ymm4,%ymm4,%ymm4
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol16(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpsrld	$20,%ymm5,%ymm3
-	vpslld	$12,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpaddd	%ymm5,%ymm1,%ymm1
-	vpxor	%ymm1,%ymm13,%ymm13
-	vpshufb	L$rol8(%rip),%ymm13,%ymm13
-	vpaddd	%ymm13,%ymm9,%ymm9
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpslld	$7,%ymm5,%ymm3
-	vpsrld	$25,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm5,%ymm5
-	vpalignr	$4,%ymm13,%ymm13,%ymm13
-	vpalignr	$8,%ymm9,%ymm9,%ymm9
-	vpalignr	$12,%ymm5,%ymm5,%ymm5
-
-	decq	%r10
-	jne	L$seal_avx2_192_rounds
-	vpaddd	%ymm2,%ymm0,%ymm0
-	vpaddd	%ymm2,%ymm1,%ymm1
-	vpaddd	%ymm6,%ymm4,%ymm4
-	vpaddd	%ymm6,%ymm5,%ymm5
-	vpaddd	%ymm10,%ymm8,%ymm8
-	vpaddd	%ymm10,%ymm9,%ymm9
-	vpaddd	%ymm11,%ymm12,%ymm12
-	vpaddd	%ymm15,%ymm13,%ymm13
-	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
-
-	vpand	L$clamp(%rip),%ymm3,%ymm3
-	vmovdqa	%ymm3,0+0(%rbp)
-
-	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
-	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
-	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
-	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
-	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
-	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
-L$seal_avx2_short:
-	movq	%r8,%r8
-	call	poly_hash_ad_internal
-	xorq	%rcx,%rcx
-L$seal_avx2_short_hash_remainder:
-	cmpq	$16,%rcx
-	jb	L$seal_avx2_short_loop
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	subq	$16,%rcx
-	addq	$16,%rdi
-	jmp	L$seal_avx2_short_hash_remainder
-L$seal_avx2_short_loop:
-	cmpq	$32,%rbx
-	jb	L$seal_avx2_short_tail
-	subq	$32,%rbx
-
-	vpxor	(%rsi),%ymm0,%ymm0
-	vmovdqu	%ymm0,(%rdi)
-	leaq	32(%rsi),%rsi
-
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-	addq	0+16(%rdi),%r10
-	adcq	8+16(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	32(%rdi),%rdi
-
-	vmovdqa	%ymm4,%ymm0
-	vmovdqa	%ymm8,%ymm4
-	vmovdqa	%ymm12,%ymm8
-	vmovdqa	%ymm1,%ymm12
-	vmovdqa	%ymm5,%ymm1
-	vmovdqa	%ymm9,%ymm5
-	vmovdqa	%ymm13,%ymm9
-	vmovdqa	%ymm2,%ymm13
-	vmovdqa	%ymm6,%ymm2
-	jmp	L$seal_avx2_short_loop
-L$seal_avx2_short_tail:
-	cmpq	$16,%rbx
-	jb	L$seal_avx2_exit
-	subq	$16,%rbx
-	vpxor	(%rsi),%xmm0,%xmm3
-	vmovdqu	%xmm3,(%rdi)
-	leaq	16(%rsi),%rsi
-	addq	0+0(%rdi),%r10
-	adcq	8+0(%rdi),%r11
-	adcq	$1,%r12
-	movq	0+0+0(%rbp),%rax
-	movq	%rax,%r15
-	mulq	%r10
-	movq	%rax,%r13
-	movq	%rdx,%r14
-	movq	0+0+0(%rbp),%rax
-	mulq	%r11
-	imulq	%r12,%r15
-	addq	%rax,%r14
-	adcq	%rdx,%r15
-	movq	8+0+0(%rbp),%rax
-	movq	%rax,%r9
-	mulq	%r10
-	addq	%rax,%r14
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-	movq	8+0+0(%rbp),%rax
-	mulq	%r11
-	addq	%rax,%r15
-	adcq	$0,%rdx
-	imulq	%r12,%r9
-	addq	%r10,%r15
-	adcq	%rdx,%r9
-	movq	%r13,%r10
-	movq	%r14,%r11
-	movq	%r15,%r12
-	andq	$3,%r12
-	movq	%r15,%r13
-	andq	$-4,%r13
-	movq	%r9,%r14
-	shrdq	$2,%r9,%r15
-	shrq	$2,%r9
-	addq	%r13,%r15
-	adcq	%r14,%r9
-	addq	%r15,%r10
-	adcq	%r9,%r11
-	adcq	$0,%r12
-
-	leaq	16(%rdi),%rdi
-	vextracti128	$1,%ymm0,%xmm0
-L$seal_avx2_exit:
-	vzeroupper
-	jmp	L$seal_sse_tail_16
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64-apple.S
deleted file mode 100644
index e1247bc..0000000
--- a/apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64-apple.S
+++ /dev/null
@@ -1,868 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-.p2align	5
-_aesni_ctr32_ghash_6x:
-
-	vmovdqu	32(%r11),%xmm2
-	subq	$6,%rdx
-	vpxor	%xmm4,%xmm4,%xmm4
-	vmovdqu	0-128(%rcx),%xmm15
-	vpaddb	%xmm2,%xmm1,%xmm10
-	vpaddb	%xmm2,%xmm10,%xmm11
-	vpaddb	%xmm2,%xmm11,%xmm12
-	vpaddb	%xmm2,%xmm12,%xmm13
-	vpaddb	%xmm2,%xmm13,%xmm14
-	vpxor	%xmm15,%xmm1,%xmm9
-	vmovdqu	%xmm4,16+8(%rsp)
-	jmp	L$oop6x
-
-.p2align	5
-L$oop6x:
-	addl	$100663296,%ebx
-	jc	L$handle_ctr32
-	vmovdqu	0-32(%r9),%xmm3
-	vpaddb	%xmm2,%xmm14,%xmm1
-	vpxor	%xmm15,%xmm10,%xmm10
-	vpxor	%xmm15,%xmm11,%xmm11
-
-L$resume_ctr32:
-	vmovdqu	%xmm1,(%r8)
-	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
-	vpxor	%xmm15,%xmm12,%xmm12
-	vmovups	16-128(%rcx),%xmm2
-	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-	xorq	%r12,%r12
-	cmpq	%r14,%r15
-
-	vaesenc	%xmm2,%xmm9,%xmm9
-	vmovdqu	48+8(%rsp),%xmm0
-	vpxor	%xmm15,%xmm13,%xmm13
-	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
-	vaesenc	%xmm2,%xmm10,%xmm10
-	vpxor	%xmm15,%xmm14,%xmm14
-	setnc	%r12b
-	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
-	vaesenc	%xmm2,%xmm11,%xmm11
-	vmovdqu	16-32(%r9),%xmm3
-	negq	%r12
-	vaesenc	%xmm2,%xmm12,%xmm12
-	vpxor	%xmm5,%xmm6,%xmm6
-	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
-	vpxor	%xmm4,%xmm8,%xmm8
-	vaesenc	%xmm2,%xmm13,%xmm13
-	vpxor	%xmm5,%xmm1,%xmm4
-	andq	$0x60,%r12
-	vmovups	32-128(%rcx),%xmm15
-	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
-	vaesenc	%xmm2,%xmm14,%xmm14
-
-	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
-	leaq	(%r14,%r12,1),%r14
-	vaesenc	%xmm15,%xmm9,%xmm9
-	vpxor	16+8(%rsp),%xmm8,%xmm8
-	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
-	vmovdqu	64+8(%rsp),%xmm0
-	vaesenc	%xmm15,%xmm10,%xmm10
-	movbeq	88(%r14),%r13
-	vaesenc	%xmm15,%xmm11,%xmm11
-	movbeq	80(%r14),%r12
-	vaesenc	%xmm15,%xmm12,%xmm12
-	movq	%r13,32+8(%rsp)
-	vaesenc	%xmm15,%xmm13,%xmm13
-	movq	%r12,40+8(%rsp)
-	vmovdqu	48-32(%r9),%xmm5
-	vaesenc	%xmm15,%xmm14,%xmm14
-
-	vmovups	48-128(%rcx),%xmm15
-	vpxor	%xmm1,%xmm6,%xmm6
-	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
-	vaesenc	%xmm15,%xmm9,%xmm9
-	vpxor	%xmm2,%xmm6,%xmm6
-	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
-	vaesenc	%xmm15,%xmm10,%xmm10
-	vpxor	%xmm3,%xmm7,%xmm7
-	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
-	vaesenc	%xmm15,%xmm11,%xmm11
-	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
-	vmovdqu	80+8(%rsp),%xmm0
-	vaesenc	%xmm15,%xmm12,%xmm12
-	vaesenc	%xmm15,%xmm13,%xmm13
-	vpxor	%xmm1,%xmm4,%xmm4
-	vmovdqu	64-32(%r9),%xmm1
-	vaesenc	%xmm15,%xmm14,%xmm14
-
-	vmovups	64-128(%rcx),%xmm15
-	vpxor	%xmm2,%xmm6,%xmm6
-	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
-	vaesenc	%xmm15,%xmm9,%xmm9
-	vpxor	%xmm3,%xmm6,%xmm6
-	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
-	vaesenc	%xmm15,%xmm10,%xmm10
-	movbeq	72(%r14),%r13
-	vpxor	%xmm5,%xmm7,%xmm7
-	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
-	vaesenc	%xmm15,%xmm11,%xmm11
-	movbeq	64(%r14),%r12
-	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
-	vmovdqu	96+8(%rsp),%xmm0
-	vaesenc	%xmm15,%xmm12,%xmm12
-	movq	%r13,48+8(%rsp)
-	vaesenc	%xmm15,%xmm13,%xmm13
-	movq	%r12,56+8(%rsp)
-	vpxor	%xmm2,%xmm4,%xmm4
-	vmovdqu	96-32(%r9),%xmm2
-	vaesenc	%xmm15,%xmm14,%xmm14
-
-	vmovups	80-128(%rcx),%xmm15
-	vpxor	%xmm3,%xmm6,%xmm6
-	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
-	vaesenc	%xmm15,%xmm9,%xmm9
-	vpxor	%xmm5,%xmm6,%xmm6
-	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
-	vaesenc	%xmm15,%xmm10,%xmm10
-	movbeq	56(%r14),%r13
-	vpxor	%xmm1,%xmm7,%xmm7
-	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
-	vpxor	112+8(%rsp),%xmm8,%xmm8
-	vaesenc	%xmm15,%xmm11,%xmm11
-	movbeq	48(%r14),%r12
-	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
-	vaesenc	%xmm15,%xmm12,%xmm12
-	movq	%r13,64+8(%rsp)
-	vaesenc	%xmm15,%xmm13,%xmm13
-	movq	%r12,72+8(%rsp)
-	vpxor	%xmm3,%xmm4,%xmm4
-	vmovdqu	112-32(%r9),%xmm3
-	vaesenc	%xmm15,%xmm14,%xmm14
-
-	vmovups	96-128(%rcx),%xmm15
-	vpxor	%xmm5,%xmm6,%xmm6
-	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
-	vaesenc	%xmm15,%xmm9,%xmm9
-	vpxor	%xmm1,%xmm6,%xmm6
-	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
-	vaesenc	%xmm15,%xmm10,%xmm10
-	movbeq	40(%r14),%r13
-	vpxor	%xmm2,%xmm7,%xmm7
-	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
-	vaesenc	%xmm15,%xmm11,%xmm11
-	movbeq	32(%r14),%r12
-	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
-	vaesenc	%xmm15,%xmm12,%xmm12
-	movq	%r13,80+8(%rsp)
-	vaesenc	%xmm15,%xmm13,%xmm13
-	movq	%r12,88+8(%rsp)
-	vpxor	%xmm5,%xmm6,%xmm6
-	vaesenc	%xmm15,%xmm14,%xmm14
-	vpxor	%xmm1,%xmm6,%xmm6
-
-	vmovups	112-128(%rcx),%xmm15
-	vpslldq	$8,%xmm6,%xmm5
-	vpxor	%xmm2,%xmm4,%xmm4
-	vmovdqu	16(%r11),%xmm3
-
-	vaesenc	%xmm15,%xmm9,%xmm9
-	vpxor	%xmm8,%xmm7,%xmm7
-	vaesenc	%xmm15,%xmm10,%xmm10
-	vpxor	%xmm5,%xmm4,%xmm4
-	movbeq	24(%r14),%r13
-	vaesenc	%xmm15,%xmm11,%xmm11
-	movbeq	16(%r14),%r12
-	vpalignr	$8,%xmm4,%xmm4,%xmm0
-	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
-	movq	%r13,96+8(%rsp)
-	vaesenc	%xmm15,%xmm12,%xmm12
-	movq	%r12,104+8(%rsp)
-	vaesenc	%xmm15,%xmm13,%xmm13
-	vmovups	128-128(%rcx),%xmm1
-	vaesenc	%xmm15,%xmm14,%xmm14
-
-	vaesenc	%xmm1,%xmm9,%xmm9
-	vmovups	144-128(%rcx),%xmm15
-	vaesenc	%xmm1,%xmm10,%xmm10
-	vpsrldq	$8,%xmm6,%xmm6
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vpxor	%xmm6,%xmm7,%xmm7
-	vaesenc	%xmm1,%xmm12,%xmm12
-	vpxor	%xmm0,%xmm4,%xmm4
-	movbeq	8(%r14),%r13
-	vaesenc	%xmm1,%xmm13,%xmm13
-	movbeq	0(%r14),%r12
-	vaesenc	%xmm1,%xmm14,%xmm14
-	vmovups	160-128(%rcx),%xmm1
-	cmpl	$11,%r10d
-	jb	L$enc_tail
-
-	vaesenc	%xmm15,%xmm9,%xmm9
-	vaesenc	%xmm15,%xmm10,%xmm10
-	vaesenc	%xmm15,%xmm11,%xmm11
-	vaesenc	%xmm15,%xmm12,%xmm12
-	vaesenc	%xmm15,%xmm13,%xmm13
-	vaesenc	%xmm15,%xmm14,%xmm14
-
-	vaesenc	%xmm1,%xmm9,%xmm9
-	vaesenc	%xmm1,%xmm10,%xmm10
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-	vaesenc	%xmm1,%xmm13,%xmm13
-	vmovups	176-128(%rcx),%xmm15
-	vaesenc	%xmm1,%xmm14,%xmm14
-	vmovups	192-128(%rcx),%xmm1
-	je	L$enc_tail
-
-	vaesenc	%xmm15,%xmm9,%xmm9
-	vaesenc	%xmm15,%xmm10,%xmm10
-	vaesenc	%xmm15,%xmm11,%xmm11
-	vaesenc	%xmm15,%xmm12,%xmm12
-	vaesenc	%xmm15,%xmm13,%xmm13
-	vaesenc	%xmm15,%xmm14,%xmm14
-
-	vaesenc	%xmm1,%xmm9,%xmm9
-	vaesenc	%xmm1,%xmm10,%xmm10
-	vaesenc	%xmm1,%xmm11,%xmm11
-	vaesenc	%xmm1,%xmm12,%xmm12
-	vaesenc	%xmm1,%xmm13,%xmm13
-	vmovups	208-128(%rcx),%xmm15
-	vaesenc	%xmm1,%xmm14,%xmm14
-	vmovups	224-128(%rcx),%xmm1
-	jmp	L$enc_tail
-
-.p2align	5
-L$handle_ctr32:
-	vmovdqu	(%r11),%xmm0
-	vpshufb	%xmm0,%xmm1,%xmm6
-	vmovdqu	48(%r11),%xmm5
-	vpaddd	64(%r11),%xmm6,%xmm10
-	vpaddd	%xmm5,%xmm6,%xmm11
-	vmovdqu	0-32(%r9),%xmm3
-	vpaddd	%xmm5,%xmm10,%xmm12
-	vpshufb	%xmm0,%xmm10,%xmm10
-	vpaddd	%xmm5,%xmm11,%xmm13
-	vpshufb	%xmm0,%xmm11,%xmm11
-	vpxor	%xmm15,%xmm10,%xmm10
-	vpaddd	%xmm5,%xmm12,%xmm14
-	vpshufb	%xmm0,%xmm12,%xmm12
-	vpxor	%xmm15,%xmm11,%xmm11
-	vpaddd	%xmm5,%xmm13,%xmm1
-	vpshufb	%xmm0,%xmm13,%xmm13
-	vpshufb	%xmm0,%xmm14,%xmm14
-	vpshufb	%xmm0,%xmm1,%xmm1
-	jmp	L$resume_ctr32
-
-.p2align	5
-L$enc_tail:
-	vaesenc	%xmm15,%xmm9,%xmm9
-	vmovdqu	%xmm7,16+8(%rsp)
-	vpalignr	$8,%xmm4,%xmm4,%xmm8
-	vaesenc	%xmm15,%xmm10,%xmm10
-	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
-	vpxor	0(%rdi),%xmm1,%xmm2
-	vaesenc	%xmm15,%xmm11,%xmm11
-	vpxor	16(%rdi),%xmm1,%xmm0
-	vaesenc	%xmm15,%xmm12,%xmm12
-	vpxor	32(%rdi),%xmm1,%xmm5
-	vaesenc	%xmm15,%xmm13,%xmm13
-	vpxor	48(%rdi),%xmm1,%xmm6
-	vaesenc	%xmm15,%xmm14,%xmm14
-	vpxor	64(%rdi),%xmm1,%xmm7
-	vpxor	80(%rdi),%xmm1,%xmm3
-	vmovdqu	(%r8),%xmm1
-
-	vaesenclast	%xmm2,%xmm9,%xmm9
-	vmovdqu	32(%r11),%xmm2
-	vaesenclast	%xmm0,%xmm10,%xmm10
-	vpaddb	%xmm2,%xmm1,%xmm0
-	movq	%r13,112+8(%rsp)
-	leaq	96(%rdi),%rdi
-
-	prefetcht0	512(%rdi)
-	prefetcht0	576(%rdi)
-	vaesenclast	%xmm5,%xmm11,%xmm11
-	vpaddb	%xmm2,%xmm0,%xmm5
-	movq	%r12,120+8(%rsp)
-	leaq	96(%rsi),%rsi
-	vmovdqu	0-128(%rcx),%xmm15
-	vaesenclast	%xmm6,%xmm12,%xmm12
-	vpaddb	%xmm2,%xmm5,%xmm6
-	vaesenclast	%xmm7,%xmm13,%xmm13
-	vpaddb	%xmm2,%xmm6,%xmm7
-	vaesenclast	%xmm3,%xmm14,%xmm14
-	vpaddb	%xmm2,%xmm7,%xmm3
-
-	addq	$0x60,%rax
-	subq	$0x6,%rdx
-	jc	L$6x_done
-
-	vmovups	%xmm9,-96(%rsi)
-	vpxor	%xmm15,%xmm1,%xmm9
-	vmovups	%xmm10,-80(%rsi)
-	vmovdqa	%xmm0,%xmm10
-	vmovups	%xmm11,-64(%rsi)
-	vmovdqa	%xmm5,%xmm11
-	vmovups	%xmm12,-48(%rsi)
-	vmovdqa	%xmm6,%xmm12
-	vmovups	%xmm13,-32(%rsi)
-	vmovdqa	%xmm7,%xmm13
-	vmovups	%xmm14,-16(%rsi)
-	vmovdqa	%xmm3,%xmm14
-	vmovdqu	32+8(%rsp),%xmm7
-	jmp	L$oop6x
-
-L$6x_done:
-	vpxor	16+8(%rsp),%xmm8,%xmm8
-	vpxor	%xmm4,%xmm8,%xmm8
-
-	ret
-
-
-.globl	_aesni_gcm_decrypt
-.private_extern _aesni_gcm_decrypt
-
-.p2align	5
-_aesni_gcm_decrypt:
-
-
-_CET_ENDBR
-	xorq	%rax,%rax
-
-
-
-	cmpq	$0x60,%rdx
-	jb	L$gcm_dec_abort
-
-	pushq	%rbp
-
-
-	movq	%rsp,%rbp
-
-	pushq	%rbx
-
-
-	pushq	%r12
-
-
-	pushq	%r13
-
-
-	pushq	%r14
-
-
-	pushq	%r15
-
-
-	vzeroupper
-
-	movq	16(%rbp),%r12
-	vmovdqu	(%r8),%xmm1
-	addq	$-128,%rsp
-	movl	12(%r8),%ebx
-	leaq	L$bswap_mask(%rip),%r11
-	leaq	-128(%rcx),%r14
-	movq	$0xf80,%r15
-	vmovdqu	(%r12),%xmm8
-	andq	$-128,%rsp
-	vmovdqu	(%r11),%xmm0
-	leaq	128(%rcx),%rcx
-	leaq	32(%r9),%r9
-	movl	240-128(%rcx),%r10d
-	vpshufb	%xmm0,%xmm8,%xmm8
-
-	andq	%r15,%r14
-	andq	%rsp,%r15
-	subq	%r14,%r15
-	jc	L$dec_no_key_aliasing
-	cmpq	$768,%r15
-	jnc	L$dec_no_key_aliasing
-	subq	%r15,%rsp
-L$dec_no_key_aliasing:
-
-	vmovdqu	80(%rdi),%xmm7
-	movq	%rdi,%r14
-	vmovdqu	64(%rdi),%xmm4
-
-
-
-
-
-
-
-	leaq	-192(%rdi,%rdx,1),%r15
-
-	vmovdqu	48(%rdi),%xmm5
-	shrq	$4,%rdx
-	xorq	%rax,%rax
-	vmovdqu	32(%rdi),%xmm6
-	vpshufb	%xmm0,%xmm7,%xmm7
-	vmovdqu	16(%rdi),%xmm2
-	vpshufb	%xmm0,%xmm4,%xmm4
-	vmovdqu	(%rdi),%xmm3
-	vpshufb	%xmm0,%xmm5,%xmm5
-	vmovdqu	%xmm4,48(%rsp)
-	vpshufb	%xmm0,%xmm6,%xmm6
-	vmovdqu	%xmm5,64(%rsp)
-	vpshufb	%xmm0,%xmm2,%xmm2
-	vmovdqu	%xmm6,80(%rsp)
-	vpshufb	%xmm0,%xmm3,%xmm3
-	vmovdqu	%xmm2,96(%rsp)
-	vmovdqu	%xmm3,112(%rsp)
-
-	call	_aesni_ctr32_ghash_6x
-
-	movq	16(%rbp),%r12
-	vmovups	%xmm9,-96(%rsi)
-	vmovups	%xmm10,-80(%rsi)
-	vmovups	%xmm11,-64(%rsi)
-	vmovups	%xmm12,-48(%rsi)
-	vmovups	%xmm13,-32(%rsi)
-	vmovups	%xmm14,-16(%rsi)
-
-	vpshufb	(%r11),%xmm8,%xmm8
-	vmovdqu	%xmm8,(%r12)
-
-	vzeroupper
-	leaq	-40(%rbp),%rsp
-
-	popq	%r15
-
-	popq	%r14
-
-	popq	%r13
-
-	popq	%r12
-
-	popq	%rbx
-
-	popq	%rbp
-
-L$gcm_dec_abort:
-	ret
-
-
-
-
-.p2align	5
-_aesni_ctr32_6x:
-
-	vmovdqu	0-128(%rcx),%xmm4
-	vmovdqu	32(%r11),%xmm2
-	leaq	-1(%r10),%r13
-	vmovups	16-128(%rcx),%xmm15
-	leaq	32-128(%rcx),%r12
-	vpxor	%xmm4,%xmm1,%xmm9
-	addl	$100663296,%ebx
-	jc	L$handle_ctr32_2
-	vpaddb	%xmm2,%xmm1,%xmm10
-	vpaddb	%xmm2,%xmm10,%xmm11
-	vpxor	%xmm4,%xmm10,%xmm10
-	vpaddb	%xmm2,%xmm11,%xmm12
-	vpxor	%xmm4,%xmm11,%xmm11
-	vpaddb	%xmm2,%xmm12,%xmm13
-	vpxor	%xmm4,%xmm12,%xmm12
-	vpaddb	%xmm2,%xmm13,%xmm14
-	vpxor	%xmm4,%xmm13,%xmm13
-	vpaddb	%xmm2,%xmm14,%xmm1
-	vpxor	%xmm4,%xmm14,%xmm14
-	jmp	L$oop_ctr32
-
-.p2align	4
-L$oop_ctr32:
-	vaesenc	%xmm15,%xmm9,%xmm9
-	vaesenc	%xmm15,%xmm10,%xmm10
-	vaesenc	%xmm15,%xmm11,%xmm11
-	vaesenc	%xmm15,%xmm12,%xmm12
-	vaesenc	%xmm15,%xmm13,%xmm13
-	vaesenc	%xmm15,%xmm14,%xmm14
-	vmovups	(%r12),%xmm15
-	leaq	16(%r12),%r12
-	decl	%r13d
-	jnz	L$oop_ctr32
-
-	vmovdqu	(%r12),%xmm3
-	vaesenc	%xmm15,%xmm9,%xmm9
-	vpxor	0(%rdi),%xmm3,%xmm4
-	vaesenc	%xmm15,%xmm10,%xmm10
-	vpxor	16(%rdi),%xmm3,%xmm5
-	vaesenc	%xmm15,%xmm11,%xmm11
-	vpxor	32(%rdi),%xmm3,%xmm6
-	vaesenc	%xmm15,%xmm12,%xmm12
-	vpxor	48(%rdi),%xmm3,%xmm8
-	vaesenc	%xmm15,%xmm13,%xmm13
-	vpxor	64(%rdi),%xmm3,%xmm2
-	vaesenc	%xmm15,%xmm14,%xmm14
-	vpxor	80(%rdi),%xmm3,%xmm3
-	leaq	96(%rdi),%rdi
-
-	vaesenclast	%xmm4,%xmm9,%xmm9
-	vaesenclast	%xmm5,%xmm10,%xmm10
-	vaesenclast	%xmm6,%xmm11,%xmm11
-	vaesenclast	%xmm8,%xmm12,%xmm12
-	vaesenclast	%xmm2,%xmm13,%xmm13
-	vaesenclast	%xmm3,%xmm14,%xmm14
-	vmovups	%xmm9,0(%rsi)
-	vmovups	%xmm10,16(%rsi)
-	vmovups	%xmm11,32(%rsi)
-	vmovups	%xmm12,48(%rsi)
-	vmovups	%xmm13,64(%rsi)
-	vmovups	%xmm14,80(%rsi)
-	leaq	96(%rsi),%rsi
-
-	ret
-.p2align	5
-L$handle_ctr32_2:
-	vpshufb	%xmm0,%xmm1,%xmm6
-	vmovdqu	48(%r11),%xmm5
-	vpaddd	64(%r11),%xmm6,%xmm10
-	vpaddd	%xmm5,%xmm6,%xmm11
-	vpaddd	%xmm5,%xmm10,%xmm12
-	vpshufb	%xmm0,%xmm10,%xmm10
-	vpaddd	%xmm5,%xmm11,%xmm13
-	vpshufb	%xmm0,%xmm11,%xmm11
-	vpxor	%xmm4,%xmm10,%xmm10
-	vpaddd	%xmm5,%xmm12,%xmm14
-	vpshufb	%xmm0,%xmm12,%xmm12
-	vpxor	%xmm4,%xmm11,%xmm11
-	vpaddd	%xmm5,%xmm13,%xmm1
-	vpshufb	%xmm0,%xmm13,%xmm13
-	vpxor	%xmm4,%xmm12,%xmm12
-	vpshufb	%xmm0,%xmm14,%xmm14
-	vpxor	%xmm4,%xmm13,%xmm13
-	vpshufb	%xmm0,%xmm1,%xmm1
-	vpxor	%xmm4,%xmm14,%xmm14
-	jmp	L$oop_ctr32
-
-
-
-.globl	_aesni_gcm_encrypt
-.private_extern _aesni_gcm_encrypt
-
-.p2align	5
-_aesni_gcm_encrypt:
-
-
-_CET_ENDBR
-#ifdef BORINGSSL_DISPATCH_TEST
-
-	movb	$1,_BORINGSSL_function_hit+2(%rip)
-#endif
-	xorq	%rax,%rax
-
-
-
-
-	cmpq	$288,%rdx
-	jb	L$gcm_enc_abort
-
-	pushq	%rbp
-
-
-	movq	%rsp,%rbp
-
-	pushq	%rbx
-
-
-	pushq	%r12
-
-
-	pushq	%r13
-
-
-	pushq	%r14
-
-
-	pushq	%r15
-
-
-	vzeroupper
-
-	vmovdqu	(%r8),%xmm1
-	addq	$-128,%rsp
-	movl	12(%r8),%ebx
-	leaq	L$bswap_mask(%rip),%r11
-	leaq	-128(%rcx),%r14
-	movq	$0xf80,%r15
-	leaq	128(%rcx),%rcx
-	vmovdqu	(%r11),%xmm0
-	andq	$-128,%rsp
-	movl	240-128(%rcx),%r10d
-
-	andq	%r15,%r14
-	andq	%rsp,%r15
-	subq	%r14,%r15
-	jc	L$enc_no_key_aliasing
-	cmpq	$768,%r15
-	jnc	L$enc_no_key_aliasing
-	subq	%r15,%rsp
-L$enc_no_key_aliasing:
-
-	movq	%rsi,%r14
-
-
-
-
-
-
-
-
-	leaq	-192(%rsi,%rdx,1),%r15
-
-	shrq	$4,%rdx
-
-	call	_aesni_ctr32_6x
-	vpshufb	%xmm0,%xmm9,%xmm8
-	vpshufb	%xmm0,%xmm10,%xmm2
-	vmovdqu	%xmm8,112(%rsp)
-	vpshufb	%xmm0,%xmm11,%xmm4
-	vmovdqu	%xmm2,96(%rsp)
-	vpshufb	%xmm0,%xmm12,%xmm5
-	vmovdqu	%xmm4,80(%rsp)
-	vpshufb	%xmm0,%xmm13,%xmm6
-	vmovdqu	%xmm5,64(%rsp)
-	vpshufb	%xmm0,%xmm14,%xmm7
-	vmovdqu	%xmm6,48(%rsp)
-
-	call	_aesni_ctr32_6x
-
-	movq	16(%rbp),%r12
-	leaq	32(%r9),%r9
-	vmovdqu	(%r12),%xmm8
-	subq	$12,%rdx
-	movq	$192,%rax
-	vpshufb	%xmm0,%xmm8,%xmm8
-
-	call	_aesni_ctr32_ghash_6x
-	vmovdqu	32(%rsp),%xmm7
-	vmovdqu	(%r11),%xmm0
-	vmovdqu	0-32(%r9),%xmm3
-	vpunpckhqdq	%xmm7,%xmm7,%xmm1
-	vmovdqu	32-32(%r9),%xmm15
-	vmovups	%xmm9,-96(%rsi)
-	vpshufb	%xmm0,%xmm9,%xmm9
-	vpxor	%xmm7,%xmm1,%xmm1
-	vmovups	%xmm10,-80(%rsi)
-	vpshufb	%xmm0,%xmm10,%xmm10
-	vmovups	%xmm11,-64(%rsi)
-	vpshufb	%xmm0,%xmm11,%xmm11
-	vmovups	%xmm12,-48(%rsi)
-	vpshufb	%xmm0,%xmm12,%xmm12
-	vmovups	%xmm13,-32(%rsi)
-	vpshufb	%xmm0,%xmm13,%xmm13
-	vmovups	%xmm14,-16(%rsi)
-	vpshufb	%xmm0,%xmm14,%xmm14
-	vmovdqu	%xmm9,16(%rsp)
-	vmovdqu	48(%rsp),%xmm6
-	vmovdqu	16-32(%r9),%xmm0
-	vpunpckhqdq	%xmm6,%xmm6,%xmm2
-	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
-	vpxor	%xmm6,%xmm2,%xmm2
-	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
-	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
-
-	vmovdqu	64(%rsp),%xmm9
-	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
-	vmovdqu	48-32(%r9),%xmm3
-	vpxor	%xmm5,%xmm4,%xmm4
-	vpunpckhqdq	%xmm9,%xmm9,%xmm5
-	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
-	vpxor	%xmm9,%xmm5,%xmm5
-	vpxor	%xmm7,%xmm6,%xmm6
-	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
-	vmovdqu	80-32(%r9),%xmm15
-	vpxor	%xmm1,%xmm2,%xmm2
-
-	vmovdqu	80(%rsp),%xmm1
-	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
-	vmovdqu	64-32(%r9),%xmm0
-	vpxor	%xmm4,%xmm7,%xmm7
-	vpunpckhqdq	%xmm1,%xmm1,%xmm4
-	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpxor	%xmm6,%xmm9,%xmm9
-	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
-	vpxor	%xmm2,%xmm5,%xmm5
-
-	vmovdqu	96(%rsp),%xmm2
-	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
-	vmovdqu	96-32(%r9),%xmm3
-	vpxor	%xmm7,%xmm6,%xmm6
-	vpunpckhqdq	%xmm2,%xmm2,%xmm7
-	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
-	vpxor	%xmm2,%xmm7,%xmm7
-	vpxor	%xmm9,%xmm1,%xmm1
-	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
-	vmovdqu	128-32(%r9),%xmm15
-	vpxor	%xmm5,%xmm4,%xmm4
-
-	vpxor	112(%rsp),%xmm8,%xmm8
-	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
-	vmovdqu	112-32(%r9),%xmm0
-	vpunpckhqdq	%xmm8,%xmm8,%xmm9
-	vpxor	%xmm6,%xmm5,%xmm5
-	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
-	vpxor	%xmm8,%xmm9,%xmm9
-	vpxor	%xmm1,%xmm2,%xmm2
-	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
-	vpxor	%xmm4,%xmm7,%xmm4
-
-	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
-	vmovdqu	0-32(%r9),%xmm3
-	vpunpckhqdq	%xmm14,%xmm14,%xmm1
-	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
-	vpxor	%xmm14,%xmm1,%xmm1
-	vpxor	%xmm5,%xmm6,%xmm5
-	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
-	vmovdqu	32-32(%r9),%xmm15
-	vpxor	%xmm2,%xmm8,%xmm7
-	vpxor	%xmm4,%xmm9,%xmm6
-
-	vmovdqu	16-32(%r9),%xmm0
-	vpxor	%xmm5,%xmm7,%xmm9
-	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
-	vpxor	%xmm9,%xmm6,%xmm6
-	vpunpckhqdq	%xmm13,%xmm13,%xmm2
-	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
-	vpxor	%xmm13,%xmm2,%xmm2
-	vpslldq	$8,%xmm6,%xmm9
-	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
-	vpxor	%xmm9,%xmm5,%xmm8
-	vpsrldq	$8,%xmm6,%xmm6
-	vpxor	%xmm6,%xmm7,%xmm7
-
-	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
-	vmovdqu	48-32(%r9),%xmm3
-	vpxor	%xmm4,%xmm5,%xmm5
-	vpunpckhqdq	%xmm12,%xmm12,%xmm9
-	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
-	vpxor	%xmm12,%xmm9,%xmm9
-	vpxor	%xmm14,%xmm13,%xmm13
-	vpalignr	$8,%xmm8,%xmm8,%xmm14
-	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
-	vmovdqu	80-32(%r9),%xmm15
-	vpxor	%xmm1,%xmm2,%xmm2
-
-	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
-	vmovdqu	64-32(%r9),%xmm0
-	vpxor	%xmm5,%xmm4,%xmm4
-	vpunpckhqdq	%xmm11,%xmm11,%xmm1
-	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
-	vpxor	%xmm11,%xmm1,%xmm1
-	vpxor	%xmm13,%xmm12,%xmm12
-	vxorps	16(%rsp),%xmm7,%xmm7
-	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
-	vpxor	%xmm2,%xmm9,%xmm9
-
-	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
-	vxorps	%xmm14,%xmm8,%xmm8
-
-	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
-	vmovdqu	96-32(%r9),%xmm3
-	vpxor	%xmm4,%xmm5,%xmm5
-	vpunpckhqdq	%xmm10,%xmm10,%xmm2
-	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
-	vpxor	%xmm10,%xmm2,%xmm2
-	vpalignr	$8,%xmm8,%xmm8,%xmm14
-	vpxor	%xmm12,%xmm11,%xmm11
-	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
-	vmovdqu	128-32(%r9),%xmm15
-	vpxor	%xmm9,%xmm1,%xmm1
-
-	vxorps	%xmm7,%xmm14,%xmm14
-	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
-	vxorps	%xmm14,%xmm8,%xmm8
-
-	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
-	vmovdqu	112-32(%r9),%xmm0
-	vpxor	%xmm5,%xmm4,%xmm4
-	vpunpckhqdq	%xmm8,%xmm8,%xmm9
-	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
-	vpxor	%xmm8,%xmm9,%xmm9
-	vpxor	%xmm11,%xmm10,%xmm10
-	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
-	vpxor	%xmm1,%xmm2,%xmm2
-
-	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
-	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
-	vpxor	%xmm4,%xmm5,%xmm5
-	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
-	vpxor	%xmm10,%xmm7,%xmm7
-	vpxor	%xmm2,%xmm6,%xmm6
-
-	vpxor	%xmm5,%xmm7,%xmm4
-	vpxor	%xmm4,%xmm6,%xmm6
-	vpslldq	$8,%xmm6,%xmm1
-	vmovdqu	16(%r11),%xmm3
-	vpsrldq	$8,%xmm6,%xmm6
-	vpxor	%xmm1,%xmm5,%xmm8
-	vpxor	%xmm6,%xmm7,%xmm7
-
-	vpalignr	$8,%xmm8,%xmm8,%xmm2
-	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
-	vpxor	%xmm2,%xmm8,%xmm8
-
-	vpalignr	$8,%xmm8,%xmm8,%xmm2
-	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
-	vpxor	%xmm7,%xmm2,%xmm2
-	vpxor	%xmm2,%xmm8,%xmm8
-	movq	16(%rbp),%r12
-	vpshufb	(%r11),%xmm8,%xmm8
-	vmovdqu	%xmm8,(%r12)
-
-	vzeroupper
-	leaq	-40(%rbp),%rsp
-
-	popq	%r15
-
-	popq	%r14
-
-	popq	%r13
-
-	popq	%r12
-
-	popq	%rbx
-
-	popq	%rbp
-
-L$gcm_enc_abort:
-	ret
-
-
-
-.section	__DATA,__const
-.p2align	6
-L$bswap_mask:
-.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-L$poly:
-.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
-L$one_msb:
-.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
-L$two_lsb:
-.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-L$one_lsb:
-.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.p2align	6
-.text	
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/aesni-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/aesni-x86_64-apple.S
deleted file mode 100644
index b8ba910..0000000
--- a/apple-x86_64/crypto/fipsmodule/aesni-x86_64-apple.S
+++ /dev/null
@@ -1,2507 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-.globl	_aes_hw_encrypt
-.private_extern _aes_hw_encrypt
-
-.p2align	4
-_aes_hw_encrypt:
-
-_CET_ENDBR
-#ifdef BORINGSSL_DISPATCH_TEST
-
-	movb	$1,_BORINGSSL_function_hit+1(%rip)
-#endif
-	movups	(%rdi),%xmm2
-	movl	240(%rdx),%eax
-	movups	(%rdx),%xmm0
-	movups	16(%rdx),%xmm1
-	leaq	32(%rdx),%rdx
-	xorps	%xmm0,%xmm2
-L$oop_enc1_1:
-.byte	102,15,56,220,209
-	decl	%eax
-	movups	(%rdx),%xmm1
-	leaq	16(%rdx),%rdx
-	jnz	L$oop_enc1_1
-.byte	102,15,56,221,209
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	movups	%xmm2,(%rsi)
-	pxor	%xmm2,%xmm2
-	ret
-
-
-
-.globl	_aes_hw_decrypt
-.private_extern _aes_hw_decrypt
-
-.p2align	4
-_aes_hw_decrypt:
-
-_CET_ENDBR
-	movups	(%rdi),%xmm2
-	movl	240(%rdx),%eax
-	movups	(%rdx),%xmm0
-	movups	16(%rdx),%xmm1
-	leaq	32(%rdx),%rdx
-	xorps	%xmm0,%xmm2
-L$oop_dec1_2:
-.byte	102,15,56,222,209
-	decl	%eax
-	movups	(%rdx),%xmm1
-	leaq	16(%rdx),%rdx
-	jnz	L$oop_dec1_2
-.byte	102,15,56,223,209
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	movups	%xmm2,(%rsi)
-	pxor	%xmm2,%xmm2
-	ret
-
-
-
-.p2align	4
-_aesni_encrypt2:
-
-	movups	(%rcx),%xmm0
-	shll	$4,%eax
-	movups	16(%rcx),%xmm1
-	xorps	%xmm0,%xmm2
-	xorps	%xmm0,%xmm3
-	movups	32(%rcx),%xmm0
-	leaq	32(%rcx,%rax,1),%rcx
-	negq	%rax
-	addq	$16,%rax
-
-L$enc_loop2:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-	movups	(%rcx,%rax,1),%xmm1
-	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-	movups	-16(%rcx,%rax,1),%xmm0
-	jnz	L$enc_loop2
-
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-	ret
-
-
-
-.p2align	4
-_aesni_decrypt2:
-
-	movups	(%rcx),%xmm0
-	shll	$4,%eax
-	movups	16(%rcx),%xmm1
-	xorps	%xmm0,%xmm2
-	xorps	%xmm0,%xmm3
-	movups	32(%rcx),%xmm0
-	leaq	32(%rcx,%rax,1),%rcx
-	negq	%rax
-	addq	$16,%rax
-
-L$dec_loop2:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-	movups	(%rcx,%rax,1),%xmm1
-	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-	movups	-16(%rcx,%rax,1),%xmm0
-	jnz	L$dec_loop2
-
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-	ret
-
-
-
-.p2align	4
-_aesni_encrypt3:
-
-	movups	(%rcx),%xmm0
-	shll	$4,%eax
-	movups	16(%rcx),%xmm1
-	xorps	%xmm0,%xmm2
-	xorps	%xmm0,%xmm3
-	xorps	%xmm0,%xmm4
-	movups	32(%rcx),%xmm0
-	leaq	32(%rcx,%rax,1),%rcx
-	negq	%rax
-	addq	$16,%rax
-
-L$enc_loop3:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-	movups	(%rcx,%rax,1),%xmm1
-	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-	movups	-16(%rcx,%rax,1),%xmm0
-	jnz	L$enc_loop3
-
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-	ret
-
-
-
-.p2align	4
-_aesni_decrypt3:
-
-	movups	(%rcx),%xmm0
-	shll	$4,%eax
-	movups	16(%rcx),%xmm1
-	xorps	%xmm0,%xmm2
-	xorps	%xmm0,%xmm3
-	xorps	%xmm0,%xmm4
-	movups	32(%rcx),%xmm0
-	leaq	32(%rcx,%rax,1),%rcx
-	negq	%rax
-	addq	$16,%rax
-
-L$dec_loop3:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-	movups	(%rcx,%rax,1),%xmm1
-	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-	movups	-16(%rcx,%rax,1),%xmm0
-	jnz	L$dec_loop3
-
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-	ret
-
-
-
-.p2align	4
-_aesni_encrypt4:
-
-	movups	(%rcx),%xmm0
-	shll	$4,%eax
-	movups	16(%rcx),%xmm1
-	xorps	%xmm0,%xmm2
-	xorps	%xmm0,%xmm3
-	xorps	%xmm0,%xmm4
-	xorps	%xmm0,%xmm5
-	movups	32(%rcx),%xmm0
-	leaq	32(%rcx,%rax,1),%rcx
-	negq	%rax
-.byte	0x0f,0x1f,0x00
-	addq	$16,%rax
-
-L$enc_loop4:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-	movups	(%rcx,%rax,1),%xmm1
-	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-	movups	-16(%rcx,%rax,1),%xmm0
-	jnz	L$enc_loop4
-
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-.byte	102,15,56,221,232
-	ret
-
-
-
-.p2align	4
-_aesni_decrypt4:
-
-	movups	(%rcx),%xmm0
-	shll	$4,%eax
-	movups	16(%rcx),%xmm1
-	xorps	%xmm0,%xmm2
-	xorps	%xmm0,%xmm3
-	xorps	%xmm0,%xmm4
-	xorps	%xmm0,%xmm5
-	movups	32(%rcx),%xmm0
-	leaq	32(%rcx,%rax,1),%rcx
-	negq	%rax
-.byte	0x0f,0x1f,0x00
-	addq	$16,%rax
-
-L$dec_loop4:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-	movups	(%rcx,%rax,1),%xmm1
-	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-	movups	-16(%rcx,%rax,1),%xmm0
-	jnz	L$dec_loop4
-
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-.byte	102,15,56,223,232
-	ret
-
-
-
-.p2align	4
-_aesni_encrypt6:
-
-	movups	(%rcx),%xmm0
-	shll	$4,%eax
-	movups	16(%rcx),%xmm1
-	xorps	%xmm0,%xmm2
-	pxor	%xmm0,%xmm3
-	pxor	%xmm0,%xmm4
-.byte	102,15,56,220,209
-	leaq	32(%rcx,%rax,1),%rcx
-	negq	%rax
-.byte	102,15,56,220,217
-	pxor	%xmm0,%xmm5
-	pxor	%xmm0,%xmm6
-.byte	102,15,56,220,225
-	pxor	%xmm0,%xmm7
-	movups	(%rcx,%rax,1),%xmm0
-	addq	$16,%rax
-	jmp	L$enc_loop6_enter
-.p2align	4
-L$enc_loop6:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-L$enc_loop6_enter:
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-	movups	(%rcx,%rax,1),%xmm1
-	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-	movups	-16(%rcx,%rax,1),%xmm0
-	jnz	L$enc_loop6
-
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-.byte	102,15,56,221,232
-.byte	102,15,56,221,240
-.byte	102,15,56,221,248
-	ret
-
-
-
-.p2align	4
-_aesni_decrypt6:
-
-	movups	(%rcx),%xmm0
-	shll	$4,%eax
-	movups	16(%rcx),%xmm1
-	xorps	%xmm0,%xmm2
-	pxor	%xmm0,%xmm3
-	pxor	%xmm0,%xmm4
-.byte	102,15,56,222,209
-	leaq	32(%rcx,%rax,1),%rcx
-	negq	%rax
-.byte	102,15,56,222,217
-	pxor	%xmm0,%xmm5
-	pxor	%xmm0,%xmm6
-.byte	102,15,56,222,225
-	pxor	%xmm0,%xmm7
-	movups	(%rcx,%rax,1),%xmm0
-	addq	$16,%rax
-	jmp	L$dec_loop6_enter
-.p2align	4
-L$dec_loop6:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-L$dec_loop6_enter:
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-	movups	(%rcx,%rax,1),%xmm1
-	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-	movups	-16(%rcx,%rax,1),%xmm0
-	jnz	L$dec_loop6
-
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-.byte	102,15,56,223,232
-.byte	102,15,56,223,240
-.byte	102,15,56,223,248
-	ret
-
-
-
-.p2align	4
-_aesni_encrypt8:
-
-	movups	(%rcx),%xmm0
-	shll	$4,%eax
-	movups	16(%rcx),%xmm1
-	xorps	%xmm0,%xmm2
-	xorps	%xmm0,%xmm3
-	pxor	%xmm0,%xmm4
-	pxor	%xmm0,%xmm5
-	pxor	%xmm0,%xmm6
-	leaq	32(%rcx,%rax,1),%rcx
-	negq	%rax
-.byte	102,15,56,220,209
-	pxor	%xmm0,%xmm7
-	pxor	%xmm0,%xmm8
-.byte	102,15,56,220,217
-	pxor	%xmm0,%xmm9
-	movups	(%rcx,%rax,1),%xmm0
-	addq	$16,%rax
-	jmp	L$enc_loop8_inner
-.p2align	4
-L$enc_loop8:
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-L$enc_loop8_inner:
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
-L$enc_loop8_enter:
-	movups	(%rcx,%rax,1),%xmm1
-	addq	$32,%rax
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
-	movups	-16(%rcx,%rax,1),%xmm0
-	jnz	L$enc_loop8
-
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
-.byte	102,15,56,221,208
-.byte	102,15,56,221,216
-.byte	102,15,56,221,224
-.byte	102,15,56,221,232
-.byte	102,15,56,221,240
-.byte	102,15,56,221,248
-.byte	102,68,15,56,221,192
-.byte	102,68,15,56,221,200
-	ret
-
-
-
-.p2align	4
-_aesni_decrypt8:
-
-	movups	(%rcx),%xmm0
-	shll	$4,%eax
-	movups	16(%rcx),%xmm1
-	xorps	%xmm0,%xmm2
-	xorps	%xmm0,%xmm3
-	pxor	%xmm0,%xmm4
-	pxor	%xmm0,%xmm5
-	pxor	%xmm0,%xmm6
-	leaq	32(%rcx,%rax,1),%rcx
-	negq	%rax
-.byte	102,15,56,222,209
-	pxor	%xmm0,%xmm7
-	pxor	%xmm0,%xmm8
-.byte	102,15,56,222,217
-	pxor	%xmm0,%xmm9
-	movups	(%rcx,%rax,1),%xmm0
-	addq	$16,%rax
-	jmp	L$dec_loop8_inner
-.p2align	4
-L$dec_loop8:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-L$dec_loop8_inner:
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
-L$dec_loop8_enter:
-	movups	(%rcx,%rax,1),%xmm1
-	addq	$32,%rax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
-	movups	-16(%rcx,%rax,1),%xmm0
-	jnz	L$dec_loop8
-
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
-.byte	102,15,56,223,208
-.byte	102,15,56,223,216
-.byte	102,15,56,223,224
-.byte	102,15,56,223,232
-.byte	102,15,56,223,240
-.byte	102,15,56,223,248
-.byte	102,68,15,56,223,192
-.byte	102,68,15,56,223,200
-	ret
-
-
-.globl	_aes_hw_ecb_encrypt
-.private_extern _aes_hw_ecb_encrypt
-
-.p2align	4
-_aes_hw_ecb_encrypt:
-
-_CET_ENDBR
-	andq	$-16,%rdx
-	jz	L$ecb_ret
-
-	movl	240(%rcx),%eax
-	movups	(%rcx),%xmm0
-	movq	%rcx,%r11
-	movl	%eax,%r10d
-	testl	%r8d,%r8d
-	jz	L$ecb_decrypt
-
-	cmpq	$0x80,%rdx
-	jb	L$ecb_enc_tail
-
-	movdqu	(%rdi),%xmm2
-	movdqu	16(%rdi),%xmm3
-	movdqu	32(%rdi),%xmm4
-	movdqu	48(%rdi),%xmm5
-	movdqu	64(%rdi),%xmm6
-	movdqu	80(%rdi),%xmm7
-	movdqu	96(%rdi),%xmm8
-	movdqu	112(%rdi),%xmm9
-	leaq	128(%rdi),%rdi
-	subq	$0x80,%rdx
-	jmp	L$ecb_enc_loop8_enter
-.p2align	4
-L$ecb_enc_loop8:
-	movups	%xmm2,(%rsi)
-	movq	%r11,%rcx
-	movdqu	(%rdi),%xmm2
-	movl	%r10d,%eax
-	movups	%xmm3,16(%rsi)
-	movdqu	16(%rdi),%xmm3
-	movups	%xmm4,32(%rsi)
-	movdqu	32(%rdi),%xmm4
-	movups	%xmm5,48(%rsi)
-	movdqu	48(%rdi),%xmm5
-	movups	%xmm6,64(%rsi)
-	movdqu	64(%rdi),%xmm6
-	movups	%xmm7,80(%rsi)
-	movdqu	80(%rdi),%xmm7
-	movups	%xmm8,96(%rsi)
-	movdqu	96(%rdi),%xmm8
-	movups	%xmm9,112(%rsi)
-	leaq	128(%rsi),%rsi
-	movdqu	112(%rdi),%xmm9
-	leaq	128(%rdi),%rdi
-L$ecb_enc_loop8_enter:
-
-	call	_aesni_encrypt8
-
-	subq	$0x80,%rdx
-	jnc	L$ecb_enc_loop8
-
-	movups	%xmm2,(%rsi)
-	movq	%r11,%rcx
-	movups	%xmm3,16(%rsi)
-	movl	%r10d,%eax
-	movups	%xmm4,32(%rsi)
-	movups	%xmm5,48(%rsi)
-	movups	%xmm6,64(%rsi)
-	movups	%xmm7,80(%rsi)
-	movups	%xmm8,96(%rsi)
-	movups	%xmm9,112(%rsi)
-	leaq	128(%rsi),%rsi
-	addq	$0x80,%rdx
-	jz	L$ecb_ret
-
-L$ecb_enc_tail:
-	movups	(%rdi),%xmm2
-	cmpq	$0x20,%rdx
-	jb	L$ecb_enc_one
-	movups	16(%rdi),%xmm3
-	je	L$ecb_enc_two
-	movups	32(%rdi),%xmm4
-	cmpq	$0x40,%rdx
-	jb	L$ecb_enc_three
-	movups	48(%rdi),%xmm5
-	je	L$ecb_enc_four
-	movups	64(%rdi),%xmm6
-	cmpq	$0x60,%rdx
-	jb	L$ecb_enc_five
-	movups	80(%rdi),%xmm7
-	je	L$ecb_enc_six
-	movdqu	96(%rdi),%xmm8
-	xorps	%xmm9,%xmm9
-	call	_aesni_encrypt8
-	movups	%xmm2,(%rsi)
-	movups	%xmm3,16(%rsi)
-	movups	%xmm4,32(%rsi)
-	movups	%xmm5,48(%rsi)
-	movups	%xmm6,64(%rsi)
-	movups	%xmm7,80(%rsi)
-	movups	%xmm8,96(%rsi)
-	jmp	L$ecb_ret
-.p2align	4
-L$ecb_enc_one:
-	movups	(%rcx),%xmm0
-	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
-	xorps	%xmm0,%xmm2
-L$oop_enc1_3:
-.byte	102,15,56,220,209
-	decl	%eax
-	movups	(%rcx),%xmm1
-	leaq	16(%rcx),%rcx
-	jnz	L$oop_enc1_3
-.byte	102,15,56,221,209
-	movups	%xmm2,(%rsi)
-	jmp	L$ecb_ret
-.p2align	4
-L$ecb_enc_two:
-	call	_aesni_encrypt2
-	movups	%xmm2,(%rsi)
-	movups	%xmm3,16(%rsi)
-	jmp	L$ecb_ret
-.p2align	4
-L$ecb_enc_three:
-	call	_aesni_encrypt3
-	movups	%xmm2,(%rsi)
-	movups	%xmm3,16(%rsi)
-	movups	%xmm4,32(%rsi)
-	jmp	L$ecb_ret
-.p2align	4
-L$ecb_enc_four:
-	call	_aesni_encrypt4
-	movups	%xmm2,(%rsi)
-	movups	%xmm3,16(%rsi)
-	movups	%xmm4,32(%rsi)
-	movups	%xmm5,48(%rsi)
-	jmp	L$ecb_ret
-.p2align	4
-L$ecb_enc_five:
-	xorps	%xmm7,%xmm7
-	call	_aesni_encrypt6
-	movups	%xmm2,(%rsi)
-	movups	%xmm3,16(%rsi)
-	movups	%xmm4,32(%rsi)
-	movups	%xmm5,48(%rsi)
-	movups	%xmm6,64(%rsi)
-	jmp	L$ecb_ret
-.p2align	4
-L$ecb_enc_six:
-	call	_aesni_encrypt6
-	movups	%xmm2,(%rsi)
-	movups	%xmm3,16(%rsi)
-	movups	%xmm4,32(%rsi)
-	movups	%xmm5,48(%rsi)
-	movups	%xmm6,64(%rsi)
-	movups	%xmm7,80(%rsi)
-	jmp	L$ecb_ret
-
-.p2align	4
-L$ecb_decrypt:
-	cmpq	$0x80,%rdx
-	jb	L$ecb_dec_tail
-
-	movdqu	(%rdi),%xmm2
-	movdqu	16(%rdi),%xmm3
-	movdqu	32(%rdi),%xmm4
-	movdqu	48(%rdi),%xmm5
-	movdqu	64(%rdi),%xmm6
-	movdqu	80(%rdi),%xmm7
-	movdqu	96(%rdi),%xmm8
-	movdqu	112(%rdi),%xmm9
-	leaq	128(%rdi),%rdi
-	subq	$0x80,%rdx
-	jmp	L$ecb_dec_loop8_enter
-.p2align	4
-L$ecb_dec_loop8:
-	movups	%xmm2,(%rsi)
-	movq	%r11,%rcx
-	movdqu	(%rdi),%xmm2
-	movl	%r10d,%eax
-	movups	%xmm3,16(%rsi)
-	movdqu	16(%rdi),%xmm3
-	movups	%xmm4,32(%rsi)
-	movdqu	32(%rdi),%xmm4
-	movups	%xmm5,48(%rsi)
-	movdqu	48(%rdi),%xmm5
-	movups	%xmm6,64(%rsi)
-	movdqu	64(%rdi),%xmm6
-	movups	%xmm7,80(%rsi)
-	movdqu	80(%rdi),%xmm7
-	movups	%xmm8,96(%rsi)
-	movdqu	96(%rdi),%xmm8
-	movups	%xmm9,112(%rsi)
-	leaq	128(%rsi),%rsi
-	movdqu	112(%rdi),%xmm9
-	leaq	128(%rdi),%rdi
-L$ecb_dec_loop8_enter:
-
-	call	_aesni_decrypt8
-
-	movups	(%r11),%xmm0
-	subq	$0x80,%rdx
-	jnc	L$ecb_dec_loop8
-
-	movups	%xmm2,(%rsi)
-	pxor	%xmm2,%xmm2
-	movq	%r11,%rcx
-	movups	%xmm3,16(%rsi)
-	pxor	%xmm3,%xmm3
-	movl	%r10d,%eax
-	movups	%xmm4,32(%rsi)
-	pxor	%xmm4,%xmm4
-	movups	%xmm5,48(%rsi)
-	pxor	%xmm5,%xmm5
-	movups	%xmm6,64(%rsi)
-	pxor	%xmm6,%xmm6
-	movups	%xmm7,80(%rsi)
-	pxor	%xmm7,%xmm7
-	movups	%xmm8,96(%rsi)
-	pxor	%xmm8,%xmm8
-	movups	%xmm9,112(%rsi)
-	pxor	%xmm9,%xmm9
-	leaq	128(%rsi),%rsi
-	addq	$0x80,%rdx
-	jz	L$ecb_ret
-
-L$ecb_dec_tail:
-	movups	(%rdi),%xmm2
-	cmpq	$0x20,%rdx
-	jb	L$ecb_dec_one
-	movups	16(%rdi),%xmm3
-	je	L$ecb_dec_two
-	movups	32(%rdi),%xmm4
-	cmpq	$0x40,%rdx
-	jb	L$ecb_dec_three
-	movups	48(%rdi),%xmm5
-	je	L$ecb_dec_four
-	movups	64(%rdi),%xmm6
-	cmpq	$0x60,%rdx
-	jb	L$ecb_dec_five
-	movups	80(%rdi),%xmm7
-	je	L$ecb_dec_six
-	movups	96(%rdi),%xmm8
-	movups	(%rcx),%xmm0
-	xorps	%xmm9,%xmm9
-	call	_aesni_decrypt8
-	movups	%xmm2,(%rsi)
-	pxor	%xmm2,%xmm2
-	movups	%xmm3,16(%rsi)
-	pxor	%xmm3,%xmm3
-	movups	%xmm4,32(%rsi)
-	pxor	%xmm4,%xmm4
-	movups	%xmm5,48(%rsi)
-	pxor	%xmm5,%xmm5
-	movups	%xmm6,64(%rsi)
-	pxor	%xmm6,%xmm6
-	movups	%xmm7,80(%rsi)
-	pxor	%xmm7,%xmm7
-	movups	%xmm8,96(%rsi)
-	pxor	%xmm8,%xmm8
-	pxor	%xmm9,%xmm9
-	jmp	L$ecb_ret
-.p2align	4
-L$ecb_dec_one:
-	movups	(%rcx),%xmm0
-	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
-	xorps	%xmm0,%xmm2
-L$oop_dec1_4:
-.byte	102,15,56,222,209
-	decl	%eax
-	movups	(%rcx),%xmm1
-	leaq	16(%rcx),%rcx
-	jnz	L$oop_dec1_4
-.byte	102,15,56,223,209
-	movups	%xmm2,(%rsi)
-	pxor	%xmm2,%xmm2
-	jmp	L$ecb_ret
-.p2align	4
-L$ecb_dec_two:
-	call	_aesni_decrypt2
-	movups	%xmm2,(%rsi)
-	pxor	%xmm2,%xmm2
-	movups	%xmm3,16(%rsi)
-	pxor	%xmm3,%xmm3
-	jmp	L$ecb_ret
-.p2align	4
-L$ecb_dec_three:
-	call	_aesni_decrypt3
-	movups	%xmm2,(%rsi)
-	pxor	%xmm2,%xmm2
-	movups	%xmm3,16(%rsi)
-	pxor	%xmm3,%xmm3
-	movups	%xmm4,32(%rsi)
-	pxor	%xmm4,%xmm4
-	jmp	L$ecb_ret
-.p2align	4
-L$ecb_dec_four:
-	call	_aesni_decrypt4
-	movups	%xmm2,(%rsi)
-	pxor	%xmm2,%xmm2
-	movups	%xmm3,16(%rsi)
-	pxor	%xmm3,%xmm3
-	movups	%xmm4,32(%rsi)
-	pxor	%xmm4,%xmm4
-	movups	%xmm5,48(%rsi)
-	pxor	%xmm5,%xmm5
-	jmp	L$ecb_ret
-.p2align	4
-L$ecb_dec_five:
-	xorps	%xmm7,%xmm7
-	call	_aesni_decrypt6
-	movups	%xmm2,(%rsi)
-	pxor	%xmm2,%xmm2
-	movups	%xmm3,16(%rsi)
-	pxor	%xmm3,%xmm3
-	movups	%xmm4,32(%rsi)
-	pxor	%xmm4,%xmm4
-	movups	%xmm5,48(%rsi)
-	pxor	%xmm5,%xmm5
-	movups	%xmm6,64(%rsi)
-	pxor	%xmm6,%xmm6
-	pxor	%xmm7,%xmm7
-	jmp	L$ecb_ret
-.p2align	4
-L$ecb_dec_six:
-	call	_aesni_decrypt6
-	movups	%xmm2,(%rsi)
-	pxor	%xmm2,%xmm2
-	movups	%xmm3,16(%rsi)
-	pxor	%xmm3,%xmm3
-	movups	%xmm4,32(%rsi)
-	pxor	%xmm4,%xmm4
-	movups	%xmm5,48(%rsi)
-	pxor	%xmm5,%xmm5
-	movups	%xmm6,64(%rsi)
-	pxor	%xmm6,%xmm6
-	movups	%xmm7,80(%rsi)
-	pxor	%xmm7,%xmm7
-
-L$ecb_ret:
-	xorps	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	ret
-
-
-.globl	_aes_hw_ctr32_encrypt_blocks
-.private_extern _aes_hw_ctr32_encrypt_blocks
-
-.p2align	4
-_aes_hw_ctr32_encrypt_blocks:
-
-_CET_ENDBR
-#ifdef BORINGSSL_DISPATCH_TEST
-	movb	$1,_BORINGSSL_function_hit(%rip)
-#endif
-	cmpq	$1,%rdx
-	jne	L$ctr32_bulk
-
-
-
-	movups	(%r8),%xmm2
-	movups	(%rdi),%xmm3
-	movl	240(%rcx),%edx
-	movups	(%rcx),%xmm0
-	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
-	xorps	%xmm0,%xmm2
-L$oop_enc1_5:
-.byte	102,15,56,220,209
-	decl	%edx
-	movups	(%rcx),%xmm1
-	leaq	16(%rcx),%rcx
-	jnz	L$oop_enc1_5
-.byte	102,15,56,221,209
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	xorps	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	movups	%xmm2,(%rsi)
-	xorps	%xmm2,%xmm2
-	jmp	L$ctr32_epilogue
-
-.p2align	4
-L$ctr32_bulk:
-	leaq	(%rsp),%r11
-
-	pushq	%rbp
-
-	subq	$128,%rsp
-	andq	$-16,%rsp
-
-
-
-
-	movdqu	(%r8),%xmm2
-	movdqu	(%rcx),%xmm0
-	movl	12(%r8),%r8d
-	pxor	%xmm0,%xmm2
-	movl	12(%rcx),%ebp
-	movdqa	%xmm2,0(%rsp)
-	bswapl	%r8d
-	movdqa	%xmm2,%xmm3
-	movdqa	%xmm2,%xmm4
-	movdqa	%xmm2,%xmm5
-	movdqa	%xmm2,64(%rsp)
-	movdqa	%xmm2,80(%rsp)
-	movdqa	%xmm2,96(%rsp)
-	movq	%rdx,%r10
-	movdqa	%xmm2,112(%rsp)
-
-	leaq	1(%r8),%rax
-	leaq	2(%r8),%rdx
-	bswapl	%eax
-	bswapl	%edx
-	xorl	%ebp,%eax
-	xorl	%ebp,%edx
-.byte	102,15,58,34,216,3
-	leaq	3(%r8),%rax
-	movdqa	%xmm3,16(%rsp)
-.byte	102,15,58,34,226,3
-	bswapl	%eax
-	movq	%r10,%rdx
-	leaq	4(%r8),%r10
-	movdqa	%xmm4,32(%rsp)
-	xorl	%ebp,%eax
-	bswapl	%r10d
-.byte	102,15,58,34,232,3
-	xorl	%ebp,%r10d
-	movdqa	%xmm5,48(%rsp)
-	leaq	5(%r8),%r9
-	movl	%r10d,64+12(%rsp)
-	bswapl	%r9d
-	leaq	6(%r8),%r10
-	movl	240(%rcx),%eax
-	xorl	%ebp,%r9d
-	bswapl	%r10d
-	movl	%r9d,80+12(%rsp)
-	xorl	%ebp,%r10d
-	leaq	7(%r8),%r9
-	movl	%r10d,96+12(%rsp)
-	bswapl	%r9d
-	leaq	_OPENSSL_ia32cap_P(%rip),%r10
-	movl	4(%r10),%r10d
-	xorl	%ebp,%r9d
-	andl	$71303168,%r10d
-	movl	%r9d,112+12(%rsp)
-
-	movups	16(%rcx),%xmm1
-
-	movdqa	64(%rsp),%xmm6
-	movdqa	80(%rsp),%xmm7
-
-	cmpq	$8,%rdx
-	jb	L$ctr32_tail
-
-	subq	$6,%rdx
-	cmpl	$4194304,%r10d
-	je	L$ctr32_6x
-
-	leaq	128(%rcx),%rcx
-	subq	$2,%rdx
-	jmp	L$ctr32_loop8
-
-.p2align	4
-L$ctr32_6x:
-	shll	$4,%eax
-	movl	$48,%r10d
-	bswapl	%ebp
-	leaq	32(%rcx,%rax,1),%rcx
-	subq	%rax,%r10
-	jmp	L$ctr32_loop6
-
-.p2align	4
-L$ctr32_loop6:
-	addl	$6,%r8d
-	movups	-48(%rcx,%r10,1),%xmm0
-.byte	102,15,56,220,209
-	movl	%r8d,%eax
-	xorl	%ebp,%eax
-.byte	102,15,56,220,217
-.byte	0x0f,0x38,0xf1,0x44,0x24,12
-	leal	1(%r8),%eax
-.byte	102,15,56,220,225
-	xorl	%ebp,%eax
-.byte	0x0f,0x38,0xf1,0x44,0x24,28
-.byte	102,15,56,220,233
-	leal	2(%r8),%eax
-	xorl	%ebp,%eax
-.byte	102,15,56,220,241
-.byte	0x0f,0x38,0xf1,0x44,0x24,44
-	leal	3(%r8),%eax
-.byte	102,15,56,220,249
-	movups	-32(%rcx,%r10,1),%xmm1
-	xorl	%ebp,%eax
-
-.byte	102,15,56,220,208
-.byte	0x0f,0x38,0xf1,0x44,0x24,60
-	leal	4(%r8),%eax
-.byte	102,15,56,220,216
-	xorl	%ebp,%eax
-.byte	0x0f,0x38,0xf1,0x44,0x24,76
-.byte	102,15,56,220,224
-	leal	5(%r8),%eax
-	xorl	%ebp,%eax
-.byte	102,15,56,220,232
-.byte	0x0f,0x38,0xf1,0x44,0x24,92
-	movq	%r10,%rax
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-	movups	-16(%rcx,%r10,1),%xmm0
-
-	call	L$enc_loop6
-
-	movdqu	(%rdi),%xmm8
-	movdqu	16(%rdi),%xmm9
-	movdqu	32(%rdi),%xmm10
-	movdqu	48(%rdi),%xmm11
-	movdqu	64(%rdi),%xmm12
-	movdqu	80(%rdi),%xmm13
-	leaq	96(%rdi),%rdi
-	movups	-64(%rcx,%r10,1),%xmm1
-	pxor	%xmm2,%xmm8
-	movaps	0(%rsp),%xmm2
-	pxor	%xmm3,%xmm9
-	movaps	16(%rsp),%xmm3
-	pxor	%xmm4,%xmm10
-	movaps	32(%rsp),%xmm4
-	pxor	%xmm5,%xmm11
-	movaps	48(%rsp),%xmm5
-	pxor	%xmm6,%xmm12
-	movaps	64(%rsp),%xmm6
-	pxor	%xmm7,%xmm13
-	movaps	80(%rsp),%xmm7
-	movdqu	%xmm8,(%rsi)
-	movdqu	%xmm9,16(%rsi)
-	movdqu	%xmm10,32(%rsi)
-	movdqu	%xmm11,48(%rsi)
-	movdqu	%xmm12,64(%rsi)
-	movdqu	%xmm13,80(%rsi)
-	leaq	96(%rsi),%rsi
-
-	subq	$6,%rdx
-	jnc	L$ctr32_loop6
-
-	addq	$6,%rdx
-	jz	L$ctr32_done
-
-	leal	-48(%r10),%eax
-	leaq	-80(%rcx,%r10,1),%rcx
-	negl	%eax
-	shrl	$4,%eax
-	jmp	L$ctr32_tail
-
-.p2align	5
-L$ctr32_loop8:
-	addl	$8,%r8d
-	movdqa	96(%rsp),%xmm8
-.byte	102,15,56,220,209
-	movl	%r8d,%r9d
-	movdqa	112(%rsp),%xmm9
-.byte	102,15,56,220,217
-	bswapl	%r9d
-	movups	32-128(%rcx),%xmm0
-.byte	102,15,56,220,225
-	xorl	%ebp,%r9d
-	nop
-.byte	102,15,56,220,233
-	movl	%r9d,0+12(%rsp)
-	leaq	1(%r8),%r9
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
-	movups	48-128(%rcx),%xmm1
-	bswapl	%r9d
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-	xorl	%ebp,%r9d
-.byte	0x66,0x90
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-	movl	%r9d,16+12(%rsp)
-	leaq	2(%r8),%r9
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
-	movups	64-128(%rcx),%xmm0
-	bswapl	%r9d
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-	xorl	%ebp,%r9d
-.byte	0x66,0x90
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-	movl	%r9d,32+12(%rsp)
-	leaq	3(%r8),%r9
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
-	movups	80-128(%rcx),%xmm1
-	bswapl	%r9d
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-	xorl	%ebp,%r9d
-.byte	0x66,0x90
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-	movl	%r9d,48+12(%rsp)
-	leaq	4(%r8),%r9
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
-	movups	96-128(%rcx),%xmm0
-	bswapl	%r9d
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-	xorl	%ebp,%r9d
-.byte	0x66,0x90
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-	movl	%r9d,64+12(%rsp)
-	leaq	5(%r8),%r9
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
-	movups	112-128(%rcx),%xmm1
-	bswapl	%r9d
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-	xorl	%ebp,%r9d
-.byte	0x66,0x90
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-	movl	%r9d,80+12(%rsp)
-	leaq	6(%r8),%r9
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
-	movups	128-128(%rcx),%xmm0
-	bswapl	%r9d
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-	xorl	%ebp,%r9d
-.byte	0x66,0x90
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-	movl	%r9d,96+12(%rsp)
-	leaq	7(%r8),%r9
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
-	movups	144-128(%rcx),%xmm1
-	bswapl	%r9d
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-	xorl	%ebp,%r9d
-	movdqu	0(%rdi),%xmm10
-.byte	102,15,56,220,232
-	movl	%r9d,112+12(%rsp)
-	cmpl	$11,%eax
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
-	movups	160-128(%rcx),%xmm0
-
-	jb	L$ctr32_enc_done
-
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
-	movups	176-128(%rcx),%xmm1
-
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
-	movups	192-128(%rcx),%xmm0
-	je	L$ctr32_enc_done
-
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
-	movups	208-128(%rcx),%xmm1
-
-.byte	102,15,56,220,208
-.byte	102,15,56,220,216
-.byte	102,15,56,220,224
-.byte	102,15,56,220,232
-.byte	102,15,56,220,240
-.byte	102,15,56,220,248
-.byte	102,68,15,56,220,192
-.byte	102,68,15,56,220,200
-	movups	224-128(%rcx),%xmm0
-	jmp	L$ctr32_enc_done
-
-.p2align	4
-L$ctr32_enc_done:
-	movdqu	16(%rdi),%xmm11
-	pxor	%xmm0,%xmm10
-	movdqu	32(%rdi),%xmm12
-	pxor	%xmm0,%xmm11
-	movdqu	48(%rdi),%xmm13
-	pxor	%xmm0,%xmm12
-	movdqu	64(%rdi),%xmm14
-	pxor	%xmm0,%xmm13
-	movdqu	80(%rdi),%xmm15
-	pxor	%xmm0,%xmm14
-	prefetcht0	448(%rdi)
-	prefetcht0	512(%rdi)
-	pxor	%xmm0,%xmm15
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-.byte	102,68,15,56,220,201
-	movdqu	96(%rdi),%xmm1
-	leaq	128(%rdi),%rdi
-
-.byte	102,65,15,56,221,210
-	pxor	%xmm0,%xmm1
-	movdqu	112-128(%rdi),%xmm10
-.byte	102,65,15,56,221,219
-	pxor	%xmm0,%xmm10
-	movdqa	0(%rsp),%xmm11
-.byte	102,65,15,56,221,228
-.byte	102,65,15,56,221,237
-	movdqa	16(%rsp),%xmm12
-	movdqa	32(%rsp),%xmm13
-.byte	102,65,15,56,221,246
-.byte	102,65,15,56,221,255
-	movdqa	48(%rsp),%xmm14
-	movdqa	64(%rsp),%xmm15
-.byte	102,68,15,56,221,193
-	movdqa	80(%rsp),%xmm0
-	movups	16-128(%rcx),%xmm1
-.byte	102,69,15,56,221,202
-
-	movups	%xmm2,(%rsi)
-	movdqa	%xmm11,%xmm2
-	movups	%xmm3,16(%rsi)
-	movdqa	%xmm12,%xmm3
-	movups	%xmm4,32(%rsi)
-	movdqa	%xmm13,%xmm4
-	movups	%xmm5,48(%rsi)
-	movdqa	%xmm14,%xmm5
-	movups	%xmm6,64(%rsi)
-	movdqa	%xmm15,%xmm6
-	movups	%xmm7,80(%rsi)
-	movdqa	%xmm0,%xmm7
-	movups	%xmm8,96(%rsi)
-	movups	%xmm9,112(%rsi)
-	leaq	128(%rsi),%rsi
-
-	subq	$8,%rdx
-	jnc	L$ctr32_loop8
-
-	addq	$8,%rdx
-	jz	L$ctr32_done
-	leaq	-128(%rcx),%rcx
-
-L$ctr32_tail:
-
-
-	leaq	16(%rcx),%rcx
-	cmpq	$4,%rdx
-	jb	L$ctr32_loop3
-	je	L$ctr32_loop4
-
-
-	shll	$4,%eax
-	movdqa	96(%rsp),%xmm8
-	pxor	%xmm9,%xmm9
-
-	movups	16(%rcx),%xmm0
-.byte	102,15,56,220,209
-.byte	102,15,56,220,217
-	leaq	32-16(%rcx,%rax,1),%rcx
-	negq	%rax
-.byte	102,15,56,220,225
-	addq	$16,%rax
-	movups	(%rdi),%xmm10
-.byte	102,15,56,220,233
-.byte	102,15,56,220,241
-	movups	16(%rdi),%xmm11
-	movups	32(%rdi),%xmm12
-.byte	102,15,56,220,249
-.byte	102,68,15,56,220,193
-
-	call	L$enc_loop8_enter
-
-	movdqu	48(%rdi),%xmm13
-	pxor	%xmm10,%xmm2
-	movdqu	64(%rdi),%xmm10
-	pxor	%xmm11,%xmm3
-	movdqu	%xmm2,(%rsi)
-	pxor	%xmm12,%xmm4
-	movdqu	%xmm3,16(%rsi)
-	pxor	%xmm13,%xmm5
-	movdqu	%xmm4,32(%rsi)
-	pxor	%xmm10,%xmm6
-	movdqu	%xmm5,48(%rsi)
-	movdqu	%xmm6,64(%rsi)
-	cmpq	$6,%rdx
-	jb	L$ctr32_done
-
-	movups	80(%rdi),%xmm11
-	xorps	%xmm11,%xmm7
-	movups	%xmm7,80(%rsi)
-	je	L$ctr32_done
-
-	movups	96(%rdi),%xmm12
-	xorps	%xmm12,%xmm8
-	movups	%xmm8,96(%rsi)
-	jmp	L$ctr32_done
-
-.p2align	5
-L$ctr32_loop4:
-.byte	102,15,56,220,209
-	leaq	16(%rcx),%rcx
-	decl	%eax
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-.byte	102,15,56,220,233
-	movups	(%rcx),%xmm1
-	jnz	L$ctr32_loop4
-.byte	102,15,56,221,209
-.byte	102,15,56,221,217
-	movups	(%rdi),%xmm10
-	movups	16(%rdi),%xmm11
-.byte	102,15,56,221,225
-.byte	102,15,56,221,233
-	movups	32(%rdi),%xmm12
-	movups	48(%rdi),%xmm13
-
-	xorps	%xmm10,%xmm2
-	movups	%xmm2,(%rsi)
-	xorps	%xmm11,%xmm3
-	movups	%xmm3,16(%rsi)
-	pxor	%xmm12,%xmm4
-	movdqu	%xmm4,32(%rsi)
-	pxor	%xmm13,%xmm5
-	movdqu	%xmm5,48(%rsi)
-	jmp	L$ctr32_done
-
-.p2align	5
-L$ctr32_loop3:
-.byte	102,15,56,220,209
-	leaq	16(%rcx),%rcx
-	decl	%eax
-.byte	102,15,56,220,217
-.byte	102,15,56,220,225
-	movups	(%rcx),%xmm1
-	jnz	L$ctr32_loop3
-.byte	102,15,56,221,209
-.byte	102,15,56,221,217
-.byte	102,15,56,221,225
-
-	movups	(%rdi),%xmm10
-	xorps	%xmm10,%xmm2
-	movups	%xmm2,(%rsi)
-	cmpq	$2,%rdx
-	jb	L$ctr32_done
-
-	movups	16(%rdi),%xmm11
-	xorps	%xmm11,%xmm3
-	movups	%xmm3,16(%rsi)
-	je	L$ctr32_done
-
-	movups	32(%rdi),%xmm12
-	xorps	%xmm12,%xmm4
-	movups	%xmm4,32(%rsi)
-
-L$ctr32_done:
-	xorps	%xmm0,%xmm0
-	xorl	%ebp,%ebp
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-	pxor	%xmm7,%xmm7
-	movaps	%xmm0,0(%rsp)
-	pxor	%xmm8,%xmm8
-	movaps	%xmm0,16(%rsp)
-	pxor	%xmm9,%xmm9
-	movaps	%xmm0,32(%rsp)
-	pxor	%xmm10,%xmm10
-	movaps	%xmm0,48(%rsp)
-	pxor	%xmm11,%xmm11
-	movaps	%xmm0,64(%rsp)
-	pxor	%xmm12,%xmm12
-	movaps	%xmm0,80(%rsp)
-	pxor	%xmm13,%xmm13
-	movaps	%xmm0,96(%rsp)
-	pxor	%xmm14,%xmm14
-	movaps	%xmm0,112(%rsp)
-	pxor	%xmm15,%xmm15
-	movq	-8(%r11),%rbp
-
-	leaq	(%r11),%rsp
-
-L$ctr32_epilogue:
-	ret
-
-
-.globl	_aes_hw_cbc_encrypt
-.private_extern _aes_hw_cbc_encrypt
-
-.p2align	4
-_aes_hw_cbc_encrypt:
-
-_CET_ENDBR
-	testq	%rdx,%rdx
-	jz	L$cbc_ret
-
-	movl	240(%rcx),%r10d
-	movq	%rcx,%r11
-	testl	%r9d,%r9d
-	jz	L$cbc_decrypt
-
-	movups	(%r8),%xmm2
-	movl	%r10d,%eax
-	cmpq	$16,%rdx
-	jb	L$cbc_enc_tail
-	subq	$16,%rdx
-	jmp	L$cbc_enc_loop
-.p2align	4
-L$cbc_enc_loop:
-	movups	(%rdi),%xmm3
-	leaq	16(%rdi),%rdi
-
-	movups	(%rcx),%xmm0
-	movups	16(%rcx),%xmm1
-	xorps	%xmm0,%xmm3
-	leaq	32(%rcx),%rcx
-	xorps	%xmm3,%xmm2
-L$oop_enc1_6:
-.byte	102,15,56,220,209
-	decl	%eax
-	movups	(%rcx),%xmm1
-	leaq	16(%rcx),%rcx
-	jnz	L$oop_enc1_6
-.byte	102,15,56,221,209
-	movl	%r10d,%eax
-	movq	%r11,%rcx
-	movups	%xmm2,0(%rsi)
-	leaq	16(%rsi),%rsi
-	subq	$16,%rdx
-	jnc	L$cbc_enc_loop
-	addq	$16,%rdx
-	jnz	L$cbc_enc_tail
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	movups	%xmm2,(%r8)
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	jmp	L$cbc_ret
-
-L$cbc_enc_tail:
-	movq	%rdx,%rcx
-	xchgq	%rdi,%rsi
-.long	0x9066A4F3
-	movl	$16,%ecx
-	subq	%rdx,%rcx
-	xorl	%eax,%eax
-.long	0x9066AAF3
-	leaq	-16(%rdi),%rdi
-	movl	%r10d,%eax
-	movq	%rdi,%rsi
-	movq	%r11,%rcx
-	xorq	%rdx,%rdx
-	jmp	L$cbc_enc_loop
-
-.p2align	4
-L$cbc_decrypt:
-	cmpq	$16,%rdx
-	jne	L$cbc_decrypt_bulk
-
-
-
-	movdqu	(%rdi),%xmm2
-	movdqu	(%r8),%xmm3
-	movdqa	%xmm2,%xmm4
-	movups	(%rcx),%xmm0
-	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
-	xorps	%xmm0,%xmm2
-L$oop_dec1_7:
-.byte	102,15,56,222,209
-	decl	%r10d
-	movups	(%rcx),%xmm1
-	leaq	16(%rcx),%rcx
-	jnz	L$oop_dec1_7
-.byte	102,15,56,223,209
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	movdqu	%xmm4,(%r8)
-	xorps	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	movups	%xmm2,(%rsi)
-	pxor	%xmm2,%xmm2
-	jmp	L$cbc_ret
-.p2align	4
-L$cbc_decrypt_bulk:
-	leaq	(%rsp),%r11
-
-	pushq	%rbp
-
-	subq	$16,%rsp
-	andq	$-16,%rsp
-	movq	%rcx,%rbp
-	movups	(%r8),%xmm10
-	movl	%r10d,%eax
-	cmpq	$0x50,%rdx
-	jbe	L$cbc_dec_tail
-
-	movups	(%rcx),%xmm0
-	movdqu	0(%rdi),%xmm2
-	movdqu	16(%rdi),%xmm3
-	movdqa	%xmm2,%xmm11
-	movdqu	32(%rdi),%xmm4
-	movdqa	%xmm3,%xmm12
-	movdqu	48(%rdi),%xmm5
-	movdqa	%xmm4,%xmm13
-	movdqu	64(%rdi),%xmm6
-	movdqa	%xmm5,%xmm14
-	movdqu	80(%rdi),%xmm7
-	movdqa	%xmm6,%xmm15
-	leaq	_OPENSSL_ia32cap_P(%rip),%r9
-	movl	4(%r9),%r9d
-	cmpq	$0x70,%rdx
-	jbe	L$cbc_dec_six_or_seven
-
-	andl	$71303168,%r9d
-	subq	$0x50,%rdx
-	cmpl	$4194304,%r9d
-	je	L$cbc_dec_loop6_enter
-	subq	$0x20,%rdx
-	leaq	112(%rcx),%rcx
-	jmp	L$cbc_dec_loop8_enter
-.p2align	4
-L$cbc_dec_loop8:
-	movups	%xmm9,(%rsi)
-	leaq	16(%rsi),%rsi
-L$cbc_dec_loop8_enter:
-	movdqu	96(%rdi),%xmm8
-	pxor	%xmm0,%xmm2
-	movdqu	112(%rdi),%xmm9
-	pxor	%xmm0,%xmm3
-	movups	16-112(%rcx),%xmm1
-	pxor	%xmm0,%xmm4
-	movq	$-1,%rbp
-	cmpq	$0x70,%rdx
-	pxor	%xmm0,%xmm5
-	pxor	%xmm0,%xmm6
-	pxor	%xmm0,%xmm7
-	pxor	%xmm0,%xmm8
-
-.byte	102,15,56,222,209
-	pxor	%xmm0,%xmm9
-	movups	32-112(%rcx),%xmm0
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-	adcq	$0,%rbp
-	andq	$128,%rbp
-.byte	102,68,15,56,222,201
-	addq	%rdi,%rbp
-	movups	48-112(%rcx),%xmm1
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
-	movups	64-112(%rcx),%xmm0
-	nop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
-	movups	80-112(%rcx),%xmm1
-	nop
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
-	movups	96-112(%rcx),%xmm0
-	nop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
-	movups	112-112(%rcx),%xmm1
-	nop
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
-	movups	128-112(%rcx),%xmm0
-	nop
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
-	movups	144-112(%rcx),%xmm1
-	cmpl	$11,%eax
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
-	movups	160-112(%rcx),%xmm0
-	jb	L$cbc_dec_done
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
-	movups	176-112(%rcx),%xmm1
-	nop
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
-	movups	192-112(%rcx),%xmm0
-	je	L$cbc_dec_done
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
-	movups	208-112(%rcx),%xmm1
-	nop
-.byte	102,15,56,222,208
-.byte	102,15,56,222,216
-.byte	102,15,56,222,224
-.byte	102,15,56,222,232
-.byte	102,15,56,222,240
-.byte	102,15,56,222,248
-.byte	102,68,15,56,222,192
-.byte	102,68,15,56,222,200
-	movups	224-112(%rcx),%xmm0
-	jmp	L$cbc_dec_done
-.p2align	4
-L$cbc_dec_done:
-.byte	102,15,56,222,209
-.byte	102,15,56,222,217
-	pxor	%xmm0,%xmm10
-	pxor	%xmm0,%xmm11
-.byte	102,15,56,222,225
-.byte	102,15,56,222,233
-	pxor	%xmm0,%xmm12
-	pxor	%xmm0,%xmm13
-.byte	102,15,56,222,241
-.byte	102,15,56,222,249
-	pxor	%xmm0,%xmm14
-	pxor	%xmm0,%xmm15
-.byte	102,68,15,56,222,193
-.byte	102,68,15,56,222,201
-	movdqu	80(%rdi),%xmm1
-
-.byte	102,65,15,56,223,210
-	movdqu	96(%rdi),%xmm10
-	pxor	%xmm0,%xmm1
-.byte	102,65,15,56,223,219
-	pxor	%xmm0,%xmm10
-	movdqu	112(%rdi),%xmm0
-.byte	102,65,15,56,223,228
-	leaq	128(%rdi),%rdi
-	movdqu	0(%rbp),%xmm11
-.byte	102,65,15,56,223,237
-.byte	102,65,15,56,223,246
-	movdqu	16(%rbp),%xmm12
-	movdqu	32(%rbp),%xmm13
-.byte	102,65,15,56,223,255
-.byte	102,68,15,56,223,193
-	movdqu	48(%rbp),%xmm14
-	movdqu	64(%rbp),%xmm15
-.byte	102,69,15,56,223,202
-	movdqa	%xmm0,%xmm10
-	movdqu	80(%rbp),%xmm1
-	movups	-112(%rcx),%xmm0
-
-	movups	%xmm2,(%rsi)
-	movdqa	%xmm11,%xmm2
-	movups	%xmm3,16(%rsi)
-	movdqa	%xmm12,%xmm3
-	movups	%xmm4,32(%rsi)
-	movdqa	%xmm13,%xmm4
-	movups	%xmm5,48(%rsi)
-	movdqa	%xmm14,%xmm5
-	movups	%xmm6,64(%rsi)
-	movdqa	%xmm15,%xmm6
-	movups	%xmm7,80(%rsi)
-	movdqa	%xmm1,%xmm7
-	movups	%xmm8,96(%rsi)
-	leaq	112(%rsi),%rsi
-
-	subq	$0x80,%rdx
-	ja	L$cbc_dec_loop8
-
-	movaps	%xmm9,%xmm2
-	leaq	-112(%rcx),%rcx
-	addq	$0x70,%rdx
-	jle	L$cbc_dec_clear_tail_collected
-	movups	%xmm9,(%rsi)
-	leaq	16(%rsi),%rsi
-	cmpq	$0x50,%rdx
-	jbe	L$cbc_dec_tail
-
-	movaps	%xmm11,%xmm2
-L$cbc_dec_six_or_seven:
-	cmpq	$0x60,%rdx
-	ja	L$cbc_dec_seven
-
-	movaps	%xmm7,%xmm8
-	call	_aesni_decrypt6
-	pxor	%xmm10,%xmm2
-	movaps	%xmm8,%xmm10
-	pxor	%xmm11,%xmm3
-	movdqu	%xmm2,(%rsi)
-	pxor	%xmm12,%xmm4
-	movdqu	%xmm3,16(%rsi)
-	pxor	%xmm3,%xmm3
-	pxor	%xmm13,%xmm5
-	movdqu	%xmm4,32(%rsi)
-	pxor	%xmm4,%xmm4
-	pxor	%xmm14,%xmm6
-	movdqu	%xmm5,48(%rsi)
-	pxor	%xmm5,%xmm5
-	pxor	%xmm15,%xmm7
-	movdqu	%xmm6,64(%rsi)
-	pxor	%xmm6,%xmm6
-	leaq	80(%rsi),%rsi
-	movdqa	%xmm7,%xmm2
-	pxor	%xmm7,%xmm7
-	jmp	L$cbc_dec_tail_collected
-
-.p2align	4
-L$cbc_dec_seven:
-	movups	96(%rdi),%xmm8
-	xorps	%xmm9,%xmm9
-	call	_aesni_decrypt8
-	movups	80(%rdi),%xmm9
-	pxor	%xmm10,%xmm2
-	movups	96(%rdi),%xmm10
-	pxor	%xmm11,%xmm3
-	movdqu	%xmm2,(%rsi)
-	pxor	%xmm12,%xmm4
-	movdqu	%xmm3,16(%rsi)
-	pxor	%xmm3,%xmm3
-	pxor	%xmm13,%xmm5
-	movdqu	%xmm4,32(%rsi)
-	pxor	%xmm4,%xmm4
-	pxor	%xmm14,%xmm6
-	movdqu	%xmm5,48(%rsi)
-	pxor	%xmm5,%xmm5
-	pxor	%xmm15,%xmm7
-	movdqu	%xmm6,64(%rsi)
-	pxor	%xmm6,%xmm6
-	pxor	%xmm9,%xmm8
-	movdqu	%xmm7,80(%rsi)
-	pxor	%xmm7,%xmm7
-	leaq	96(%rsi),%rsi
-	movdqa	%xmm8,%xmm2
-	pxor	%xmm8,%xmm8
-	pxor	%xmm9,%xmm9
-	jmp	L$cbc_dec_tail_collected
-
-.p2align	4
-L$cbc_dec_loop6:
-	movups	%xmm7,(%rsi)
-	leaq	16(%rsi),%rsi
-	movdqu	0(%rdi),%xmm2
-	movdqu	16(%rdi),%xmm3
-	movdqa	%xmm2,%xmm11
-	movdqu	32(%rdi),%xmm4
-	movdqa	%xmm3,%xmm12
-	movdqu	48(%rdi),%xmm5
-	movdqa	%xmm4,%xmm13
-	movdqu	64(%rdi),%xmm6
-	movdqa	%xmm5,%xmm14
-	movdqu	80(%rdi),%xmm7
-	movdqa	%xmm6,%xmm15
-L$cbc_dec_loop6_enter:
-	leaq	96(%rdi),%rdi
-	movdqa	%xmm7,%xmm8
-
-	call	_aesni_decrypt6
-
-	pxor	%xmm10,%xmm2
-	movdqa	%xmm8,%xmm10
-	pxor	%xmm11,%xmm3
-	movdqu	%xmm2,(%rsi)
-	pxor	%xmm12,%xmm4
-	movdqu	%xmm3,16(%rsi)
-	pxor	%xmm13,%xmm5
-	movdqu	%xmm4,32(%rsi)
-	pxor	%xmm14,%xmm6
-	movq	%rbp,%rcx
-	movdqu	%xmm5,48(%rsi)
-	pxor	%xmm15,%xmm7
-	movl	%r10d,%eax
-	movdqu	%xmm6,64(%rsi)
-	leaq	80(%rsi),%rsi
-	subq	$0x60,%rdx
-	ja	L$cbc_dec_loop6
-
-	movdqa	%xmm7,%xmm2
-	addq	$0x50,%rdx
-	jle	L$cbc_dec_clear_tail_collected
-	movups	%xmm7,(%rsi)
-	leaq	16(%rsi),%rsi
-
-L$cbc_dec_tail:
-	movups	(%rdi),%xmm2
-	subq	$0x10,%rdx
-	jbe	L$cbc_dec_one
-
-	movups	16(%rdi),%xmm3
-	movaps	%xmm2,%xmm11
-	subq	$0x10,%rdx
-	jbe	L$cbc_dec_two
-
-	movups	32(%rdi),%xmm4
-	movaps	%xmm3,%xmm12
-	subq	$0x10,%rdx
-	jbe	L$cbc_dec_three
-
-	movups	48(%rdi),%xmm5
-	movaps	%xmm4,%xmm13
-	subq	$0x10,%rdx
-	jbe	L$cbc_dec_four
-
-	movups	64(%rdi),%xmm6
-	movaps	%xmm5,%xmm14
-	movaps	%xmm6,%xmm15
-	xorps	%xmm7,%xmm7
-	call	_aesni_decrypt6
-	pxor	%xmm10,%xmm2
-	movaps	%xmm15,%xmm10
-	pxor	%xmm11,%xmm3
-	movdqu	%xmm2,(%rsi)
-	pxor	%xmm12,%xmm4
-	movdqu	%xmm3,16(%rsi)
-	pxor	%xmm3,%xmm3
-	pxor	%xmm13,%xmm5
-	movdqu	%xmm4,32(%rsi)
-	pxor	%xmm4,%xmm4
-	pxor	%xmm14,%xmm6
-	movdqu	%xmm5,48(%rsi)
-	pxor	%xmm5,%xmm5
-	leaq	64(%rsi),%rsi
-	movdqa	%xmm6,%xmm2
-	pxor	%xmm6,%xmm6
-	pxor	%xmm7,%xmm7
-	subq	$0x10,%rdx
-	jmp	L$cbc_dec_tail_collected
-
-.p2align	4
-L$cbc_dec_one:
-	movaps	%xmm2,%xmm11
-	movups	(%rcx),%xmm0
-	movups	16(%rcx),%xmm1
-	leaq	32(%rcx),%rcx
-	xorps	%xmm0,%xmm2
-L$oop_dec1_8:
-.byte	102,15,56,222,209
-	decl	%eax
-	movups	(%rcx),%xmm1
-	leaq	16(%rcx),%rcx
-	jnz	L$oop_dec1_8
-.byte	102,15,56,223,209
-	xorps	%xmm10,%xmm2
-	movaps	%xmm11,%xmm10
-	jmp	L$cbc_dec_tail_collected
-.p2align	4
-L$cbc_dec_two:
-	movaps	%xmm3,%xmm12
-	call	_aesni_decrypt2
-	pxor	%xmm10,%xmm2
-	movaps	%xmm12,%xmm10
-	pxor	%xmm11,%xmm3
-	movdqu	%xmm2,(%rsi)
-	movdqa	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	leaq	16(%rsi),%rsi
-	jmp	L$cbc_dec_tail_collected
-.p2align	4
-L$cbc_dec_three:
-	movaps	%xmm4,%xmm13
-	call	_aesni_decrypt3
-	pxor	%xmm10,%xmm2
-	movaps	%xmm13,%xmm10
-	pxor	%xmm11,%xmm3
-	movdqu	%xmm2,(%rsi)
-	pxor	%xmm12,%xmm4
-	movdqu	%xmm3,16(%rsi)
-	pxor	%xmm3,%xmm3
-	movdqa	%xmm4,%xmm2
-	pxor	%xmm4,%xmm4
-	leaq	32(%rsi),%rsi
-	jmp	L$cbc_dec_tail_collected
-.p2align	4
-L$cbc_dec_four:
-	movaps	%xmm5,%xmm14
-	call	_aesni_decrypt4
-	pxor	%xmm10,%xmm2
-	movaps	%xmm14,%xmm10
-	pxor	%xmm11,%xmm3
-	movdqu	%xmm2,(%rsi)
-	pxor	%xmm12,%xmm4
-	movdqu	%xmm3,16(%rsi)
-	pxor	%xmm3,%xmm3
-	pxor	%xmm13,%xmm5
-	movdqu	%xmm4,32(%rsi)
-	pxor	%xmm4,%xmm4
-	movdqa	%xmm5,%xmm2
-	pxor	%xmm5,%xmm5
-	leaq	48(%rsi),%rsi
-	jmp	L$cbc_dec_tail_collected
-
-.p2align	4
-L$cbc_dec_clear_tail_collected:
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-	pxor	%xmm7,%xmm7
-	pxor	%xmm8,%xmm8
-	pxor	%xmm9,%xmm9
-L$cbc_dec_tail_collected:
-	movups	%xmm10,(%r8)
-	andq	$15,%rdx
-	jnz	L$cbc_dec_tail_partial
-	movups	%xmm2,(%rsi)
-	pxor	%xmm2,%xmm2
-	jmp	L$cbc_dec_ret
-.p2align	4
-L$cbc_dec_tail_partial:
-	movaps	%xmm2,(%rsp)
-	pxor	%xmm2,%xmm2
-	movq	$16,%rcx
-	movq	%rsi,%rdi
-	subq	%rdx,%rcx
-	leaq	(%rsp),%rsi
-.long	0x9066A4F3
-	movdqa	%xmm2,(%rsp)
-
-L$cbc_dec_ret:
-	xorps	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	movq	-8(%r11),%rbp
-
-	leaq	(%r11),%rsp
-
-L$cbc_ret:
-	ret
-
-
-.globl	_aes_hw_set_decrypt_key
-.private_extern _aes_hw_set_decrypt_key
-
-.p2align	4
-_aes_hw_set_decrypt_key:
-
-_CET_ENDBR
-.byte	0x48,0x83,0xEC,0x08
-
-	call	__aesni_set_encrypt_key
-	shll	$4,%esi
-	testl	%eax,%eax
-	jnz	L$dec_key_ret
-	leaq	16(%rdx,%rsi,1),%rdi
-
-	movups	(%rdx),%xmm0
-	movups	(%rdi),%xmm1
-	movups	%xmm0,(%rdi)
-	movups	%xmm1,(%rdx)
-	leaq	16(%rdx),%rdx
-	leaq	-16(%rdi),%rdi
-
-L$dec_key_inverse:
-	movups	(%rdx),%xmm0
-	movups	(%rdi),%xmm1
-.byte	102,15,56,219,192
-.byte	102,15,56,219,201
-	leaq	16(%rdx),%rdx
-	leaq	-16(%rdi),%rdi
-	movups	%xmm0,16(%rdi)
-	movups	%xmm1,-16(%rdx)
-	cmpq	%rdx,%rdi
-	ja	L$dec_key_inverse
-
-	movups	(%rdx),%xmm0
-.byte	102,15,56,219,192
-	pxor	%xmm1,%xmm1
-	movups	%xmm0,(%rdi)
-	pxor	%xmm0,%xmm0
-L$dec_key_ret:
-	addq	$8,%rsp
-
-	ret
-
-L$SEH_end_set_decrypt_key:
-
-.globl	_aes_hw_set_encrypt_key
-.private_extern _aes_hw_set_encrypt_key
-
-.p2align	4
-_aes_hw_set_encrypt_key:
-__aesni_set_encrypt_key:
-
-_CET_ENDBR
-#ifdef BORINGSSL_DISPATCH_TEST
-	movb	$1,_BORINGSSL_function_hit+3(%rip)
-#endif
-.byte	0x48,0x83,0xEC,0x08
-
-	movq	$-1,%rax
-	testq	%rdi,%rdi
-	jz	L$enc_key_ret
-	testq	%rdx,%rdx
-	jz	L$enc_key_ret
-
-	movups	(%rdi),%xmm0
-	xorps	%xmm4,%xmm4
-	leaq	_OPENSSL_ia32cap_P(%rip),%r10
-	movl	4(%r10),%r10d
-	andl	$268437504,%r10d
-	leaq	16(%rdx),%rax
-	cmpl	$256,%esi
-	je	L$14rounds
-	cmpl	$192,%esi
-	je	L$12rounds
-	cmpl	$128,%esi
-	jne	L$bad_keybits
-
-L$10rounds:
-	movl	$9,%esi
-	cmpl	$268435456,%r10d
-	je	L$10rounds_alt
-
-	movups	%xmm0,(%rdx)
-.byte	102,15,58,223,200,1
-	call	L$key_expansion_128_cold
-.byte	102,15,58,223,200,2
-	call	L$key_expansion_128
-.byte	102,15,58,223,200,4
-	call	L$key_expansion_128
-.byte	102,15,58,223,200,8
-	call	L$key_expansion_128
-.byte	102,15,58,223,200,16
-	call	L$key_expansion_128
-.byte	102,15,58,223,200,32
-	call	L$key_expansion_128
-.byte	102,15,58,223,200,64
-	call	L$key_expansion_128
-.byte	102,15,58,223,200,128
-	call	L$key_expansion_128
-.byte	102,15,58,223,200,27
-	call	L$key_expansion_128
-.byte	102,15,58,223,200,54
-	call	L$key_expansion_128
-	movups	%xmm0,(%rax)
-	movl	%esi,80(%rax)
-	xorl	%eax,%eax
-	jmp	L$enc_key_ret
-
-.p2align	4
-L$10rounds_alt:
-	movdqa	L$key_rotate(%rip),%xmm5
-	movl	$8,%r10d
-	movdqa	L$key_rcon1(%rip),%xmm4
-	movdqa	%xmm0,%xmm2
-	movdqu	%xmm0,(%rdx)
-	jmp	L$oop_key128
-
-.p2align	4
-L$oop_key128:
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-	pslld	$1,%xmm4
-	leaq	16(%rax),%rax
-
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,-16(%rax)
-	movdqa	%xmm0,%xmm2
-
-	decl	%r10d
-	jnz	L$oop_key128
-
-	movdqa	L$key_rcon1b(%rip),%xmm4
-
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-	pslld	$1,%xmm4
-
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,(%rax)
-
-	movdqa	%xmm0,%xmm2
-.byte	102,15,56,0,197
-.byte	102,15,56,221,196
-
-	movdqa	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm2,%xmm3
-	pslldq	$4,%xmm2
-	pxor	%xmm3,%xmm2
-
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,16(%rax)
-
-	movl	%esi,96(%rax)
-	xorl	%eax,%eax
-	jmp	L$enc_key_ret
-
-.p2align	4
-L$12rounds:
-	movq	16(%rdi),%xmm2
-	movl	$11,%esi
-	cmpl	$268435456,%r10d
-	je	L$12rounds_alt
-
-	movups	%xmm0,(%rdx)
-.byte	102,15,58,223,202,1
-	call	L$key_expansion_192a_cold
-.byte	102,15,58,223,202,2
-	call	L$key_expansion_192b
-.byte	102,15,58,223,202,4
-	call	L$key_expansion_192a
-.byte	102,15,58,223,202,8
-	call	L$key_expansion_192b
-.byte	102,15,58,223,202,16
-	call	L$key_expansion_192a
-.byte	102,15,58,223,202,32
-	call	L$key_expansion_192b
-.byte	102,15,58,223,202,64
-	call	L$key_expansion_192a
-.byte	102,15,58,223,202,128
-	call	L$key_expansion_192b
-	movups	%xmm0,(%rax)
-	movl	%esi,48(%rax)
-	xorq	%rax,%rax
-	jmp	L$enc_key_ret
-
-.p2align	4
-L$12rounds_alt:
-	movdqa	L$key_rotate192(%rip),%xmm5
-	movdqa	L$key_rcon1(%rip),%xmm4
-	movl	$8,%r10d
-	movdqu	%xmm0,(%rdx)
-	jmp	L$oop_key192
-
-.p2align	4
-L$oop_key192:
-	movq	%xmm2,0(%rax)
-	movdqa	%xmm2,%xmm1
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
-	pslld	$1,%xmm4
-	leaq	24(%rax),%rax
-
-	movdqa	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm3,%xmm0
-
-	pshufd	$0xff,%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-
-	pxor	%xmm2,%xmm0
-	pxor	%xmm3,%xmm2
-	movdqu	%xmm0,-16(%rax)
-
-	decl	%r10d
-	jnz	L$oop_key192
-
-	movl	%esi,32(%rax)
-	xorl	%eax,%eax
-	jmp	L$enc_key_ret
-
-.p2align	4
-L$14rounds:
-	movups	16(%rdi),%xmm2
-	movl	$13,%esi
-	leaq	16(%rax),%rax
-	cmpl	$268435456,%r10d
-	je	L$14rounds_alt
-
-	movups	%xmm0,(%rdx)
-	movups	%xmm2,16(%rdx)
-.byte	102,15,58,223,202,1
-	call	L$key_expansion_256a_cold
-.byte	102,15,58,223,200,1
-	call	L$key_expansion_256b
-.byte	102,15,58,223,202,2
-	call	L$key_expansion_256a
-.byte	102,15,58,223,200,2
-	call	L$key_expansion_256b
-.byte	102,15,58,223,202,4
-	call	L$key_expansion_256a
-.byte	102,15,58,223,200,4
-	call	L$key_expansion_256b
-.byte	102,15,58,223,202,8
-	call	L$key_expansion_256a
-.byte	102,15,58,223,200,8
-	call	L$key_expansion_256b
-.byte	102,15,58,223,202,16
-	call	L$key_expansion_256a
-.byte	102,15,58,223,200,16
-	call	L$key_expansion_256b
-.byte	102,15,58,223,202,32
-	call	L$key_expansion_256a
-.byte	102,15,58,223,200,32
-	call	L$key_expansion_256b
-.byte	102,15,58,223,202,64
-	call	L$key_expansion_256a
-	movups	%xmm0,(%rax)
-	movl	%esi,16(%rax)
-	xorq	%rax,%rax
-	jmp	L$enc_key_ret
-
-.p2align	4
-L$14rounds_alt:
-	movdqa	L$key_rotate(%rip),%xmm5
-	movdqa	L$key_rcon1(%rip),%xmm4
-	movl	$7,%r10d
-	movdqu	%xmm0,0(%rdx)
-	movdqa	%xmm2,%xmm1
-	movdqu	%xmm2,16(%rdx)
-	jmp	L$oop_key256
-
-.p2align	4
-L$oop_key256:
-.byte	102,15,56,0,213
-.byte	102,15,56,221,212
-
-	movdqa	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm0,%xmm3
-	pslldq	$4,%xmm0
-	pxor	%xmm3,%xmm0
-	pslld	$1,%xmm4
-
-	pxor	%xmm2,%xmm0
-	movdqu	%xmm0,(%rax)
-
-	decl	%r10d
-	jz	L$done_key256
-
-	pshufd	$0xff,%xmm0,%xmm2
-	pxor	%xmm3,%xmm3
-.byte	102,15,56,221,211
-
-	movdqa	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm1,%xmm3
-	pslldq	$4,%xmm1
-	pxor	%xmm3,%xmm1
-
-	pxor	%xmm1,%xmm2
-	movdqu	%xmm2,16(%rax)
-	leaq	32(%rax),%rax
-	movdqa	%xmm2,%xmm1
-
-	jmp	L$oop_key256
-
-L$done_key256:
-	movl	%esi,16(%rax)
-	xorl	%eax,%eax
-	jmp	L$enc_key_ret
-
-.p2align	4
-L$bad_keybits:
-	movq	$-2,%rax
-L$enc_key_ret:
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	addq	$8,%rsp
-
-	ret
-
-L$SEH_end_set_encrypt_key:
-
-.p2align	4
-L$key_expansion_128:
-	movups	%xmm0,(%rax)
-	leaq	16(%rax),%rax
-L$key_expansion_128_cold:
-	shufps	$16,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$255,%xmm1,%xmm1
-	xorps	%xmm1,%xmm0
-	ret
-
-.p2align	4
-L$key_expansion_192a:
-	movups	%xmm0,(%rax)
-	leaq	16(%rax),%rax
-L$key_expansion_192a_cold:
-	movaps	%xmm2,%xmm5
-L$key_expansion_192b_warm:
-	shufps	$16,%xmm0,%xmm4
-	movdqa	%xmm2,%xmm3
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	pslldq	$4,%xmm3
-	xorps	%xmm4,%xmm0
-	pshufd	$85,%xmm1,%xmm1
-	pxor	%xmm3,%xmm2
-	pxor	%xmm1,%xmm0
-	pshufd	$255,%xmm0,%xmm3
-	pxor	%xmm3,%xmm2
-	ret
-
-.p2align	4
-L$key_expansion_192b:
-	movaps	%xmm0,%xmm3
-	shufps	$68,%xmm0,%xmm5
-	movups	%xmm5,(%rax)
-	shufps	$78,%xmm2,%xmm3
-	movups	%xmm3,16(%rax)
-	leaq	32(%rax),%rax
-	jmp	L$key_expansion_192b_warm
-
-.p2align	4
-L$key_expansion_256a:
-	movups	%xmm2,(%rax)
-	leaq	16(%rax),%rax
-L$key_expansion_256a_cold:
-	shufps	$16,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$140,%xmm0,%xmm4
-	xorps	%xmm4,%xmm0
-	shufps	$255,%xmm1,%xmm1
-	xorps	%xmm1,%xmm0
-	ret
-
-.p2align	4
-L$key_expansion_256b:
-	movups	%xmm0,(%rax)
-	leaq	16(%rax),%rax
-
-	shufps	$16,%xmm2,%xmm4
-	xorps	%xmm4,%xmm2
-	shufps	$140,%xmm2,%xmm4
-	xorps	%xmm4,%xmm2
-	shufps	$170,%xmm1,%xmm1
-	xorps	%xmm1,%xmm2
-	ret
-
-
-.section	__DATA,__const
-.p2align	6
-L$bswap_mask:
-.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-L$increment32:
-.long	6,6,6,0
-L$increment64:
-.long	1,0,0,0
-L$xts_magic:
-.long	0x87,0,1,0
-L$increment1:
-.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
-L$key_rotate:
-.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
-L$key_rotate192:
-.long	0x04070605,0x04070605,0x04070605,0x04070605
-L$key_rcon1:
-.long	1,1,1,1
-L$key_rcon1b:
-.long	0x1b,0x1b,0x1b,0x1b
-
-.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.p2align	6
-.text	
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64-apple.S
deleted file mode 100644
index bcbf824..0000000
--- a/apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64-apple.S
+++ /dev/null
@@ -1,423 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-
-
-
-
-.globl	_gcm_gmult_ssse3
-.private_extern _gcm_gmult_ssse3
-.p2align	4
-_gcm_gmult_ssse3:
-
-
-_CET_ENDBR
-	movdqu	(%rdi),%xmm0
-	movdqa	L$reverse_bytes(%rip),%xmm10
-	movdqa	L$low4_mask(%rip),%xmm2
-
-
-.byte	102,65,15,56,0,194
-
-
-	movdqa	%xmm2,%xmm1
-	pandn	%xmm0,%xmm1
-	psrld	$4,%xmm1
-	pand	%xmm2,%xmm0
-
-
-
-
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	movq	$5,%rax
-L$oop_row_1:
-	movdqa	(%rsi),%xmm4
-	leaq	16(%rsi),%rsi
-
-
-	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
-	movdqa	%xmm6,%xmm3
-	psrldq	$1,%xmm2
-
-
-
-
-	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
-
-
-	pxor	%xmm5,%xmm2
-
-
-
-	movdqa	%xmm4,%xmm5
-	psllq	$60,%xmm5
-	movdqa	%xmm5,%xmm6
-	pslldq	$8,%xmm6
-	pxor	%xmm6,%xmm3
-
-
-	psrldq	$8,%xmm5
-	pxor	%xmm5,%xmm2
-	psrlq	$4,%xmm4
-	pxor	%xmm4,%xmm2
-
-	subq	$1,%rax
-	jnz	L$oop_row_1
-
-
-
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$5,%xmm3
-	pxor	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	movq	$5,%rax
-L$oop_row_2:
-	movdqa	(%rsi),%xmm4
-	leaq	16(%rsi),%rsi
-
-
-	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
-	movdqa	%xmm6,%xmm3
-	psrldq	$1,%xmm2
-
-
-
-
-	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
-
-
-	pxor	%xmm5,%xmm2
-
-
-
-	movdqa	%xmm4,%xmm5
-	psllq	$60,%xmm5
-	movdqa	%xmm5,%xmm6
-	pslldq	$8,%xmm6
-	pxor	%xmm6,%xmm3
-
-
-	psrldq	$8,%xmm5
-	pxor	%xmm5,%xmm2
-	psrlq	$4,%xmm4
-	pxor	%xmm4,%xmm2
-
-	subq	$1,%rax
-	jnz	L$oop_row_2
-
-
-
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$5,%xmm3
-	pxor	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	movq	$6,%rax
-L$oop_row_3:
-	movdqa	(%rsi),%xmm4
-	leaq	16(%rsi),%rsi
-
-
-	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
-	movdqa	%xmm6,%xmm3
-	psrldq	$1,%xmm2
-
-
-
-
-	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
-
-
-	pxor	%xmm5,%xmm2
-
-
-
-	movdqa	%xmm4,%xmm5
-	psllq	$60,%xmm5
-	movdqa	%xmm5,%xmm6
-	pslldq	$8,%xmm6
-	pxor	%xmm6,%xmm3
-
-
-	psrldq	$8,%xmm5
-	pxor	%xmm5,%xmm2
-	psrlq	$4,%xmm4
-	pxor	%xmm4,%xmm2
-
-	subq	$1,%rax
-	jnz	L$oop_row_3
-
-
-
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$5,%xmm3
-	pxor	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-
-.byte	102,65,15,56,0,210
-	movdqu	%xmm2,(%rdi)
-
-
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-	ret
-
-
-
-
-
-
-
-
-
-.globl	_gcm_ghash_ssse3
-.private_extern _gcm_ghash_ssse3
-.p2align	4
-_gcm_ghash_ssse3:
-
-
-_CET_ENDBR
-	movdqu	(%rdi),%xmm0
-	movdqa	L$reverse_bytes(%rip),%xmm10
-	movdqa	L$low4_mask(%rip),%xmm11
-
-
-	andq	$-16,%rcx
-
-
-
-.byte	102,65,15,56,0,194
-
-
-	pxor	%xmm3,%xmm3
-L$oop_ghash:
-
-	movdqu	(%rdx),%xmm1
-.byte	102,65,15,56,0,202
-	pxor	%xmm1,%xmm0
-
-
-	movdqa	%xmm11,%xmm1
-	pandn	%xmm0,%xmm1
-	psrld	$4,%xmm1
-	pand	%xmm11,%xmm0
-
-
-
-
-	pxor	%xmm2,%xmm2
-
-	movq	$5,%rax
-L$oop_row_4:
-	movdqa	(%rsi),%xmm4
-	leaq	16(%rsi),%rsi
-
-
-	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
-	movdqa	%xmm6,%xmm3
-	psrldq	$1,%xmm2
-
-
-
-
-	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
-
-
-	pxor	%xmm5,%xmm2
-
-
-
-	movdqa	%xmm4,%xmm5
-	psllq	$60,%xmm5
-	movdqa	%xmm5,%xmm6
-	pslldq	$8,%xmm6
-	pxor	%xmm6,%xmm3
-
-
-	psrldq	$8,%xmm5
-	pxor	%xmm5,%xmm2
-	psrlq	$4,%xmm4
-	pxor	%xmm4,%xmm2
-
-	subq	$1,%rax
-	jnz	L$oop_row_4
-
-
-
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$5,%xmm3
-	pxor	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	movq	$5,%rax
-L$oop_row_5:
-	movdqa	(%rsi),%xmm4
-	leaq	16(%rsi),%rsi
-
-
-	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
-	movdqa	%xmm6,%xmm3
-	psrldq	$1,%xmm2
-
-
-
-
-	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
-
-
-	pxor	%xmm5,%xmm2
-
-
-
-	movdqa	%xmm4,%xmm5
-	psllq	$60,%xmm5
-	movdqa	%xmm5,%xmm6
-	pslldq	$8,%xmm6
-	pxor	%xmm6,%xmm3
-
-
-	psrldq	$8,%xmm5
-	pxor	%xmm5,%xmm2
-	psrlq	$4,%xmm4
-	pxor	%xmm4,%xmm2
-
-	subq	$1,%rax
-	jnz	L$oop_row_5
-
-
-
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$5,%xmm3
-	pxor	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	movq	$6,%rax
-L$oop_row_6:
-	movdqa	(%rsi),%xmm4
-	leaq	16(%rsi),%rsi
-
-
-	movdqa	%xmm2,%xmm6
-.byte	102,15,58,15,243,1
-	movdqa	%xmm6,%xmm3
-	psrldq	$1,%xmm2
-
-
-
-
-	movdqa	%xmm4,%xmm5
-.byte	102,15,56,0,224
-.byte	102,15,56,0,233
-
-
-	pxor	%xmm5,%xmm2
-
-
-
-	movdqa	%xmm4,%xmm5
-	psllq	$60,%xmm5
-	movdqa	%xmm5,%xmm6
-	pslldq	$8,%xmm6
-	pxor	%xmm6,%xmm3
-
-
-	psrldq	$8,%xmm5
-	pxor	%xmm5,%xmm2
-	psrlq	$4,%xmm4
-	pxor	%xmm4,%xmm2
-
-	subq	$1,%rax
-	jnz	L$oop_row_6
-
-
-
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$1,%xmm3
-	pxor	%xmm3,%xmm2
-	psrlq	$5,%xmm3
-	pxor	%xmm3,%xmm2
-	pxor	%xmm3,%xmm3
-	movdqa	%xmm2,%xmm0
-
-
-	leaq	-256(%rsi),%rsi
-
-
-	leaq	16(%rdx),%rdx
-	subq	$16,%rcx
-	jnz	L$oop_ghash
-
-
-.byte	102,65,15,56,0,194
-	movdqu	%xmm0,(%rdi)
-
-
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-	ret
-
-
-
-
-.section	__DATA,__const
-.p2align	4
-
-
-L$reverse_bytes:
-.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-L$low4_mask:
-.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-.text	
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/ghash-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/ghash-x86_64-apple.S
deleted file mode 100644
index c17d8f7..0000000
--- a/apple-x86_64/crypto/fipsmodule/ghash-x86_64-apple.S
+++ /dev/null
@@ -1,1132 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-.globl	_gcm_init_clmul
-.private_extern _gcm_init_clmul
-
-.p2align	4
-_gcm_init_clmul:
-
-
-_CET_ENDBR
-L$_init_clmul:
-	movdqu	(%rsi),%xmm2
-	pshufd	$78,%xmm2,%xmm2
-
-
-	pshufd	$255,%xmm2,%xmm4
-	movdqa	%xmm2,%xmm3
-	psllq	$1,%xmm2
-	pxor	%xmm5,%xmm5
-	psrlq	$63,%xmm3
-	pcmpgtd	%xmm4,%xmm5
-	pslldq	$8,%xmm3
-	por	%xmm3,%xmm2
-
-
-	pand	L$0x1c2_polynomial(%rip),%xmm5
-	pxor	%xmm5,%xmm2
-
-
-	pshufd	$78,%xmm2,%xmm6
-	movdqa	%xmm2,%xmm0
-	pxor	%xmm2,%xmm6
-	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,222,0
-	pxor	%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
-
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-
-	movdqa	%xmm0,%xmm4
-	movdqa	%xmm0,%xmm3
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm3
-	psllq	$1,%xmm0
-	pxor	%xmm3,%xmm0
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm3
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm3
-	pxor	%xmm4,%xmm0
-	pxor	%xmm3,%xmm1
-
-
-	movdqa	%xmm0,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm1
-	pxor	%xmm0,%xmm4
-	psrlq	$5,%xmm0
-	pxor	%xmm4,%xmm0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-	pshufd	$78,%xmm2,%xmm3
-	pshufd	$78,%xmm0,%xmm4
-	pxor	%xmm2,%xmm3
-	movdqu	%xmm2,0(%rdi)
-	pxor	%xmm0,%xmm4
-	movdqu	%xmm0,16(%rdi)
-.byte	102,15,58,15,227,8
-	movdqu	%xmm4,32(%rdi)
-	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,222,0
-	pxor	%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
-
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-
-	movdqa	%xmm0,%xmm4
-	movdqa	%xmm0,%xmm3
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm3
-	psllq	$1,%xmm0
-	pxor	%xmm3,%xmm0
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm3
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm3
-	pxor	%xmm4,%xmm0
-	pxor	%xmm3,%xmm1
-
-
-	movdqa	%xmm0,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm1
-	pxor	%xmm0,%xmm4
-	psrlq	$5,%xmm0
-	pxor	%xmm4,%xmm0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-	movdqa	%xmm0,%xmm5
-	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,222,0
-	pxor	%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
-
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-
-	movdqa	%xmm0,%xmm4
-	movdqa	%xmm0,%xmm3
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm3
-	psllq	$1,%xmm0
-	pxor	%xmm3,%xmm0
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm3
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm3
-	pxor	%xmm4,%xmm0
-	pxor	%xmm3,%xmm1
-
-
-	movdqa	%xmm0,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm1
-	pxor	%xmm0,%xmm4
-	psrlq	$5,%xmm0
-	pxor	%xmm4,%xmm0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-	pshufd	$78,%xmm5,%xmm3
-	pshufd	$78,%xmm0,%xmm4
-	pxor	%xmm5,%xmm3
-	movdqu	%xmm5,48(%rdi)
-	pxor	%xmm0,%xmm4
-	movdqu	%xmm0,64(%rdi)
-.byte	102,15,58,15,227,8
-	movdqu	%xmm4,80(%rdi)
-	ret
-
-
-
-.globl	_gcm_gmult_clmul
-.private_extern _gcm_gmult_clmul
-
-.p2align	4
-_gcm_gmult_clmul:
-
-_CET_ENDBR
-L$_gmult_clmul:
-	movdqu	(%rdi),%xmm0
-	movdqa	L$bswap_mask(%rip),%xmm5
-	movdqu	(%rsi),%xmm2
-	movdqu	32(%rsi),%xmm4
-.byte	102,15,56,0,197
-	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,220,0
-	pxor	%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
-
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-
-	movdqa	%xmm0,%xmm4
-	movdqa	%xmm0,%xmm3
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm3
-	psllq	$1,%xmm0
-	pxor	%xmm3,%xmm0
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm3
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm3
-	pxor	%xmm4,%xmm0
-	pxor	%xmm3,%xmm1
-
-
-	movdqa	%xmm0,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm1
-	pxor	%xmm0,%xmm4
-	psrlq	$5,%xmm0
-	pxor	%xmm4,%xmm0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,197
-	movdqu	%xmm0,(%rdi)
-	ret
-
-
-.globl	_gcm_ghash_clmul
-.private_extern _gcm_ghash_clmul
-
-.p2align	5
-_gcm_ghash_clmul:
-
-
-_CET_ENDBR
-L$_ghash_clmul:
-	movdqa	L$bswap_mask(%rip),%xmm10
-
-	movdqu	(%rdi),%xmm0
-	movdqu	(%rsi),%xmm2
-	movdqu	32(%rsi),%xmm7
-.byte	102,65,15,56,0,194
-
-	subq	$0x10,%rcx
-	jz	L$odd_tail
-
-	movdqu	16(%rsi),%xmm6
-	leaq	_OPENSSL_ia32cap_P(%rip),%rax
-	movl	4(%rax),%eax
-	cmpq	$0x30,%rcx
-	jb	L$skip4x
-
-	andl	$71303168,%eax
-	cmpl	$4194304,%eax
-	je	L$skip4x
-
-	subq	$0x30,%rcx
-	movq	$0xA040608020C0E000,%rax
-	movdqu	48(%rsi),%xmm14
-	movdqu	64(%rsi),%xmm15
-
-
-
-
-	movdqu	48(%rdx),%xmm3
-	movdqu	32(%rdx),%xmm11
-.byte	102,65,15,56,0,218
-.byte	102,69,15,56,0,218
-	movdqa	%xmm3,%xmm5
-	pshufd	$78,%xmm3,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,68,218,0
-.byte	102,15,58,68,234,17
-.byte	102,15,58,68,231,0
-
-	movdqa	%xmm11,%xmm13
-	pshufd	$78,%xmm11,%xmm12
-	pxor	%xmm11,%xmm12
-.byte	102,68,15,58,68,222,0
-.byte	102,68,15,58,68,238,17
-.byte	102,68,15,58,68,231,16
-	xorps	%xmm11,%xmm3
-	xorps	%xmm13,%xmm5
-	movups	80(%rsi),%xmm7
-	xorps	%xmm12,%xmm4
-
-	movdqu	16(%rdx),%xmm11
-	movdqu	0(%rdx),%xmm8
-.byte	102,69,15,56,0,218
-.byte	102,69,15,56,0,194
-	movdqa	%xmm11,%xmm13
-	pshufd	$78,%xmm11,%xmm12
-	pxor	%xmm8,%xmm0
-	pxor	%xmm11,%xmm12
-.byte	102,69,15,58,68,222,0
-	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm8
-	pxor	%xmm0,%xmm8
-.byte	102,69,15,58,68,238,17
-.byte	102,68,15,58,68,231,0
-	xorps	%xmm11,%xmm3
-	xorps	%xmm13,%xmm5
-
-	leaq	64(%rdx),%rdx
-	subq	$0x40,%rcx
-	jc	L$tail4x
-
-	jmp	L$mod4_loop
-.p2align	5
-L$mod4_loop:
-.byte	102,65,15,58,68,199,0
-	xorps	%xmm12,%xmm4
-	movdqu	48(%rdx),%xmm11
-.byte	102,69,15,56,0,218
-.byte	102,65,15,58,68,207,17
-	xorps	%xmm3,%xmm0
-	movdqu	32(%rdx),%xmm3
-	movdqa	%xmm11,%xmm13
-.byte	102,68,15,58,68,199,16
-	pshufd	$78,%xmm11,%xmm12
-	xorps	%xmm5,%xmm1
-	pxor	%xmm11,%xmm12
-.byte	102,65,15,56,0,218
-	movups	32(%rsi),%xmm7
-	xorps	%xmm4,%xmm8
-.byte	102,68,15,58,68,218,0
-	pshufd	$78,%xmm3,%xmm4
-
-	pxor	%xmm0,%xmm8
-	movdqa	%xmm3,%xmm5
-	pxor	%xmm1,%xmm8
-	pxor	%xmm3,%xmm4
-	movdqa	%xmm8,%xmm9
-.byte	102,68,15,58,68,234,17
-	pslldq	$8,%xmm8
-	psrldq	$8,%xmm9
-	pxor	%xmm8,%xmm0
-	movdqa	L$7_mask(%rip),%xmm8
-	pxor	%xmm9,%xmm1
-.byte	102,76,15,110,200
-
-	pand	%xmm0,%xmm8
-.byte	102,69,15,56,0,200
-	pxor	%xmm0,%xmm9
-.byte	102,68,15,58,68,231,0
-	psllq	$57,%xmm9
-	movdqa	%xmm9,%xmm8
-	pslldq	$8,%xmm9
-.byte	102,15,58,68,222,0
-	psrldq	$8,%xmm8
-	pxor	%xmm9,%xmm0
-	pxor	%xmm8,%xmm1
-	movdqu	0(%rdx),%xmm8
-
-	movdqa	%xmm0,%xmm9
-	psrlq	$1,%xmm0
-.byte	102,15,58,68,238,17
-	xorps	%xmm11,%xmm3
-	movdqu	16(%rdx),%xmm11
-.byte	102,69,15,56,0,218
-.byte	102,15,58,68,231,16
-	xorps	%xmm13,%xmm5
-	movups	80(%rsi),%xmm7
-.byte	102,69,15,56,0,194
-	pxor	%xmm9,%xmm1
-	pxor	%xmm0,%xmm9
-	psrlq	$5,%xmm0
-
-	movdqa	%xmm11,%xmm13
-	pxor	%xmm12,%xmm4
-	pshufd	$78,%xmm11,%xmm12
-	pxor	%xmm9,%xmm0
-	pxor	%xmm8,%xmm1
-	pxor	%xmm11,%xmm12
-.byte	102,69,15,58,68,222,0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-	movdqa	%xmm0,%xmm1
-.byte	102,69,15,58,68,238,17
-	xorps	%xmm11,%xmm3
-	pshufd	$78,%xmm0,%xmm8
-	pxor	%xmm0,%xmm8
-
-.byte	102,68,15,58,68,231,0
-	xorps	%xmm13,%xmm5
-
-	leaq	64(%rdx),%rdx
-	subq	$0x40,%rcx
-	jnc	L$mod4_loop
-
-L$tail4x:
-.byte	102,65,15,58,68,199,0
-.byte	102,65,15,58,68,207,17
-.byte	102,68,15,58,68,199,16
-	xorps	%xmm12,%xmm4
-	xorps	%xmm3,%xmm0
-	xorps	%xmm5,%xmm1
-	pxor	%xmm0,%xmm1
-	pxor	%xmm4,%xmm8
-
-	pxor	%xmm1,%xmm8
-	pxor	%xmm0,%xmm1
-
-	movdqa	%xmm8,%xmm9
-	psrldq	$8,%xmm8
-	pslldq	$8,%xmm9
-	pxor	%xmm8,%xmm1
-	pxor	%xmm9,%xmm0
-
-	movdqa	%xmm0,%xmm4
-	movdqa	%xmm0,%xmm3
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm3
-	psllq	$1,%xmm0
-	pxor	%xmm3,%xmm0
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm3
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm3
-	pxor	%xmm4,%xmm0
-	pxor	%xmm3,%xmm1
-
-
-	movdqa	%xmm0,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm1
-	pxor	%xmm0,%xmm4
-	psrlq	$5,%xmm0
-	pxor	%xmm4,%xmm0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-	addq	$0x40,%rcx
-	jz	L$done
-	movdqu	32(%rsi),%xmm7
-	subq	$0x10,%rcx
-	jz	L$odd_tail
-L$skip4x:
-
-
-
-
-
-	movdqu	(%rdx),%xmm8
-	movdqu	16(%rdx),%xmm3
-.byte	102,69,15,56,0,194
-.byte	102,65,15,56,0,218
-	pxor	%xmm8,%xmm0
-
-	movdqa	%xmm3,%xmm5
-	pshufd	$78,%xmm3,%xmm4
-	pxor	%xmm3,%xmm4
-.byte	102,15,58,68,218,0
-.byte	102,15,58,68,234,17
-.byte	102,15,58,68,231,0
-
-	leaq	32(%rdx),%rdx
-	nop
-	subq	$0x20,%rcx
-	jbe	L$even_tail
-	nop
-	jmp	L$mod_loop
-
-.p2align	5
-L$mod_loop:
-	movdqa	%xmm0,%xmm1
-	movdqa	%xmm4,%xmm8
-	pshufd	$78,%xmm0,%xmm4
-	pxor	%xmm0,%xmm4
-
-.byte	102,15,58,68,198,0
-.byte	102,15,58,68,206,17
-.byte	102,15,58,68,231,16
-
-	pxor	%xmm3,%xmm0
-	pxor	%xmm5,%xmm1
-	movdqu	(%rdx),%xmm9
-	pxor	%xmm0,%xmm8
-.byte	102,69,15,56,0,202
-	movdqu	16(%rdx),%xmm3
-
-	pxor	%xmm1,%xmm8
-	pxor	%xmm9,%xmm1
-	pxor	%xmm8,%xmm4
-.byte	102,65,15,56,0,218
-	movdqa	%xmm4,%xmm8
-	psrldq	$8,%xmm8
-	pslldq	$8,%xmm4
-	pxor	%xmm8,%xmm1
-	pxor	%xmm4,%xmm0
-
-	movdqa	%xmm3,%xmm5
-
-	movdqa	%xmm0,%xmm9
-	movdqa	%xmm0,%xmm8
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm8
-.byte	102,15,58,68,218,0
-	psllq	$1,%xmm0
-	pxor	%xmm8,%xmm0
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm8
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm8
-	pxor	%xmm9,%xmm0
-	pshufd	$78,%xmm5,%xmm4
-	pxor	%xmm8,%xmm1
-	pxor	%xmm5,%xmm4
-
-	movdqa	%xmm0,%xmm9
-	psrlq	$1,%xmm0
-.byte	102,15,58,68,234,17
-	pxor	%xmm9,%xmm1
-	pxor	%xmm0,%xmm9
-	psrlq	$5,%xmm0
-	pxor	%xmm9,%xmm0
-	leaq	32(%rdx),%rdx
-	psrlq	$1,%xmm0
-.byte	102,15,58,68,231,0
-	pxor	%xmm1,%xmm0
-
-	subq	$0x20,%rcx
-	ja	L$mod_loop
-
-L$even_tail:
-	movdqa	%xmm0,%xmm1
-	movdqa	%xmm4,%xmm8
-	pshufd	$78,%xmm0,%xmm4
-	pxor	%xmm0,%xmm4
-
-.byte	102,15,58,68,198,0
-.byte	102,15,58,68,206,17
-.byte	102,15,58,68,231,16
-
-	pxor	%xmm3,%xmm0
-	pxor	%xmm5,%xmm1
-	pxor	%xmm0,%xmm8
-	pxor	%xmm1,%xmm8
-	pxor	%xmm8,%xmm4
-	movdqa	%xmm4,%xmm8
-	psrldq	$8,%xmm8
-	pslldq	$8,%xmm4
-	pxor	%xmm8,%xmm1
-	pxor	%xmm4,%xmm0
-
-	movdqa	%xmm0,%xmm4
-	movdqa	%xmm0,%xmm3
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm3
-	psllq	$1,%xmm0
-	pxor	%xmm3,%xmm0
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm3
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm3
-	pxor	%xmm4,%xmm0
-	pxor	%xmm3,%xmm1
-
-
-	movdqa	%xmm0,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm1
-	pxor	%xmm0,%xmm4
-	psrlq	$5,%xmm0
-	pxor	%xmm4,%xmm0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-	testq	%rcx,%rcx
-	jnz	L$done
-
-L$odd_tail:
-	movdqu	(%rdx),%xmm8
-.byte	102,69,15,56,0,194
-	pxor	%xmm8,%xmm0
-	movdqa	%xmm0,%xmm1
-	pshufd	$78,%xmm0,%xmm3
-	pxor	%xmm0,%xmm3
-.byte	102,15,58,68,194,0
-.byte	102,15,58,68,202,17
-.byte	102,15,58,68,223,0
-	pxor	%xmm0,%xmm3
-	pxor	%xmm1,%xmm3
-
-	movdqa	%xmm3,%xmm4
-	psrldq	$8,%xmm3
-	pslldq	$8,%xmm4
-	pxor	%xmm3,%xmm1
-	pxor	%xmm4,%xmm0
-
-	movdqa	%xmm0,%xmm4
-	movdqa	%xmm0,%xmm3
-	psllq	$5,%xmm0
-	pxor	%xmm0,%xmm3
-	psllq	$1,%xmm0
-	pxor	%xmm3,%xmm0
-	psllq	$57,%xmm0
-	movdqa	%xmm0,%xmm3
-	pslldq	$8,%xmm0
-	psrldq	$8,%xmm3
-	pxor	%xmm4,%xmm0
-	pxor	%xmm3,%xmm1
-
-
-	movdqa	%xmm0,%xmm4
-	psrlq	$1,%xmm0
-	pxor	%xmm4,%xmm1
-	pxor	%xmm0,%xmm4
-	psrlq	$5,%xmm0
-	pxor	%xmm4,%xmm0
-	psrlq	$1,%xmm0
-	pxor	%xmm1,%xmm0
-L$done:
-.byte	102,65,15,56,0,194
-	movdqu	%xmm0,(%rdi)
-	ret
-
-
-
-.globl	_gcm_init_avx
-.private_extern _gcm_init_avx
-
-.p2align	5
-_gcm_init_avx:
-
-_CET_ENDBR
-	vzeroupper
-
-	vmovdqu	(%rsi),%xmm2
-	vpshufd	$78,%xmm2,%xmm2
-
-
-	vpshufd	$255,%xmm2,%xmm4
-	vpsrlq	$63,%xmm2,%xmm3
-	vpsllq	$1,%xmm2,%xmm2
-	vpxor	%xmm5,%xmm5,%xmm5
-	vpcmpgtd	%xmm4,%xmm5,%xmm5
-	vpslldq	$8,%xmm3,%xmm3
-	vpor	%xmm3,%xmm2,%xmm2
-
-
-	vpand	L$0x1c2_polynomial(%rip),%xmm5,%xmm5
-	vpxor	%xmm5,%xmm2,%xmm2
-
-	vpunpckhqdq	%xmm2,%xmm2,%xmm6
-	vmovdqa	%xmm2,%xmm0
-	vpxor	%xmm2,%xmm6,%xmm6
-	movq	$4,%r10
-	jmp	L$init_start_avx
-.p2align	5
-L$init_loop_avx:
-	vpalignr	$8,%xmm3,%xmm4,%xmm5
-	vmovdqu	%xmm5,-16(%rdi)
-	vpunpckhqdq	%xmm0,%xmm0,%xmm3
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
-	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
-	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
-	vpxor	%xmm0,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-
-	vpslldq	$8,%xmm3,%xmm4
-	vpsrldq	$8,%xmm3,%xmm3
-	vpxor	%xmm4,%xmm0,%xmm0
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpsllq	$57,%xmm0,%xmm3
-	vpsllq	$62,%xmm0,%xmm4
-	vpxor	%xmm3,%xmm4,%xmm4
-	vpsllq	$63,%xmm0,%xmm3
-	vpxor	%xmm3,%xmm4,%xmm4
-	vpslldq	$8,%xmm4,%xmm3
-	vpsrldq	$8,%xmm4,%xmm4
-	vpxor	%xmm3,%xmm0,%xmm0
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vpsrlq	$1,%xmm0,%xmm4
-	vpxor	%xmm0,%xmm1,%xmm1
-	vpxor	%xmm4,%xmm0,%xmm0
-	vpsrlq	$5,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm0,%xmm0
-	vpsrlq	$1,%xmm0,%xmm0
-	vpxor	%xmm1,%xmm0,%xmm0
-L$init_start_avx:
-	vmovdqa	%xmm0,%xmm5
-	vpunpckhqdq	%xmm0,%xmm0,%xmm3
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
-	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
-	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
-	vpxor	%xmm0,%xmm1,%xmm4
-	vpxor	%xmm4,%xmm3,%xmm3
-
-	vpslldq	$8,%xmm3,%xmm4
-	vpsrldq	$8,%xmm3,%xmm3
-	vpxor	%xmm4,%xmm0,%xmm0
-	vpxor	%xmm3,%xmm1,%xmm1
-	vpsllq	$57,%xmm0,%xmm3
-	vpsllq	$62,%xmm0,%xmm4
-	vpxor	%xmm3,%xmm4,%xmm4
-	vpsllq	$63,%xmm0,%xmm3
-	vpxor	%xmm3,%xmm4,%xmm4
-	vpslldq	$8,%xmm4,%xmm3
-	vpsrldq	$8,%xmm4,%xmm4
-	vpxor	%xmm3,%xmm0,%xmm0
-	vpxor	%xmm4,%xmm1,%xmm1
-
-	vpsrlq	$1,%xmm0,%xmm4
-	vpxor	%xmm0,%xmm1,%xmm1
-	vpxor	%xmm4,%xmm0,%xmm0
-	vpsrlq	$5,%xmm4,%xmm4
-	vpxor	%xmm4,%xmm0,%xmm0
-	vpsrlq	$1,%xmm0,%xmm0
-	vpxor	%xmm1,%xmm0,%xmm0
-	vpshufd	$78,%xmm5,%xmm3
-	vpshufd	$78,%xmm0,%xmm4
-	vpxor	%xmm5,%xmm3,%xmm3
-	vmovdqu	%xmm5,0(%rdi)
-	vpxor	%xmm0,%xmm4,%xmm4
-	vmovdqu	%xmm0,16(%rdi)
-	leaq	48(%rdi),%rdi
-	subq	$1,%r10
-	jnz	L$init_loop_avx
-
-	vpalignr	$8,%xmm4,%xmm3,%xmm5
-	vmovdqu	%xmm5,-16(%rdi)
-
-	vzeroupper
-	ret
-
-
-
-.globl	_gcm_gmult_avx
-.private_extern _gcm_gmult_avx
-
-.p2align	5
-_gcm_gmult_avx:
-
-_CET_ENDBR
-	jmp	L$_gmult_clmul
-
-
-.globl	_gcm_ghash_avx
-.private_extern _gcm_ghash_avx
-
-.p2align	5
-_gcm_ghash_avx:
-
-_CET_ENDBR
-	vzeroupper
-
-	vmovdqu	(%rdi),%xmm10
-	leaq	L$0x1c2_polynomial(%rip),%r10
-	leaq	64(%rsi),%rsi
-	vmovdqu	L$bswap_mask(%rip),%xmm13
-	vpshufb	%xmm13,%xmm10,%xmm10
-	cmpq	$0x80,%rcx
-	jb	L$short_avx
-	subq	$0x80,%rcx
-
-	vmovdqu	112(%rdx),%xmm14
-	vmovdqu	0-64(%rsi),%xmm6
-	vpshufb	%xmm13,%xmm14,%xmm14
-	vmovdqu	32-64(%rsi),%xmm7
-
-	vpunpckhqdq	%xmm14,%xmm14,%xmm9
-	vmovdqu	96(%rdx),%xmm15
-	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
-	vpxor	%xmm14,%xmm9,%xmm9
-	vpshufb	%xmm13,%xmm15,%xmm15
-	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
-	vmovdqu	16-64(%rsi),%xmm6
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vmovdqu	80(%rdx),%xmm14
-	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
-	vpxor	%xmm15,%xmm8,%xmm8
-
-	vpshufb	%xmm13,%xmm14,%xmm14
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
-	vpunpckhqdq	%xmm14,%xmm14,%xmm9
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
-	vmovdqu	48-64(%rsi),%xmm6
-	vpxor	%xmm14,%xmm9,%xmm9
-	vmovdqu	64(%rdx),%xmm15
-	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
-	vmovdqu	80-64(%rsi),%xmm7
-
-	vpshufb	%xmm13,%xmm15,%xmm15
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
-	vmovdqu	64-64(%rsi),%xmm6
-	vpxor	%xmm2,%xmm5,%xmm5
-	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
-	vpxor	%xmm15,%xmm8,%xmm8
-
-	vmovdqu	48(%rdx),%xmm14
-	vpxor	%xmm3,%xmm0,%xmm0
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpshufb	%xmm13,%xmm14,%xmm14
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
-	vmovdqu	96-64(%rsi),%xmm6
-	vpxor	%xmm5,%xmm2,%xmm2
-	vpunpckhqdq	%xmm14,%xmm14,%xmm9
-	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
-	vmovdqu	128-64(%rsi),%xmm7
-	vpxor	%xmm14,%xmm9,%xmm9
-
-	vmovdqu	32(%rdx),%xmm15
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpshufb	%xmm13,%xmm15,%xmm15
-	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
-	vmovdqu	112-64(%rsi),%xmm6
-	vpxor	%xmm2,%xmm5,%xmm5
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
-	vpxor	%xmm15,%xmm8,%xmm8
-
-	vmovdqu	16(%rdx),%xmm14
-	vpxor	%xmm3,%xmm0,%xmm0
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpshufb	%xmm13,%xmm14,%xmm14
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
-	vmovdqu	144-64(%rsi),%xmm6
-	vpxor	%xmm5,%xmm2,%xmm2
-	vpunpckhqdq	%xmm14,%xmm14,%xmm9
-	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
-	vmovdqu	176-64(%rsi),%xmm7
-	vpxor	%xmm14,%xmm9,%xmm9
-
-	vmovdqu	(%rdx),%xmm15
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpshufb	%xmm13,%xmm15,%xmm15
-	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
-	vmovdqu	160-64(%rsi),%xmm6
-	vpxor	%xmm2,%xmm5,%xmm5
-	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
-
-	leaq	128(%rdx),%rdx
-	cmpq	$0x80,%rcx
-	jb	L$tail_avx
-
-	vpxor	%xmm10,%xmm15,%xmm15
-	subq	$0x80,%rcx
-	jmp	L$oop8x_avx
-
-.p2align	5
-L$oop8x_avx:
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vmovdqu	112(%rdx),%xmm14
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpxor	%xmm15,%xmm8,%xmm8
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
-	vpshufb	%xmm13,%xmm14,%xmm14
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
-	vmovdqu	0-64(%rsi),%xmm6
-	vpunpckhqdq	%xmm14,%xmm14,%xmm9
-	vpxor	%xmm2,%xmm5,%xmm5
-	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
-	vmovdqu	32-64(%rsi),%xmm7
-	vpxor	%xmm14,%xmm9,%xmm9
-
-	vmovdqu	96(%rdx),%xmm15
-	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
-	vpxor	%xmm3,%xmm10,%xmm10
-	vpshufb	%xmm13,%xmm15,%xmm15
-	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
-	vxorps	%xmm4,%xmm11,%xmm11
-	vmovdqu	16-64(%rsi),%xmm6
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
-	vpxor	%xmm5,%xmm12,%xmm12
-	vxorps	%xmm15,%xmm8,%xmm8
-
-	vmovdqu	80(%rdx),%xmm14
-	vpxor	%xmm10,%xmm12,%xmm12
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
-	vpxor	%xmm11,%xmm12,%xmm12
-	vpslldq	$8,%xmm12,%xmm9
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
-	vpsrldq	$8,%xmm12,%xmm12
-	vpxor	%xmm9,%xmm10,%xmm10
-	vmovdqu	48-64(%rsi),%xmm6
-	vpshufb	%xmm13,%xmm14,%xmm14
-	vxorps	%xmm12,%xmm11,%xmm11
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpunpckhqdq	%xmm14,%xmm14,%xmm9
-	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
-	vmovdqu	80-64(%rsi),%xmm7
-	vpxor	%xmm14,%xmm9,%xmm9
-	vpxor	%xmm2,%xmm5,%xmm5
-
-	vmovdqu	64(%rdx),%xmm15
-	vpalignr	$8,%xmm10,%xmm10,%xmm12
-	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
-	vpshufb	%xmm13,%xmm15,%xmm15
-	vpxor	%xmm3,%xmm0,%xmm0
-	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
-	vmovdqu	64-64(%rsi),%xmm6
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
-	vxorps	%xmm15,%xmm8,%xmm8
-	vpxor	%xmm5,%xmm2,%xmm2
-
-	vmovdqu	48(%rdx),%xmm14
-	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
-	vpshufb	%xmm13,%xmm14,%xmm14
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
-	vmovdqu	96-64(%rsi),%xmm6
-	vpunpckhqdq	%xmm14,%xmm14,%xmm9
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
-	vmovdqu	128-64(%rsi),%xmm7
-	vpxor	%xmm14,%xmm9,%xmm9
-	vpxor	%xmm2,%xmm5,%xmm5
-
-	vmovdqu	32(%rdx),%xmm15
-	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
-	vpshufb	%xmm13,%xmm15,%xmm15
-	vpxor	%xmm3,%xmm0,%xmm0
-	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
-	vmovdqu	112-64(%rsi),%xmm6
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vpxor	%xmm4,%xmm1,%xmm1
-	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
-	vpxor	%xmm15,%xmm8,%xmm8
-	vpxor	%xmm5,%xmm2,%xmm2
-	vxorps	%xmm12,%xmm10,%xmm10
-
-	vmovdqu	16(%rdx),%xmm14
-	vpalignr	$8,%xmm10,%xmm10,%xmm12
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
-	vpshufb	%xmm13,%xmm14,%xmm14
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
-	vmovdqu	144-64(%rsi),%xmm6
-	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
-	vxorps	%xmm11,%xmm12,%xmm12
-	vpunpckhqdq	%xmm14,%xmm14,%xmm9
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
-	vmovdqu	176-64(%rsi),%xmm7
-	vpxor	%xmm14,%xmm9,%xmm9
-	vpxor	%xmm2,%xmm5,%xmm5
-
-	vmovdqu	(%rdx),%xmm15
-	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
-	vpshufb	%xmm13,%xmm15,%xmm15
-	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
-	vmovdqu	160-64(%rsi),%xmm6
-	vpxor	%xmm12,%xmm15,%xmm15
-	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
-	vpxor	%xmm10,%xmm15,%xmm15
-
-	leaq	128(%rdx),%rdx
-	subq	$0x80,%rcx
-	jnc	L$oop8x_avx
-
-	addq	$0x80,%rcx
-	jmp	L$tail_no_xor_avx
-
-.p2align	5
-L$short_avx:
-	vmovdqu	-16(%rdx,%rcx,1),%xmm14
-	leaq	(%rdx,%rcx,1),%rdx
-	vmovdqu	0-64(%rsi),%xmm6
-	vmovdqu	32-64(%rsi),%xmm7
-	vpshufb	%xmm13,%xmm14,%xmm15
-
-	vmovdqa	%xmm0,%xmm3
-	vmovdqa	%xmm1,%xmm4
-	vmovdqa	%xmm2,%xmm5
-	subq	$0x10,%rcx
-	jz	L$tail_avx
-
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
-	vpxor	%xmm15,%xmm8,%xmm8
-	vmovdqu	-32(%rdx),%xmm14
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
-	vmovdqu	16-64(%rsi),%xmm6
-	vpshufb	%xmm13,%xmm14,%xmm15
-	vpxor	%xmm2,%xmm5,%xmm5
-	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
-	vpsrldq	$8,%xmm7,%xmm7
-	subq	$0x10,%rcx
-	jz	L$tail_avx
-
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
-	vpxor	%xmm15,%xmm8,%xmm8
-	vmovdqu	-48(%rdx),%xmm14
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
-	vmovdqu	48-64(%rsi),%xmm6
-	vpshufb	%xmm13,%xmm14,%xmm15
-	vpxor	%xmm2,%xmm5,%xmm5
-	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
-	vmovdqu	80-64(%rsi),%xmm7
-	subq	$0x10,%rcx
-	jz	L$tail_avx
-
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
-	vpxor	%xmm15,%xmm8,%xmm8
-	vmovdqu	-64(%rdx),%xmm14
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
-	vmovdqu	64-64(%rsi),%xmm6
-	vpshufb	%xmm13,%xmm14,%xmm15
-	vpxor	%xmm2,%xmm5,%xmm5
-	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
-	vpsrldq	$8,%xmm7,%xmm7
-	subq	$0x10,%rcx
-	jz	L$tail_avx
-
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
-	vpxor	%xmm15,%xmm8,%xmm8
-	vmovdqu	-80(%rdx),%xmm14
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
-	vmovdqu	96-64(%rsi),%xmm6
-	vpshufb	%xmm13,%xmm14,%xmm15
-	vpxor	%xmm2,%xmm5,%xmm5
-	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
-	vmovdqu	128-64(%rsi),%xmm7
-	subq	$0x10,%rcx
-	jz	L$tail_avx
-
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
-	vpxor	%xmm15,%xmm8,%xmm8
-	vmovdqu	-96(%rdx),%xmm14
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
-	vmovdqu	112-64(%rsi),%xmm6
-	vpshufb	%xmm13,%xmm14,%xmm15
-	vpxor	%xmm2,%xmm5,%xmm5
-	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
-	vpsrldq	$8,%xmm7,%xmm7
-	subq	$0x10,%rcx
-	jz	L$tail_avx
-
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
-	vpxor	%xmm15,%xmm8,%xmm8
-	vmovdqu	-112(%rdx),%xmm14
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
-	vmovdqu	144-64(%rsi),%xmm6
-	vpshufb	%xmm13,%xmm14,%xmm15
-	vpxor	%xmm2,%xmm5,%xmm5
-	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
-	vmovq	184-64(%rsi),%xmm7
-	subq	$0x10,%rcx
-	jmp	L$tail_avx
-
-.p2align	5
-L$tail_avx:
-	vpxor	%xmm10,%xmm15,%xmm15
-L$tail_no_xor_avx:
-	vpunpckhqdq	%xmm15,%xmm15,%xmm8
-	vpxor	%xmm0,%xmm3,%xmm3
-	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
-	vpxor	%xmm15,%xmm8,%xmm8
-	vpxor	%xmm1,%xmm4,%xmm4
-	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
-	vpxor	%xmm2,%xmm5,%xmm5
-	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
-
-	vmovdqu	(%r10),%xmm12
-
-	vpxor	%xmm0,%xmm3,%xmm10
-	vpxor	%xmm1,%xmm4,%xmm11
-	vpxor	%xmm2,%xmm5,%xmm5
-
-	vpxor	%xmm10,%xmm5,%xmm5
-	vpxor	%xmm11,%xmm5,%xmm5
-	vpslldq	$8,%xmm5,%xmm9
-	vpsrldq	$8,%xmm5,%xmm5
-	vpxor	%xmm9,%xmm10,%xmm10
-	vpxor	%xmm5,%xmm11,%xmm11
-
-	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
-	vpalignr	$8,%xmm10,%xmm10,%xmm10
-	vpxor	%xmm9,%xmm10,%xmm10
-
-	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
-	vpalignr	$8,%xmm10,%xmm10,%xmm10
-	vpxor	%xmm11,%xmm10,%xmm10
-	vpxor	%xmm9,%xmm10,%xmm10
-
-	cmpq	$0,%rcx
-	jne	L$short_avx
-
-	vpshufb	%xmm13,%xmm10,%xmm10
-	vmovdqu	%xmm10,(%rdi)
-	vzeroupper
-	ret
-
-
-
-.section	__DATA,__const
-.p2align	6
-L$bswap_mask:
-.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-L$0x1c2_polynomial:
-.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
-L$7_mask:
-.long	7,0,7,0
-.p2align	6
-
-.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.p2align	6
-.text	
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/md5-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/md5-x86_64-apple.S
deleted file mode 100644
index e4c0241..0000000
--- a/apple-x86_64/crypto/fipsmodule/md5-x86_64-apple.S
+++ /dev/null
@@ -1,690 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-.p2align	4
-
-.globl	_md5_block_asm_data_order
-.private_extern _md5_block_asm_data_order
-
-_md5_block_asm_data_order:
-
-_CET_ENDBR
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r14
-
-	pushq	%r15
-
-L$prologue:
-
-
-
-
-	movq	%rdi,%rbp
-	shlq	$6,%rdx
-	leaq	(%rsi,%rdx,1),%rdi
-	movl	0(%rbp),%eax
-	movl	4(%rbp),%ebx
-	movl	8(%rbp),%ecx
-	movl	12(%rbp),%edx
-
-
-
-
-
-
-
-	cmpq	%rdi,%rsi
-	je	L$end
-
-
-L$loop:
-	movl	%eax,%r8d
-	movl	%ebx,%r9d
-	movl	%ecx,%r14d
-	movl	%edx,%r15d
-	movl	0(%rsi),%r10d
-	movl	%edx,%r11d
-	xorl	%ecx,%r11d
-	leal	-680876936(%rax,%r10,1),%eax
-	andl	%ebx,%r11d
-	xorl	%edx,%r11d
-	movl	4(%rsi),%r10d
-	addl	%r11d,%eax
-	roll	$7,%eax
-	movl	%ecx,%r11d
-	addl	%ebx,%eax
-	xorl	%ebx,%r11d
-	leal	-389564586(%rdx,%r10,1),%edx
-	andl	%eax,%r11d
-	xorl	%ecx,%r11d
-	movl	8(%rsi),%r10d
-	addl	%r11d,%edx
-	roll	$12,%edx
-	movl	%ebx,%r11d
-	addl	%eax,%edx
-	xorl	%eax,%r11d
-	leal	606105819(%rcx,%r10,1),%ecx
-	andl	%edx,%r11d
-	xorl	%ebx,%r11d
-	movl	12(%rsi),%r10d
-	addl	%r11d,%ecx
-	roll	$17,%ecx
-	movl	%eax,%r11d
-	addl	%edx,%ecx
-	xorl	%edx,%r11d
-	leal	-1044525330(%rbx,%r10,1),%ebx
-	andl	%ecx,%r11d
-	xorl	%eax,%r11d
-	movl	16(%rsi),%r10d
-	addl	%r11d,%ebx
-	roll	$22,%ebx
-	movl	%edx,%r11d
-	addl	%ecx,%ebx
-	xorl	%ecx,%r11d
-	leal	-176418897(%rax,%r10,1),%eax
-	andl	%ebx,%r11d
-	xorl	%edx,%r11d
-	movl	20(%rsi),%r10d
-	addl	%r11d,%eax
-	roll	$7,%eax
-	movl	%ecx,%r11d
-	addl	%ebx,%eax
-	xorl	%ebx,%r11d
-	leal	1200080426(%rdx,%r10,1),%edx
-	andl	%eax,%r11d
-	xorl	%ecx,%r11d
-	movl	24(%rsi),%r10d
-	addl	%r11d,%edx
-	roll	$12,%edx
-	movl	%ebx,%r11d
-	addl	%eax,%edx
-	xorl	%eax,%r11d
-	leal	-1473231341(%rcx,%r10,1),%ecx
-	andl	%edx,%r11d
-	xorl	%ebx,%r11d
-	movl	28(%rsi),%r10d
-	addl	%r11d,%ecx
-	roll	$17,%ecx
-	movl	%eax,%r11d
-	addl	%edx,%ecx
-	xorl	%edx,%r11d
-	leal	-45705983(%rbx,%r10,1),%ebx
-	andl	%ecx,%r11d
-	xorl	%eax,%r11d
-	movl	32(%rsi),%r10d
-	addl	%r11d,%ebx
-	roll	$22,%ebx
-	movl	%edx,%r11d
-	addl	%ecx,%ebx
-	xorl	%ecx,%r11d
-	leal	1770035416(%rax,%r10,1),%eax
-	andl	%ebx,%r11d
-	xorl	%edx,%r11d
-	movl	36(%rsi),%r10d
-	addl	%r11d,%eax
-	roll	$7,%eax
-	movl	%ecx,%r11d
-	addl	%ebx,%eax
-	xorl	%ebx,%r11d
-	leal	-1958414417(%rdx,%r10,1),%edx
-	andl	%eax,%r11d
-	xorl	%ecx,%r11d
-	movl	40(%rsi),%r10d
-	addl	%r11d,%edx
-	roll	$12,%edx
-	movl	%ebx,%r11d
-	addl	%eax,%edx
-	xorl	%eax,%r11d
-	leal	-42063(%rcx,%r10,1),%ecx
-	andl	%edx,%r11d
-	xorl	%ebx,%r11d
-	movl	44(%rsi),%r10d
-	addl	%r11d,%ecx
-	roll	$17,%ecx
-	movl	%eax,%r11d
-	addl	%edx,%ecx
-	xorl	%edx,%r11d
-	leal	-1990404162(%rbx,%r10,1),%ebx
-	andl	%ecx,%r11d
-	xorl	%eax,%r11d
-	movl	48(%rsi),%r10d
-	addl	%r11d,%ebx
-	roll	$22,%ebx
-	movl	%edx,%r11d
-	addl	%ecx,%ebx
-	xorl	%ecx,%r11d
-	leal	1804603682(%rax,%r10,1),%eax
-	andl	%ebx,%r11d
-	xorl	%edx,%r11d
-	movl	52(%rsi),%r10d
-	addl	%r11d,%eax
-	roll	$7,%eax
-	movl	%ecx,%r11d
-	addl	%ebx,%eax
-	xorl	%ebx,%r11d
-	leal	-40341101(%rdx,%r10,1),%edx
-	andl	%eax,%r11d
-	xorl	%ecx,%r11d
-	movl	56(%rsi),%r10d
-	addl	%r11d,%edx
-	roll	$12,%edx
-	movl	%ebx,%r11d
-	addl	%eax,%edx
-	xorl	%eax,%r11d
-	leal	-1502002290(%rcx,%r10,1),%ecx
-	andl	%edx,%r11d
-	xorl	%ebx,%r11d
-	movl	60(%rsi),%r10d
-	addl	%r11d,%ecx
-	roll	$17,%ecx
-	movl	%eax,%r11d
-	addl	%edx,%ecx
-	xorl	%edx,%r11d
-	leal	1236535329(%rbx,%r10,1),%ebx
-	andl	%ecx,%r11d
-	xorl	%eax,%r11d
-	movl	0(%rsi),%r10d
-	addl	%r11d,%ebx
-	roll	$22,%ebx
-	movl	%edx,%r11d
-	addl	%ecx,%ebx
-	movl	4(%rsi),%r10d
-	movl	%edx,%r11d
-	movl	%edx,%r12d
-	notl	%r11d
-	leal	-165796510(%rax,%r10,1),%eax
-	andl	%ebx,%r12d
-	andl	%ecx,%r11d
-	movl	24(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%ecx,%r11d
-	addl	%r12d,%eax
-	movl	%ecx,%r12d
-	roll	$5,%eax
-	addl	%ebx,%eax
-	notl	%r11d
-	leal	-1069501632(%rdx,%r10,1),%edx
-	andl	%eax,%r12d
-	andl	%ebx,%r11d
-	movl	44(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%ebx,%r11d
-	addl	%r12d,%edx
-	movl	%ebx,%r12d
-	roll	$9,%edx
-	addl	%eax,%edx
-	notl	%r11d
-	leal	643717713(%rcx,%r10,1),%ecx
-	andl	%edx,%r12d
-	andl	%eax,%r11d
-	movl	0(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%eax,%r11d
-	addl	%r12d,%ecx
-	movl	%eax,%r12d
-	roll	$14,%ecx
-	addl	%edx,%ecx
-	notl	%r11d
-	leal	-373897302(%rbx,%r10,1),%ebx
-	andl	%ecx,%r12d
-	andl	%edx,%r11d
-	movl	20(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%edx,%r11d
-	addl	%r12d,%ebx
-	movl	%edx,%r12d
-	roll	$20,%ebx
-	addl	%ecx,%ebx
-	notl	%r11d
-	leal	-701558691(%rax,%r10,1),%eax
-	andl	%ebx,%r12d
-	andl	%ecx,%r11d
-	movl	40(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%ecx,%r11d
-	addl	%r12d,%eax
-	movl	%ecx,%r12d
-	roll	$5,%eax
-	addl	%ebx,%eax
-	notl	%r11d
-	leal	38016083(%rdx,%r10,1),%edx
-	andl	%eax,%r12d
-	andl	%ebx,%r11d
-	movl	60(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%ebx,%r11d
-	addl	%r12d,%edx
-	movl	%ebx,%r12d
-	roll	$9,%edx
-	addl	%eax,%edx
-	notl	%r11d
-	leal	-660478335(%rcx,%r10,1),%ecx
-	andl	%edx,%r12d
-	andl	%eax,%r11d
-	movl	16(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%eax,%r11d
-	addl	%r12d,%ecx
-	movl	%eax,%r12d
-	roll	$14,%ecx
-	addl	%edx,%ecx
-	notl	%r11d
-	leal	-405537848(%rbx,%r10,1),%ebx
-	andl	%ecx,%r12d
-	andl	%edx,%r11d
-	movl	36(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%edx,%r11d
-	addl	%r12d,%ebx
-	movl	%edx,%r12d
-	roll	$20,%ebx
-	addl	%ecx,%ebx
-	notl	%r11d
-	leal	568446438(%rax,%r10,1),%eax
-	andl	%ebx,%r12d
-	andl	%ecx,%r11d
-	movl	56(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%ecx,%r11d
-	addl	%r12d,%eax
-	movl	%ecx,%r12d
-	roll	$5,%eax
-	addl	%ebx,%eax
-	notl	%r11d
-	leal	-1019803690(%rdx,%r10,1),%edx
-	andl	%eax,%r12d
-	andl	%ebx,%r11d
-	movl	12(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%ebx,%r11d
-	addl	%r12d,%edx
-	movl	%ebx,%r12d
-	roll	$9,%edx
-	addl	%eax,%edx
-	notl	%r11d
-	leal	-187363961(%rcx,%r10,1),%ecx
-	andl	%edx,%r12d
-	andl	%eax,%r11d
-	movl	32(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%eax,%r11d
-	addl	%r12d,%ecx
-	movl	%eax,%r12d
-	roll	$14,%ecx
-	addl	%edx,%ecx
-	notl	%r11d
-	leal	1163531501(%rbx,%r10,1),%ebx
-	andl	%ecx,%r12d
-	andl	%edx,%r11d
-	movl	52(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%edx,%r11d
-	addl	%r12d,%ebx
-	movl	%edx,%r12d
-	roll	$20,%ebx
-	addl	%ecx,%ebx
-	notl	%r11d
-	leal	-1444681467(%rax,%r10,1),%eax
-	andl	%ebx,%r12d
-	andl	%ecx,%r11d
-	movl	8(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%ecx,%r11d
-	addl	%r12d,%eax
-	movl	%ecx,%r12d
-	roll	$5,%eax
-	addl	%ebx,%eax
-	notl	%r11d
-	leal	-51403784(%rdx,%r10,1),%edx
-	andl	%eax,%r12d
-	andl	%ebx,%r11d
-	movl	28(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%ebx,%r11d
-	addl	%r12d,%edx
-	movl	%ebx,%r12d
-	roll	$9,%edx
-	addl	%eax,%edx
-	notl	%r11d
-	leal	1735328473(%rcx,%r10,1),%ecx
-	andl	%edx,%r12d
-	andl	%eax,%r11d
-	movl	48(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%eax,%r11d
-	addl	%r12d,%ecx
-	movl	%eax,%r12d
-	roll	$14,%ecx
-	addl	%edx,%ecx
-	notl	%r11d
-	leal	-1926607734(%rbx,%r10,1),%ebx
-	andl	%ecx,%r12d
-	andl	%edx,%r11d
-	movl	0(%rsi),%r10d
-	orl	%r11d,%r12d
-	movl	%edx,%r11d
-	addl	%r12d,%ebx
-	movl	%edx,%r12d
-	roll	$20,%ebx
-	addl	%ecx,%ebx
-	movl	20(%rsi),%r10d
-	movl	%ecx,%r11d
-	leal	-378558(%rax,%r10,1),%eax
-	movl	32(%rsi),%r10d
-	xorl	%edx,%r11d
-	xorl	%ebx,%r11d
-	addl	%r11d,%eax
-	roll	$4,%eax
-	movl	%ebx,%r11d
-	addl	%ebx,%eax
-	leal	-2022574463(%rdx,%r10,1),%edx
-	movl	44(%rsi),%r10d
-	xorl	%ecx,%r11d
-	xorl	%eax,%r11d
-	addl	%r11d,%edx
-	roll	$11,%edx
-	movl	%eax,%r11d
-	addl	%eax,%edx
-	leal	1839030562(%rcx,%r10,1),%ecx
-	movl	56(%rsi),%r10d
-	xorl	%ebx,%r11d
-	xorl	%edx,%r11d
-	addl	%r11d,%ecx
-	roll	$16,%ecx
-	movl	%edx,%r11d
-	addl	%edx,%ecx
-	leal	-35309556(%rbx,%r10,1),%ebx
-	movl	4(%rsi),%r10d
-	xorl	%eax,%r11d
-	xorl	%ecx,%r11d
-	addl	%r11d,%ebx
-	roll	$23,%ebx
-	movl	%ecx,%r11d
-	addl	%ecx,%ebx
-	leal	-1530992060(%rax,%r10,1),%eax
-	movl	16(%rsi),%r10d
-	xorl	%edx,%r11d
-	xorl	%ebx,%r11d
-	addl	%r11d,%eax
-	roll	$4,%eax
-	movl	%ebx,%r11d
-	addl	%ebx,%eax
-	leal	1272893353(%rdx,%r10,1),%edx
-	movl	28(%rsi),%r10d
-	xorl	%ecx,%r11d
-	xorl	%eax,%r11d
-	addl	%r11d,%edx
-	roll	$11,%edx
-	movl	%eax,%r11d
-	addl	%eax,%edx
-	leal	-155497632(%rcx,%r10,1),%ecx
-	movl	40(%rsi),%r10d
-	xorl	%ebx,%r11d
-	xorl	%edx,%r11d
-	addl	%r11d,%ecx
-	roll	$16,%ecx
-	movl	%edx,%r11d
-	addl	%edx,%ecx
-	leal	-1094730640(%rbx,%r10,1),%ebx
-	movl	52(%rsi),%r10d
-	xorl	%eax,%r11d
-	xorl	%ecx,%r11d
-	addl	%r11d,%ebx
-	roll	$23,%ebx
-	movl	%ecx,%r11d
-	addl	%ecx,%ebx
-	leal	681279174(%rax,%r10,1),%eax
-	movl	0(%rsi),%r10d
-	xorl	%edx,%r11d
-	xorl	%ebx,%r11d
-	addl	%r11d,%eax
-	roll	$4,%eax
-	movl	%ebx,%r11d
-	addl	%ebx,%eax
-	leal	-358537222(%rdx,%r10,1),%edx
-	movl	12(%rsi),%r10d
-	xorl	%ecx,%r11d
-	xorl	%eax,%r11d
-	addl	%r11d,%edx
-	roll	$11,%edx
-	movl	%eax,%r11d
-	addl	%eax,%edx
-	leal	-722521979(%rcx,%r10,1),%ecx
-	movl	24(%rsi),%r10d
-	xorl	%ebx,%r11d
-	xorl	%edx,%r11d
-	addl	%r11d,%ecx
-	roll	$16,%ecx
-	movl	%edx,%r11d
-	addl	%edx,%ecx
-	leal	76029189(%rbx,%r10,1),%ebx
-	movl	36(%rsi),%r10d
-	xorl	%eax,%r11d
-	xorl	%ecx,%r11d
-	addl	%r11d,%ebx
-	roll	$23,%ebx
-	movl	%ecx,%r11d
-	addl	%ecx,%ebx
-	leal	-640364487(%rax,%r10,1),%eax
-	movl	48(%rsi),%r10d
-	xorl	%edx,%r11d
-	xorl	%ebx,%r11d
-	addl	%r11d,%eax
-	roll	$4,%eax
-	movl	%ebx,%r11d
-	addl	%ebx,%eax
-	leal	-421815835(%rdx,%r10,1),%edx
-	movl	60(%rsi),%r10d
-	xorl	%ecx,%r11d
-	xorl	%eax,%r11d
-	addl	%r11d,%edx
-	roll	$11,%edx
-	movl	%eax,%r11d
-	addl	%eax,%edx
-	leal	530742520(%rcx,%r10,1),%ecx
-	movl	8(%rsi),%r10d
-	xorl	%ebx,%r11d
-	xorl	%edx,%r11d
-	addl	%r11d,%ecx
-	roll	$16,%ecx
-	movl	%edx,%r11d
-	addl	%edx,%ecx
-	leal	-995338651(%rbx,%r10,1),%ebx
-	movl	0(%rsi),%r10d
-	xorl	%eax,%r11d
-	xorl	%ecx,%r11d
-	addl	%r11d,%ebx
-	roll	$23,%ebx
-	movl	%ecx,%r11d
-	addl	%ecx,%ebx
-	movl	0(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	xorl	%edx,%r11d
-	leal	-198630844(%rax,%r10,1),%eax
-	orl	%ebx,%r11d
-	xorl	%ecx,%r11d
-	addl	%r11d,%eax
-	movl	28(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$6,%eax
-	xorl	%ecx,%r11d
-	addl	%ebx,%eax
-	leal	1126891415(%rdx,%r10,1),%edx
-	orl	%eax,%r11d
-	xorl	%ebx,%r11d
-	addl	%r11d,%edx
-	movl	56(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$10,%edx
-	xorl	%ebx,%r11d
-	addl	%eax,%edx
-	leal	-1416354905(%rcx,%r10,1),%ecx
-	orl	%edx,%r11d
-	xorl	%eax,%r11d
-	addl	%r11d,%ecx
-	movl	20(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$15,%ecx
-	xorl	%eax,%r11d
-	addl	%edx,%ecx
-	leal	-57434055(%rbx,%r10,1),%ebx
-	orl	%ecx,%r11d
-	xorl	%edx,%r11d
-	addl	%r11d,%ebx
-	movl	48(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$21,%ebx
-	xorl	%edx,%r11d
-	addl	%ecx,%ebx
-	leal	1700485571(%rax,%r10,1),%eax
-	orl	%ebx,%r11d
-	xorl	%ecx,%r11d
-	addl	%r11d,%eax
-	movl	12(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$6,%eax
-	xorl	%ecx,%r11d
-	addl	%ebx,%eax
-	leal	-1894986606(%rdx,%r10,1),%edx
-	orl	%eax,%r11d
-	xorl	%ebx,%r11d
-	addl	%r11d,%edx
-	movl	40(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$10,%edx
-	xorl	%ebx,%r11d
-	addl	%eax,%edx
-	leal	-1051523(%rcx,%r10,1),%ecx
-	orl	%edx,%r11d
-	xorl	%eax,%r11d
-	addl	%r11d,%ecx
-	movl	4(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$15,%ecx
-	xorl	%eax,%r11d
-	addl	%edx,%ecx
-	leal	-2054922799(%rbx,%r10,1),%ebx
-	orl	%ecx,%r11d
-	xorl	%edx,%r11d
-	addl	%r11d,%ebx
-	movl	32(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$21,%ebx
-	xorl	%edx,%r11d
-	addl	%ecx,%ebx
-	leal	1873313359(%rax,%r10,1),%eax
-	orl	%ebx,%r11d
-	xorl	%ecx,%r11d
-	addl	%r11d,%eax
-	movl	60(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$6,%eax
-	xorl	%ecx,%r11d
-	addl	%ebx,%eax
-	leal	-30611744(%rdx,%r10,1),%edx
-	orl	%eax,%r11d
-	xorl	%ebx,%r11d
-	addl	%r11d,%edx
-	movl	24(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$10,%edx
-	xorl	%ebx,%r11d
-	addl	%eax,%edx
-	leal	-1560198380(%rcx,%r10,1),%ecx
-	orl	%edx,%r11d
-	xorl	%eax,%r11d
-	addl	%r11d,%ecx
-	movl	52(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$15,%ecx
-	xorl	%eax,%r11d
-	addl	%edx,%ecx
-	leal	1309151649(%rbx,%r10,1),%ebx
-	orl	%ecx,%r11d
-	xorl	%edx,%r11d
-	addl	%r11d,%ebx
-	movl	16(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$21,%ebx
-	xorl	%edx,%r11d
-	addl	%ecx,%ebx
-	leal	-145523070(%rax,%r10,1),%eax
-	orl	%ebx,%r11d
-	xorl	%ecx,%r11d
-	addl	%r11d,%eax
-	movl	44(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$6,%eax
-	xorl	%ecx,%r11d
-	addl	%ebx,%eax
-	leal	-1120210379(%rdx,%r10,1),%edx
-	orl	%eax,%r11d
-	xorl	%ebx,%r11d
-	addl	%r11d,%edx
-	movl	8(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$10,%edx
-	xorl	%ebx,%r11d
-	addl	%eax,%edx
-	leal	718787259(%rcx,%r10,1),%ecx
-	orl	%edx,%r11d
-	xorl	%eax,%r11d
-	addl	%r11d,%ecx
-	movl	36(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$15,%ecx
-	xorl	%eax,%r11d
-	addl	%edx,%ecx
-	leal	-343485551(%rbx,%r10,1),%ebx
-	orl	%ecx,%r11d
-	xorl	%edx,%r11d
-	addl	%r11d,%ebx
-	movl	0(%rsi),%r10d
-	movl	$0xffffffff,%r11d
-	roll	$21,%ebx
-	xorl	%edx,%r11d
-	addl	%ecx,%ebx
-
-	addl	%r8d,%eax
-	addl	%r9d,%ebx
-	addl	%r14d,%ecx
-	addl	%r15d,%edx
-
-
-	addq	$64,%rsi
-	cmpq	%rdi,%rsi
-	jb	L$loop
-
-
-L$end:
-	movl	%eax,0(%rbp)
-	movl	%ebx,4(%rbp)
-	movl	%ecx,8(%rbp)
-	movl	%edx,12(%rbp)
-
-	movq	(%rsp),%r15
-
-	movq	8(%rsp),%r14
-
-	movq	16(%rsp),%r12
-
-	movq	24(%rsp),%rbx
-
-	movq	32(%rsp),%rbp
-
-	addq	$40,%rsp
-
-L$epilogue:
-	ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/p256-x86_64-asm-apple.S b/apple-x86_64/crypto/fipsmodule/p256-x86_64-asm-apple.S
deleted file mode 100644
index 81cb582..0000000
--- a/apple-x86_64/crypto/fipsmodule/p256-x86_64-asm-apple.S
+++ /dev/null
@@ -1,4473 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-
-.section	__DATA,__const
-.p2align	6
-L$poly:
-.quad	0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
-
-L$One:
-.long	1,1,1,1,1,1,1,1
-L$Two:
-.long	2,2,2,2,2,2,2,2
-L$Three:
-.long	3,3,3,3,3,3,3,3
-L$ONE_mont:
-.quad	0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
-
-
-L$ord:
-.quad	0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
-L$ordK:
-.quad	0xccd1c8aaee00bc4f
-.text	
-
-
-
-.globl	_ecp_nistz256_neg
-.private_extern _ecp_nistz256_neg
-
-.p2align	5
-_ecp_nistz256_neg:
-
-_CET_ENDBR
-	pushq	%r12
-
-	pushq	%r13
-
-L$neg_body:
-
-	xorq	%r8,%r8
-	xorq	%r9,%r9
-	xorq	%r10,%r10
-	xorq	%r11,%r11
-	xorq	%r13,%r13
-
-	subq	0(%rsi),%r8
-	sbbq	8(%rsi),%r9
-	sbbq	16(%rsi),%r10
-	movq	%r8,%rax
-	sbbq	24(%rsi),%r11
-	leaq	L$poly(%rip),%rsi
-	movq	%r9,%rdx
-	sbbq	$0,%r13
-
-	addq	0(%rsi),%r8
-	movq	%r10,%rcx
-	adcq	8(%rsi),%r9
-	adcq	16(%rsi),%r10
-	movq	%r11,%r12
-	adcq	24(%rsi),%r11
-	testq	%r13,%r13
-
-	cmovzq	%rax,%r8
-	cmovzq	%rdx,%r9
-	movq	%r8,0(%rdi)
-	cmovzq	%rcx,%r10
-	movq	%r9,8(%rdi)
-	cmovzq	%r12,%r11
-	movq	%r10,16(%rdi)
-	movq	%r11,24(%rdi)
-
-	movq	0(%rsp),%r13
-
-	movq	8(%rsp),%r12
-
-	leaq	16(%rsp),%rsp
-
-L$neg_epilogue:
-	ret
-
-
-
-
-
-
-
-
-.globl	_ecp_nistz256_ord_mul_mont
-.private_extern _ecp_nistz256_ord_mul_mont
-
-.p2align	5
-_ecp_nistz256_ord_mul_mont:
-
-_CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	L$ecp_nistz256_ord_mul_montx
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-L$ord_mul_body:
-
-	movq	0(%rdx),%rax
-	movq	%rdx,%rbx
-	leaq	L$ord(%rip),%r14
-	movq	L$ordK(%rip),%r15
-
-
-	movq	%rax,%rcx
-	mulq	0(%rsi)
-	movq	%rax,%r8
-	movq	%rcx,%rax
-	movq	%rdx,%r9
-
-	mulq	8(%rsi)
-	addq	%rax,%r9
-	movq	%rcx,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-
-	mulq	16(%rsi)
-	addq	%rax,%r10
-	movq	%rcx,%rax
-	adcq	$0,%rdx
-
-	movq	%r8,%r13
-	imulq	%r15,%r8
-
-	movq	%rdx,%r11
-	mulq	24(%rsi)
-	addq	%rax,%r11
-	movq	%r8,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%r12
-
-
-	mulq	0(%r14)
-	movq	%r8,%rbp
-	addq	%rax,%r13
-	movq	%r8,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rcx
-
-	subq	%r8,%r10
-	sbbq	$0,%r8
-
-	mulq	8(%r14)
-	addq	%rcx,%r9
-	adcq	$0,%rdx
-	addq	%rax,%r9
-	movq	%rbp,%rax
-	adcq	%rdx,%r10
-	movq	%rbp,%rdx
-	adcq	$0,%r8
-
-	shlq	$32,%rax
-	shrq	$32,%rdx
-	subq	%rax,%r11
-	movq	8(%rbx),%rax
-	sbbq	%rdx,%rbp
-
-	addq	%r8,%r11
-	adcq	%rbp,%r12
-	adcq	$0,%r13
-
-
-	movq	%rax,%rcx
-	mulq	0(%rsi)
-	addq	%rax,%r9
-	movq	%rcx,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rbp
-
-	mulq	8(%rsi)
-	addq	%rbp,%r10
-	adcq	$0,%rdx
-	addq	%rax,%r10
-	movq	%rcx,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rbp
-
-	mulq	16(%rsi)
-	addq	%rbp,%r11
-	adcq	$0,%rdx
-	addq	%rax,%r11
-	movq	%rcx,%rax
-	adcq	$0,%rdx
-
-	movq	%r9,%rcx
-	imulq	%r15,%r9
-
-	movq	%rdx,%rbp
-	mulq	24(%rsi)
-	addq	%rbp,%r12
-	adcq	$0,%rdx
-	xorq	%r8,%r8
-	addq	%rax,%r12
-	movq	%r9,%rax
-	adcq	%rdx,%r13
-	adcq	$0,%r8
-
-
-	mulq	0(%r14)
-	movq	%r9,%rbp
-	addq	%rax,%rcx
-	movq	%r9,%rax
-	adcq	%rdx,%rcx
-
-	subq	%r9,%r11
-	sbbq	$0,%r9
-
-	mulq	8(%r14)
-	addq	%rcx,%r10
-	adcq	$0,%rdx
-	addq	%rax,%r10
-	movq	%rbp,%rax
-	adcq	%rdx,%r11
-	movq	%rbp,%rdx
-	adcq	$0,%r9
-
-	shlq	$32,%rax
-	shrq	$32,%rdx
-	subq	%rax,%r12
-	movq	16(%rbx),%rax
-	sbbq	%rdx,%rbp
-
-	addq	%r9,%r12
-	adcq	%rbp,%r13
-	adcq	$0,%r8
-
-
-	movq	%rax,%rcx
-	mulq	0(%rsi)
-	addq	%rax,%r10
-	movq	%rcx,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rbp
-
-	mulq	8(%rsi)
-	addq	%rbp,%r11
-	adcq	$0,%rdx
-	addq	%rax,%r11
-	movq	%rcx,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rbp
-
-	mulq	16(%rsi)
-	addq	%rbp,%r12
-	adcq	$0,%rdx
-	addq	%rax,%r12
-	movq	%rcx,%rax
-	adcq	$0,%rdx
-
-	movq	%r10,%rcx
-	imulq	%r15,%r10
-
-	movq	%rdx,%rbp
-	mulq	24(%rsi)
-	addq	%rbp,%r13
-	adcq	$0,%rdx
-	xorq	%r9,%r9
-	addq	%rax,%r13
-	movq	%r10,%rax
-	adcq	%rdx,%r8
-	adcq	$0,%r9
-
-
-	mulq	0(%r14)
-	movq	%r10,%rbp
-	addq	%rax,%rcx
-	movq	%r10,%rax
-	adcq	%rdx,%rcx
-
-	subq	%r10,%r12
-	sbbq	$0,%r10
-
-	mulq	8(%r14)
-	addq	%rcx,%r11
-	adcq	$0,%rdx
-	addq	%rax,%r11
-	movq	%rbp,%rax
-	adcq	%rdx,%r12
-	movq	%rbp,%rdx
-	adcq	$0,%r10
-
-	shlq	$32,%rax
-	shrq	$32,%rdx
-	subq	%rax,%r13
-	movq	24(%rbx),%rax
-	sbbq	%rdx,%rbp
-
-	addq	%r10,%r13
-	adcq	%rbp,%r8
-	adcq	$0,%r9
-
-
-	movq	%rax,%rcx
-	mulq	0(%rsi)
-	addq	%rax,%r11
-	movq	%rcx,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rbp
-
-	mulq	8(%rsi)
-	addq	%rbp,%r12
-	adcq	$0,%rdx
-	addq	%rax,%r12
-	movq	%rcx,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rbp
-
-	mulq	16(%rsi)
-	addq	%rbp,%r13
-	adcq	$0,%rdx
-	addq	%rax,%r13
-	movq	%rcx,%rax
-	adcq	$0,%rdx
-
-	movq	%r11,%rcx
-	imulq	%r15,%r11
-
-	movq	%rdx,%rbp
-	mulq	24(%rsi)
-	addq	%rbp,%r8
-	adcq	$0,%rdx
-	xorq	%r10,%r10
-	addq	%rax,%r8
-	movq	%r11,%rax
-	adcq	%rdx,%r9
-	adcq	$0,%r10
-
-
-	mulq	0(%r14)
-	movq	%r11,%rbp
-	addq	%rax,%rcx
-	movq	%r11,%rax
-	adcq	%rdx,%rcx
-
-	subq	%r11,%r13
-	sbbq	$0,%r11
-
-	mulq	8(%r14)
-	addq	%rcx,%r12
-	adcq	$0,%rdx
-	addq	%rax,%r12
-	movq	%rbp,%rax
-	adcq	%rdx,%r13
-	movq	%rbp,%rdx
-	adcq	$0,%r11
-
-	shlq	$32,%rax
-	shrq	$32,%rdx
-	subq	%rax,%r8
-	sbbq	%rdx,%rbp
-
-	addq	%r11,%r8
-	adcq	%rbp,%r9
-	adcq	$0,%r10
-
-
-	movq	%r12,%rsi
-	subq	0(%r14),%r12
-	movq	%r13,%r11
-	sbbq	8(%r14),%r13
-	movq	%r8,%rcx
-	sbbq	16(%r14),%r8
-	movq	%r9,%rbp
-	sbbq	24(%r14),%r9
-	sbbq	$0,%r10
-
-	cmovcq	%rsi,%r12
-	cmovcq	%r11,%r13
-	cmovcq	%rcx,%r8
-	cmovcq	%rbp,%r9
-
-	movq	%r12,0(%rdi)
-	movq	%r13,8(%rdi)
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-
-	movq	0(%rsp),%r15
-
-	movq	8(%rsp),%r14
-
-	movq	16(%rsp),%r13
-
-	movq	24(%rsp),%r12
-
-	movq	32(%rsp),%rbx
-
-	movq	40(%rsp),%rbp
-
-	leaq	48(%rsp),%rsp
-
-L$ord_mul_epilogue:
-	ret
-
-
-
-
-
-
-
-
-
-.globl	_ecp_nistz256_ord_sqr_mont
-.private_extern _ecp_nistz256_ord_sqr_mont
-
-.p2align	5
-_ecp_nistz256_ord_sqr_mont:
-
-_CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	L$ecp_nistz256_ord_sqr_montx
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-L$ord_sqr_body:
-
-	movq	0(%rsi),%r8
-	movq	8(%rsi),%rax
-	movq	16(%rsi),%r14
-	movq	24(%rsi),%r15
-	leaq	L$ord(%rip),%rsi
-	movq	%rdx,%rbx
-	jmp	L$oop_ord_sqr
-
-.p2align	5
-L$oop_ord_sqr:
-
-	movq	%rax,%rbp
-	mulq	%r8
-	movq	%rax,%r9
-.byte	102,72,15,110,205
-	movq	%r14,%rax
-	movq	%rdx,%r10
-
-	mulq	%r8
-	addq	%rax,%r10
-	movq	%r15,%rax
-.byte	102,73,15,110,214
-	adcq	$0,%rdx
-	movq	%rdx,%r11
-
-	mulq	%r8
-	addq	%rax,%r11
-	movq	%r15,%rax
-.byte	102,73,15,110,223
-	adcq	$0,%rdx
-	movq	%rdx,%r12
-
-
-	mulq	%r14
-	movq	%rax,%r13
-	movq	%r14,%rax
-	movq	%rdx,%r14
-
-
-	mulq	%rbp
-	addq	%rax,%r11
-	movq	%r15,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%r15
-
-	mulq	%rbp
-	addq	%rax,%r12
-	adcq	$0,%rdx
-
-	addq	%r15,%r12
-	adcq	%rdx,%r13
-	adcq	$0,%r14
-
-
-	xorq	%r15,%r15
-	movq	%r8,%rax
-	addq	%r9,%r9
-	adcq	%r10,%r10
-	adcq	%r11,%r11
-	adcq	%r12,%r12
-	adcq	%r13,%r13
-	adcq	%r14,%r14
-	adcq	$0,%r15
-
-
-	mulq	%rax
-	movq	%rax,%r8
-.byte	102,72,15,126,200
-	movq	%rdx,%rbp
-
-	mulq	%rax
-	addq	%rbp,%r9
-	adcq	%rax,%r10
-.byte	102,72,15,126,208
-	adcq	$0,%rdx
-	movq	%rdx,%rbp
-
-	mulq	%rax
-	addq	%rbp,%r11
-	adcq	%rax,%r12
-.byte	102,72,15,126,216
-	adcq	$0,%rdx
-	movq	%rdx,%rbp
-
-	movq	%r8,%rcx
-	imulq	32(%rsi),%r8
-
-	mulq	%rax
-	addq	%rbp,%r13
-	adcq	%rax,%r14
-	movq	0(%rsi),%rax
-	adcq	%rdx,%r15
-
-
-	mulq	%r8
-	movq	%r8,%rbp
-	addq	%rax,%rcx
-	movq	8(%rsi),%rax
-	adcq	%rdx,%rcx
-
-	subq	%r8,%r10
-	sbbq	$0,%rbp
-
-	mulq	%r8
-	addq	%rcx,%r9
-	adcq	$0,%rdx
-	addq	%rax,%r9
-	movq	%r8,%rax
-	adcq	%rdx,%r10
-	movq	%r8,%rdx
-	adcq	$0,%rbp
-
-	movq	%r9,%rcx
-	imulq	32(%rsi),%r9
-
-	shlq	$32,%rax
-	shrq	$32,%rdx
-	subq	%rax,%r11
-	movq	0(%rsi),%rax
-	sbbq	%rdx,%r8
-
-	addq	%rbp,%r11
-	adcq	$0,%r8
-
-
-	mulq	%r9
-	movq	%r9,%rbp
-	addq	%rax,%rcx
-	movq	8(%rsi),%rax
-	adcq	%rdx,%rcx
-
-	subq	%r9,%r11
-	sbbq	$0,%rbp
-
-	mulq	%r9
-	addq	%rcx,%r10
-	adcq	$0,%rdx
-	addq	%rax,%r10
-	movq	%r9,%rax
-	adcq	%rdx,%r11
-	movq	%r9,%rdx
-	adcq	$0,%rbp
-
-	movq	%r10,%rcx
-	imulq	32(%rsi),%r10
-
-	shlq	$32,%rax
-	shrq	$32,%rdx
-	subq	%rax,%r8
-	movq	0(%rsi),%rax
-	sbbq	%rdx,%r9
-
-	addq	%rbp,%r8
-	adcq	$0,%r9
-
-
-	mulq	%r10
-	movq	%r10,%rbp
-	addq	%rax,%rcx
-	movq	8(%rsi),%rax
-	adcq	%rdx,%rcx
-
-	subq	%r10,%r8
-	sbbq	$0,%rbp
-
-	mulq	%r10
-	addq	%rcx,%r11
-	adcq	$0,%rdx
-	addq	%rax,%r11
-	movq	%r10,%rax
-	adcq	%rdx,%r8
-	movq	%r10,%rdx
-	adcq	$0,%rbp
-
-	movq	%r11,%rcx
-	imulq	32(%rsi),%r11
-
-	shlq	$32,%rax
-	shrq	$32,%rdx
-	subq	%rax,%r9
-	movq	0(%rsi),%rax
-	sbbq	%rdx,%r10
-
-	addq	%rbp,%r9
-	adcq	$0,%r10
-
-
-	mulq	%r11
-	movq	%r11,%rbp
-	addq	%rax,%rcx
-	movq	8(%rsi),%rax
-	adcq	%rdx,%rcx
-
-	subq	%r11,%r9
-	sbbq	$0,%rbp
-
-	mulq	%r11
-	addq	%rcx,%r8
-	adcq	$0,%rdx
-	addq	%rax,%r8
-	movq	%r11,%rax
-	adcq	%rdx,%r9
-	movq	%r11,%rdx
-	adcq	$0,%rbp
-
-	shlq	$32,%rax
-	shrq	$32,%rdx
-	subq	%rax,%r10
-	sbbq	%rdx,%r11
-
-	addq	%rbp,%r10
-	adcq	$0,%r11
-
-
-	xorq	%rdx,%rdx
-	addq	%r12,%r8
-	adcq	%r13,%r9
-	movq	%r8,%r12
-	adcq	%r14,%r10
-	adcq	%r15,%r11
-	movq	%r9,%rax
-	adcq	$0,%rdx
-
-
-	subq	0(%rsi),%r8
-	movq	%r10,%r14
-	sbbq	8(%rsi),%r9
-	sbbq	16(%rsi),%r10
-	movq	%r11,%r15
-	sbbq	24(%rsi),%r11
-	sbbq	$0,%rdx
-
-	cmovcq	%r12,%r8
-	cmovncq	%r9,%rax
-	cmovncq	%r10,%r14
-	cmovncq	%r11,%r15
-
-	decq	%rbx
-	jnz	L$oop_ord_sqr
-
-	movq	%r8,0(%rdi)
-	movq	%rax,8(%rdi)
-	pxor	%xmm1,%xmm1
-	movq	%r14,16(%rdi)
-	pxor	%xmm2,%xmm2
-	movq	%r15,24(%rdi)
-	pxor	%xmm3,%xmm3
-
-	movq	0(%rsp),%r15
-
-	movq	8(%rsp),%r14
-
-	movq	16(%rsp),%r13
-
-	movq	24(%rsp),%r12
-
-	movq	32(%rsp),%rbx
-
-	movq	40(%rsp),%rbp
-
-	leaq	48(%rsp),%rsp
-
-L$ord_sqr_epilogue:
-	ret
-
-
-
-
-.p2align	5
-ecp_nistz256_ord_mul_montx:
-
-L$ecp_nistz256_ord_mul_montx:
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-L$ord_mulx_body:
-
-	movq	%rdx,%rbx
-	movq	0(%rdx),%rdx
-	movq	0(%rsi),%r9
-	movq	8(%rsi),%r10
-	movq	16(%rsi),%r11
-	movq	24(%rsi),%r12
-	leaq	-128(%rsi),%rsi
-	leaq	L$ord-128(%rip),%r14
-	movq	L$ordK(%rip),%r15
-
-
-	mulxq	%r9,%r8,%r9
-	mulxq	%r10,%rcx,%r10
-	mulxq	%r11,%rbp,%r11
-	addq	%rcx,%r9
-	mulxq	%r12,%rcx,%r12
-	movq	%r8,%rdx
-	mulxq	%r15,%rdx,%rax
-	adcq	%rbp,%r10
-	adcq	%rcx,%r11
-	adcq	$0,%r12
-
-
-	xorq	%r13,%r13
-	mulxq	0+128(%r14),%rcx,%rbp
-	adcxq	%rcx,%r8
-	adoxq	%rbp,%r9
-
-	mulxq	8+128(%r14),%rcx,%rbp
-	adcxq	%rcx,%r9
-	adoxq	%rbp,%r10
-
-	mulxq	16+128(%r14),%rcx,%rbp
-	adcxq	%rcx,%r10
-	adoxq	%rbp,%r11
-
-	mulxq	24+128(%r14),%rcx,%rbp
-	movq	8(%rbx),%rdx
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r12
-	adcxq	%r8,%r12
-	adoxq	%r8,%r13
-	adcq	$0,%r13
-
-
-	mulxq	0+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r9
-	adoxq	%rbp,%r10
-
-	mulxq	8+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r10
-	adoxq	%rbp,%r11
-
-	mulxq	16+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r12
-
-	mulxq	24+128(%rsi),%rcx,%rbp
-	movq	%r9,%rdx
-	mulxq	%r15,%rdx,%rax
-	adcxq	%rcx,%r12
-	adoxq	%rbp,%r13
-
-	adcxq	%r8,%r13
-	adoxq	%r8,%r8
-	adcq	$0,%r8
-
-
-	mulxq	0+128(%r14),%rcx,%rbp
-	adcxq	%rcx,%r9
-	adoxq	%rbp,%r10
-
-	mulxq	8+128(%r14),%rcx,%rbp
-	adcxq	%rcx,%r10
-	adoxq	%rbp,%r11
-
-	mulxq	16+128(%r14),%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r12
-
-	mulxq	24+128(%r14),%rcx,%rbp
-	movq	16(%rbx),%rdx
-	adcxq	%rcx,%r12
-	adoxq	%rbp,%r13
-	adcxq	%r9,%r13
-	adoxq	%r9,%r8
-	adcq	$0,%r8
-
-
-	mulxq	0+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r10
-	adoxq	%rbp,%r11
-
-	mulxq	8+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r12
-
-	mulxq	16+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r12
-	adoxq	%rbp,%r13
-
-	mulxq	24+128(%rsi),%rcx,%rbp
-	movq	%r10,%rdx
-	mulxq	%r15,%rdx,%rax
-	adcxq	%rcx,%r13
-	adoxq	%rbp,%r8
-
-	adcxq	%r9,%r8
-	adoxq	%r9,%r9
-	adcq	$0,%r9
-
-
-	mulxq	0+128(%r14),%rcx,%rbp
-	adcxq	%rcx,%r10
-	adoxq	%rbp,%r11
-
-	mulxq	8+128(%r14),%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r12
-
-	mulxq	16+128(%r14),%rcx,%rbp
-	adcxq	%rcx,%r12
-	adoxq	%rbp,%r13
-
-	mulxq	24+128(%r14),%rcx,%rbp
-	movq	24(%rbx),%rdx
-	adcxq	%rcx,%r13
-	adoxq	%rbp,%r8
-	adcxq	%r10,%r8
-	adoxq	%r10,%r9
-	adcq	$0,%r9
-
-
-	mulxq	0+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r12
-
-	mulxq	8+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r12
-	adoxq	%rbp,%r13
-
-	mulxq	16+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r13
-	adoxq	%rbp,%r8
-
-	mulxq	24+128(%rsi),%rcx,%rbp
-	movq	%r11,%rdx
-	mulxq	%r15,%rdx,%rax
-	adcxq	%rcx,%r8
-	adoxq	%rbp,%r9
-
-	adcxq	%r10,%r9
-	adoxq	%r10,%r10
-	adcq	$0,%r10
-
-
-	mulxq	0+128(%r14),%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r12
-
-	mulxq	8+128(%r14),%rcx,%rbp
-	adcxq	%rcx,%r12
-	adoxq	%rbp,%r13
-
-	mulxq	16+128(%r14),%rcx,%rbp
-	adcxq	%rcx,%r13
-	adoxq	%rbp,%r8
-
-	mulxq	24+128(%r14),%rcx,%rbp
-	leaq	128(%r14),%r14
-	movq	%r12,%rbx
-	adcxq	%rcx,%r8
-	adoxq	%rbp,%r9
-	movq	%r13,%rdx
-	adcxq	%r11,%r9
-	adoxq	%r11,%r10
-	adcq	$0,%r10
-
-
-
-	movq	%r8,%rcx
-	subq	0(%r14),%r12
-	sbbq	8(%r14),%r13
-	sbbq	16(%r14),%r8
-	movq	%r9,%rbp
-	sbbq	24(%r14),%r9
-	sbbq	$0,%r10
-
-	cmovcq	%rbx,%r12
-	cmovcq	%rdx,%r13
-	cmovcq	%rcx,%r8
-	cmovcq	%rbp,%r9
-
-	movq	%r12,0(%rdi)
-	movq	%r13,8(%rdi)
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-
-	movq	0(%rsp),%r15
-
-	movq	8(%rsp),%r14
-
-	movq	16(%rsp),%r13
-
-	movq	24(%rsp),%r12
-
-	movq	32(%rsp),%rbx
-
-	movq	40(%rsp),%rbp
-
-	leaq	48(%rsp),%rsp
-
-L$ord_mulx_epilogue:
-	ret
-
-
-
-
-.p2align	5
-ecp_nistz256_ord_sqr_montx:
-
-L$ecp_nistz256_ord_sqr_montx:
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-L$ord_sqrx_body:
-
-	movq	%rdx,%rbx
-	movq	0(%rsi),%rdx
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r15
-	movq	24(%rsi),%r8
-	leaq	L$ord(%rip),%rsi
-	jmp	L$oop_ord_sqrx
-
-.p2align	5
-L$oop_ord_sqrx:
-	mulxq	%r14,%r9,%r10
-	mulxq	%r15,%rcx,%r11
-	movq	%rdx,%rax
-.byte	102,73,15,110,206
-	mulxq	%r8,%rbp,%r12
-	movq	%r14,%rdx
-	addq	%rcx,%r10
-.byte	102,73,15,110,215
-	adcq	%rbp,%r11
-	adcq	$0,%r12
-	xorq	%r13,%r13
-
-	mulxq	%r15,%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r12
-
-	mulxq	%r8,%rcx,%rbp
-	movq	%r15,%rdx
-	adcxq	%rcx,%r12
-	adoxq	%rbp,%r13
-	adcq	$0,%r13
-
-	mulxq	%r8,%rcx,%r14
-	movq	%rax,%rdx
-.byte	102,73,15,110,216
-	xorq	%r15,%r15
-	adcxq	%r9,%r9
-	adoxq	%rcx,%r13
-	adcxq	%r10,%r10
-	adoxq	%r15,%r14
-
-
-	mulxq	%rdx,%r8,%rbp
-.byte	102,72,15,126,202
-	adcxq	%r11,%r11
-	adoxq	%rbp,%r9
-	adcxq	%r12,%r12
-	mulxq	%rdx,%rcx,%rax
-.byte	102,72,15,126,210
-	adcxq	%r13,%r13
-	adoxq	%rcx,%r10
-	adcxq	%r14,%r14
-	mulxq	%rdx,%rcx,%rbp
-.byte	0x67
-.byte	102,72,15,126,218
-	adoxq	%rax,%r11
-	adcxq	%r15,%r15
-	adoxq	%rcx,%r12
-	adoxq	%rbp,%r13
-	mulxq	%rdx,%rcx,%rax
-	adoxq	%rcx,%r14
-	adoxq	%rax,%r15
-
-
-	movq	%r8,%rdx
-	mulxq	32(%rsi),%rdx,%rcx
-
-	xorq	%rax,%rax
-	mulxq	0(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r8
-	adoxq	%rbp,%r9
-	mulxq	8(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r9
-	adoxq	%rbp,%r10
-	mulxq	16(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r10
-	adoxq	%rbp,%r11
-	mulxq	24(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r8
-	adcxq	%rax,%r8
-
-
-	movq	%r9,%rdx
-	mulxq	32(%rsi),%rdx,%rcx
-
-	mulxq	0(%rsi),%rcx,%rbp
-	adoxq	%rcx,%r9
-	adcxq	%rbp,%r10
-	mulxq	8(%rsi),%rcx,%rbp
-	adoxq	%rcx,%r10
-	adcxq	%rbp,%r11
-	mulxq	16(%rsi),%rcx,%rbp
-	adoxq	%rcx,%r11
-	adcxq	%rbp,%r8
-	mulxq	24(%rsi),%rcx,%rbp
-	adoxq	%rcx,%r8
-	adcxq	%rbp,%r9
-	adoxq	%rax,%r9
-
-
-	movq	%r10,%rdx
-	mulxq	32(%rsi),%rdx,%rcx
-
-	mulxq	0(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r10
-	adoxq	%rbp,%r11
-	mulxq	8(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r8
-	mulxq	16(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r8
-	adoxq	%rbp,%r9
-	mulxq	24(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r9
-	adoxq	%rbp,%r10
-	adcxq	%rax,%r10
-
-
-	movq	%r11,%rdx
-	mulxq	32(%rsi),%rdx,%rcx
-
-	mulxq	0(%rsi),%rcx,%rbp
-	adoxq	%rcx,%r11
-	adcxq	%rbp,%r8
-	mulxq	8(%rsi),%rcx,%rbp
-	adoxq	%rcx,%r8
-	adcxq	%rbp,%r9
-	mulxq	16(%rsi),%rcx,%rbp
-	adoxq	%rcx,%r9
-	adcxq	%rbp,%r10
-	mulxq	24(%rsi),%rcx,%rbp
-	adoxq	%rcx,%r10
-	adcxq	%rbp,%r11
-	adoxq	%rax,%r11
-
-
-	addq	%r8,%r12
-	adcq	%r13,%r9
-	movq	%r12,%rdx
-	adcq	%r14,%r10
-	adcq	%r15,%r11
-	movq	%r9,%r14
-	adcq	$0,%rax
-
-
-	subq	0(%rsi),%r12
-	movq	%r10,%r15
-	sbbq	8(%rsi),%r9
-	sbbq	16(%rsi),%r10
-	movq	%r11,%r8
-	sbbq	24(%rsi),%r11
-	sbbq	$0,%rax
-
-	cmovncq	%r12,%rdx
-	cmovncq	%r9,%r14
-	cmovncq	%r10,%r15
-	cmovncq	%r11,%r8
-
-	decq	%rbx
-	jnz	L$oop_ord_sqrx
-
-	movq	%rdx,0(%rdi)
-	movq	%r14,8(%rdi)
-	pxor	%xmm1,%xmm1
-	movq	%r15,16(%rdi)
-	pxor	%xmm2,%xmm2
-	movq	%r8,24(%rdi)
-	pxor	%xmm3,%xmm3
-
-	movq	0(%rsp),%r15
-
-	movq	8(%rsp),%r14
-
-	movq	16(%rsp),%r13
-
-	movq	24(%rsp),%r12
-
-	movq	32(%rsp),%rbx
-
-	movq	40(%rsp),%rbp
-
-	leaq	48(%rsp),%rsp
-
-L$ord_sqrx_epilogue:
-	ret
-
-
-
-
-
-
-
-
-.globl	_ecp_nistz256_mul_mont
-.private_extern _ecp_nistz256_mul_mont
-
-.p2align	5
-_ecp_nistz256_mul_mont:
-
-_CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-L$mul_mont:
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-L$mul_body:
-	cmpl	$0x80100,%ecx
-	je	L$mul_montx
-	movq	%rdx,%rbx
-	movq	0(%rdx),%rax
-	movq	0(%rsi),%r9
-	movq	8(%rsi),%r10
-	movq	16(%rsi),%r11
-	movq	24(%rsi),%r12
-
-	call	__ecp_nistz256_mul_montq
-	jmp	L$mul_mont_done
-
-.p2align	5
-L$mul_montx:
-	movq	%rdx,%rbx
-	movq	0(%rdx),%rdx
-	movq	0(%rsi),%r9
-	movq	8(%rsi),%r10
-	movq	16(%rsi),%r11
-	movq	24(%rsi),%r12
-	leaq	-128(%rsi),%rsi
-
-	call	__ecp_nistz256_mul_montx
-L$mul_mont_done:
-	movq	0(%rsp),%r15
-
-	movq	8(%rsp),%r14
-
-	movq	16(%rsp),%r13
-
-	movq	24(%rsp),%r12
-
-	movq	32(%rsp),%rbx
-
-	movq	40(%rsp),%rbp
-
-	leaq	48(%rsp),%rsp
-
-L$mul_epilogue:
-	ret
-
-
-
-
-.p2align	5
-__ecp_nistz256_mul_montq:
-
-
-
-	movq	%rax,%rbp
-	mulq	%r9
-	movq	L$poly+8(%rip),%r14
-	movq	%rax,%r8
-	movq	%rbp,%rax
-	movq	%rdx,%r9
-
-	mulq	%r10
-	movq	L$poly+24(%rip),%r15
-	addq	%rax,%r9
-	movq	%rbp,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%r10
-
-	mulq	%r11
-	addq	%rax,%r10
-	movq	%rbp,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%r11
-
-	mulq	%r12
-	addq	%rax,%r11
-	movq	%r8,%rax
-	adcq	$0,%rdx
-	xorq	%r13,%r13
-	movq	%rdx,%r12
-
-
-
-
-
-
-
-
-
-
-	movq	%r8,%rbp
-	shlq	$32,%r8
-	mulq	%r15
-	shrq	$32,%rbp
-	addq	%r8,%r9
-	adcq	%rbp,%r10
-	adcq	%rax,%r11
-	movq	8(%rbx),%rax
-	adcq	%rdx,%r12
-	adcq	$0,%r13
-	xorq	%r8,%r8
-
-
-
-	movq	%rax,%rbp
-	mulq	0(%rsi)
-	addq	%rax,%r9
-	movq	%rbp,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rcx
-
-	mulq	8(%rsi)
-	addq	%rcx,%r10
-	adcq	$0,%rdx
-	addq	%rax,%r10
-	movq	%rbp,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rcx
-
-	mulq	16(%rsi)
-	addq	%rcx,%r11
-	adcq	$0,%rdx
-	addq	%rax,%r11
-	movq	%rbp,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rcx
-
-	mulq	24(%rsi)
-	addq	%rcx,%r12
-	adcq	$0,%rdx
-	addq	%rax,%r12
-	movq	%r9,%rax
-	adcq	%rdx,%r13
-	adcq	$0,%r8
-
-
-
-	movq	%r9,%rbp
-	shlq	$32,%r9
-	mulq	%r15
-	shrq	$32,%rbp
-	addq	%r9,%r10
-	adcq	%rbp,%r11
-	adcq	%rax,%r12
-	movq	16(%rbx),%rax
-	adcq	%rdx,%r13
-	adcq	$0,%r8
-	xorq	%r9,%r9
-
-
-
-	movq	%rax,%rbp
-	mulq	0(%rsi)
-	addq	%rax,%r10
-	movq	%rbp,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rcx
-
-	mulq	8(%rsi)
-	addq	%rcx,%r11
-	adcq	$0,%rdx
-	addq	%rax,%r11
-	movq	%rbp,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rcx
-
-	mulq	16(%rsi)
-	addq	%rcx,%r12
-	adcq	$0,%rdx
-	addq	%rax,%r12
-	movq	%rbp,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rcx
-
-	mulq	24(%rsi)
-	addq	%rcx,%r13
-	adcq	$0,%rdx
-	addq	%rax,%r13
-	movq	%r10,%rax
-	adcq	%rdx,%r8
-	adcq	$0,%r9
-
-
-
-	movq	%r10,%rbp
-	shlq	$32,%r10
-	mulq	%r15
-	shrq	$32,%rbp
-	addq	%r10,%r11
-	adcq	%rbp,%r12
-	adcq	%rax,%r13
-	movq	24(%rbx),%rax
-	adcq	%rdx,%r8
-	adcq	$0,%r9
-	xorq	%r10,%r10
-
-
-
-	movq	%rax,%rbp
-	mulq	0(%rsi)
-	addq	%rax,%r11
-	movq	%rbp,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rcx
-
-	mulq	8(%rsi)
-	addq	%rcx,%r12
-	adcq	$0,%rdx
-	addq	%rax,%r12
-	movq	%rbp,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rcx
-
-	mulq	16(%rsi)
-	addq	%rcx,%r13
-	adcq	$0,%rdx
-	addq	%rax,%r13
-	movq	%rbp,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rcx
-
-	mulq	24(%rsi)
-	addq	%rcx,%r8
-	adcq	$0,%rdx
-	addq	%rax,%r8
-	movq	%r11,%rax
-	adcq	%rdx,%r9
-	adcq	$0,%r10
-
-
-
-	movq	%r11,%rbp
-	shlq	$32,%r11
-	mulq	%r15
-	shrq	$32,%rbp
-	addq	%r11,%r12
-	adcq	%rbp,%r13
-	movq	%r12,%rcx
-	adcq	%rax,%r8
-	adcq	%rdx,%r9
-	movq	%r13,%rbp
-	adcq	$0,%r10
-
-
-
-	subq	$-1,%r12
-	movq	%r8,%rbx
-	sbbq	%r14,%r13
-	sbbq	$0,%r8
-	movq	%r9,%rdx
-	sbbq	%r15,%r9
-	sbbq	$0,%r10
-
-	cmovcq	%rcx,%r12
-	cmovcq	%rbp,%r13
-	movq	%r12,0(%rdi)
-	cmovcq	%rbx,%r8
-	movq	%r13,8(%rdi)
-	cmovcq	%rdx,%r9
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-
-	ret
-
-
-
-
-
-
-
-
-
-
-.globl	_ecp_nistz256_sqr_mont
-.private_extern _ecp_nistz256_sqr_mont
-
-.p2align	5
-_ecp_nistz256_sqr_mont:
-
-_CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-L$sqr_body:
-	cmpl	$0x80100,%ecx
-	je	L$sqr_montx
-	movq	0(%rsi),%rax
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r15
-	movq	24(%rsi),%r8
-
-	call	__ecp_nistz256_sqr_montq
-	jmp	L$sqr_mont_done
-
-.p2align	5
-L$sqr_montx:
-	movq	0(%rsi),%rdx
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r15
-	movq	24(%rsi),%r8
-	leaq	-128(%rsi),%rsi
-
-	call	__ecp_nistz256_sqr_montx
-L$sqr_mont_done:
-	movq	0(%rsp),%r15
-
-	movq	8(%rsp),%r14
-
-	movq	16(%rsp),%r13
-
-	movq	24(%rsp),%r12
-
-	movq	32(%rsp),%rbx
-
-	movq	40(%rsp),%rbp
-
-	leaq	48(%rsp),%rsp
-
-L$sqr_epilogue:
-	ret
-
-
-
-
-.p2align	5
-__ecp_nistz256_sqr_montq:
-
-	movq	%rax,%r13
-	mulq	%r14
-	movq	%rax,%r9
-	movq	%r15,%rax
-	movq	%rdx,%r10
-
-	mulq	%r13
-	addq	%rax,%r10
-	movq	%r8,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%r11
-
-	mulq	%r13
-	addq	%rax,%r11
-	movq	%r15,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%r12
-
-
-	mulq	%r14
-	addq	%rax,%r11
-	movq	%r8,%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rbp
-
-	mulq	%r14
-	addq	%rax,%r12
-	movq	%r8,%rax
-	adcq	$0,%rdx
-	addq	%rbp,%r12
-	movq	%rdx,%r13
-	adcq	$0,%r13
-
-
-	mulq	%r15
-	xorq	%r15,%r15
-	addq	%rax,%r13
-	movq	0(%rsi),%rax
-	movq	%rdx,%r14
-	adcq	$0,%r14
-
-	addq	%r9,%r9
-	adcq	%r10,%r10
-	adcq	%r11,%r11
-	adcq	%r12,%r12
-	adcq	%r13,%r13
-	adcq	%r14,%r14
-	adcq	$0,%r15
-
-	mulq	%rax
-	movq	%rax,%r8
-	movq	8(%rsi),%rax
-	movq	%rdx,%rcx
-
-	mulq	%rax
-	addq	%rcx,%r9
-	adcq	%rax,%r10
-	movq	16(%rsi),%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rcx
-
-	mulq	%rax
-	addq	%rcx,%r11
-	adcq	%rax,%r12
-	movq	24(%rsi),%rax
-	adcq	$0,%rdx
-	movq	%rdx,%rcx
-
-	mulq	%rax
-	addq	%rcx,%r13
-	adcq	%rax,%r14
-	movq	%r8,%rax
-	adcq	%rdx,%r15
-
-	movq	L$poly+8(%rip),%rsi
-	movq	L$poly+24(%rip),%rbp
-
-
-
-
-	movq	%r8,%rcx
-	shlq	$32,%r8
-	mulq	%rbp
-	shrq	$32,%rcx
-	addq	%r8,%r9
-	adcq	%rcx,%r10
-	adcq	%rax,%r11
-	movq	%r9,%rax
-	adcq	$0,%rdx
-
-
-
-	movq	%r9,%rcx
-	shlq	$32,%r9
-	movq	%rdx,%r8
-	mulq	%rbp
-	shrq	$32,%rcx
-	addq	%r9,%r10
-	adcq	%rcx,%r11
-	adcq	%rax,%r8
-	movq	%r10,%rax
-	adcq	$0,%rdx
-
-
-
-	movq	%r10,%rcx
-	shlq	$32,%r10
-	movq	%rdx,%r9
-	mulq	%rbp
-	shrq	$32,%rcx
-	addq	%r10,%r11
-	adcq	%rcx,%r8
-	adcq	%rax,%r9
-	movq	%r11,%rax
-	adcq	$0,%rdx
-
-
-
-	movq	%r11,%rcx
-	shlq	$32,%r11
-	movq	%rdx,%r10
-	mulq	%rbp
-	shrq	$32,%rcx
-	addq	%r11,%r8
-	adcq	%rcx,%r9
-	adcq	%rax,%r10
-	adcq	$0,%rdx
-	xorq	%r11,%r11
-
-
-
-	addq	%r8,%r12
-	adcq	%r9,%r13
-	movq	%r12,%r8
-	adcq	%r10,%r14
-	adcq	%rdx,%r15
-	movq	%r13,%r9
-	adcq	$0,%r11
-
-	subq	$-1,%r12
-	movq	%r14,%r10
-	sbbq	%rsi,%r13
-	sbbq	$0,%r14
-	movq	%r15,%rcx
-	sbbq	%rbp,%r15
-	sbbq	$0,%r11
-
-	cmovcq	%r8,%r12
-	cmovcq	%r9,%r13
-	movq	%r12,0(%rdi)
-	cmovcq	%r10,%r14
-	movq	%r13,8(%rdi)
-	cmovcq	%rcx,%r15
-	movq	%r14,16(%rdi)
-	movq	%r15,24(%rdi)
-
-	ret
-
-
-
-.p2align	5
-__ecp_nistz256_mul_montx:
-
-
-
-	mulxq	%r9,%r8,%r9
-	mulxq	%r10,%rcx,%r10
-	movq	$32,%r14
-	xorq	%r13,%r13
-	mulxq	%r11,%rbp,%r11
-	movq	L$poly+24(%rip),%r15
-	adcq	%rcx,%r9
-	mulxq	%r12,%rcx,%r12
-	movq	%r8,%rdx
-	adcq	%rbp,%r10
-	shlxq	%r14,%r8,%rbp
-	adcq	%rcx,%r11
-	shrxq	%r14,%r8,%rcx
-	adcq	$0,%r12
-
-
-
-	addq	%rbp,%r9
-	adcq	%rcx,%r10
-
-	mulxq	%r15,%rcx,%rbp
-	movq	8(%rbx),%rdx
-	adcq	%rcx,%r11
-	adcq	%rbp,%r12
-	adcq	$0,%r13
-	xorq	%r8,%r8
-
-
-
-	mulxq	0+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r9
-	adoxq	%rbp,%r10
-
-	mulxq	8+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r10
-	adoxq	%rbp,%r11
-
-	mulxq	16+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r12
-
-	mulxq	24+128(%rsi),%rcx,%rbp
-	movq	%r9,%rdx
-	adcxq	%rcx,%r12
-	shlxq	%r14,%r9,%rcx
-	adoxq	%rbp,%r13
-	shrxq	%r14,%r9,%rbp
-
-	adcxq	%r8,%r13
-	adoxq	%r8,%r8
-	adcq	$0,%r8
-
-
-
-	addq	%rcx,%r10
-	adcq	%rbp,%r11
-
-	mulxq	%r15,%rcx,%rbp
-	movq	16(%rbx),%rdx
-	adcq	%rcx,%r12
-	adcq	%rbp,%r13
-	adcq	$0,%r8
-	xorq	%r9,%r9
-
-
-
-	mulxq	0+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r10
-	adoxq	%rbp,%r11
-
-	mulxq	8+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r12
-
-	mulxq	16+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r12
-	adoxq	%rbp,%r13
-
-	mulxq	24+128(%rsi),%rcx,%rbp
-	movq	%r10,%rdx
-	adcxq	%rcx,%r13
-	shlxq	%r14,%r10,%rcx
-	adoxq	%rbp,%r8
-	shrxq	%r14,%r10,%rbp
-
-	adcxq	%r9,%r8
-	adoxq	%r9,%r9
-	adcq	$0,%r9
-
-
-
-	addq	%rcx,%r11
-	adcq	%rbp,%r12
-
-	mulxq	%r15,%rcx,%rbp
-	movq	24(%rbx),%rdx
-	adcq	%rcx,%r13
-	adcq	%rbp,%r8
-	adcq	$0,%r9
-	xorq	%r10,%r10
-
-
-
-	mulxq	0+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r12
-
-	mulxq	8+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r12
-	adoxq	%rbp,%r13
-
-	mulxq	16+128(%rsi),%rcx,%rbp
-	adcxq	%rcx,%r13
-	adoxq	%rbp,%r8
-
-	mulxq	24+128(%rsi),%rcx,%rbp
-	movq	%r11,%rdx
-	adcxq	%rcx,%r8
-	shlxq	%r14,%r11,%rcx
-	adoxq	%rbp,%r9
-	shrxq	%r14,%r11,%rbp
-
-	adcxq	%r10,%r9
-	adoxq	%r10,%r10
-	adcq	$0,%r10
-
-
-
-	addq	%rcx,%r12
-	adcq	%rbp,%r13
-
-	mulxq	%r15,%rcx,%rbp
-	movq	%r12,%rbx
-	movq	L$poly+8(%rip),%r14
-	adcq	%rcx,%r8
-	movq	%r13,%rdx
-	adcq	%rbp,%r9
-	adcq	$0,%r10
-
-
-
-	xorl	%eax,%eax
-	movq	%r8,%rcx
-	sbbq	$-1,%r12
-	sbbq	%r14,%r13
-	sbbq	$0,%r8
-	movq	%r9,%rbp
-	sbbq	%r15,%r9
-	sbbq	$0,%r10
-
-	cmovcq	%rbx,%r12
-	cmovcq	%rdx,%r13
-	movq	%r12,0(%rdi)
-	cmovcq	%rcx,%r8
-	movq	%r13,8(%rdi)
-	cmovcq	%rbp,%r9
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-
-	ret
-
-
-
-
-.p2align	5
-__ecp_nistz256_sqr_montx:
-
-	mulxq	%r14,%r9,%r10
-	mulxq	%r15,%rcx,%r11
-	xorl	%eax,%eax
-	adcq	%rcx,%r10
-	mulxq	%r8,%rbp,%r12
-	movq	%r14,%rdx
-	adcq	%rbp,%r11
-	adcq	$0,%r12
-	xorq	%r13,%r13
-
-
-	mulxq	%r15,%rcx,%rbp
-	adcxq	%rcx,%r11
-	adoxq	%rbp,%r12
-
-	mulxq	%r8,%rcx,%rbp
-	movq	%r15,%rdx
-	adcxq	%rcx,%r12
-	adoxq	%rbp,%r13
-	adcq	$0,%r13
-
-
-	mulxq	%r8,%rcx,%r14
-	movq	0+128(%rsi),%rdx
-	xorq	%r15,%r15
-	adcxq	%r9,%r9
-	adoxq	%rcx,%r13
-	adcxq	%r10,%r10
-	adoxq	%r15,%r14
-
-	mulxq	%rdx,%r8,%rbp
-	movq	8+128(%rsi),%rdx
-	adcxq	%r11,%r11
-	adoxq	%rbp,%r9
-	adcxq	%r12,%r12
-	mulxq	%rdx,%rcx,%rax
-	movq	16+128(%rsi),%rdx
-	adcxq	%r13,%r13
-	adoxq	%rcx,%r10
-	adcxq	%r14,%r14
-.byte	0x67
-	mulxq	%rdx,%rcx,%rbp
-	movq	24+128(%rsi),%rdx
-	adoxq	%rax,%r11
-	adcxq	%r15,%r15
-	adoxq	%rcx,%r12
-	movq	$32,%rsi
-	adoxq	%rbp,%r13
-.byte	0x67,0x67
-	mulxq	%rdx,%rcx,%rax
-	movq	L$poly+24(%rip),%rdx
-	adoxq	%rcx,%r14
-	shlxq	%rsi,%r8,%rcx
-	adoxq	%rax,%r15
-	shrxq	%rsi,%r8,%rax
-	movq	%rdx,%rbp
-
-
-	addq	%rcx,%r9
-	adcq	%rax,%r10
-
-	mulxq	%r8,%rcx,%r8
-	adcq	%rcx,%r11
-	shlxq	%rsi,%r9,%rcx
-	adcq	$0,%r8
-	shrxq	%rsi,%r9,%rax
-
-
-	addq	%rcx,%r10
-	adcq	%rax,%r11
-
-	mulxq	%r9,%rcx,%r9
-	adcq	%rcx,%r8
-	shlxq	%rsi,%r10,%rcx
-	adcq	$0,%r9
-	shrxq	%rsi,%r10,%rax
-
-
-	addq	%rcx,%r11
-	adcq	%rax,%r8
-
-	mulxq	%r10,%rcx,%r10
-	adcq	%rcx,%r9
-	shlxq	%rsi,%r11,%rcx
-	adcq	$0,%r10
-	shrxq	%rsi,%r11,%rax
-
-
-	addq	%rcx,%r8
-	adcq	%rax,%r9
-
-	mulxq	%r11,%rcx,%r11
-	adcq	%rcx,%r10
-	adcq	$0,%r11
-
-	xorq	%rdx,%rdx
-	addq	%r8,%r12
-	movq	L$poly+8(%rip),%rsi
-	adcq	%r9,%r13
-	movq	%r12,%r8
-	adcq	%r10,%r14
-	adcq	%r11,%r15
-	movq	%r13,%r9
-	adcq	$0,%rdx
-
-	subq	$-1,%r12
-	movq	%r14,%r10
-	sbbq	%rsi,%r13
-	sbbq	$0,%r14
-	movq	%r15,%r11
-	sbbq	%rbp,%r15
-	sbbq	$0,%rdx
-
-	cmovcq	%r8,%r12
-	cmovcq	%r9,%r13
-	movq	%r12,0(%rdi)
-	cmovcq	%r10,%r14
-	movq	%r13,8(%rdi)
-	cmovcq	%r11,%r15
-	movq	%r14,16(%rdi)
-	movq	%r15,24(%rdi)
-
-	ret
-
-
-
-
-.globl	_ecp_nistz256_select_w5
-.private_extern _ecp_nistz256_select_w5
-
-.p2align	5
-_ecp_nistz256_select_w5:
-
-_CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rax
-	movq	8(%rax),%rax
-	testl	$32,%eax
-	jnz	L$avx2_select_w5
-	movdqa	L$One(%rip),%xmm0
-	movd	%edx,%xmm1
-
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-	pxor	%xmm7,%xmm7
-
-	movdqa	%xmm0,%xmm8
-	pshufd	$0,%xmm1,%xmm1
-
-	movq	$16,%rax
-L$select_loop_sse_w5:
-
-	movdqa	%xmm8,%xmm15
-	paddd	%xmm0,%xmm8
-	pcmpeqd	%xmm1,%xmm15
-
-	movdqa	0(%rsi),%xmm9
-	movdqa	16(%rsi),%xmm10
-	movdqa	32(%rsi),%xmm11
-	movdqa	48(%rsi),%xmm12
-	movdqa	64(%rsi),%xmm13
-	movdqa	80(%rsi),%xmm14
-	leaq	96(%rsi),%rsi
-
-	pand	%xmm15,%xmm9
-	pand	%xmm15,%xmm10
-	por	%xmm9,%xmm2
-	pand	%xmm15,%xmm11
-	por	%xmm10,%xmm3
-	pand	%xmm15,%xmm12
-	por	%xmm11,%xmm4
-	pand	%xmm15,%xmm13
-	por	%xmm12,%xmm5
-	pand	%xmm15,%xmm14
-	por	%xmm13,%xmm6
-	por	%xmm14,%xmm7
-
-	decq	%rax
-	jnz	L$select_loop_sse_w5
-
-	movdqu	%xmm2,0(%rdi)
-	movdqu	%xmm3,16(%rdi)
-	movdqu	%xmm4,32(%rdi)
-	movdqu	%xmm5,48(%rdi)
-	movdqu	%xmm6,64(%rdi)
-	movdqu	%xmm7,80(%rdi)
-	ret
-
-L$SEH_end_ecp_nistz256_select_w5:
-
-
-
-
-.globl	_ecp_nistz256_select_w7
-.private_extern _ecp_nistz256_select_w7
-
-.p2align	5
-_ecp_nistz256_select_w7:
-
-_CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rax
-	movq	8(%rax),%rax
-	testl	$32,%eax
-	jnz	L$avx2_select_w7
-	movdqa	L$One(%rip),%xmm8
-	movd	%edx,%xmm1
-
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-
-	movdqa	%xmm8,%xmm0
-	pshufd	$0,%xmm1,%xmm1
-	movq	$64,%rax
-
-L$select_loop_sse_w7:
-	movdqa	%xmm8,%xmm15
-	paddd	%xmm0,%xmm8
-	movdqa	0(%rsi),%xmm9
-	movdqa	16(%rsi),%xmm10
-	pcmpeqd	%xmm1,%xmm15
-	movdqa	32(%rsi),%xmm11
-	movdqa	48(%rsi),%xmm12
-	leaq	64(%rsi),%rsi
-
-	pand	%xmm15,%xmm9
-	pand	%xmm15,%xmm10
-	por	%xmm9,%xmm2
-	pand	%xmm15,%xmm11
-	por	%xmm10,%xmm3
-	pand	%xmm15,%xmm12
-	por	%xmm11,%xmm4
-	prefetcht0	255(%rsi)
-	por	%xmm12,%xmm5
-
-	decq	%rax
-	jnz	L$select_loop_sse_w7
-
-	movdqu	%xmm2,0(%rdi)
-	movdqu	%xmm3,16(%rdi)
-	movdqu	%xmm4,32(%rdi)
-	movdqu	%xmm5,48(%rdi)
-	ret
-
-L$SEH_end_ecp_nistz256_select_w7:
-
-
-
-
-.p2align	5
-ecp_nistz256_avx2_select_w5:
-
-L$avx2_select_w5:
-	vzeroupper
-	vmovdqa	L$Two(%rip),%ymm0
-
-	vpxor	%ymm2,%ymm2,%ymm2
-	vpxor	%ymm3,%ymm3,%ymm3
-	vpxor	%ymm4,%ymm4,%ymm4
-
-	vmovdqa	L$One(%rip),%ymm5
-	vmovdqa	L$Two(%rip),%ymm10
-
-	vmovd	%edx,%xmm1
-	vpermd	%ymm1,%ymm2,%ymm1
-
-	movq	$8,%rax
-L$select_loop_avx2_w5:
-
-	vmovdqa	0(%rsi),%ymm6
-	vmovdqa	32(%rsi),%ymm7
-	vmovdqa	64(%rsi),%ymm8
-
-	vmovdqa	96(%rsi),%ymm11
-	vmovdqa	128(%rsi),%ymm12
-	vmovdqa	160(%rsi),%ymm13
-
-	vpcmpeqd	%ymm1,%ymm5,%ymm9
-	vpcmpeqd	%ymm1,%ymm10,%ymm14
-
-	vpaddd	%ymm0,%ymm5,%ymm5
-	vpaddd	%ymm0,%ymm10,%ymm10
-	leaq	192(%rsi),%rsi
-
-	vpand	%ymm9,%ymm6,%ymm6
-	vpand	%ymm9,%ymm7,%ymm7
-	vpand	%ymm9,%ymm8,%ymm8
-	vpand	%ymm14,%ymm11,%ymm11
-	vpand	%ymm14,%ymm12,%ymm12
-	vpand	%ymm14,%ymm13,%ymm13
-
-	vpxor	%ymm6,%ymm2,%ymm2
-	vpxor	%ymm7,%ymm3,%ymm3
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpxor	%ymm11,%ymm2,%ymm2
-	vpxor	%ymm12,%ymm3,%ymm3
-	vpxor	%ymm13,%ymm4,%ymm4
-
-	decq	%rax
-	jnz	L$select_loop_avx2_w5
-
-	vmovdqu	%ymm2,0(%rdi)
-	vmovdqu	%ymm3,32(%rdi)
-	vmovdqu	%ymm4,64(%rdi)
-	vzeroupper
-	ret
-
-L$SEH_end_ecp_nistz256_avx2_select_w5:
-
-
-
-
-.globl	_ecp_nistz256_avx2_select_w7
-.private_extern _ecp_nistz256_avx2_select_w7
-
-.p2align	5
-_ecp_nistz256_avx2_select_w7:
-
-L$avx2_select_w7:
-_CET_ENDBR
-	vzeroupper
-	vmovdqa	L$Three(%rip),%ymm0
-
-	vpxor	%ymm2,%ymm2,%ymm2
-	vpxor	%ymm3,%ymm3,%ymm3
-
-	vmovdqa	L$One(%rip),%ymm4
-	vmovdqa	L$Two(%rip),%ymm8
-	vmovdqa	L$Three(%rip),%ymm12
-
-	vmovd	%edx,%xmm1
-	vpermd	%ymm1,%ymm2,%ymm1
-
-
-	movq	$21,%rax
-L$select_loop_avx2_w7:
-
-	vmovdqa	0(%rsi),%ymm5
-	vmovdqa	32(%rsi),%ymm6
-
-	vmovdqa	64(%rsi),%ymm9
-	vmovdqa	96(%rsi),%ymm10
-
-	vmovdqa	128(%rsi),%ymm13
-	vmovdqa	160(%rsi),%ymm14
-
-	vpcmpeqd	%ymm1,%ymm4,%ymm7
-	vpcmpeqd	%ymm1,%ymm8,%ymm11
-	vpcmpeqd	%ymm1,%ymm12,%ymm15
-
-	vpaddd	%ymm0,%ymm4,%ymm4
-	vpaddd	%ymm0,%ymm8,%ymm8
-	vpaddd	%ymm0,%ymm12,%ymm12
-	leaq	192(%rsi),%rsi
-
-	vpand	%ymm7,%ymm5,%ymm5
-	vpand	%ymm7,%ymm6,%ymm6
-	vpand	%ymm11,%ymm9,%ymm9
-	vpand	%ymm11,%ymm10,%ymm10
-	vpand	%ymm15,%ymm13,%ymm13
-	vpand	%ymm15,%ymm14,%ymm14
-
-	vpxor	%ymm5,%ymm2,%ymm2
-	vpxor	%ymm6,%ymm3,%ymm3
-	vpxor	%ymm9,%ymm2,%ymm2
-	vpxor	%ymm10,%ymm3,%ymm3
-	vpxor	%ymm13,%ymm2,%ymm2
-	vpxor	%ymm14,%ymm3,%ymm3
-
-	decq	%rax
-	jnz	L$select_loop_avx2_w7
-
-
-	vmovdqa	0(%rsi),%ymm5
-	vmovdqa	32(%rsi),%ymm6
-
-	vpcmpeqd	%ymm1,%ymm4,%ymm7
-
-	vpand	%ymm7,%ymm5,%ymm5
-	vpand	%ymm7,%ymm6,%ymm6
-
-	vpxor	%ymm5,%ymm2,%ymm2
-	vpxor	%ymm6,%ymm3,%ymm3
-
-	vmovdqu	%ymm2,0(%rdi)
-	vmovdqu	%ymm3,32(%rdi)
-	vzeroupper
-	ret
-
-L$SEH_end_ecp_nistz256_avx2_select_w7:
-
-
-.p2align	5
-__ecp_nistz256_add_toq:
-
-	xorq	%r11,%r11
-	addq	0(%rbx),%r12
-	adcq	8(%rbx),%r13
-	movq	%r12,%rax
-	adcq	16(%rbx),%r8
-	adcq	24(%rbx),%r9
-	movq	%r13,%rbp
-	adcq	$0,%r11
-
-	subq	$-1,%r12
-	movq	%r8,%rcx
-	sbbq	%r14,%r13
-	sbbq	$0,%r8
-	movq	%r9,%r10
-	sbbq	%r15,%r9
-	sbbq	$0,%r11
-
-	cmovcq	%rax,%r12
-	cmovcq	%rbp,%r13
-	movq	%r12,0(%rdi)
-	cmovcq	%rcx,%r8
-	movq	%r13,8(%rdi)
-	cmovcq	%r10,%r9
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-
-	ret
-
-
-
-
-.p2align	5
-__ecp_nistz256_sub_fromq:
-
-	subq	0(%rbx),%r12
-	sbbq	8(%rbx),%r13
-	movq	%r12,%rax
-	sbbq	16(%rbx),%r8
-	sbbq	24(%rbx),%r9
-	movq	%r13,%rbp
-	sbbq	%r11,%r11
-
-	addq	$-1,%r12
-	movq	%r8,%rcx
-	adcq	%r14,%r13
-	adcq	$0,%r8
-	movq	%r9,%r10
-	adcq	%r15,%r9
-	testq	%r11,%r11
-
-	cmovzq	%rax,%r12
-	cmovzq	%rbp,%r13
-	movq	%r12,0(%rdi)
-	cmovzq	%rcx,%r8
-	movq	%r13,8(%rdi)
-	cmovzq	%r10,%r9
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-
-	ret
-
-
-
-
-.p2align	5
-__ecp_nistz256_subq:
-
-	subq	%r12,%rax
-	sbbq	%r13,%rbp
-	movq	%rax,%r12
-	sbbq	%r8,%rcx
-	sbbq	%r9,%r10
-	movq	%rbp,%r13
-	sbbq	%r11,%r11
-
-	addq	$-1,%rax
-	movq	%rcx,%r8
-	adcq	%r14,%rbp
-	adcq	$0,%rcx
-	movq	%r10,%r9
-	adcq	%r15,%r10
-	testq	%r11,%r11
-
-	cmovnzq	%rax,%r12
-	cmovnzq	%rbp,%r13
-	cmovnzq	%rcx,%r8
-	cmovnzq	%r10,%r9
-
-	ret
-
-
-
-
-.p2align	5
-__ecp_nistz256_mul_by_2q:
-
-	xorq	%r11,%r11
-	addq	%r12,%r12
-	adcq	%r13,%r13
-	movq	%r12,%rax
-	adcq	%r8,%r8
-	adcq	%r9,%r9
-	movq	%r13,%rbp
-	adcq	$0,%r11
-
-	subq	$-1,%r12
-	movq	%r8,%rcx
-	sbbq	%r14,%r13
-	sbbq	$0,%r8
-	movq	%r9,%r10
-	sbbq	%r15,%r9
-	sbbq	$0,%r11
-
-	cmovcq	%rax,%r12
-	cmovcq	%rbp,%r13
-	movq	%r12,0(%rdi)
-	cmovcq	%rcx,%r8
-	movq	%r13,8(%rdi)
-	cmovcq	%r10,%r9
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-
-	ret
-
-
-.globl	_ecp_nistz256_point_double
-.private_extern _ecp_nistz256_point_double
-
-.p2align	5
-_ecp_nistz256_point_double:
-
-_CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	L$point_doublex
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	subq	$160+8,%rsp
-
-L$point_doubleq_body:
-
-L$point_double_shortcutq:
-	movdqu	0(%rsi),%xmm0
-	movq	%rsi,%rbx
-	movdqu	16(%rsi),%xmm1
-	movq	32+0(%rsi),%r12
-	movq	32+8(%rsi),%r13
-	movq	32+16(%rsi),%r8
-	movq	32+24(%rsi),%r9
-	movq	L$poly+8(%rip),%r14
-	movq	L$poly+24(%rip),%r15
-	movdqa	%xmm0,96(%rsp)
-	movdqa	%xmm1,96+16(%rsp)
-	leaq	32(%rdi),%r10
-	leaq	64(%rdi),%r11
-.byte	102,72,15,110,199
-.byte	102,73,15,110,202
-.byte	102,73,15,110,211
-
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_mul_by_2q
-
-	movq	64+0(%rsi),%rax
-	movq	64+8(%rsi),%r14
-	movq	64+16(%rsi),%r15
-	movq	64+24(%rsi),%r8
-	leaq	64-0(%rsi),%rsi
-	leaq	64(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montq
-
-	movq	0+0(%rsp),%rax
-	movq	8+0(%rsp),%r14
-	leaq	0+0(%rsp),%rsi
-	movq	16+0(%rsp),%r15
-	movq	24+0(%rsp),%r8
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montq
-
-	movq	32(%rbx),%rax
-	movq	64+0(%rbx),%r9
-	movq	64+8(%rbx),%r10
-	movq	64+16(%rbx),%r11
-	movq	64+24(%rbx),%r12
-	leaq	64-0(%rbx),%rsi
-	leaq	32(%rbx),%rbx
-.byte	102,72,15,126,215
-	call	__ecp_nistz256_mul_montq
-	call	__ecp_nistz256_mul_by_2q
-
-	movq	96+0(%rsp),%r12
-	movq	96+8(%rsp),%r13
-	leaq	64(%rsp),%rbx
-	movq	96+16(%rsp),%r8
-	movq	96+24(%rsp),%r9
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_add_toq
-
-	movq	96+0(%rsp),%r12
-	movq	96+8(%rsp),%r13
-	leaq	64(%rsp),%rbx
-	movq	96+16(%rsp),%r8
-	movq	96+24(%rsp),%r9
-	leaq	64(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromq
-
-	movq	0+0(%rsp),%rax
-	movq	8+0(%rsp),%r14
-	leaq	0+0(%rsp),%rsi
-	movq	16+0(%rsp),%r15
-	movq	24+0(%rsp),%r8
-.byte	102,72,15,126,207
-	call	__ecp_nistz256_sqr_montq
-	xorq	%r9,%r9
-	movq	%r12,%rax
-	addq	$-1,%r12
-	movq	%r13,%r10
-	adcq	%rsi,%r13
-	movq	%r14,%rcx
-	adcq	$0,%r14
-	movq	%r15,%r8
-	adcq	%rbp,%r15
-	adcq	$0,%r9
-	xorq	%rsi,%rsi
-	testq	$1,%rax
-
-	cmovzq	%rax,%r12
-	cmovzq	%r10,%r13
-	cmovzq	%rcx,%r14
-	cmovzq	%r8,%r15
-	cmovzq	%rsi,%r9
-
-	movq	%r13,%rax
-	shrq	$1,%r12
-	shlq	$63,%rax
-	movq	%r14,%r10
-	shrq	$1,%r13
-	orq	%rax,%r12
-	shlq	$63,%r10
-	movq	%r15,%rcx
-	shrq	$1,%r14
-	orq	%r10,%r13
-	shlq	$63,%rcx
-	movq	%r12,0(%rdi)
-	shrq	$1,%r15
-	movq	%r13,8(%rdi)
-	shlq	$63,%r9
-	orq	%rcx,%r14
-	orq	%r9,%r15
-	movq	%r14,16(%rdi)
-	movq	%r15,24(%rdi)
-	movq	64(%rsp),%rax
-	leaq	64(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	0+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	leaq	128(%rsp),%rdi
-	call	__ecp_nistz256_mul_by_2q
-
-	leaq	32(%rsp),%rbx
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_add_toq
-
-	movq	96(%rsp),%rax
-	leaq	96(%rsp),%rbx
-	movq	0+0(%rsp),%r9
-	movq	8+0(%rsp),%r10
-	leaq	0+0(%rsp),%rsi
-	movq	16+0(%rsp),%r11
-	movq	24+0(%rsp),%r12
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	leaq	128(%rsp),%rdi
-	call	__ecp_nistz256_mul_by_2q
-
-	movq	0+32(%rsp),%rax
-	movq	8+32(%rsp),%r14
-	leaq	0+32(%rsp),%rsi
-	movq	16+32(%rsp),%r15
-	movq	24+32(%rsp),%r8
-.byte	102,72,15,126,199
-	call	__ecp_nistz256_sqr_montq
-
-	leaq	128(%rsp),%rbx
-	movq	%r14,%r8
-	movq	%r15,%r9
-	movq	%rsi,%r14
-	movq	%rbp,%r15
-	call	__ecp_nistz256_sub_fromq
-
-	movq	0+0(%rsp),%rax
-	movq	0+8(%rsp),%rbp
-	movq	0+16(%rsp),%rcx
-	movq	0+24(%rsp),%r10
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_subq
-
-	movq	32(%rsp),%rax
-	leaq	32(%rsp),%rbx
-	movq	%r12,%r14
-	xorl	%ecx,%ecx
-	movq	%r12,0+0(%rsp)
-	movq	%r13,%r10
-	movq	%r13,0+8(%rsp)
-	cmovzq	%r8,%r11
-	movq	%r8,0+16(%rsp)
-	leaq	0-0(%rsp),%rsi
-	cmovzq	%r9,%r12
-	movq	%r9,0+24(%rsp)
-	movq	%r14,%r9
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-.byte	102,72,15,126,203
-.byte	102,72,15,126,207
-	call	__ecp_nistz256_sub_fromq
-
-	leaq	160+56(%rsp),%rsi
-
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbx
-
-	movq	-8(%rsi),%rbp
-
-	leaq	(%rsi),%rsp
-
-L$point_doubleq_epilogue:
-	ret
-
-
-.globl	_ecp_nistz256_point_add
-.private_extern _ecp_nistz256_point_add
-
-.p2align	5
-_ecp_nistz256_point_add:
-
-_CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	L$point_addx
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	subq	$576+8,%rsp
-
-L$point_addq_body:
-
-	movdqu	0(%rsi),%xmm0
-	movdqu	16(%rsi),%xmm1
-	movdqu	32(%rsi),%xmm2
-	movdqu	48(%rsi),%xmm3
-	movdqu	64(%rsi),%xmm4
-	movdqu	80(%rsi),%xmm5
-	movq	%rsi,%rbx
-	movq	%rdx,%rsi
-	movdqa	%xmm0,384(%rsp)
-	movdqa	%xmm1,384+16(%rsp)
-	movdqa	%xmm2,416(%rsp)
-	movdqa	%xmm3,416+16(%rsp)
-	movdqa	%xmm4,448(%rsp)
-	movdqa	%xmm5,448+16(%rsp)
-	por	%xmm4,%xmm5
-
-	movdqu	0(%rsi),%xmm0
-	pshufd	$0xb1,%xmm5,%xmm3
-	movdqu	16(%rsi),%xmm1
-	movdqu	32(%rsi),%xmm2
-	por	%xmm3,%xmm5
-	movdqu	48(%rsi),%xmm3
-	movq	64+0(%rsi),%rax
-	movq	64+8(%rsi),%r14
-	movq	64+16(%rsi),%r15
-	movq	64+24(%rsi),%r8
-	movdqa	%xmm0,480(%rsp)
-	pshufd	$0x1e,%xmm5,%xmm4
-	movdqa	%xmm1,480+16(%rsp)
-	movdqu	64(%rsi),%xmm0
-	movdqu	80(%rsi),%xmm1
-	movdqa	%xmm2,512(%rsp)
-	movdqa	%xmm3,512+16(%rsp)
-	por	%xmm4,%xmm5
-	pxor	%xmm4,%xmm4
-	por	%xmm0,%xmm1
-.byte	102,72,15,110,199
-
-	leaq	64-0(%rsi),%rsi
-	movq	%rax,544+0(%rsp)
-	movq	%r14,544+8(%rsp)
-	movq	%r15,544+16(%rsp)
-	movq	%r8,544+24(%rsp)
-	leaq	96(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montq
-
-	pcmpeqd	%xmm4,%xmm5
-	pshufd	$0xb1,%xmm1,%xmm4
-	por	%xmm1,%xmm4
-	pshufd	$0,%xmm5,%xmm5
-	pshufd	$0x1e,%xmm4,%xmm3
-	por	%xmm3,%xmm4
-	pxor	%xmm3,%xmm3
-	pcmpeqd	%xmm3,%xmm4
-	pshufd	$0,%xmm4,%xmm4
-	movq	64+0(%rbx),%rax
-	movq	64+8(%rbx),%r14
-	movq	64+16(%rbx),%r15
-	movq	64+24(%rbx),%r8
-.byte	102,72,15,110,203
-
-	leaq	64-0(%rbx),%rsi
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montq
-
-	movq	544(%rsp),%rax
-	leaq	544(%rsp),%rbx
-	movq	0+96(%rsp),%r9
-	movq	8+96(%rsp),%r10
-	leaq	0+96(%rsp),%rsi
-	movq	16+96(%rsp),%r11
-	movq	24+96(%rsp),%r12
-	leaq	224(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	movq	448(%rsp),%rax
-	leaq	448(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	0+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	256(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	movq	416(%rsp),%rax
-	leaq	416(%rsp),%rbx
-	movq	0+224(%rsp),%r9
-	movq	8+224(%rsp),%r10
-	leaq	0+224(%rsp),%rsi
-	movq	16+224(%rsp),%r11
-	movq	24+224(%rsp),%r12
-	leaq	224(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	movq	512(%rsp),%rax
-	leaq	512(%rsp),%rbx
-	movq	0+256(%rsp),%r9
-	movq	8+256(%rsp),%r10
-	leaq	0+256(%rsp),%rsi
-	movq	16+256(%rsp),%r11
-	movq	24+256(%rsp),%r12
-	leaq	256(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	leaq	224(%rsp),%rbx
-	leaq	64(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromq
-
-	orq	%r13,%r12
-	movdqa	%xmm4,%xmm2
-	orq	%r8,%r12
-	orq	%r9,%r12
-	por	%xmm5,%xmm2
-.byte	102,73,15,110,220
-
-	movq	384(%rsp),%rax
-	leaq	384(%rsp),%rbx
-	movq	0+96(%rsp),%r9
-	movq	8+96(%rsp),%r10
-	leaq	0+96(%rsp),%rsi
-	movq	16+96(%rsp),%r11
-	movq	24+96(%rsp),%r12
-	leaq	160(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	movq	480(%rsp),%rax
-	leaq	480(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	0+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	192(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	leaq	160(%rsp),%rbx
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromq
-
-	orq	%r13,%r12
-	orq	%r8,%r12
-	orq	%r9,%r12
-
-.byte	102,73,15,126,208
-.byte	102,73,15,126,217
-	orq	%r8,%r12
-.byte	0x3e
-	jnz	L$add_proceedq
-
-
-
-	testq	%r9,%r9
-	jz	L$add_doubleq
-
-
-
-
-
-
-.byte	102,72,15,126,199
-	pxor	%xmm0,%xmm0
-	movdqu	%xmm0,0(%rdi)
-	movdqu	%xmm0,16(%rdi)
-	movdqu	%xmm0,32(%rdi)
-	movdqu	%xmm0,48(%rdi)
-	movdqu	%xmm0,64(%rdi)
-	movdqu	%xmm0,80(%rdi)
-	jmp	L$add_doneq
-
-.p2align	5
-L$add_doubleq:
-.byte	102,72,15,126,206
-.byte	102,72,15,126,199
-	addq	$416,%rsp
-
-	jmp	L$point_double_shortcutq
-
-
-.p2align	5
-L$add_proceedq:
-	movq	0+64(%rsp),%rax
-	movq	8+64(%rsp),%r14
-	leaq	0+64(%rsp),%rsi
-	movq	16+64(%rsp),%r15
-	movq	24+64(%rsp),%r8
-	leaq	96(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montq
-
-	movq	448(%rsp),%rax
-	leaq	448(%rsp),%rbx
-	movq	0+0(%rsp),%r9
-	movq	8+0(%rsp),%r10
-	leaq	0+0(%rsp),%rsi
-	movq	16+0(%rsp),%r11
-	movq	24+0(%rsp),%r12
-	leaq	352(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	movq	0+0(%rsp),%rax
-	movq	8+0(%rsp),%r14
-	leaq	0+0(%rsp),%rsi
-	movq	16+0(%rsp),%r15
-	movq	24+0(%rsp),%r8
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montq
-
-	movq	544(%rsp),%rax
-	leaq	544(%rsp),%rbx
-	movq	0+352(%rsp),%r9
-	movq	8+352(%rsp),%r10
-	leaq	0+352(%rsp),%rsi
-	movq	16+352(%rsp),%r11
-	movq	24+352(%rsp),%r12
-	leaq	352(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	movq	0(%rsp),%rax
-	leaq	0(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	0+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	128(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	movq	160(%rsp),%rax
-	leaq	160(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	0+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	192(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-
-
-
-	xorq	%r11,%r11
-	addq	%r12,%r12
-	leaq	96(%rsp),%rsi
-	adcq	%r13,%r13
-	movq	%r12,%rax
-	adcq	%r8,%r8
-	adcq	%r9,%r9
-	movq	%r13,%rbp
-	adcq	$0,%r11
-
-	subq	$-1,%r12
-	movq	%r8,%rcx
-	sbbq	%r14,%r13
-	sbbq	$0,%r8
-	movq	%r9,%r10
-	sbbq	%r15,%r9
-	sbbq	$0,%r11
-
-	cmovcq	%rax,%r12
-	movq	0(%rsi),%rax
-	cmovcq	%rbp,%r13
-	movq	8(%rsi),%rbp
-	cmovcq	%rcx,%r8
-	movq	16(%rsi),%rcx
-	cmovcq	%r10,%r9
-	movq	24(%rsi),%r10
-
-	call	__ecp_nistz256_subq
-
-	leaq	128(%rsp),%rbx
-	leaq	288(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromq
-
-	movq	192+0(%rsp),%rax
-	movq	192+8(%rsp),%rbp
-	movq	192+16(%rsp),%rcx
-	movq	192+24(%rsp),%r10
-	leaq	320(%rsp),%rdi
-
-	call	__ecp_nistz256_subq
-
-	movq	%r12,0(%rdi)
-	movq	%r13,8(%rdi)
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-	movq	128(%rsp),%rax
-	leaq	128(%rsp),%rbx
-	movq	0+224(%rsp),%r9
-	movq	8+224(%rsp),%r10
-	leaq	0+224(%rsp),%rsi
-	movq	16+224(%rsp),%r11
-	movq	24+224(%rsp),%r12
-	leaq	256(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	movq	320(%rsp),%rax
-	leaq	320(%rsp),%rbx
-	movq	0+64(%rsp),%r9
-	movq	8+64(%rsp),%r10
-	leaq	0+64(%rsp),%rsi
-	movq	16+64(%rsp),%r11
-	movq	24+64(%rsp),%r12
-	leaq	320(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	leaq	256(%rsp),%rbx
-	leaq	320(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromq
-
-.byte	102,72,15,126,199
-
-	movdqa	%xmm5,%xmm0
-	movdqa	%xmm5,%xmm1
-	pandn	352(%rsp),%xmm0
-	movdqa	%xmm5,%xmm2
-	pandn	352+16(%rsp),%xmm1
-	movdqa	%xmm5,%xmm3
-	pand	544(%rsp),%xmm2
-	pand	544+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm2,%xmm0
-	movdqa	%xmm4,%xmm2
-	pandn	%xmm3,%xmm1
-	movdqa	%xmm4,%xmm3
-	pand	448(%rsp),%xmm2
-	pand	448+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-	movdqu	%xmm2,64(%rdi)
-	movdqu	%xmm3,80(%rdi)
-
-	movdqa	%xmm5,%xmm0
-	movdqa	%xmm5,%xmm1
-	pandn	288(%rsp),%xmm0
-	movdqa	%xmm5,%xmm2
-	pandn	288+16(%rsp),%xmm1
-	movdqa	%xmm5,%xmm3
-	pand	480(%rsp),%xmm2
-	pand	480+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm2,%xmm0
-	movdqa	%xmm4,%xmm2
-	pandn	%xmm3,%xmm1
-	movdqa	%xmm4,%xmm3
-	pand	384(%rsp),%xmm2
-	pand	384+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-	movdqu	%xmm2,0(%rdi)
-	movdqu	%xmm3,16(%rdi)
-
-	movdqa	%xmm5,%xmm0
-	movdqa	%xmm5,%xmm1
-	pandn	320(%rsp),%xmm0
-	movdqa	%xmm5,%xmm2
-	pandn	320+16(%rsp),%xmm1
-	movdqa	%xmm5,%xmm3
-	pand	512(%rsp),%xmm2
-	pand	512+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm2,%xmm0
-	movdqa	%xmm4,%xmm2
-	pandn	%xmm3,%xmm1
-	movdqa	%xmm4,%xmm3
-	pand	416(%rsp),%xmm2
-	pand	416+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-	movdqu	%xmm2,32(%rdi)
-	movdqu	%xmm3,48(%rdi)
-
-L$add_doneq:
-	leaq	576+56(%rsp),%rsi
-
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbx
-
-	movq	-8(%rsi),%rbp
-
-	leaq	(%rsi),%rsp
-
-L$point_addq_epilogue:
-	ret
-
-
-.globl	_ecp_nistz256_point_add_affine
-.private_extern _ecp_nistz256_point_add_affine
-
-.p2align	5
-_ecp_nistz256_point_add_affine:
-
-_CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%rcx
-	movq	8(%rcx),%rcx
-	andl	$0x80100,%ecx
-	cmpl	$0x80100,%ecx
-	je	L$point_add_affinex
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	subq	$480+8,%rsp
-
-L$add_affineq_body:
-
-	movdqu	0(%rsi),%xmm0
-	movq	%rdx,%rbx
-	movdqu	16(%rsi),%xmm1
-	movdqu	32(%rsi),%xmm2
-	movdqu	48(%rsi),%xmm3
-	movdqu	64(%rsi),%xmm4
-	movdqu	80(%rsi),%xmm5
-	movq	64+0(%rsi),%rax
-	movq	64+8(%rsi),%r14
-	movq	64+16(%rsi),%r15
-	movq	64+24(%rsi),%r8
-	movdqa	%xmm0,320(%rsp)
-	movdqa	%xmm1,320+16(%rsp)
-	movdqa	%xmm2,352(%rsp)
-	movdqa	%xmm3,352+16(%rsp)
-	movdqa	%xmm4,384(%rsp)
-	movdqa	%xmm5,384+16(%rsp)
-	por	%xmm4,%xmm5
-
-	movdqu	0(%rbx),%xmm0
-	pshufd	$0xb1,%xmm5,%xmm3
-	movdqu	16(%rbx),%xmm1
-	movdqu	32(%rbx),%xmm2
-	por	%xmm3,%xmm5
-	movdqu	48(%rbx),%xmm3
-	movdqa	%xmm0,416(%rsp)
-	pshufd	$0x1e,%xmm5,%xmm4
-	movdqa	%xmm1,416+16(%rsp)
-	por	%xmm0,%xmm1
-.byte	102,72,15,110,199
-	movdqa	%xmm2,448(%rsp)
-	movdqa	%xmm3,448+16(%rsp)
-	por	%xmm2,%xmm3
-	por	%xmm4,%xmm5
-	pxor	%xmm4,%xmm4
-	por	%xmm1,%xmm3
-
-	leaq	64-0(%rsi),%rsi
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montq
-
-	pcmpeqd	%xmm4,%xmm5
-	pshufd	$0xb1,%xmm3,%xmm4
-	movq	0(%rbx),%rax
-
-	movq	%r12,%r9
-	por	%xmm3,%xmm4
-	pshufd	$0,%xmm5,%xmm5
-	pshufd	$0x1e,%xmm4,%xmm3
-	movq	%r13,%r10
-	por	%xmm3,%xmm4
-	pxor	%xmm3,%xmm3
-	movq	%r14,%r11
-	pcmpeqd	%xmm3,%xmm4
-	pshufd	$0,%xmm4,%xmm4
-
-	leaq	32-0(%rsp),%rsi
-	movq	%r15,%r12
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	leaq	320(%rsp),%rbx
-	leaq	64(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromq
-
-	movq	384(%rsp),%rax
-	leaq	384(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	0+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	movq	384(%rsp),%rax
-	leaq	384(%rsp),%rbx
-	movq	0+64(%rsp),%r9
-	movq	8+64(%rsp),%r10
-	leaq	0+64(%rsp),%rsi
-	movq	16+64(%rsp),%r11
-	movq	24+64(%rsp),%r12
-	leaq	288(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	movq	448(%rsp),%rax
-	leaq	448(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	0+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	leaq	352(%rsp),%rbx
-	leaq	96(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromq
-
-	movq	0+64(%rsp),%rax
-	movq	8+64(%rsp),%r14
-	leaq	0+64(%rsp),%rsi
-	movq	16+64(%rsp),%r15
-	movq	24+64(%rsp),%r8
-	leaq	128(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montq
-
-	movq	0+96(%rsp),%rax
-	movq	8+96(%rsp),%r14
-	leaq	0+96(%rsp),%rsi
-	movq	16+96(%rsp),%r15
-	movq	24+96(%rsp),%r8
-	leaq	192(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montq
-
-	movq	128(%rsp),%rax
-	leaq	128(%rsp),%rbx
-	movq	0+64(%rsp),%r9
-	movq	8+64(%rsp),%r10
-	leaq	0+64(%rsp),%rsi
-	movq	16+64(%rsp),%r11
-	movq	24+64(%rsp),%r12
-	leaq	160(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	movq	320(%rsp),%rax
-	leaq	320(%rsp),%rbx
-	movq	0+128(%rsp),%r9
-	movq	8+128(%rsp),%r10
-	leaq	0+128(%rsp),%rsi
-	movq	16+128(%rsp),%r11
-	movq	24+128(%rsp),%r12
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-
-
-
-	xorq	%r11,%r11
-	addq	%r12,%r12
-	leaq	192(%rsp),%rsi
-	adcq	%r13,%r13
-	movq	%r12,%rax
-	adcq	%r8,%r8
-	adcq	%r9,%r9
-	movq	%r13,%rbp
-	adcq	$0,%r11
-
-	subq	$-1,%r12
-	movq	%r8,%rcx
-	sbbq	%r14,%r13
-	sbbq	$0,%r8
-	movq	%r9,%r10
-	sbbq	%r15,%r9
-	sbbq	$0,%r11
-
-	cmovcq	%rax,%r12
-	movq	0(%rsi),%rax
-	cmovcq	%rbp,%r13
-	movq	8(%rsi),%rbp
-	cmovcq	%rcx,%r8
-	movq	16(%rsi),%rcx
-	cmovcq	%r10,%r9
-	movq	24(%rsi),%r10
-
-	call	__ecp_nistz256_subq
-
-	leaq	160(%rsp),%rbx
-	leaq	224(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromq
-
-	movq	0+0(%rsp),%rax
-	movq	0+8(%rsp),%rbp
-	movq	0+16(%rsp),%rcx
-	movq	0+24(%rsp),%r10
-	leaq	64(%rsp),%rdi
-
-	call	__ecp_nistz256_subq
-
-	movq	%r12,0(%rdi)
-	movq	%r13,8(%rdi)
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-	movq	352(%rsp),%rax
-	leaq	352(%rsp),%rbx
-	movq	0+160(%rsp),%r9
-	movq	8+160(%rsp),%r10
-	leaq	0+160(%rsp),%rsi
-	movq	16+160(%rsp),%r11
-	movq	24+160(%rsp),%r12
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	movq	96(%rsp),%rax
-	leaq	96(%rsp),%rbx
-	movq	0+64(%rsp),%r9
-	movq	8+64(%rsp),%r10
-	leaq	0+64(%rsp),%rsi
-	movq	16+64(%rsp),%r11
-	movq	24+64(%rsp),%r12
-	leaq	64(%rsp),%rdi
-	call	__ecp_nistz256_mul_montq
-
-	leaq	32(%rsp),%rbx
-	leaq	256(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromq
-
-.byte	102,72,15,126,199
-
-	movdqa	%xmm5,%xmm0
-	movdqa	%xmm5,%xmm1
-	pandn	288(%rsp),%xmm0
-	movdqa	%xmm5,%xmm2
-	pandn	288+16(%rsp),%xmm1
-	movdqa	%xmm5,%xmm3
-	pand	L$ONE_mont(%rip),%xmm2
-	pand	L$ONE_mont+16(%rip),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm2,%xmm0
-	movdqa	%xmm4,%xmm2
-	pandn	%xmm3,%xmm1
-	movdqa	%xmm4,%xmm3
-	pand	384(%rsp),%xmm2
-	pand	384+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-	movdqu	%xmm2,64(%rdi)
-	movdqu	%xmm3,80(%rdi)
-
-	movdqa	%xmm5,%xmm0
-	movdqa	%xmm5,%xmm1
-	pandn	224(%rsp),%xmm0
-	movdqa	%xmm5,%xmm2
-	pandn	224+16(%rsp),%xmm1
-	movdqa	%xmm5,%xmm3
-	pand	416(%rsp),%xmm2
-	pand	416+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm2,%xmm0
-	movdqa	%xmm4,%xmm2
-	pandn	%xmm3,%xmm1
-	movdqa	%xmm4,%xmm3
-	pand	320(%rsp),%xmm2
-	pand	320+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-	movdqu	%xmm2,0(%rdi)
-	movdqu	%xmm3,16(%rdi)
-
-	movdqa	%xmm5,%xmm0
-	movdqa	%xmm5,%xmm1
-	pandn	256(%rsp),%xmm0
-	movdqa	%xmm5,%xmm2
-	pandn	256+16(%rsp),%xmm1
-	movdqa	%xmm5,%xmm3
-	pand	448(%rsp),%xmm2
-	pand	448+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm2,%xmm0
-	movdqa	%xmm4,%xmm2
-	pandn	%xmm3,%xmm1
-	movdqa	%xmm4,%xmm3
-	pand	352(%rsp),%xmm2
-	pand	352+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-	movdqu	%xmm2,32(%rdi)
-	movdqu	%xmm3,48(%rdi)
-
-	leaq	480+56(%rsp),%rsi
-
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbx
-
-	movq	-8(%rsi),%rbp
-
-	leaq	(%rsi),%rsp
-
-L$add_affineq_epilogue:
-	ret
-
-
-
-.p2align	5
-__ecp_nistz256_add_tox:
-
-	xorq	%r11,%r11
-	adcq	0(%rbx),%r12
-	adcq	8(%rbx),%r13
-	movq	%r12,%rax
-	adcq	16(%rbx),%r8
-	adcq	24(%rbx),%r9
-	movq	%r13,%rbp
-	adcq	$0,%r11
-
-	xorq	%r10,%r10
-	sbbq	$-1,%r12
-	movq	%r8,%rcx
-	sbbq	%r14,%r13
-	sbbq	$0,%r8
-	movq	%r9,%r10
-	sbbq	%r15,%r9
-	sbbq	$0,%r11
-
-	cmovcq	%rax,%r12
-	cmovcq	%rbp,%r13
-	movq	%r12,0(%rdi)
-	cmovcq	%rcx,%r8
-	movq	%r13,8(%rdi)
-	cmovcq	%r10,%r9
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-
-	ret
-
-
-
-
-.p2align	5
-__ecp_nistz256_sub_fromx:
-
-	xorq	%r11,%r11
-	sbbq	0(%rbx),%r12
-	sbbq	8(%rbx),%r13
-	movq	%r12,%rax
-	sbbq	16(%rbx),%r8
-	sbbq	24(%rbx),%r9
-	movq	%r13,%rbp
-	sbbq	$0,%r11
-
-	xorq	%r10,%r10
-	adcq	$-1,%r12
-	movq	%r8,%rcx
-	adcq	%r14,%r13
-	adcq	$0,%r8
-	movq	%r9,%r10
-	adcq	%r15,%r9
-
-	btq	$0,%r11
-	cmovncq	%rax,%r12
-	cmovncq	%rbp,%r13
-	movq	%r12,0(%rdi)
-	cmovncq	%rcx,%r8
-	movq	%r13,8(%rdi)
-	cmovncq	%r10,%r9
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-
-	ret
-
-
-
-
-.p2align	5
-__ecp_nistz256_subx:
-
-	xorq	%r11,%r11
-	sbbq	%r12,%rax
-	sbbq	%r13,%rbp
-	movq	%rax,%r12
-	sbbq	%r8,%rcx
-	sbbq	%r9,%r10
-	movq	%rbp,%r13
-	sbbq	$0,%r11
-
-	xorq	%r9,%r9
-	adcq	$-1,%rax
-	movq	%rcx,%r8
-	adcq	%r14,%rbp
-	adcq	$0,%rcx
-	movq	%r10,%r9
-	adcq	%r15,%r10
-
-	btq	$0,%r11
-	cmovcq	%rax,%r12
-	cmovcq	%rbp,%r13
-	cmovcq	%rcx,%r8
-	cmovcq	%r10,%r9
-
-	ret
-
-
-
-
-.p2align	5
-__ecp_nistz256_mul_by_2x:
-
-	xorq	%r11,%r11
-	adcq	%r12,%r12
-	adcq	%r13,%r13
-	movq	%r12,%rax
-	adcq	%r8,%r8
-	adcq	%r9,%r9
-	movq	%r13,%rbp
-	adcq	$0,%r11
-
-	xorq	%r10,%r10
-	sbbq	$-1,%r12
-	movq	%r8,%rcx
-	sbbq	%r14,%r13
-	sbbq	$0,%r8
-	movq	%r9,%r10
-	sbbq	%r15,%r9
-	sbbq	$0,%r11
-
-	cmovcq	%rax,%r12
-	cmovcq	%rbp,%r13
-	movq	%r12,0(%rdi)
-	cmovcq	%rcx,%r8
-	movq	%r13,8(%rdi)
-	cmovcq	%r10,%r9
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-
-	ret
-
-
-
-.p2align	5
-ecp_nistz256_point_doublex:
-
-L$point_doublex:
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	subq	$160+8,%rsp
-
-L$point_doublex_body:
-
-L$point_double_shortcutx:
-	movdqu	0(%rsi),%xmm0
-	movq	%rsi,%rbx
-	movdqu	16(%rsi),%xmm1
-	movq	32+0(%rsi),%r12
-	movq	32+8(%rsi),%r13
-	movq	32+16(%rsi),%r8
-	movq	32+24(%rsi),%r9
-	movq	L$poly+8(%rip),%r14
-	movq	L$poly+24(%rip),%r15
-	movdqa	%xmm0,96(%rsp)
-	movdqa	%xmm1,96+16(%rsp)
-	leaq	32(%rdi),%r10
-	leaq	64(%rdi),%r11
-.byte	102,72,15,110,199
-.byte	102,73,15,110,202
-.byte	102,73,15,110,211
-
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_mul_by_2x
-
-	movq	64+0(%rsi),%rdx
-	movq	64+8(%rsi),%r14
-	movq	64+16(%rsi),%r15
-	movq	64+24(%rsi),%r8
-	leaq	64-128(%rsi),%rsi
-	leaq	64(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montx
-
-	movq	0+0(%rsp),%rdx
-	movq	8+0(%rsp),%r14
-	leaq	-128+0(%rsp),%rsi
-	movq	16+0(%rsp),%r15
-	movq	24+0(%rsp),%r8
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montx
-
-	movq	32(%rbx),%rdx
-	movq	64+0(%rbx),%r9
-	movq	64+8(%rbx),%r10
-	movq	64+16(%rbx),%r11
-	movq	64+24(%rbx),%r12
-	leaq	64-128(%rbx),%rsi
-	leaq	32(%rbx),%rbx
-.byte	102,72,15,126,215
-	call	__ecp_nistz256_mul_montx
-	call	__ecp_nistz256_mul_by_2x
-
-	movq	96+0(%rsp),%r12
-	movq	96+8(%rsp),%r13
-	leaq	64(%rsp),%rbx
-	movq	96+16(%rsp),%r8
-	movq	96+24(%rsp),%r9
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_add_tox
-
-	movq	96+0(%rsp),%r12
-	movq	96+8(%rsp),%r13
-	leaq	64(%rsp),%rbx
-	movq	96+16(%rsp),%r8
-	movq	96+24(%rsp),%r9
-	leaq	64(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromx
-
-	movq	0+0(%rsp),%rdx
-	movq	8+0(%rsp),%r14
-	leaq	-128+0(%rsp),%rsi
-	movq	16+0(%rsp),%r15
-	movq	24+0(%rsp),%r8
-.byte	102,72,15,126,207
-	call	__ecp_nistz256_sqr_montx
-	xorq	%r9,%r9
-	movq	%r12,%rax
-	addq	$-1,%r12
-	movq	%r13,%r10
-	adcq	%rsi,%r13
-	movq	%r14,%rcx
-	adcq	$0,%r14
-	movq	%r15,%r8
-	adcq	%rbp,%r15
-	adcq	$0,%r9
-	xorq	%rsi,%rsi
-	testq	$1,%rax
-
-	cmovzq	%rax,%r12
-	cmovzq	%r10,%r13
-	cmovzq	%rcx,%r14
-	cmovzq	%r8,%r15
-	cmovzq	%rsi,%r9
-
-	movq	%r13,%rax
-	shrq	$1,%r12
-	shlq	$63,%rax
-	movq	%r14,%r10
-	shrq	$1,%r13
-	orq	%rax,%r12
-	shlq	$63,%r10
-	movq	%r15,%rcx
-	shrq	$1,%r14
-	orq	%r10,%r13
-	shlq	$63,%rcx
-	movq	%r12,0(%rdi)
-	shrq	$1,%r15
-	movq	%r13,8(%rdi)
-	shlq	$63,%r9
-	orq	%rcx,%r14
-	orq	%r9,%r15
-	movq	%r14,16(%rdi)
-	movq	%r15,24(%rdi)
-	movq	64(%rsp),%rdx
-	leaq	64(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	-128+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	leaq	128(%rsp),%rdi
-	call	__ecp_nistz256_mul_by_2x
-
-	leaq	32(%rsp),%rbx
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_add_tox
-
-	movq	96(%rsp),%rdx
-	leaq	96(%rsp),%rbx
-	movq	0+0(%rsp),%r9
-	movq	8+0(%rsp),%r10
-	leaq	-128+0(%rsp),%rsi
-	movq	16+0(%rsp),%r11
-	movq	24+0(%rsp),%r12
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	leaq	128(%rsp),%rdi
-	call	__ecp_nistz256_mul_by_2x
-
-	movq	0+32(%rsp),%rdx
-	movq	8+32(%rsp),%r14
-	leaq	-128+32(%rsp),%rsi
-	movq	16+32(%rsp),%r15
-	movq	24+32(%rsp),%r8
-.byte	102,72,15,126,199
-	call	__ecp_nistz256_sqr_montx
-
-	leaq	128(%rsp),%rbx
-	movq	%r14,%r8
-	movq	%r15,%r9
-	movq	%rsi,%r14
-	movq	%rbp,%r15
-	call	__ecp_nistz256_sub_fromx
-
-	movq	0+0(%rsp),%rax
-	movq	0+8(%rsp),%rbp
-	movq	0+16(%rsp),%rcx
-	movq	0+24(%rsp),%r10
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_subx
-
-	movq	32(%rsp),%rdx
-	leaq	32(%rsp),%rbx
-	movq	%r12,%r14
-	xorl	%ecx,%ecx
-	movq	%r12,0+0(%rsp)
-	movq	%r13,%r10
-	movq	%r13,0+8(%rsp)
-	cmovzq	%r8,%r11
-	movq	%r8,0+16(%rsp)
-	leaq	0-128(%rsp),%rsi
-	cmovzq	%r9,%r12
-	movq	%r9,0+24(%rsp)
-	movq	%r14,%r9
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-.byte	102,72,15,126,203
-.byte	102,72,15,126,207
-	call	__ecp_nistz256_sub_fromx
-
-	leaq	160+56(%rsp),%rsi
-
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbx
-
-	movq	-8(%rsi),%rbp
-
-	leaq	(%rsi),%rsp
-
-L$point_doublex_epilogue:
-	ret
-
-
-
-.p2align	5
-ecp_nistz256_point_addx:
-
-L$point_addx:
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	subq	$576+8,%rsp
-
-L$point_addx_body:
-
-	movdqu	0(%rsi),%xmm0
-	movdqu	16(%rsi),%xmm1
-	movdqu	32(%rsi),%xmm2
-	movdqu	48(%rsi),%xmm3
-	movdqu	64(%rsi),%xmm4
-	movdqu	80(%rsi),%xmm5
-	movq	%rsi,%rbx
-	movq	%rdx,%rsi
-	movdqa	%xmm0,384(%rsp)
-	movdqa	%xmm1,384+16(%rsp)
-	movdqa	%xmm2,416(%rsp)
-	movdqa	%xmm3,416+16(%rsp)
-	movdqa	%xmm4,448(%rsp)
-	movdqa	%xmm5,448+16(%rsp)
-	por	%xmm4,%xmm5
-
-	movdqu	0(%rsi),%xmm0
-	pshufd	$0xb1,%xmm5,%xmm3
-	movdqu	16(%rsi),%xmm1
-	movdqu	32(%rsi),%xmm2
-	por	%xmm3,%xmm5
-	movdqu	48(%rsi),%xmm3
-	movq	64+0(%rsi),%rdx
-	movq	64+8(%rsi),%r14
-	movq	64+16(%rsi),%r15
-	movq	64+24(%rsi),%r8
-	movdqa	%xmm0,480(%rsp)
-	pshufd	$0x1e,%xmm5,%xmm4
-	movdqa	%xmm1,480+16(%rsp)
-	movdqu	64(%rsi),%xmm0
-	movdqu	80(%rsi),%xmm1
-	movdqa	%xmm2,512(%rsp)
-	movdqa	%xmm3,512+16(%rsp)
-	por	%xmm4,%xmm5
-	pxor	%xmm4,%xmm4
-	por	%xmm0,%xmm1
-.byte	102,72,15,110,199
-
-	leaq	64-128(%rsi),%rsi
-	movq	%rdx,544+0(%rsp)
-	movq	%r14,544+8(%rsp)
-	movq	%r15,544+16(%rsp)
-	movq	%r8,544+24(%rsp)
-	leaq	96(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montx
-
-	pcmpeqd	%xmm4,%xmm5
-	pshufd	$0xb1,%xmm1,%xmm4
-	por	%xmm1,%xmm4
-	pshufd	$0,%xmm5,%xmm5
-	pshufd	$0x1e,%xmm4,%xmm3
-	por	%xmm3,%xmm4
-	pxor	%xmm3,%xmm3
-	pcmpeqd	%xmm3,%xmm4
-	pshufd	$0,%xmm4,%xmm4
-	movq	64+0(%rbx),%rdx
-	movq	64+8(%rbx),%r14
-	movq	64+16(%rbx),%r15
-	movq	64+24(%rbx),%r8
-.byte	102,72,15,110,203
-
-	leaq	64-128(%rbx),%rsi
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montx
-
-	movq	544(%rsp),%rdx
-	leaq	544(%rsp),%rbx
-	movq	0+96(%rsp),%r9
-	movq	8+96(%rsp),%r10
-	leaq	-128+96(%rsp),%rsi
-	movq	16+96(%rsp),%r11
-	movq	24+96(%rsp),%r12
-	leaq	224(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	movq	448(%rsp),%rdx
-	leaq	448(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	-128+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	256(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	movq	416(%rsp),%rdx
-	leaq	416(%rsp),%rbx
-	movq	0+224(%rsp),%r9
-	movq	8+224(%rsp),%r10
-	leaq	-128+224(%rsp),%rsi
-	movq	16+224(%rsp),%r11
-	movq	24+224(%rsp),%r12
-	leaq	224(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	movq	512(%rsp),%rdx
-	leaq	512(%rsp),%rbx
-	movq	0+256(%rsp),%r9
-	movq	8+256(%rsp),%r10
-	leaq	-128+256(%rsp),%rsi
-	movq	16+256(%rsp),%r11
-	movq	24+256(%rsp),%r12
-	leaq	256(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	leaq	224(%rsp),%rbx
-	leaq	64(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromx
-
-	orq	%r13,%r12
-	movdqa	%xmm4,%xmm2
-	orq	%r8,%r12
-	orq	%r9,%r12
-	por	%xmm5,%xmm2
-.byte	102,73,15,110,220
-
-	movq	384(%rsp),%rdx
-	leaq	384(%rsp),%rbx
-	movq	0+96(%rsp),%r9
-	movq	8+96(%rsp),%r10
-	leaq	-128+96(%rsp),%rsi
-	movq	16+96(%rsp),%r11
-	movq	24+96(%rsp),%r12
-	leaq	160(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	movq	480(%rsp),%rdx
-	leaq	480(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	-128+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	192(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	leaq	160(%rsp),%rbx
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromx
-
-	orq	%r13,%r12
-	orq	%r8,%r12
-	orq	%r9,%r12
-
-.byte	102,73,15,126,208
-.byte	102,73,15,126,217
-	orq	%r8,%r12
-.byte	0x3e
-	jnz	L$add_proceedx
-
-
-
-	testq	%r9,%r9
-	jz	L$add_doublex
-
-
-
-
-
-
-.byte	102,72,15,126,199
-	pxor	%xmm0,%xmm0
-	movdqu	%xmm0,0(%rdi)
-	movdqu	%xmm0,16(%rdi)
-	movdqu	%xmm0,32(%rdi)
-	movdqu	%xmm0,48(%rdi)
-	movdqu	%xmm0,64(%rdi)
-	movdqu	%xmm0,80(%rdi)
-	jmp	L$add_donex
-
-.p2align	5
-L$add_doublex:
-.byte	102,72,15,126,206
-.byte	102,72,15,126,199
-	addq	$416,%rsp
-
-	jmp	L$point_double_shortcutx
-
-
-.p2align	5
-L$add_proceedx:
-	movq	0+64(%rsp),%rdx
-	movq	8+64(%rsp),%r14
-	leaq	-128+64(%rsp),%rsi
-	movq	16+64(%rsp),%r15
-	movq	24+64(%rsp),%r8
-	leaq	96(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montx
-
-	movq	448(%rsp),%rdx
-	leaq	448(%rsp),%rbx
-	movq	0+0(%rsp),%r9
-	movq	8+0(%rsp),%r10
-	leaq	-128+0(%rsp),%rsi
-	movq	16+0(%rsp),%r11
-	movq	24+0(%rsp),%r12
-	leaq	352(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	movq	0+0(%rsp),%rdx
-	movq	8+0(%rsp),%r14
-	leaq	-128+0(%rsp),%rsi
-	movq	16+0(%rsp),%r15
-	movq	24+0(%rsp),%r8
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montx
-
-	movq	544(%rsp),%rdx
-	leaq	544(%rsp),%rbx
-	movq	0+352(%rsp),%r9
-	movq	8+352(%rsp),%r10
-	leaq	-128+352(%rsp),%rsi
-	movq	16+352(%rsp),%r11
-	movq	24+352(%rsp),%r12
-	leaq	352(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	movq	0(%rsp),%rdx
-	leaq	0(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	-128+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	128(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	movq	160(%rsp),%rdx
-	leaq	160(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	-128+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	192(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-
-
-
-	xorq	%r11,%r11
-	addq	%r12,%r12
-	leaq	96(%rsp),%rsi
-	adcq	%r13,%r13
-	movq	%r12,%rax
-	adcq	%r8,%r8
-	adcq	%r9,%r9
-	movq	%r13,%rbp
-	adcq	$0,%r11
-
-	subq	$-1,%r12
-	movq	%r8,%rcx
-	sbbq	%r14,%r13
-	sbbq	$0,%r8
-	movq	%r9,%r10
-	sbbq	%r15,%r9
-	sbbq	$0,%r11
-
-	cmovcq	%rax,%r12
-	movq	0(%rsi),%rax
-	cmovcq	%rbp,%r13
-	movq	8(%rsi),%rbp
-	cmovcq	%rcx,%r8
-	movq	16(%rsi),%rcx
-	cmovcq	%r10,%r9
-	movq	24(%rsi),%r10
-
-	call	__ecp_nistz256_subx
-
-	leaq	128(%rsp),%rbx
-	leaq	288(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromx
-
-	movq	192+0(%rsp),%rax
-	movq	192+8(%rsp),%rbp
-	movq	192+16(%rsp),%rcx
-	movq	192+24(%rsp),%r10
-	leaq	320(%rsp),%rdi
-
-	call	__ecp_nistz256_subx
-
-	movq	%r12,0(%rdi)
-	movq	%r13,8(%rdi)
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-	movq	128(%rsp),%rdx
-	leaq	128(%rsp),%rbx
-	movq	0+224(%rsp),%r9
-	movq	8+224(%rsp),%r10
-	leaq	-128+224(%rsp),%rsi
-	movq	16+224(%rsp),%r11
-	movq	24+224(%rsp),%r12
-	leaq	256(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	movq	320(%rsp),%rdx
-	leaq	320(%rsp),%rbx
-	movq	0+64(%rsp),%r9
-	movq	8+64(%rsp),%r10
-	leaq	-128+64(%rsp),%rsi
-	movq	16+64(%rsp),%r11
-	movq	24+64(%rsp),%r12
-	leaq	320(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	leaq	256(%rsp),%rbx
-	leaq	320(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromx
-
-.byte	102,72,15,126,199
-
-	movdqa	%xmm5,%xmm0
-	movdqa	%xmm5,%xmm1
-	pandn	352(%rsp),%xmm0
-	movdqa	%xmm5,%xmm2
-	pandn	352+16(%rsp),%xmm1
-	movdqa	%xmm5,%xmm3
-	pand	544(%rsp),%xmm2
-	pand	544+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm2,%xmm0
-	movdqa	%xmm4,%xmm2
-	pandn	%xmm3,%xmm1
-	movdqa	%xmm4,%xmm3
-	pand	448(%rsp),%xmm2
-	pand	448+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-	movdqu	%xmm2,64(%rdi)
-	movdqu	%xmm3,80(%rdi)
-
-	movdqa	%xmm5,%xmm0
-	movdqa	%xmm5,%xmm1
-	pandn	288(%rsp),%xmm0
-	movdqa	%xmm5,%xmm2
-	pandn	288+16(%rsp),%xmm1
-	movdqa	%xmm5,%xmm3
-	pand	480(%rsp),%xmm2
-	pand	480+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm2,%xmm0
-	movdqa	%xmm4,%xmm2
-	pandn	%xmm3,%xmm1
-	movdqa	%xmm4,%xmm3
-	pand	384(%rsp),%xmm2
-	pand	384+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-	movdqu	%xmm2,0(%rdi)
-	movdqu	%xmm3,16(%rdi)
-
-	movdqa	%xmm5,%xmm0
-	movdqa	%xmm5,%xmm1
-	pandn	320(%rsp),%xmm0
-	movdqa	%xmm5,%xmm2
-	pandn	320+16(%rsp),%xmm1
-	movdqa	%xmm5,%xmm3
-	pand	512(%rsp),%xmm2
-	pand	512+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm2,%xmm0
-	movdqa	%xmm4,%xmm2
-	pandn	%xmm3,%xmm1
-	movdqa	%xmm4,%xmm3
-	pand	416(%rsp),%xmm2
-	pand	416+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-	movdqu	%xmm2,32(%rdi)
-	movdqu	%xmm3,48(%rdi)
-
-L$add_donex:
-	leaq	576+56(%rsp),%rsi
-
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbx
-
-	movq	-8(%rsi),%rbp
-
-	leaq	(%rsi),%rsp
-
-L$point_addx_epilogue:
-	ret
-
-
-
-.p2align	5
-ecp_nistz256_point_add_affinex:
-
-L$point_add_affinex:
-	pushq	%rbp
-
-	pushq	%rbx
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	subq	$480+8,%rsp
-
-L$add_affinex_body:
-
-	movdqu	0(%rsi),%xmm0
-	movq	%rdx,%rbx
-	movdqu	16(%rsi),%xmm1
-	movdqu	32(%rsi),%xmm2
-	movdqu	48(%rsi),%xmm3
-	movdqu	64(%rsi),%xmm4
-	movdqu	80(%rsi),%xmm5
-	movq	64+0(%rsi),%rdx
-	movq	64+8(%rsi),%r14
-	movq	64+16(%rsi),%r15
-	movq	64+24(%rsi),%r8
-	movdqa	%xmm0,320(%rsp)
-	movdqa	%xmm1,320+16(%rsp)
-	movdqa	%xmm2,352(%rsp)
-	movdqa	%xmm3,352+16(%rsp)
-	movdqa	%xmm4,384(%rsp)
-	movdqa	%xmm5,384+16(%rsp)
-	por	%xmm4,%xmm5
-
-	movdqu	0(%rbx),%xmm0
-	pshufd	$0xb1,%xmm5,%xmm3
-	movdqu	16(%rbx),%xmm1
-	movdqu	32(%rbx),%xmm2
-	por	%xmm3,%xmm5
-	movdqu	48(%rbx),%xmm3
-	movdqa	%xmm0,416(%rsp)
-	pshufd	$0x1e,%xmm5,%xmm4
-	movdqa	%xmm1,416+16(%rsp)
-	por	%xmm0,%xmm1
-.byte	102,72,15,110,199
-	movdqa	%xmm2,448(%rsp)
-	movdqa	%xmm3,448+16(%rsp)
-	por	%xmm2,%xmm3
-	por	%xmm4,%xmm5
-	pxor	%xmm4,%xmm4
-	por	%xmm1,%xmm3
-
-	leaq	64-128(%rsi),%rsi
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montx
-
-	pcmpeqd	%xmm4,%xmm5
-	pshufd	$0xb1,%xmm3,%xmm4
-	movq	0(%rbx),%rdx
-
-	movq	%r12,%r9
-	por	%xmm3,%xmm4
-	pshufd	$0,%xmm5,%xmm5
-	pshufd	$0x1e,%xmm4,%xmm3
-	movq	%r13,%r10
-	por	%xmm3,%xmm4
-	pxor	%xmm3,%xmm3
-	movq	%r14,%r11
-	pcmpeqd	%xmm3,%xmm4
-	pshufd	$0,%xmm4,%xmm4
-
-	leaq	32-128(%rsp),%rsi
-	movq	%r15,%r12
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	leaq	320(%rsp),%rbx
-	leaq	64(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromx
-
-	movq	384(%rsp),%rdx
-	leaq	384(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	-128+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	movq	384(%rsp),%rdx
-	leaq	384(%rsp),%rbx
-	movq	0+64(%rsp),%r9
-	movq	8+64(%rsp),%r10
-	leaq	-128+64(%rsp),%rsi
-	movq	16+64(%rsp),%r11
-	movq	24+64(%rsp),%r12
-	leaq	288(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	movq	448(%rsp),%rdx
-	leaq	448(%rsp),%rbx
-	movq	0+32(%rsp),%r9
-	movq	8+32(%rsp),%r10
-	leaq	-128+32(%rsp),%rsi
-	movq	16+32(%rsp),%r11
-	movq	24+32(%rsp),%r12
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	leaq	352(%rsp),%rbx
-	leaq	96(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromx
-
-	movq	0+64(%rsp),%rdx
-	movq	8+64(%rsp),%r14
-	leaq	-128+64(%rsp),%rsi
-	movq	16+64(%rsp),%r15
-	movq	24+64(%rsp),%r8
-	leaq	128(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montx
-
-	movq	0+96(%rsp),%rdx
-	movq	8+96(%rsp),%r14
-	leaq	-128+96(%rsp),%rsi
-	movq	16+96(%rsp),%r15
-	movq	24+96(%rsp),%r8
-	leaq	192(%rsp),%rdi
-	call	__ecp_nistz256_sqr_montx
-
-	movq	128(%rsp),%rdx
-	leaq	128(%rsp),%rbx
-	movq	0+64(%rsp),%r9
-	movq	8+64(%rsp),%r10
-	leaq	-128+64(%rsp),%rsi
-	movq	16+64(%rsp),%r11
-	movq	24+64(%rsp),%r12
-	leaq	160(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	movq	320(%rsp),%rdx
-	leaq	320(%rsp),%rbx
-	movq	0+128(%rsp),%r9
-	movq	8+128(%rsp),%r10
-	leaq	-128+128(%rsp),%rsi
-	movq	16+128(%rsp),%r11
-	movq	24+128(%rsp),%r12
-	leaq	0(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-
-
-
-	xorq	%r11,%r11
-	addq	%r12,%r12
-	leaq	192(%rsp),%rsi
-	adcq	%r13,%r13
-	movq	%r12,%rax
-	adcq	%r8,%r8
-	adcq	%r9,%r9
-	movq	%r13,%rbp
-	adcq	$0,%r11
-
-	subq	$-1,%r12
-	movq	%r8,%rcx
-	sbbq	%r14,%r13
-	sbbq	$0,%r8
-	movq	%r9,%r10
-	sbbq	%r15,%r9
-	sbbq	$0,%r11
-
-	cmovcq	%rax,%r12
-	movq	0(%rsi),%rax
-	cmovcq	%rbp,%r13
-	movq	8(%rsi),%rbp
-	cmovcq	%rcx,%r8
-	movq	16(%rsi),%rcx
-	cmovcq	%r10,%r9
-	movq	24(%rsi),%r10
-
-	call	__ecp_nistz256_subx
-
-	leaq	160(%rsp),%rbx
-	leaq	224(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromx
-
-	movq	0+0(%rsp),%rax
-	movq	0+8(%rsp),%rbp
-	movq	0+16(%rsp),%rcx
-	movq	0+24(%rsp),%r10
-	leaq	64(%rsp),%rdi
-
-	call	__ecp_nistz256_subx
-
-	movq	%r12,0(%rdi)
-	movq	%r13,8(%rdi)
-	movq	%r8,16(%rdi)
-	movq	%r9,24(%rdi)
-	movq	352(%rsp),%rdx
-	leaq	352(%rsp),%rbx
-	movq	0+160(%rsp),%r9
-	movq	8+160(%rsp),%r10
-	leaq	-128+160(%rsp),%rsi
-	movq	16+160(%rsp),%r11
-	movq	24+160(%rsp),%r12
-	leaq	32(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	movq	96(%rsp),%rdx
-	leaq	96(%rsp),%rbx
-	movq	0+64(%rsp),%r9
-	movq	8+64(%rsp),%r10
-	leaq	-128+64(%rsp),%rsi
-	movq	16+64(%rsp),%r11
-	movq	24+64(%rsp),%r12
-	leaq	64(%rsp),%rdi
-	call	__ecp_nistz256_mul_montx
-
-	leaq	32(%rsp),%rbx
-	leaq	256(%rsp),%rdi
-	call	__ecp_nistz256_sub_fromx
-
-.byte	102,72,15,126,199
-
-	movdqa	%xmm5,%xmm0
-	movdqa	%xmm5,%xmm1
-	pandn	288(%rsp),%xmm0
-	movdqa	%xmm5,%xmm2
-	pandn	288+16(%rsp),%xmm1
-	movdqa	%xmm5,%xmm3
-	pand	L$ONE_mont(%rip),%xmm2
-	pand	L$ONE_mont+16(%rip),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm2,%xmm0
-	movdqa	%xmm4,%xmm2
-	pandn	%xmm3,%xmm1
-	movdqa	%xmm4,%xmm3
-	pand	384(%rsp),%xmm2
-	pand	384+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-	movdqu	%xmm2,64(%rdi)
-	movdqu	%xmm3,80(%rdi)
-
-	movdqa	%xmm5,%xmm0
-	movdqa	%xmm5,%xmm1
-	pandn	224(%rsp),%xmm0
-	movdqa	%xmm5,%xmm2
-	pandn	224+16(%rsp),%xmm1
-	movdqa	%xmm5,%xmm3
-	pand	416(%rsp),%xmm2
-	pand	416+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm2,%xmm0
-	movdqa	%xmm4,%xmm2
-	pandn	%xmm3,%xmm1
-	movdqa	%xmm4,%xmm3
-	pand	320(%rsp),%xmm2
-	pand	320+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-	movdqu	%xmm2,0(%rdi)
-	movdqu	%xmm3,16(%rdi)
-
-	movdqa	%xmm5,%xmm0
-	movdqa	%xmm5,%xmm1
-	pandn	256(%rsp),%xmm0
-	movdqa	%xmm5,%xmm2
-	pandn	256+16(%rsp),%xmm1
-	movdqa	%xmm5,%xmm3
-	pand	448(%rsp),%xmm2
-	pand	448+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-
-	movdqa	%xmm4,%xmm0
-	movdqa	%xmm4,%xmm1
-	pandn	%xmm2,%xmm0
-	movdqa	%xmm4,%xmm2
-	pandn	%xmm3,%xmm1
-	movdqa	%xmm4,%xmm3
-	pand	352(%rsp),%xmm2
-	pand	352+16(%rsp),%xmm3
-	por	%xmm0,%xmm2
-	por	%xmm1,%xmm3
-	movdqu	%xmm2,32(%rdi)
-	movdqu	%xmm3,48(%rdi)
-
-	leaq	480+56(%rsp),%rsi
-
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbx
-
-	movq	-8(%rsi),%rbp
-
-	leaq	(%rsi),%rsp
-
-L$add_affinex_epilogue:
-	ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm-apple.S b/apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm-apple.S
deleted file mode 100644
index fc6552c..0000000
--- a/apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm-apple.S
+++ /dev/null
@@ -1,322 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-.private_extern	_beeu_mod_inverse_vartime
-.globl	_beeu_mod_inverse_vartime
-.private_extern _beeu_mod_inverse_vartime
-.p2align	5
-_beeu_mod_inverse_vartime:
-
-_CET_ENDBR
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	pushq	%rbx
-
-	pushq	%rsi
-
-
-	subq	$80,%rsp
-
-	movq	%rdi,0(%rsp)
-
-
-	movq	$1,%r8
-	xorq	%r9,%r9
-	xorq	%r10,%r10
-	xorq	%r11,%r11
-	xorq	%rdi,%rdi
-
-	xorq	%r12,%r12
-	xorq	%r13,%r13
-	xorq	%r14,%r14
-	xorq	%r15,%r15
-	xorq	%rbp,%rbp
-
-
-	vmovdqu	0(%rsi),%xmm0
-	vmovdqu	16(%rsi),%xmm1
-	vmovdqu	%xmm0,48(%rsp)
-	vmovdqu	%xmm1,64(%rsp)
-
-	vmovdqu	0(%rdx),%xmm0
-	vmovdqu	16(%rdx),%xmm1
-	vmovdqu	%xmm0,16(%rsp)
-	vmovdqu	%xmm1,32(%rsp)
-
-L$beeu_loop:
-	xorq	%rbx,%rbx
-	orq	48(%rsp),%rbx
-	orq	56(%rsp),%rbx
-	orq	64(%rsp),%rbx
-	orq	72(%rsp),%rbx
-	jz	L$beeu_loop_end
-
-
-
-
-
-
-
-
-
-
-	movq	$1,%rcx
-
-
-L$beeu_shift_loop_XB:
-	movq	%rcx,%rbx
-	andq	48(%rsp),%rbx
-	jnz	L$beeu_shift_loop_end_XB
-
-
-	movq	$1,%rbx
-	andq	%r8,%rbx
-	jz	L$shift1_0
-	addq	0(%rdx),%r8
-	adcq	8(%rdx),%r9
-	adcq	16(%rdx),%r10
-	adcq	24(%rdx),%r11
-	adcq	$0,%rdi
-
-L$shift1_0:
-	shrdq	$1,%r9,%r8
-	shrdq	$1,%r10,%r9
-	shrdq	$1,%r11,%r10
-	shrdq	$1,%rdi,%r11
-	shrq	$1,%rdi
-
-	shlq	$1,%rcx
-
-
-
-
-
-	cmpq	$0x8000000,%rcx
-	jne	L$beeu_shift_loop_XB
-
-L$beeu_shift_loop_end_XB:
-	bsfq	%rcx,%rcx
-	testq	%rcx,%rcx
-	jz	L$beeu_no_shift_XB
-
-
-
-	movq	8+48(%rsp),%rax
-	movq	16+48(%rsp),%rbx
-	movq	24+48(%rsp),%rsi
-
-	shrdq	%cl,%rax,0+48(%rsp)
-	shrdq	%cl,%rbx,8+48(%rsp)
-	shrdq	%cl,%rsi,16+48(%rsp)
-
-	shrq	%cl,%rsi
-	movq	%rsi,24+48(%rsp)
-
-
-L$beeu_no_shift_XB:
-
-	movq	$1,%rcx
-
-
-L$beeu_shift_loop_YA:
-	movq	%rcx,%rbx
-	andq	16(%rsp),%rbx
-	jnz	L$beeu_shift_loop_end_YA
-
-
-	movq	$1,%rbx
-	andq	%r12,%rbx
-	jz	L$shift1_1
-	addq	0(%rdx),%r12
-	adcq	8(%rdx),%r13
-	adcq	16(%rdx),%r14
-	adcq	24(%rdx),%r15
-	adcq	$0,%rbp
-
-L$shift1_1:
-	shrdq	$1,%r13,%r12
-	shrdq	$1,%r14,%r13
-	shrdq	$1,%r15,%r14
-	shrdq	$1,%rbp,%r15
-	shrq	$1,%rbp
-
-	shlq	$1,%rcx
-
-
-
-
-
-	cmpq	$0x8000000,%rcx
-	jne	L$beeu_shift_loop_YA
-
-L$beeu_shift_loop_end_YA:
-	bsfq	%rcx,%rcx
-	testq	%rcx,%rcx
-	jz	L$beeu_no_shift_YA
-
-
-
-	movq	8+16(%rsp),%rax
-	movq	16+16(%rsp),%rbx
-	movq	24+16(%rsp),%rsi
-
-	shrdq	%cl,%rax,0+16(%rsp)
-	shrdq	%cl,%rbx,8+16(%rsp)
-	shrdq	%cl,%rsi,16+16(%rsp)
-
-	shrq	%cl,%rsi
-	movq	%rsi,24+16(%rsp)
-
-
-L$beeu_no_shift_YA:
-
-	movq	48(%rsp),%rax
-	movq	56(%rsp),%rbx
-	movq	64(%rsp),%rsi
-	movq	72(%rsp),%rcx
-	subq	16(%rsp),%rax
-	sbbq	24(%rsp),%rbx
-	sbbq	32(%rsp),%rsi
-	sbbq	40(%rsp),%rcx
-	jnc	L$beeu_B_bigger_than_A
-
-
-	movq	16(%rsp),%rax
-	movq	24(%rsp),%rbx
-	movq	32(%rsp),%rsi
-	movq	40(%rsp),%rcx
-	subq	48(%rsp),%rax
-	sbbq	56(%rsp),%rbx
-	sbbq	64(%rsp),%rsi
-	sbbq	72(%rsp),%rcx
-	movq	%rax,16(%rsp)
-	movq	%rbx,24(%rsp)
-	movq	%rsi,32(%rsp)
-	movq	%rcx,40(%rsp)
-
-
-	addq	%r8,%r12
-	adcq	%r9,%r13
-	adcq	%r10,%r14
-	adcq	%r11,%r15
-	adcq	%rdi,%rbp
-	jmp	L$beeu_loop
-
-L$beeu_B_bigger_than_A:
-
-	movq	%rax,48(%rsp)
-	movq	%rbx,56(%rsp)
-	movq	%rsi,64(%rsp)
-	movq	%rcx,72(%rsp)
-
-
-	addq	%r12,%r8
-	adcq	%r13,%r9
-	adcq	%r14,%r10
-	adcq	%r15,%r11
-	adcq	%rbp,%rdi
-
-	jmp	L$beeu_loop
-
-L$beeu_loop_end:
-
-
-
-
-	movq	16(%rsp),%rbx
-	subq	$1,%rbx
-	orq	24(%rsp),%rbx
-	orq	32(%rsp),%rbx
-	orq	40(%rsp),%rbx
-
-	jnz	L$beeu_err
-
-
-
-
-	movq	0(%rdx),%r8
-	movq	8(%rdx),%r9
-	movq	16(%rdx),%r10
-	movq	24(%rdx),%r11
-	xorq	%rdi,%rdi
-
-L$beeu_reduction_loop:
-	movq	%r12,16(%rsp)
-	movq	%r13,24(%rsp)
-	movq	%r14,32(%rsp)
-	movq	%r15,40(%rsp)
-	movq	%rbp,48(%rsp)
-
-
-	subq	%r8,%r12
-	sbbq	%r9,%r13
-	sbbq	%r10,%r14
-	sbbq	%r11,%r15
-	sbbq	$0,%rbp
-
-
-	cmovcq	16(%rsp),%r12
-	cmovcq	24(%rsp),%r13
-	cmovcq	32(%rsp),%r14
-	cmovcq	40(%rsp),%r15
-	jnc	L$beeu_reduction_loop
-
-
-	subq	%r12,%r8
-	sbbq	%r13,%r9
-	sbbq	%r14,%r10
-	sbbq	%r15,%r11
-
-L$beeu_save:
-
-	movq	0(%rsp),%rdi
-
-	movq	%r8,0(%rdi)
-	movq	%r9,8(%rdi)
-	movq	%r10,16(%rdi)
-	movq	%r11,24(%rdi)
-
-
-	movq	$1,%rax
-	jmp	L$beeu_finish
-
-L$beeu_err:
-
-	xorq	%rax,%rax
-
-L$beeu_finish:
-	addq	$80,%rsp
-
-	popq	%rsi
-
-	popq	%rbx
-
-	popq	%r15
-
-	popq	%r14
-
-	popq	%r13
-
-	popq	%r12
-
-	popq	%rbp
-
-	ret
-
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/rdrand-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/rdrand-x86_64-apple.S
deleted file mode 100644
index 5fdf105..0000000
--- a/apple-x86_64/crypto/fipsmodule/rdrand-x86_64-apple.S
+++ /dev/null
@@ -1,57 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-
-
-.globl	_CRYPTO_rdrand
-.private_extern _CRYPTO_rdrand
-
-.p2align	4
-_CRYPTO_rdrand:
-
-_CET_ENDBR
-	xorq	%rax,%rax
-.byte	72,15,199,242
-
-	adcq	%rax,%rax
-	movq	%rdx,0(%rdi)
-	ret
-
-
-
-
-
-
-
-.globl	_CRYPTO_rdrand_multiple8_buf
-.private_extern _CRYPTO_rdrand_multiple8_buf
-
-.p2align	4
-_CRYPTO_rdrand_multiple8_buf:
-
-_CET_ENDBR
-	testq	%rsi,%rsi
-	jz	L$out
-	movq	$8,%rdx
-L$loop:
-.byte	72,15,199,241
-	jnc	L$err
-	movq	%rcx,0(%rdi)
-	addq	%rdx,%rdi
-	subq	%rdx,%rsi
-	jnz	L$loop
-L$out:
-	movq	$1,%rax
-	ret
-L$err:
-	xorq	%rax,%rax
-	ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/rsaz-avx2-apple.S b/apple-x86_64/crypto/fipsmodule/rsaz-avx2-apple.S
deleted file mode 100644
index 3672309..0000000
--- a/apple-x86_64/crypto/fipsmodule/rsaz-avx2-apple.S
+++ /dev/null
@@ -1,1749 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-.globl	_rsaz_1024_sqr_avx2
-.private_extern _rsaz_1024_sqr_avx2
-
-.p2align	6
-_rsaz_1024_sqr_avx2:
-
-_CET_ENDBR
-	leaq	(%rsp),%rax
-
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	vzeroupper
-	movq	%rax,%rbp
-
-	movq	%rdx,%r13
-	subq	$832,%rsp
-	movq	%r13,%r15
-	subq	$-128,%rdi
-	subq	$-128,%rsi
-	subq	$-128,%r13
-
-	andq	$4095,%r15
-	addq	$320,%r15
-	shrq	$12,%r15
-	vpxor	%ymm9,%ymm9,%ymm9
-	jz	L$sqr_1024_no_n_copy
-
-
-
-
-
-	subq	$320,%rsp
-	vmovdqu	0-128(%r13),%ymm0
-	andq	$-2048,%rsp
-	vmovdqu	32-128(%r13),%ymm1
-	vmovdqu	64-128(%r13),%ymm2
-	vmovdqu	96-128(%r13),%ymm3
-	vmovdqu	128-128(%r13),%ymm4
-	vmovdqu	160-128(%r13),%ymm5
-	vmovdqu	192-128(%r13),%ymm6
-	vmovdqu	224-128(%r13),%ymm7
-	vmovdqu	256-128(%r13),%ymm8
-	leaq	832+128(%rsp),%r13
-	vmovdqu	%ymm0,0-128(%r13)
-	vmovdqu	%ymm1,32-128(%r13)
-	vmovdqu	%ymm2,64-128(%r13)
-	vmovdqu	%ymm3,96-128(%r13)
-	vmovdqu	%ymm4,128-128(%r13)
-	vmovdqu	%ymm5,160-128(%r13)
-	vmovdqu	%ymm6,192-128(%r13)
-	vmovdqu	%ymm7,224-128(%r13)
-	vmovdqu	%ymm8,256-128(%r13)
-	vmovdqu	%ymm9,288-128(%r13)
-
-L$sqr_1024_no_n_copy:
-	andq	$-1024,%rsp
-
-	vmovdqu	32-128(%rsi),%ymm1
-	vmovdqu	64-128(%rsi),%ymm2
-	vmovdqu	96-128(%rsi),%ymm3
-	vmovdqu	128-128(%rsi),%ymm4
-	vmovdqu	160-128(%rsi),%ymm5
-	vmovdqu	192-128(%rsi),%ymm6
-	vmovdqu	224-128(%rsi),%ymm7
-	vmovdqu	256-128(%rsi),%ymm8
-
-	leaq	192(%rsp),%rbx
-	vmovdqu	L$and_mask(%rip),%ymm15
-	jmp	L$OOP_GRANDE_SQR_1024
-
-.p2align	5
-L$OOP_GRANDE_SQR_1024:
-	leaq	576+128(%rsp),%r9
-	leaq	448(%rsp),%r12
-
-
-
-
-	vpaddq	%ymm1,%ymm1,%ymm1
-	vpbroadcastq	0-128(%rsi),%ymm10
-	vpaddq	%ymm2,%ymm2,%ymm2
-	vmovdqa	%ymm1,0-128(%r9)
-	vpaddq	%ymm3,%ymm3,%ymm3
-	vmovdqa	%ymm2,32-128(%r9)
-	vpaddq	%ymm4,%ymm4,%ymm4
-	vmovdqa	%ymm3,64-128(%r9)
-	vpaddq	%ymm5,%ymm5,%ymm5
-	vmovdqa	%ymm4,96-128(%r9)
-	vpaddq	%ymm6,%ymm6,%ymm6
-	vmovdqa	%ymm5,128-128(%r9)
-	vpaddq	%ymm7,%ymm7,%ymm7
-	vmovdqa	%ymm6,160-128(%r9)
-	vpaddq	%ymm8,%ymm8,%ymm8
-	vmovdqa	%ymm7,192-128(%r9)
-	vpxor	%ymm9,%ymm9,%ymm9
-	vmovdqa	%ymm8,224-128(%r9)
-
-	vpmuludq	0-128(%rsi),%ymm10,%ymm0
-	vpbroadcastq	32-128(%rsi),%ymm11
-	vmovdqu	%ymm9,288-192(%rbx)
-	vpmuludq	%ymm10,%ymm1,%ymm1
-	vmovdqu	%ymm9,320-448(%r12)
-	vpmuludq	%ymm10,%ymm2,%ymm2
-	vmovdqu	%ymm9,352-448(%r12)
-	vpmuludq	%ymm10,%ymm3,%ymm3
-	vmovdqu	%ymm9,384-448(%r12)
-	vpmuludq	%ymm10,%ymm4,%ymm4
-	vmovdqu	%ymm9,416-448(%r12)
-	vpmuludq	%ymm10,%ymm5,%ymm5
-	vmovdqu	%ymm9,448-448(%r12)
-	vpmuludq	%ymm10,%ymm6,%ymm6
-	vmovdqu	%ymm9,480-448(%r12)
-	vpmuludq	%ymm10,%ymm7,%ymm7
-	vmovdqu	%ymm9,512-448(%r12)
-	vpmuludq	%ymm10,%ymm8,%ymm8
-	vpbroadcastq	64-128(%rsi),%ymm10
-	vmovdqu	%ymm9,544-448(%r12)
-
-	movq	%rsi,%r15
-	movl	$4,%r14d
-	jmp	L$sqr_entry_1024
-.p2align	5
-L$OOP_SQR_1024:
-	vpbroadcastq	32-128(%r15),%ymm11
-	vpmuludq	0-128(%rsi),%ymm10,%ymm0
-	vpaddq	0-192(%rbx),%ymm0,%ymm0
-	vpmuludq	0-128(%r9),%ymm10,%ymm1
-	vpaddq	32-192(%rbx),%ymm1,%ymm1
-	vpmuludq	32-128(%r9),%ymm10,%ymm2
-	vpaddq	64-192(%rbx),%ymm2,%ymm2
-	vpmuludq	64-128(%r9),%ymm10,%ymm3
-	vpaddq	96-192(%rbx),%ymm3,%ymm3
-	vpmuludq	96-128(%r9),%ymm10,%ymm4
-	vpaddq	128-192(%rbx),%ymm4,%ymm4
-	vpmuludq	128-128(%r9),%ymm10,%ymm5
-	vpaddq	160-192(%rbx),%ymm5,%ymm5
-	vpmuludq	160-128(%r9),%ymm10,%ymm6
-	vpaddq	192-192(%rbx),%ymm6,%ymm6
-	vpmuludq	192-128(%r9),%ymm10,%ymm7
-	vpaddq	224-192(%rbx),%ymm7,%ymm7
-	vpmuludq	224-128(%r9),%ymm10,%ymm8
-	vpbroadcastq	64-128(%r15),%ymm10
-	vpaddq	256-192(%rbx),%ymm8,%ymm8
-L$sqr_entry_1024:
-	vmovdqu	%ymm0,0-192(%rbx)
-	vmovdqu	%ymm1,32-192(%rbx)
-
-	vpmuludq	32-128(%rsi),%ymm11,%ymm12
-	vpaddq	%ymm12,%ymm2,%ymm2
-	vpmuludq	32-128(%r9),%ymm11,%ymm14
-	vpaddq	%ymm14,%ymm3,%ymm3
-	vpmuludq	64-128(%r9),%ymm11,%ymm13
-	vpaddq	%ymm13,%ymm4,%ymm4
-	vpmuludq	96-128(%r9),%ymm11,%ymm12
-	vpaddq	%ymm12,%ymm5,%ymm5
-	vpmuludq	128-128(%r9),%ymm11,%ymm14
-	vpaddq	%ymm14,%ymm6,%ymm6
-	vpmuludq	160-128(%r9),%ymm11,%ymm13
-	vpaddq	%ymm13,%ymm7,%ymm7
-	vpmuludq	192-128(%r9),%ymm11,%ymm12
-	vpaddq	%ymm12,%ymm8,%ymm8
-	vpmuludq	224-128(%r9),%ymm11,%ymm0
-	vpbroadcastq	96-128(%r15),%ymm11
-	vpaddq	288-192(%rbx),%ymm0,%ymm0
-
-	vmovdqu	%ymm2,64-192(%rbx)
-	vmovdqu	%ymm3,96-192(%rbx)
-
-	vpmuludq	64-128(%rsi),%ymm10,%ymm13
-	vpaddq	%ymm13,%ymm4,%ymm4
-	vpmuludq	64-128(%r9),%ymm10,%ymm12
-	vpaddq	%ymm12,%ymm5,%ymm5
-	vpmuludq	96-128(%r9),%ymm10,%ymm14
-	vpaddq	%ymm14,%ymm6,%ymm6
-	vpmuludq	128-128(%r9),%ymm10,%ymm13
-	vpaddq	%ymm13,%ymm7,%ymm7
-	vpmuludq	160-128(%r9),%ymm10,%ymm12
-	vpaddq	%ymm12,%ymm8,%ymm8
-	vpmuludq	192-128(%r9),%ymm10,%ymm14
-	vpaddq	%ymm14,%ymm0,%ymm0
-	vpmuludq	224-128(%r9),%ymm10,%ymm1
-	vpbroadcastq	128-128(%r15),%ymm10
-	vpaddq	320-448(%r12),%ymm1,%ymm1
-
-	vmovdqu	%ymm4,128-192(%rbx)
-	vmovdqu	%ymm5,160-192(%rbx)
-
-	vpmuludq	96-128(%rsi),%ymm11,%ymm12
-	vpaddq	%ymm12,%ymm6,%ymm6
-	vpmuludq	96-128(%r9),%ymm11,%ymm14
-	vpaddq	%ymm14,%ymm7,%ymm7
-	vpmuludq	128-128(%r9),%ymm11,%ymm13
-	vpaddq	%ymm13,%ymm8,%ymm8
-	vpmuludq	160-128(%r9),%ymm11,%ymm12
-	vpaddq	%ymm12,%ymm0,%ymm0
-	vpmuludq	192-128(%r9),%ymm11,%ymm14
-	vpaddq	%ymm14,%ymm1,%ymm1
-	vpmuludq	224-128(%r9),%ymm11,%ymm2
-	vpbroadcastq	160-128(%r15),%ymm11
-	vpaddq	352-448(%r12),%ymm2,%ymm2
-
-	vmovdqu	%ymm6,192-192(%rbx)
-	vmovdqu	%ymm7,224-192(%rbx)
-
-	vpmuludq	128-128(%rsi),%ymm10,%ymm12
-	vpaddq	%ymm12,%ymm8,%ymm8
-	vpmuludq	128-128(%r9),%ymm10,%ymm14
-	vpaddq	%ymm14,%ymm0,%ymm0
-	vpmuludq	160-128(%r9),%ymm10,%ymm13
-	vpaddq	%ymm13,%ymm1,%ymm1
-	vpmuludq	192-128(%r9),%ymm10,%ymm12
-	vpaddq	%ymm12,%ymm2,%ymm2
-	vpmuludq	224-128(%r9),%ymm10,%ymm3
-	vpbroadcastq	192-128(%r15),%ymm10
-	vpaddq	384-448(%r12),%ymm3,%ymm3
-
-	vmovdqu	%ymm8,256-192(%rbx)
-	vmovdqu	%ymm0,288-192(%rbx)
-	leaq	8(%rbx),%rbx
-
-	vpmuludq	160-128(%rsi),%ymm11,%ymm13
-	vpaddq	%ymm13,%ymm1,%ymm1
-	vpmuludq	160-128(%r9),%ymm11,%ymm12
-	vpaddq	%ymm12,%ymm2,%ymm2
-	vpmuludq	192-128(%r9),%ymm11,%ymm14
-	vpaddq	%ymm14,%ymm3,%ymm3
-	vpmuludq	224-128(%r9),%ymm11,%ymm4
-	vpbroadcastq	224-128(%r15),%ymm11
-	vpaddq	416-448(%r12),%ymm4,%ymm4
-
-	vmovdqu	%ymm1,320-448(%r12)
-	vmovdqu	%ymm2,352-448(%r12)
-
-	vpmuludq	192-128(%rsi),%ymm10,%ymm12
-	vpaddq	%ymm12,%ymm3,%ymm3
-	vpmuludq	192-128(%r9),%ymm10,%ymm14
-	vpbroadcastq	256-128(%r15),%ymm0
-	vpaddq	%ymm14,%ymm4,%ymm4
-	vpmuludq	224-128(%r9),%ymm10,%ymm5
-	vpbroadcastq	0+8-128(%r15),%ymm10
-	vpaddq	448-448(%r12),%ymm5,%ymm5
-
-	vmovdqu	%ymm3,384-448(%r12)
-	vmovdqu	%ymm4,416-448(%r12)
-	leaq	8(%r15),%r15
-
-	vpmuludq	224-128(%rsi),%ymm11,%ymm12
-	vpaddq	%ymm12,%ymm5,%ymm5
-	vpmuludq	224-128(%r9),%ymm11,%ymm6
-	vpaddq	480-448(%r12),%ymm6,%ymm6
-
-	vpmuludq	256-128(%rsi),%ymm0,%ymm7
-	vmovdqu	%ymm5,448-448(%r12)
-	vpaddq	512-448(%r12),%ymm7,%ymm7
-	vmovdqu	%ymm6,480-448(%r12)
-	vmovdqu	%ymm7,512-448(%r12)
-	leaq	8(%r12),%r12
-
-	decl	%r14d
-	jnz	L$OOP_SQR_1024
-
-	vmovdqu	256(%rsp),%ymm8
-	vmovdqu	288(%rsp),%ymm1
-	vmovdqu	320(%rsp),%ymm2
-	leaq	192(%rsp),%rbx
-
-	vpsrlq	$29,%ymm8,%ymm14
-	vpand	%ymm15,%ymm8,%ymm8
-	vpsrlq	$29,%ymm1,%ymm11
-	vpand	%ymm15,%ymm1,%ymm1
-
-	vpermq	$0x93,%ymm14,%ymm14
-	vpxor	%ymm9,%ymm9,%ymm9
-	vpermq	$0x93,%ymm11,%ymm11
-
-	vpblendd	$3,%ymm9,%ymm14,%ymm10
-	vpblendd	$3,%ymm14,%ymm11,%ymm14
-	vpaddq	%ymm10,%ymm8,%ymm8
-	vpblendd	$3,%ymm11,%ymm9,%ymm11
-	vpaddq	%ymm14,%ymm1,%ymm1
-	vpaddq	%ymm11,%ymm2,%ymm2
-	vmovdqu	%ymm1,288-192(%rbx)
-	vmovdqu	%ymm2,320-192(%rbx)
-
-	movq	(%rsp),%rax
-	movq	8(%rsp),%r10
-	movq	16(%rsp),%r11
-	movq	24(%rsp),%r12
-	vmovdqu	32(%rsp),%ymm1
-	vmovdqu	64-192(%rbx),%ymm2
-	vmovdqu	96-192(%rbx),%ymm3
-	vmovdqu	128-192(%rbx),%ymm4
-	vmovdqu	160-192(%rbx),%ymm5
-	vmovdqu	192-192(%rbx),%ymm6
-	vmovdqu	224-192(%rbx),%ymm7
-
-	movq	%rax,%r9
-	imull	%ecx,%eax
-	andl	$0x1fffffff,%eax
-	vmovd	%eax,%xmm12
-
-	movq	%rax,%rdx
-	imulq	-128(%r13),%rax
-	vpbroadcastq	%xmm12,%ymm12
-	addq	%rax,%r9
-	movq	%rdx,%rax
-	imulq	8-128(%r13),%rax
-	shrq	$29,%r9
-	addq	%rax,%r10
-	movq	%rdx,%rax
-	imulq	16-128(%r13),%rax
-	addq	%r9,%r10
-	addq	%rax,%r11
-	imulq	24-128(%r13),%rdx
-	addq	%rdx,%r12
-
-	movq	%r10,%rax
-	imull	%ecx,%eax
-	andl	$0x1fffffff,%eax
-
-	movl	$9,%r14d
-	jmp	L$OOP_REDUCE_1024
-
-.p2align	5
-L$OOP_REDUCE_1024:
-	vmovd	%eax,%xmm13
-	vpbroadcastq	%xmm13,%ymm13
-
-	vpmuludq	32-128(%r13),%ymm12,%ymm10
-	movq	%rax,%rdx
-	imulq	-128(%r13),%rax
-	vpaddq	%ymm10,%ymm1,%ymm1
-	addq	%rax,%r10
-	vpmuludq	64-128(%r13),%ymm12,%ymm14
-	movq	%rdx,%rax
-	imulq	8-128(%r13),%rax
-	vpaddq	%ymm14,%ymm2,%ymm2
-	vpmuludq	96-128(%r13),%ymm12,%ymm11
-.byte	0x67
-	addq	%rax,%r11
-.byte	0x67
-	movq	%rdx,%rax
-	imulq	16-128(%r13),%rax
-	shrq	$29,%r10
-	vpaddq	%ymm11,%ymm3,%ymm3
-	vpmuludq	128-128(%r13),%ymm12,%ymm10
-	addq	%rax,%r12
-	addq	%r10,%r11
-	vpaddq	%ymm10,%ymm4,%ymm4
-	vpmuludq	160-128(%r13),%ymm12,%ymm14
-	movq	%r11,%rax
-	imull	%ecx,%eax
-	vpaddq	%ymm14,%ymm5,%ymm5
-	vpmuludq	192-128(%r13),%ymm12,%ymm11
-	andl	$0x1fffffff,%eax
-	vpaddq	%ymm11,%ymm6,%ymm6
-	vpmuludq	224-128(%r13),%ymm12,%ymm10
-	vpaddq	%ymm10,%ymm7,%ymm7
-	vpmuludq	256-128(%r13),%ymm12,%ymm14
-	vmovd	%eax,%xmm12
-
-	vpaddq	%ymm14,%ymm8,%ymm8
-
-	vpbroadcastq	%xmm12,%ymm12
-
-	vpmuludq	32-8-128(%r13),%ymm13,%ymm11
-	vmovdqu	96-8-128(%r13),%ymm14
-	movq	%rax,%rdx
-	imulq	-128(%r13),%rax
-	vpaddq	%ymm11,%ymm1,%ymm1
-	vpmuludq	64-8-128(%r13),%ymm13,%ymm10
-	vmovdqu	128-8-128(%r13),%ymm11
-	addq	%rax,%r11
-	movq	%rdx,%rax
-	imulq	8-128(%r13),%rax
-	vpaddq	%ymm10,%ymm2,%ymm2
-	addq	%r12,%rax
-	shrq	$29,%r11
-	vpmuludq	%ymm13,%ymm14,%ymm14
-	vmovdqu	160-8-128(%r13),%ymm10
-	addq	%r11,%rax
-	vpaddq	%ymm14,%ymm3,%ymm3
-	vpmuludq	%ymm13,%ymm11,%ymm11
-	vmovdqu	192-8-128(%r13),%ymm14
-.byte	0x67
-	movq	%rax,%r12
-	imull	%ecx,%eax
-	vpaddq	%ymm11,%ymm4,%ymm4
-	vpmuludq	%ymm13,%ymm10,%ymm10
-.byte	0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
-	andl	$0x1fffffff,%eax
-	vpaddq	%ymm10,%ymm5,%ymm5
-	vpmuludq	%ymm13,%ymm14,%ymm14
-	vmovdqu	256-8-128(%r13),%ymm10
-	vpaddq	%ymm14,%ymm6,%ymm6
-	vpmuludq	%ymm13,%ymm11,%ymm11
-	vmovdqu	288-8-128(%r13),%ymm9
-	vmovd	%eax,%xmm0
-	imulq	-128(%r13),%rax
-	vpaddq	%ymm11,%ymm7,%ymm7
-	vpmuludq	%ymm13,%ymm10,%ymm10
-	vmovdqu	32-16-128(%r13),%ymm14
-	vpbroadcastq	%xmm0,%ymm0
-	vpaddq	%ymm10,%ymm8,%ymm8
-	vpmuludq	%ymm13,%ymm9,%ymm9
-	vmovdqu	64-16-128(%r13),%ymm11
-	addq	%rax,%r12
-
-	vmovdqu	32-24-128(%r13),%ymm13
-	vpmuludq	%ymm12,%ymm14,%ymm14
-	vmovdqu	96-16-128(%r13),%ymm10
-	vpaddq	%ymm14,%ymm1,%ymm1
-	vpmuludq	%ymm0,%ymm13,%ymm13
-	vpmuludq	%ymm12,%ymm11,%ymm11
-.byte	0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
-	vpaddq	%ymm1,%ymm13,%ymm13
-	vpaddq	%ymm11,%ymm2,%ymm2
-	vpmuludq	%ymm12,%ymm10,%ymm10
-	vmovdqu	160-16-128(%r13),%ymm11
-.byte	0x67
-	vmovq	%xmm13,%rax
-	vmovdqu	%ymm13,(%rsp)
-	vpaddq	%ymm10,%ymm3,%ymm3
-	vpmuludq	%ymm12,%ymm14,%ymm14
-	vmovdqu	192-16-128(%r13),%ymm10
-	vpaddq	%ymm14,%ymm4,%ymm4
-	vpmuludq	%ymm12,%ymm11,%ymm11
-	vmovdqu	224-16-128(%r13),%ymm14
-	vpaddq	%ymm11,%ymm5,%ymm5
-	vpmuludq	%ymm12,%ymm10,%ymm10
-	vmovdqu	256-16-128(%r13),%ymm11
-	vpaddq	%ymm10,%ymm6,%ymm6
-	vpmuludq	%ymm12,%ymm14,%ymm14
-	shrq	$29,%r12
-	vmovdqu	288-16-128(%r13),%ymm10
-	addq	%r12,%rax
-	vpaddq	%ymm14,%ymm7,%ymm7
-	vpmuludq	%ymm12,%ymm11,%ymm11
-
-	movq	%rax,%r9
-	imull	%ecx,%eax
-	vpaddq	%ymm11,%ymm8,%ymm8
-	vpmuludq	%ymm12,%ymm10,%ymm10
-	andl	$0x1fffffff,%eax
-	vmovd	%eax,%xmm12
-	vmovdqu	96-24-128(%r13),%ymm11
-.byte	0x67
-	vpaddq	%ymm10,%ymm9,%ymm9
-	vpbroadcastq	%xmm12,%ymm12
-
-	vpmuludq	64-24-128(%r13),%ymm0,%ymm14
-	vmovdqu	128-24-128(%r13),%ymm10
-	movq	%rax,%rdx
-	imulq	-128(%r13),%rax
-	movq	8(%rsp),%r10
-	vpaddq	%ymm14,%ymm2,%ymm1
-	vpmuludq	%ymm0,%ymm11,%ymm11
-	vmovdqu	160-24-128(%r13),%ymm14
-	addq	%rax,%r9
-	movq	%rdx,%rax
-	imulq	8-128(%r13),%rax
-.byte	0x67
-	shrq	$29,%r9
-	movq	16(%rsp),%r11
-	vpaddq	%ymm11,%ymm3,%ymm2
-	vpmuludq	%ymm0,%ymm10,%ymm10
-	vmovdqu	192-24-128(%r13),%ymm11
-	addq	%rax,%r10
-	movq	%rdx,%rax
-	imulq	16-128(%r13),%rax
-	vpaddq	%ymm10,%ymm4,%ymm3
-	vpmuludq	%ymm0,%ymm14,%ymm14
-	vmovdqu	224-24-128(%r13),%ymm10
-	imulq	24-128(%r13),%rdx
-	addq	%rax,%r11
-	leaq	(%r9,%r10,1),%rax
-	vpaddq	%ymm14,%ymm5,%ymm4
-	vpmuludq	%ymm0,%ymm11,%ymm11
-	vmovdqu	256-24-128(%r13),%ymm14
-	movq	%rax,%r10
-	imull	%ecx,%eax
-	vpmuludq	%ymm0,%ymm10,%ymm10
-	vpaddq	%ymm11,%ymm6,%ymm5
-	vmovdqu	288-24-128(%r13),%ymm11
-	andl	$0x1fffffff,%eax
-	vpaddq	%ymm10,%ymm7,%ymm6
-	vpmuludq	%ymm0,%ymm14,%ymm14
-	addq	24(%rsp),%rdx
-	vpaddq	%ymm14,%ymm8,%ymm7
-	vpmuludq	%ymm0,%ymm11,%ymm11
-	vpaddq	%ymm11,%ymm9,%ymm8
-	vmovq	%r12,%xmm9
-	movq	%rdx,%r12
-
-	decl	%r14d
-	jnz	L$OOP_REDUCE_1024
-	leaq	448(%rsp),%r12
-	vpaddq	%ymm9,%ymm13,%ymm0
-	vpxor	%ymm9,%ymm9,%ymm9
-
-	vpaddq	288-192(%rbx),%ymm0,%ymm0
-	vpaddq	320-448(%r12),%ymm1,%ymm1
-	vpaddq	352-448(%r12),%ymm2,%ymm2
-	vpaddq	384-448(%r12),%ymm3,%ymm3
-	vpaddq	416-448(%r12),%ymm4,%ymm4
-	vpaddq	448-448(%r12),%ymm5,%ymm5
-	vpaddq	480-448(%r12),%ymm6,%ymm6
-	vpaddq	512-448(%r12),%ymm7,%ymm7
-	vpaddq	544-448(%r12),%ymm8,%ymm8
-
-	vpsrlq	$29,%ymm0,%ymm14
-	vpand	%ymm15,%ymm0,%ymm0
-	vpsrlq	$29,%ymm1,%ymm11
-	vpand	%ymm15,%ymm1,%ymm1
-	vpsrlq	$29,%ymm2,%ymm12
-	vpermq	$0x93,%ymm14,%ymm14
-	vpand	%ymm15,%ymm2,%ymm2
-	vpsrlq	$29,%ymm3,%ymm13
-	vpermq	$0x93,%ymm11,%ymm11
-	vpand	%ymm15,%ymm3,%ymm3
-	vpermq	$0x93,%ymm12,%ymm12
-
-	vpblendd	$3,%ymm9,%ymm14,%ymm10
-	vpermq	$0x93,%ymm13,%ymm13
-	vpblendd	$3,%ymm14,%ymm11,%ymm14
-	vpaddq	%ymm10,%ymm0,%ymm0
-	vpblendd	$3,%ymm11,%ymm12,%ymm11
-	vpaddq	%ymm14,%ymm1,%ymm1
-	vpblendd	$3,%ymm12,%ymm13,%ymm12
-	vpaddq	%ymm11,%ymm2,%ymm2
-	vpblendd	$3,%ymm13,%ymm9,%ymm13
-	vpaddq	%ymm12,%ymm3,%ymm3
-	vpaddq	%ymm13,%ymm4,%ymm4
-
-	vpsrlq	$29,%ymm0,%ymm14
-	vpand	%ymm15,%ymm0,%ymm0
-	vpsrlq	$29,%ymm1,%ymm11
-	vpand	%ymm15,%ymm1,%ymm1
-	vpsrlq	$29,%ymm2,%ymm12
-	vpermq	$0x93,%ymm14,%ymm14
-	vpand	%ymm15,%ymm2,%ymm2
-	vpsrlq	$29,%ymm3,%ymm13
-	vpermq	$0x93,%ymm11,%ymm11
-	vpand	%ymm15,%ymm3,%ymm3
-	vpermq	$0x93,%ymm12,%ymm12
-
-	vpblendd	$3,%ymm9,%ymm14,%ymm10
-	vpermq	$0x93,%ymm13,%ymm13
-	vpblendd	$3,%ymm14,%ymm11,%ymm14
-	vpaddq	%ymm10,%ymm0,%ymm0
-	vpblendd	$3,%ymm11,%ymm12,%ymm11
-	vpaddq	%ymm14,%ymm1,%ymm1
-	vmovdqu	%ymm0,0-128(%rdi)
-	vpblendd	$3,%ymm12,%ymm13,%ymm12
-	vpaddq	%ymm11,%ymm2,%ymm2
-	vmovdqu	%ymm1,32-128(%rdi)
-	vpblendd	$3,%ymm13,%ymm9,%ymm13
-	vpaddq	%ymm12,%ymm3,%ymm3
-	vmovdqu	%ymm2,64-128(%rdi)
-	vpaddq	%ymm13,%ymm4,%ymm4
-	vmovdqu	%ymm3,96-128(%rdi)
-	vpsrlq	$29,%ymm4,%ymm14
-	vpand	%ymm15,%ymm4,%ymm4
-	vpsrlq	$29,%ymm5,%ymm11
-	vpand	%ymm15,%ymm5,%ymm5
-	vpsrlq	$29,%ymm6,%ymm12
-	vpermq	$0x93,%ymm14,%ymm14
-	vpand	%ymm15,%ymm6,%ymm6
-	vpsrlq	$29,%ymm7,%ymm13
-	vpermq	$0x93,%ymm11,%ymm11
-	vpand	%ymm15,%ymm7,%ymm7
-	vpsrlq	$29,%ymm8,%ymm0
-	vpermq	$0x93,%ymm12,%ymm12
-	vpand	%ymm15,%ymm8,%ymm8
-	vpermq	$0x93,%ymm13,%ymm13
-
-	vpblendd	$3,%ymm9,%ymm14,%ymm10
-	vpermq	$0x93,%ymm0,%ymm0
-	vpblendd	$3,%ymm14,%ymm11,%ymm14
-	vpaddq	%ymm10,%ymm4,%ymm4
-	vpblendd	$3,%ymm11,%ymm12,%ymm11
-	vpaddq	%ymm14,%ymm5,%ymm5
-	vpblendd	$3,%ymm12,%ymm13,%ymm12
-	vpaddq	%ymm11,%ymm6,%ymm6
-	vpblendd	$3,%ymm13,%ymm0,%ymm13
-	vpaddq	%ymm12,%ymm7,%ymm7
-	vpaddq	%ymm13,%ymm8,%ymm8
-
-	vpsrlq	$29,%ymm4,%ymm14
-	vpand	%ymm15,%ymm4,%ymm4
-	vpsrlq	$29,%ymm5,%ymm11
-	vpand	%ymm15,%ymm5,%ymm5
-	vpsrlq	$29,%ymm6,%ymm12
-	vpermq	$0x93,%ymm14,%ymm14
-	vpand	%ymm15,%ymm6,%ymm6
-	vpsrlq	$29,%ymm7,%ymm13
-	vpermq	$0x93,%ymm11,%ymm11
-	vpand	%ymm15,%ymm7,%ymm7
-	vpsrlq	$29,%ymm8,%ymm0
-	vpermq	$0x93,%ymm12,%ymm12
-	vpand	%ymm15,%ymm8,%ymm8
-	vpermq	$0x93,%ymm13,%ymm13
-
-	vpblendd	$3,%ymm9,%ymm14,%ymm10
-	vpermq	$0x93,%ymm0,%ymm0
-	vpblendd	$3,%ymm14,%ymm11,%ymm14
-	vpaddq	%ymm10,%ymm4,%ymm4
-	vpblendd	$3,%ymm11,%ymm12,%ymm11
-	vpaddq	%ymm14,%ymm5,%ymm5
-	vmovdqu	%ymm4,128-128(%rdi)
-	vpblendd	$3,%ymm12,%ymm13,%ymm12
-	vpaddq	%ymm11,%ymm6,%ymm6
-	vmovdqu	%ymm5,160-128(%rdi)
-	vpblendd	$3,%ymm13,%ymm0,%ymm13
-	vpaddq	%ymm12,%ymm7,%ymm7
-	vmovdqu	%ymm6,192-128(%rdi)
-	vpaddq	%ymm13,%ymm8,%ymm8
-	vmovdqu	%ymm7,224-128(%rdi)
-	vmovdqu	%ymm8,256-128(%rdi)
-
-	movq	%rdi,%rsi
-	decl	%r8d
-	jne	L$OOP_GRANDE_SQR_1024
-
-	vzeroall
-	movq	%rbp,%rax
-
-	movq	-48(%rax),%r15
-
-	movq	-40(%rax),%r14
-
-	movq	-32(%rax),%r13
-
-	movq	-24(%rax),%r12
-
-	movq	-16(%rax),%rbp
-
-	movq	-8(%rax),%rbx
-
-	leaq	(%rax),%rsp
-
-L$sqr_1024_epilogue:
-	ret
-
-
-.globl	_rsaz_1024_mul_avx2
-.private_extern _rsaz_1024_mul_avx2
-
-.p2align	6
-_rsaz_1024_mul_avx2:
-
-_CET_ENDBR
-	leaq	(%rsp),%rax
-
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	movq	%rax,%rbp
-
-	vzeroall
-	movq	%rdx,%r13
-	subq	$64,%rsp
-
-
-
-
-
-
-.byte	0x67,0x67
-	movq	%rsi,%r15
-	andq	$4095,%r15
-	addq	$320,%r15
-	shrq	$12,%r15
-	movq	%rsi,%r15
-	cmovnzq	%r13,%rsi
-	cmovnzq	%r15,%r13
-
-	movq	%rcx,%r15
-	subq	$-128,%rsi
-	subq	$-128,%rcx
-	subq	$-128,%rdi
-
-	andq	$4095,%r15
-	addq	$320,%r15
-.byte	0x67,0x67
-	shrq	$12,%r15
-	jz	L$mul_1024_no_n_copy
-
-
-
-
-
-	subq	$320,%rsp
-	vmovdqu	0-128(%rcx),%ymm0
-	andq	$-512,%rsp
-	vmovdqu	32-128(%rcx),%ymm1
-	vmovdqu	64-128(%rcx),%ymm2
-	vmovdqu	96-128(%rcx),%ymm3
-	vmovdqu	128-128(%rcx),%ymm4
-	vmovdqu	160-128(%rcx),%ymm5
-	vmovdqu	192-128(%rcx),%ymm6
-	vmovdqu	224-128(%rcx),%ymm7
-	vmovdqu	256-128(%rcx),%ymm8
-	leaq	64+128(%rsp),%rcx
-	vmovdqu	%ymm0,0-128(%rcx)
-	vpxor	%ymm0,%ymm0,%ymm0
-	vmovdqu	%ymm1,32-128(%rcx)
-	vpxor	%ymm1,%ymm1,%ymm1
-	vmovdqu	%ymm2,64-128(%rcx)
-	vpxor	%ymm2,%ymm2,%ymm2
-	vmovdqu	%ymm3,96-128(%rcx)
-	vpxor	%ymm3,%ymm3,%ymm3
-	vmovdqu	%ymm4,128-128(%rcx)
-	vpxor	%ymm4,%ymm4,%ymm4
-	vmovdqu	%ymm5,160-128(%rcx)
-	vpxor	%ymm5,%ymm5,%ymm5
-	vmovdqu	%ymm6,192-128(%rcx)
-	vpxor	%ymm6,%ymm6,%ymm6
-	vmovdqu	%ymm7,224-128(%rcx)
-	vpxor	%ymm7,%ymm7,%ymm7
-	vmovdqu	%ymm8,256-128(%rcx)
-	vmovdqa	%ymm0,%ymm8
-	vmovdqu	%ymm9,288-128(%rcx)
-L$mul_1024_no_n_copy:
-	andq	$-64,%rsp
-
-	movq	(%r13),%rbx
-	vpbroadcastq	(%r13),%ymm10
-	vmovdqu	%ymm0,(%rsp)
-	xorq	%r9,%r9
-.byte	0x67
-	xorq	%r10,%r10
-	xorq	%r11,%r11
-	xorq	%r12,%r12
-
-	vmovdqu	L$and_mask(%rip),%ymm15
-	movl	$9,%r14d
-	vmovdqu	%ymm9,288-128(%rdi)
-	jmp	L$oop_mul_1024
-
-.p2align	5
-L$oop_mul_1024:
-	vpsrlq	$29,%ymm3,%ymm9
-	movq	%rbx,%rax
-	imulq	-128(%rsi),%rax
-	addq	%r9,%rax
-	movq	%rbx,%r10
-	imulq	8-128(%rsi),%r10
-	addq	8(%rsp),%r10
-
-	movq	%rax,%r9
-	imull	%r8d,%eax
-	andl	$0x1fffffff,%eax
-
-	movq	%rbx,%r11
-	imulq	16-128(%rsi),%r11
-	addq	16(%rsp),%r11
-
-	movq	%rbx,%r12
-	imulq	24-128(%rsi),%r12
-	addq	24(%rsp),%r12
-	vpmuludq	32-128(%rsi),%ymm10,%ymm0
-	vmovd	%eax,%xmm11
-	vpaddq	%ymm0,%ymm1,%ymm1
-	vpmuludq	64-128(%rsi),%ymm10,%ymm12
-	vpbroadcastq	%xmm11,%ymm11
-	vpaddq	%ymm12,%ymm2,%ymm2
-	vpmuludq	96-128(%rsi),%ymm10,%ymm13
-	vpand	%ymm15,%ymm3,%ymm3
-	vpaddq	%ymm13,%ymm3,%ymm3
-	vpmuludq	128-128(%rsi),%ymm10,%ymm0
-	vpaddq	%ymm0,%ymm4,%ymm4
-	vpmuludq	160-128(%rsi),%ymm10,%ymm12
-	vpaddq	%ymm12,%ymm5,%ymm5
-	vpmuludq	192-128(%rsi),%ymm10,%ymm13
-	vpaddq	%ymm13,%ymm6,%ymm6
-	vpmuludq	224-128(%rsi),%ymm10,%ymm0
-	vpermq	$0x93,%ymm9,%ymm9
-	vpaddq	%ymm0,%ymm7,%ymm7
-	vpmuludq	256-128(%rsi),%ymm10,%ymm12
-	vpbroadcastq	8(%r13),%ymm10
-	vpaddq	%ymm12,%ymm8,%ymm8
-
-	movq	%rax,%rdx
-	imulq	-128(%rcx),%rax
-	addq	%rax,%r9
-	movq	%rdx,%rax
-	imulq	8-128(%rcx),%rax
-	addq	%rax,%r10
-	movq	%rdx,%rax
-	imulq	16-128(%rcx),%rax
-	addq	%rax,%r11
-	shrq	$29,%r9
-	imulq	24-128(%rcx),%rdx
-	addq	%rdx,%r12
-	addq	%r9,%r10
-
-	vpmuludq	32-128(%rcx),%ymm11,%ymm13
-	vmovq	%xmm10,%rbx
-	vpaddq	%ymm13,%ymm1,%ymm1
-	vpmuludq	64-128(%rcx),%ymm11,%ymm0
-	vpaddq	%ymm0,%ymm2,%ymm2
-	vpmuludq	96-128(%rcx),%ymm11,%ymm12
-	vpaddq	%ymm12,%ymm3,%ymm3
-	vpmuludq	128-128(%rcx),%ymm11,%ymm13
-	vpaddq	%ymm13,%ymm4,%ymm4
-	vpmuludq	160-128(%rcx),%ymm11,%ymm0
-	vpaddq	%ymm0,%ymm5,%ymm5
-	vpmuludq	192-128(%rcx),%ymm11,%ymm12
-	vpaddq	%ymm12,%ymm6,%ymm6
-	vpmuludq	224-128(%rcx),%ymm11,%ymm13
-	vpblendd	$3,%ymm14,%ymm9,%ymm12
-	vpaddq	%ymm13,%ymm7,%ymm7
-	vpmuludq	256-128(%rcx),%ymm11,%ymm0
-	vpaddq	%ymm12,%ymm3,%ymm3
-	vpaddq	%ymm0,%ymm8,%ymm8
-
-	movq	%rbx,%rax
-	imulq	-128(%rsi),%rax
-	addq	%rax,%r10
-	vmovdqu	-8+32-128(%rsi),%ymm12
-	movq	%rbx,%rax
-	imulq	8-128(%rsi),%rax
-	addq	%rax,%r11
-	vmovdqu	-8+64-128(%rsi),%ymm13
-
-	movq	%r10,%rax
-	vpblendd	$0xfc,%ymm14,%ymm9,%ymm9
-	imull	%r8d,%eax
-	vpaddq	%ymm9,%ymm4,%ymm4
-	andl	$0x1fffffff,%eax
-
-	imulq	16-128(%rsi),%rbx
-	addq	%rbx,%r12
-	vpmuludq	%ymm10,%ymm12,%ymm12
-	vmovd	%eax,%xmm11
-	vmovdqu	-8+96-128(%rsi),%ymm0
-	vpaddq	%ymm12,%ymm1,%ymm1
-	vpmuludq	%ymm10,%ymm13,%ymm13
-	vpbroadcastq	%xmm11,%ymm11
-	vmovdqu	-8+128-128(%rsi),%ymm12
-	vpaddq	%ymm13,%ymm2,%ymm2
-	vpmuludq	%ymm10,%ymm0,%ymm0
-	vmovdqu	-8+160-128(%rsi),%ymm13
-	vpaddq	%ymm0,%ymm3,%ymm3
-	vpmuludq	%ymm10,%ymm12,%ymm12
-	vmovdqu	-8+192-128(%rsi),%ymm0
-	vpaddq	%ymm12,%ymm4,%ymm4
-	vpmuludq	%ymm10,%ymm13,%ymm13
-	vmovdqu	-8+224-128(%rsi),%ymm12
-	vpaddq	%ymm13,%ymm5,%ymm5
-	vpmuludq	%ymm10,%ymm0,%ymm0
-	vmovdqu	-8+256-128(%rsi),%ymm13
-	vpaddq	%ymm0,%ymm6,%ymm6
-	vpmuludq	%ymm10,%ymm12,%ymm12
-	vmovdqu	-8+288-128(%rsi),%ymm9
-	vpaddq	%ymm12,%ymm7,%ymm7
-	vpmuludq	%ymm10,%ymm13,%ymm13
-	vpaddq	%ymm13,%ymm8,%ymm8
-	vpmuludq	%ymm10,%ymm9,%ymm9
-	vpbroadcastq	16(%r13),%ymm10
-
-	movq	%rax,%rdx
-	imulq	-128(%rcx),%rax
-	addq	%rax,%r10
-	vmovdqu	-8+32-128(%rcx),%ymm0
-	movq	%rdx,%rax
-	imulq	8-128(%rcx),%rax
-	addq	%rax,%r11
-	vmovdqu	-8+64-128(%rcx),%ymm12
-	shrq	$29,%r10
-	imulq	16-128(%rcx),%rdx
-	addq	%rdx,%r12
-	addq	%r10,%r11
-
-	vpmuludq	%ymm11,%ymm0,%ymm0
-	vmovq	%xmm10,%rbx
-	vmovdqu	-8+96-128(%rcx),%ymm13
-	vpaddq	%ymm0,%ymm1,%ymm1
-	vpmuludq	%ymm11,%ymm12,%ymm12
-	vmovdqu	-8+128-128(%rcx),%ymm0
-	vpaddq	%ymm12,%ymm2,%ymm2
-	vpmuludq	%ymm11,%ymm13,%ymm13
-	vmovdqu	-8+160-128(%rcx),%ymm12
-	vpaddq	%ymm13,%ymm3,%ymm3
-	vpmuludq	%ymm11,%ymm0,%ymm0
-	vmovdqu	-8+192-128(%rcx),%ymm13
-	vpaddq	%ymm0,%ymm4,%ymm4
-	vpmuludq	%ymm11,%ymm12,%ymm12
-	vmovdqu	-8+224-128(%rcx),%ymm0
-	vpaddq	%ymm12,%ymm5,%ymm5
-	vpmuludq	%ymm11,%ymm13,%ymm13
-	vmovdqu	-8+256-128(%rcx),%ymm12
-	vpaddq	%ymm13,%ymm6,%ymm6
-	vpmuludq	%ymm11,%ymm0,%ymm0
-	vmovdqu	-8+288-128(%rcx),%ymm13
-	vpaddq	%ymm0,%ymm7,%ymm7
-	vpmuludq	%ymm11,%ymm12,%ymm12
-	vpaddq	%ymm12,%ymm8,%ymm8
-	vpmuludq	%ymm11,%ymm13,%ymm13
-	vpaddq	%ymm13,%ymm9,%ymm9
-
-	vmovdqu	-16+32-128(%rsi),%ymm0
-	movq	%rbx,%rax
-	imulq	-128(%rsi),%rax
-	addq	%r11,%rax
-
-	vmovdqu	-16+64-128(%rsi),%ymm12
-	movq	%rax,%r11
-	imull	%r8d,%eax
-	andl	$0x1fffffff,%eax
-
-	imulq	8-128(%rsi),%rbx
-	addq	%rbx,%r12
-	vpmuludq	%ymm10,%ymm0,%ymm0
-	vmovd	%eax,%xmm11
-	vmovdqu	-16+96-128(%rsi),%ymm13
-	vpaddq	%ymm0,%ymm1,%ymm1
-	vpmuludq	%ymm10,%ymm12,%ymm12
-	vpbroadcastq	%xmm11,%ymm11
-	vmovdqu	-16+128-128(%rsi),%ymm0
-	vpaddq	%ymm12,%ymm2,%ymm2
-	vpmuludq	%ymm10,%ymm13,%ymm13
-	vmovdqu	-16+160-128(%rsi),%ymm12
-	vpaddq	%ymm13,%ymm3,%ymm3
-	vpmuludq	%ymm10,%ymm0,%ymm0
-	vmovdqu	-16+192-128(%rsi),%ymm13
-	vpaddq	%ymm0,%ymm4,%ymm4
-	vpmuludq	%ymm10,%ymm12,%ymm12
-	vmovdqu	-16+224-128(%rsi),%ymm0
-	vpaddq	%ymm12,%ymm5,%ymm5
-	vpmuludq	%ymm10,%ymm13,%ymm13
-	vmovdqu	-16+256-128(%rsi),%ymm12
-	vpaddq	%ymm13,%ymm6,%ymm6
-	vpmuludq	%ymm10,%ymm0,%ymm0
-	vmovdqu	-16+288-128(%rsi),%ymm13
-	vpaddq	%ymm0,%ymm7,%ymm7
-	vpmuludq	%ymm10,%ymm12,%ymm12
-	vpaddq	%ymm12,%ymm8,%ymm8
-	vpmuludq	%ymm10,%ymm13,%ymm13
-	vpbroadcastq	24(%r13),%ymm10
-	vpaddq	%ymm13,%ymm9,%ymm9
-
-	vmovdqu	-16+32-128(%rcx),%ymm0
-	movq	%rax,%rdx
-	imulq	-128(%rcx),%rax
-	addq	%rax,%r11
-	vmovdqu	-16+64-128(%rcx),%ymm12
-	imulq	8-128(%rcx),%rdx
-	addq	%rdx,%r12
-	shrq	$29,%r11
-
-	vpmuludq	%ymm11,%ymm0,%ymm0
-	vmovq	%xmm10,%rbx
-	vmovdqu	-16+96-128(%rcx),%ymm13
-	vpaddq	%ymm0,%ymm1,%ymm1
-	vpmuludq	%ymm11,%ymm12,%ymm12
-	vmovdqu	-16+128-128(%rcx),%ymm0
-	vpaddq	%ymm12,%ymm2,%ymm2
-	vpmuludq	%ymm11,%ymm13,%ymm13
-	vmovdqu	-16+160-128(%rcx),%ymm12
-	vpaddq	%ymm13,%ymm3,%ymm3
-	vpmuludq	%ymm11,%ymm0,%ymm0
-	vmovdqu	-16+192-128(%rcx),%ymm13
-	vpaddq	%ymm0,%ymm4,%ymm4
-	vpmuludq	%ymm11,%ymm12,%ymm12
-	vmovdqu	-16+224-128(%rcx),%ymm0
-	vpaddq	%ymm12,%ymm5,%ymm5
-	vpmuludq	%ymm11,%ymm13,%ymm13
-	vmovdqu	-16+256-128(%rcx),%ymm12
-	vpaddq	%ymm13,%ymm6,%ymm6
-	vpmuludq	%ymm11,%ymm0,%ymm0
-	vmovdqu	-16+288-128(%rcx),%ymm13
-	vpaddq	%ymm0,%ymm7,%ymm7
-	vpmuludq	%ymm11,%ymm12,%ymm12
-	vmovdqu	-24+32-128(%rsi),%ymm0
-	vpaddq	%ymm12,%ymm8,%ymm8
-	vpmuludq	%ymm11,%ymm13,%ymm13
-	vmovdqu	-24+64-128(%rsi),%ymm12
-	vpaddq	%ymm13,%ymm9,%ymm9
-
-	addq	%r11,%r12
-	imulq	-128(%rsi),%rbx
-	addq	%rbx,%r12
-
-	movq	%r12,%rax
-	imull	%r8d,%eax
-	andl	$0x1fffffff,%eax
-
-	vpmuludq	%ymm10,%ymm0,%ymm0
-	vmovd	%eax,%xmm11
-	vmovdqu	-24+96-128(%rsi),%ymm13
-	vpaddq	%ymm0,%ymm1,%ymm1
-	vpmuludq	%ymm10,%ymm12,%ymm12
-	vpbroadcastq	%xmm11,%ymm11
-	vmovdqu	-24+128-128(%rsi),%ymm0
-	vpaddq	%ymm12,%ymm2,%ymm2
-	vpmuludq	%ymm10,%ymm13,%ymm13
-	vmovdqu	-24+160-128(%rsi),%ymm12
-	vpaddq	%ymm13,%ymm3,%ymm3
-	vpmuludq	%ymm10,%ymm0,%ymm0
-	vmovdqu	-24+192-128(%rsi),%ymm13
-	vpaddq	%ymm0,%ymm4,%ymm4
-	vpmuludq	%ymm10,%ymm12,%ymm12
-	vmovdqu	-24+224-128(%rsi),%ymm0
-	vpaddq	%ymm12,%ymm5,%ymm5
-	vpmuludq	%ymm10,%ymm13,%ymm13
-	vmovdqu	-24+256-128(%rsi),%ymm12
-	vpaddq	%ymm13,%ymm6,%ymm6
-	vpmuludq	%ymm10,%ymm0,%ymm0
-	vmovdqu	-24+288-128(%rsi),%ymm13
-	vpaddq	%ymm0,%ymm7,%ymm7
-	vpmuludq	%ymm10,%ymm12,%ymm12
-	vpaddq	%ymm12,%ymm8,%ymm8
-	vpmuludq	%ymm10,%ymm13,%ymm13
-	vpbroadcastq	32(%r13),%ymm10
-	vpaddq	%ymm13,%ymm9,%ymm9
-	addq	$32,%r13
-
-	vmovdqu	-24+32-128(%rcx),%ymm0
-	imulq	-128(%rcx),%rax
-	addq	%rax,%r12
-	shrq	$29,%r12
-
-	vmovdqu	-24+64-128(%rcx),%ymm12
-	vpmuludq	%ymm11,%ymm0,%ymm0
-	vmovq	%xmm10,%rbx
-	vmovdqu	-24+96-128(%rcx),%ymm13
-	vpaddq	%ymm0,%ymm1,%ymm0
-	vpmuludq	%ymm11,%ymm12,%ymm12
-	vmovdqu	%ymm0,(%rsp)
-	vpaddq	%ymm12,%ymm2,%ymm1
-	vmovdqu	-24+128-128(%rcx),%ymm0
-	vpmuludq	%ymm11,%ymm13,%ymm13
-	vmovdqu	-24+160-128(%rcx),%ymm12
-	vpaddq	%ymm13,%ymm3,%ymm2
-	vpmuludq	%ymm11,%ymm0,%ymm0
-	vmovdqu	-24+192-128(%rcx),%ymm13
-	vpaddq	%ymm0,%ymm4,%ymm3
-	vpmuludq	%ymm11,%ymm12,%ymm12
-	vmovdqu	-24+224-128(%rcx),%ymm0
-	vpaddq	%ymm12,%ymm5,%ymm4
-	vpmuludq	%ymm11,%ymm13,%ymm13
-	vmovdqu	-24+256-128(%rcx),%ymm12
-	vpaddq	%ymm13,%ymm6,%ymm5
-	vpmuludq	%ymm11,%ymm0,%ymm0
-	vmovdqu	-24+288-128(%rcx),%ymm13
-	movq	%r12,%r9
-	vpaddq	%ymm0,%ymm7,%ymm6
-	vpmuludq	%ymm11,%ymm12,%ymm12
-	addq	(%rsp),%r9
-	vpaddq	%ymm12,%ymm8,%ymm7
-	vpmuludq	%ymm11,%ymm13,%ymm13
-	vmovq	%r12,%xmm12
-	vpaddq	%ymm13,%ymm9,%ymm8
-
-	decl	%r14d
-	jnz	L$oop_mul_1024
-	vpaddq	(%rsp),%ymm12,%ymm0
-
-	vpsrlq	$29,%ymm0,%ymm12
-	vpand	%ymm15,%ymm0,%ymm0
-	vpsrlq	$29,%ymm1,%ymm13
-	vpand	%ymm15,%ymm1,%ymm1
-	vpsrlq	$29,%ymm2,%ymm10
-	vpermq	$0x93,%ymm12,%ymm12
-	vpand	%ymm15,%ymm2,%ymm2
-	vpsrlq	$29,%ymm3,%ymm11
-	vpermq	$0x93,%ymm13,%ymm13
-	vpand	%ymm15,%ymm3,%ymm3
-
-	vpblendd	$3,%ymm14,%ymm12,%ymm9
-	vpermq	$0x93,%ymm10,%ymm10
-	vpblendd	$3,%ymm12,%ymm13,%ymm12
-	vpermq	$0x93,%ymm11,%ymm11
-	vpaddq	%ymm9,%ymm0,%ymm0
-	vpblendd	$3,%ymm13,%ymm10,%ymm13
-	vpaddq	%ymm12,%ymm1,%ymm1
-	vpblendd	$3,%ymm10,%ymm11,%ymm10
-	vpaddq	%ymm13,%ymm2,%ymm2
-	vpblendd	$3,%ymm11,%ymm14,%ymm11
-	vpaddq	%ymm10,%ymm3,%ymm3
-	vpaddq	%ymm11,%ymm4,%ymm4
-
-	vpsrlq	$29,%ymm0,%ymm12
-	vpand	%ymm15,%ymm0,%ymm0
-	vpsrlq	$29,%ymm1,%ymm13
-	vpand	%ymm15,%ymm1,%ymm1
-	vpsrlq	$29,%ymm2,%ymm10
-	vpermq	$0x93,%ymm12,%ymm12
-	vpand	%ymm15,%ymm2,%ymm2
-	vpsrlq	$29,%ymm3,%ymm11
-	vpermq	$0x93,%ymm13,%ymm13
-	vpand	%ymm15,%ymm3,%ymm3
-	vpermq	$0x93,%ymm10,%ymm10
-
-	vpblendd	$3,%ymm14,%ymm12,%ymm9
-	vpermq	$0x93,%ymm11,%ymm11
-	vpblendd	$3,%ymm12,%ymm13,%ymm12
-	vpaddq	%ymm9,%ymm0,%ymm0
-	vpblendd	$3,%ymm13,%ymm10,%ymm13
-	vpaddq	%ymm12,%ymm1,%ymm1
-	vpblendd	$3,%ymm10,%ymm11,%ymm10
-	vpaddq	%ymm13,%ymm2,%ymm2
-	vpblendd	$3,%ymm11,%ymm14,%ymm11
-	vpaddq	%ymm10,%ymm3,%ymm3
-	vpaddq	%ymm11,%ymm4,%ymm4
-
-	vmovdqu	%ymm0,0-128(%rdi)
-	vmovdqu	%ymm1,32-128(%rdi)
-	vmovdqu	%ymm2,64-128(%rdi)
-	vmovdqu	%ymm3,96-128(%rdi)
-	vpsrlq	$29,%ymm4,%ymm12
-	vpand	%ymm15,%ymm4,%ymm4
-	vpsrlq	$29,%ymm5,%ymm13
-	vpand	%ymm15,%ymm5,%ymm5
-	vpsrlq	$29,%ymm6,%ymm10
-	vpermq	$0x93,%ymm12,%ymm12
-	vpand	%ymm15,%ymm6,%ymm6
-	vpsrlq	$29,%ymm7,%ymm11
-	vpermq	$0x93,%ymm13,%ymm13
-	vpand	%ymm15,%ymm7,%ymm7
-	vpsrlq	$29,%ymm8,%ymm0
-	vpermq	$0x93,%ymm10,%ymm10
-	vpand	%ymm15,%ymm8,%ymm8
-	vpermq	$0x93,%ymm11,%ymm11
-
-	vpblendd	$3,%ymm14,%ymm12,%ymm9
-	vpermq	$0x93,%ymm0,%ymm0
-	vpblendd	$3,%ymm12,%ymm13,%ymm12
-	vpaddq	%ymm9,%ymm4,%ymm4
-	vpblendd	$3,%ymm13,%ymm10,%ymm13
-	vpaddq	%ymm12,%ymm5,%ymm5
-	vpblendd	$3,%ymm10,%ymm11,%ymm10
-	vpaddq	%ymm13,%ymm6,%ymm6
-	vpblendd	$3,%ymm11,%ymm0,%ymm11
-	vpaddq	%ymm10,%ymm7,%ymm7
-	vpaddq	%ymm11,%ymm8,%ymm8
-
-	vpsrlq	$29,%ymm4,%ymm12
-	vpand	%ymm15,%ymm4,%ymm4
-	vpsrlq	$29,%ymm5,%ymm13
-	vpand	%ymm15,%ymm5,%ymm5
-	vpsrlq	$29,%ymm6,%ymm10
-	vpermq	$0x93,%ymm12,%ymm12
-	vpand	%ymm15,%ymm6,%ymm6
-	vpsrlq	$29,%ymm7,%ymm11
-	vpermq	$0x93,%ymm13,%ymm13
-	vpand	%ymm15,%ymm7,%ymm7
-	vpsrlq	$29,%ymm8,%ymm0
-	vpermq	$0x93,%ymm10,%ymm10
-	vpand	%ymm15,%ymm8,%ymm8
-	vpermq	$0x93,%ymm11,%ymm11
-
-	vpblendd	$3,%ymm14,%ymm12,%ymm9
-	vpermq	$0x93,%ymm0,%ymm0
-	vpblendd	$3,%ymm12,%ymm13,%ymm12
-	vpaddq	%ymm9,%ymm4,%ymm4
-	vpblendd	$3,%ymm13,%ymm10,%ymm13
-	vpaddq	%ymm12,%ymm5,%ymm5
-	vpblendd	$3,%ymm10,%ymm11,%ymm10
-	vpaddq	%ymm13,%ymm6,%ymm6
-	vpblendd	$3,%ymm11,%ymm0,%ymm11
-	vpaddq	%ymm10,%ymm7,%ymm7
-	vpaddq	%ymm11,%ymm8,%ymm8
-
-	vmovdqu	%ymm4,128-128(%rdi)
-	vmovdqu	%ymm5,160-128(%rdi)
-	vmovdqu	%ymm6,192-128(%rdi)
-	vmovdqu	%ymm7,224-128(%rdi)
-	vmovdqu	%ymm8,256-128(%rdi)
-	vzeroupper
-
-	movq	%rbp,%rax
-
-	movq	-48(%rax),%r15
-
-	movq	-40(%rax),%r14
-
-	movq	-32(%rax),%r13
-
-	movq	-24(%rax),%r12
-
-	movq	-16(%rax),%rbp
-
-	movq	-8(%rax),%rbx
-
-	leaq	(%rax),%rsp
-
-L$mul_1024_epilogue:
-	ret
-
-
-.globl	_rsaz_1024_red2norm_avx2
-.private_extern _rsaz_1024_red2norm_avx2
-
-.p2align	5
-_rsaz_1024_red2norm_avx2:
-
-_CET_ENDBR
-	subq	$-128,%rsi
-	xorq	%rax,%rax
-	movq	-128(%rsi),%r8
-	movq	-120(%rsi),%r9
-	movq	-112(%rsi),%r10
-	shlq	$0,%r8
-	shlq	$29,%r9
-	movq	%r10,%r11
-	shlq	$58,%r10
-	shrq	$6,%r11
-	addq	%r8,%rax
-	addq	%r9,%rax
-	addq	%r10,%rax
-	adcq	$0,%r11
-	movq	%rax,0(%rdi)
-	movq	%r11,%rax
-	movq	-104(%rsi),%r8
-	movq	-96(%rsi),%r9
-	shlq	$23,%r8
-	movq	%r9,%r10
-	shlq	$52,%r9
-	shrq	$12,%r10
-	addq	%r8,%rax
-	addq	%r9,%rax
-	adcq	$0,%r10
-	movq	%rax,8(%rdi)
-	movq	%r10,%rax
-	movq	-88(%rsi),%r11
-	movq	-80(%rsi),%r8
-	shlq	$17,%r11
-	movq	%r8,%r9
-	shlq	$46,%r8
-	shrq	$18,%r9
-	addq	%r11,%rax
-	addq	%r8,%rax
-	adcq	$0,%r9
-	movq	%rax,16(%rdi)
-	movq	%r9,%rax
-	movq	-72(%rsi),%r10
-	movq	-64(%rsi),%r11
-	shlq	$11,%r10
-	movq	%r11,%r8
-	shlq	$40,%r11
-	shrq	$24,%r8
-	addq	%r10,%rax
-	addq	%r11,%rax
-	adcq	$0,%r8
-	movq	%rax,24(%rdi)
-	movq	%r8,%rax
-	movq	-56(%rsi),%r9
-	movq	-48(%rsi),%r10
-	movq	-40(%rsi),%r11
-	shlq	$5,%r9
-	shlq	$34,%r10
-	movq	%r11,%r8
-	shlq	$63,%r11
-	shrq	$1,%r8
-	addq	%r9,%rax
-	addq	%r10,%rax
-	addq	%r11,%rax
-	adcq	$0,%r8
-	movq	%rax,32(%rdi)
-	movq	%r8,%rax
-	movq	-32(%rsi),%r9
-	movq	-24(%rsi),%r10
-	shlq	$28,%r9
-	movq	%r10,%r11
-	shlq	$57,%r10
-	shrq	$7,%r11
-	addq	%r9,%rax
-	addq	%r10,%rax
-	adcq	$0,%r11
-	movq	%rax,40(%rdi)
-	movq	%r11,%rax
-	movq	-16(%rsi),%r8
-	movq	-8(%rsi),%r9
-	shlq	$22,%r8
-	movq	%r9,%r10
-	shlq	$51,%r9
-	shrq	$13,%r10
-	addq	%r8,%rax
-	addq	%r9,%rax
-	adcq	$0,%r10
-	movq	%rax,48(%rdi)
-	movq	%r10,%rax
-	movq	0(%rsi),%r11
-	movq	8(%rsi),%r8
-	shlq	$16,%r11
-	movq	%r8,%r9
-	shlq	$45,%r8
-	shrq	$19,%r9
-	addq	%r11,%rax
-	addq	%r8,%rax
-	adcq	$0,%r9
-	movq	%rax,56(%rdi)
-	movq	%r9,%rax
-	movq	16(%rsi),%r10
-	movq	24(%rsi),%r11
-	shlq	$10,%r10
-	movq	%r11,%r8
-	shlq	$39,%r11
-	shrq	$25,%r8
-	addq	%r10,%rax
-	addq	%r11,%rax
-	adcq	$0,%r8
-	movq	%rax,64(%rdi)
-	movq	%r8,%rax
-	movq	32(%rsi),%r9
-	movq	40(%rsi),%r10
-	movq	48(%rsi),%r11
-	shlq	$4,%r9
-	shlq	$33,%r10
-	movq	%r11,%r8
-	shlq	$62,%r11
-	shrq	$2,%r8
-	addq	%r9,%rax
-	addq	%r10,%rax
-	addq	%r11,%rax
-	adcq	$0,%r8
-	movq	%rax,72(%rdi)
-	movq	%r8,%rax
-	movq	56(%rsi),%r9
-	movq	64(%rsi),%r10
-	shlq	$27,%r9
-	movq	%r10,%r11
-	shlq	$56,%r10
-	shrq	$8,%r11
-	addq	%r9,%rax
-	addq	%r10,%rax
-	adcq	$0,%r11
-	movq	%rax,80(%rdi)
-	movq	%r11,%rax
-	movq	72(%rsi),%r8
-	movq	80(%rsi),%r9
-	shlq	$21,%r8
-	movq	%r9,%r10
-	shlq	$50,%r9
-	shrq	$14,%r10
-	addq	%r8,%rax
-	addq	%r9,%rax
-	adcq	$0,%r10
-	movq	%rax,88(%rdi)
-	movq	%r10,%rax
-	movq	88(%rsi),%r11
-	movq	96(%rsi),%r8
-	shlq	$15,%r11
-	movq	%r8,%r9
-	shlq	$44,%r8
-	shrq	$20,%r9
-	addq	%r11,%rax
-	addq	%r8,%rax
-	adcq	$0,%r9
-	movq	%rax,96(%rdi)
-	movq	%r9,%rax
-	movq	104(%rsi),%r10
-	movq	112(%rsi),%r11
-	shlq	$9,%r10
-	movq	%r11,%r8
-	shlq	$38,%r11
-	shrq	$26,%r8
-	addq	%r10,%rax
-	addq	%r11,%rax
-	adcq	$0,%r8
-	movq	%rax,104(%rdi)
-	movq	%r8,%rax
-	movq	120(%rsi),%r9
-	movq	128(%rsi),%r10
-	movq	136(%rsi),%r11
-	shlq	$3,%r9
-	shlq	$32,%r10
-	movq	%r11,%r8
-	shlq	$61,%r11
-	shrq	$3,%r8
-	addq	%r9,%rax
-	addq	%r10,%rax
-	addq	%r11,%rax
-	adcq	$0,%r8
-	movq	%rax,112(%rdi)
-	movq	%r8,%rax
-	movq	144(%rsi),%r9
-	movq	152(%rsi),%r10
-	shlq	$26,%r9
-	movq	%r10,%r11
-	shlq	$55,%r10
-	shrq	$9,%r11
-	addq	%r9,%rax
-	addq	%r10,%rax
-	adcq	$0,%r11
-	movq	%rax,120(%rdi)
-	movq	%r11,%rax
-	ret
-
-
-
-.globl	_rsaz_1024_norm2red_avx2
-.private_extern _rsaz_1024_norm2red_avx2
-
-.p2align	5
-_rsaz_1024_norm2red_avx2:
-
-_CET_ENDBR
-	subq	$-128,%rdi
-	movq	(%rsi),%r8
-	movl	$0x1fffffff,%eax
-	movq	8(%rsi),%r9
-	movq	%r8,%r11
-	shrq	$0,%r11
-	andq	%rax,%r11
-	movq	%r11,-128(%rdi)
-	movq	%r8,%r10
-	shrq	$29,%r10
-	andq	%rax,%r10
-	movq	%r10,-120(%rdi)
-	shrdq	$58,%r9,%r8
-	andq	%rax,%r8
-	movq	%r8,-112(%rdi)
-	movq	16(%rsi),%r10
-	movq	%r9,%r8
-	shrq	$23,%r8
-	andq	%rax,%r8
-	movq	%r8,-104(%rdi)
-	shrdq	$52,%r10,%r9
-	andq	%rax,%r9
-	movq	%r9,-96(%rdi)
-	movq	24(%rsi),%r11
-	movq	%r10,%r9
-	shrq	$17,%r9
-	andq	%rax,%r9
-	movq	%r9,-88(%rdi)
-	shrdq	$46,%r11,%r10
-	andq	%rax,%r10
-	movq	%r10,-80(%rdi)
-	movq	32(%rsi),%r8
-	movq	%r11,%r10
-	shrq	$11,%r10
-	andq	%rax,%r10
-	movq	%r10,-72(%rdi)
-	shrdq	$40,%r8,%r11
-	andq	%rax,%r11
-	movq	%r11,-64(%rdi)
-	movq	40(%rsi),%r9
-	movq	%r8,%r11
-	shrq	$5,%r11
-	andq	%rax,%r11
-	movq	%r11,-56(%rdi)
-	movq	%r8,%r10
-	shrq	$34,%r10
-	andq	%rax,%r10
-	movq	%r10,-48(%rdi)
-	shrdq	$63,%r9,%r8
-	andq	%rax,%r8
-	movq	%r8,-40(%rdi)
-	movq	48(%rsi),%r10
-	movq	%r9,%r8
-	shrq	$28,%r8
-	andq	%rax,%r8
-	movq	%r8,-32(%rdi)
-	shrdq	$57,%r10,%r9
-	andq	%rax,%r9
-	movq	%r9,-24(%rdi)
-	movq	56(%rsi),%r11
-	movq	%r10,%r9
-	shrq	$22,%r9
-	andq	%rax,%r9
-	movq	%r9,-16(%rdi)
-	shrdq	$51,%r11,%r10
-	andq	%rax,%r10
-	movq	%r10,-8(%rdi)
-	movq	64(%rsi),%r8
-	movq	%r11,%r10
-	shrq	$16,%r10
-	andq	%rax,%r10
-	movq	%r10,0(%rdi)
-	shrdq	$45,%r8,%r11
-	andq	%rax,%r11
-	movq	%r11,8(%rdi)
-	movq	72(%rsi),%r9
-	movq	%r8,%r11
-	shrq	$10,%r11
-	andq	%rax,%r11
-	movq	%r11,16(%rdi)
-	shrdq	$39,%r9,%r8
-	andq	%rax,%r8
-	movq	%r8,24(%rdi)
-	movq	80(%rsi),%r10
-	movq	%r9,%r8
-	shrq	$4,%r8
-	andq	%rax,%r8
-	movq	%r8,32(%rdi)
-	movq	%r9,%r11
-	shrq	$33,%r11
-	andq	%rax,%r11
-	movq	%r11,40(%rdi)
-	shrdq	$62,%r10,%r9
-	andq	%rax,%r9
-	movq	%r9,48(%rdi)
-	movq	88(%rsi),%r11
-	movq	%r10,%r9
-	shrq	$27,%r9
-	andq	%rax,%r9
-	movq	%r9,56(%rdi)
-	shrdq	$56,%r11,%r10
-	andq	%rax,%r10
-	movq	%r10,64(%rdi)
-	movq	96(%rsi),%r8
-	movq	%r11,%r10
-	shrq	$21,%r10
-	andq	%rax,%r10
-	movq	%r10,72(%rdi)
-	shrdq	$50,%r8,%r11
-	andq	%rax,%r11
-	movq	%r11,80(%rdi)
-	movq	104(%rsi),%r9
-	movq	%r8,%r11
-	shrq	$15,%r11
-	andq	%rax,%r11
-	movq	%r11,88(%rdi)
-	shrdq	$44,%r9,%r8
-	andq	%rax,%r8
-	movq	%r8,96(%rdi)
-	movq	112(%rsi),%r10
-	movq	%r9,%r8
-	shrq	$9,%r8
-	andq	%rax,%r8
-	movq	%r8,104(%rdi)
-	shrdq	$38,%r10,%r9
-	andq	%rax,%r9
-	movq	%r9,112(%rdi)
-	movq	120(%rsi),%r11
-	movq	%r10,%r9
-	shrq	$3,%r9
-	andq	%rax,%r9
-	movq	%r9,120(%rdi)
-	movq	%r10,%r8
-	shrq	$32,%r8
-	andq	%rax,%r8
-	movq	%r8,128(%rdi)
-	shrdq	$61,%r11,%r10
-	andq	%rax,%r10
-	movq	%r10,136(%rdi)
-	xorq	%r8,%r8
-	movq	%r11,%r10
-	shrq	$26,%r10
-	andq	%rax,%r10
-	movq	%r10,144(%rdi)
-	shrdq	$55,%r8,%r11
-	andq	%rax,%r11
-	movq	%r11,152(%rdi)
-	movq	%r8,160(%rdi)
-	movq	%r8,168(%rdi)
-	movq	%r8,176(%rdi)
-	movq	%r8,184(%rdi)
-	ret
-
-
-.globl	_rsaz_1024_scatter5_avx2
-.private_extern _rsaz_1024_scatter5_avx2
-
-.p2align	5
-_rsaz_1024_scatter5_avx2:
-
-_CET_ENDBR
-	vzeroupper
-	vmovdqu	L$scatter_permd(%rip),%ymm5
-	shll	$4,%edx
-	leaq	(%rdi,%rdx,1),%rdi
-	movl	$9,%eax
-	jmp	L$oop_scatter_1024
-
-.p2align	5
-L$oop_scatter_1024:
-	vmovdqu	(%rsi),%ymm0
-	leaq	32(%rsi),%rsi
-	vpermd	%ymm0,%ymm5,%ymm0
-	vmovdqu	%xmm0,(%rdi)
-	leaq	512(%rdi),%rdi
-	decl	%eax
-	jnz	L$oop_scatter_1024
-
-	vzeroupper
-	ret
-
-
-
-.globl	_rsaz_1024_gather5_avx2
-.private_extern _rsaz_1024_gather5_avx2
-
-.p2align	5
-_rsaz_1024_gather5_avx2:
-
-_CET_ENDBR
-	vzeroupper
-	movq	%rsp,%r11
-
-	leaq	-256(%rsp),%rsp
-	andq	$-32,%rsp
-	leaq	L$inc(%rip),%r10
-	leaq	-128(%rsp),%rax
-
-	vmovd	%edx,%xmm4
-	vmovdqa	(%r10),%ymm0
-	vmovdqa	32(%r10),%ymm1
-	vmovdqa	64(%r10),%ymm5
-	vpbroadcastd	%xmm4,%ymm4
-
-	vpaddd	%ymm5,%ymm0,%ymm2
-	vpcmpeqd	%ymm4,%ymm0,%ymm0
-	vpaddd	%ymm5,%ymm1,%ymm3
-	vpcmpeqd	%ymm4,%ymm1,%ymm1
-	vmovdqa	%ymm0,0+128(%rax)
-	vpaddd	%ymm5,%ymm2,%ymm0
-	vpcmpeqd	%ymm4,%ymm2,%ymm2
-	vmovdqa	%ymm1,32+128(%rax)
-	vpaddd	%ymm5,%ymm3,%ymm1
-	vpcmpeqd	%ymm4,%ymm3,%ymm3
-	vmovdqa	%ymm2,64+128(%rax)
-	vpaddd	%ymm5,%ymm0,%ymm2
-	vpcmpeqd	%ymm4,%ymm0,%ymm0
-	vmovdqa	%ymm3,96+128(%rax)
-	vpaddd	%ymm5,%ymm1,%ymm3
-	vpcmpeqd	%ymm4,%ymm1,%ymm1
-	vmovdqa	%ymm0,128+128(%rax)
-	vpaddd	%ymm5,%ymm2,%ymm8
-	vpcmpeqd	%ymm4,%ymm2,%ymm2
-	vmovdqa	%ymm1,160+128(%rax)
-	vpaddd	%ymm5,%ymm3,%ymm9
-	vpcmpeqd	%ymm4,%ymm3,%ymm3
-	vmovdqa	%ymm2,192+128(%rax)
-	vpaddd	%ymm5,%ymm8,%ymm10
-	vpcmpeqd	%ymm4,%ymm8,%ymm8
-	vmovdqa	%ymm3,224+128(%rax)
-	vpaddd	%ymm5,%ymm9,%ymm11
-	vpcmpeqd	%ymm4,%ymm9,%ymm9
-	vpaddd	%ymm5,%ymm10,%ymm12
-	vpcmpeqd	%ymm4,%ymm10,%ymm10
-	vpaddd	%ymm5,%ymm11,%ymm13
-	vpcmpeqd	%ymm4,%ymm11,%ymm11
-	vpaddd	%ymm5,%ymm12,%ymm14
-	vpcmpeqd	%ymm4,%ymm12,%ymm12
-	vpaddd	%ymm5,%ymm13,%ymm15
-	vpcmpeqd	%ymm4,%ymm13,%ymm13
-	vpcmpeqd	%ymm4,%ymm14,%ymm14
-	vpcmpeqd	%ymm4,%ymm15,%ymm15
-
-	vmovdqa	-32(%r10),%ymm7
-	leaq	128(%rsi),%rsi
-	movl	$9,%edx
-
-L$oop_gather_1024:
-	vmovdqa	0-128(%rsi),%ymm0
-	vmovdqa	32-128(%rsi),%ymm1
-	vmovdqa	64-128(%rsi),%ymm2
-	vmovdqa	96-128(%rsi),%ymm3
-	vpand	0+128(%rax),%ymm0,%ymm0
-	vpand	32+128(%rax),%ymm1,%ymm1
-	vpand	64+128(%rax),%ymm2,%ymm2
-	vpor	%ymm0,%ymm1,%ymm4
-	vpand	96+128(%rax),%ymm3,%ymm3
-	vmovdqa	128-128(%rsi),%ymm0
-	vmovdqa	160-128(%rsi),%ymm1
-	vpor	%ymm2,%ymm3,%ymm5
-	vmovdqa	192-128(%rsi),%ymm2
-	vmovdqa	224-128(%rsi),%ymm3
-	vpand	128+128(%rax),%ymm0,%ymm0
-	vpand	160+128(%rax),%ymm1,%ymm1
-	vpand	192+128(%rax),%ymm2,%ymm2
-	vpor	%ymm0,%ymm4,%ymm4
-	vpand	224+128(%rax),%ymm3,%ymm3
-	vpand	256-128(%rsi),%ymm8,%ymm0
-	vpor	%ymm1,%ymm5,%ymm5
-	vpand	288-128(%rsi),%ymm9,%ymm1
-	vpor	%ymm2,%ymm4,%ymm4
-	vpand	320-128(%rsi),%ymm10,%ymm2
-	vpor	%ymm3,%ymm5,%ymm5
-	vpand	352-128(%rsi),%ymm11,%ymm3
-	vpor	%ymm0,%ymm4,%ymm4
-	vpand	384-128(%rsi),%ymm12,%ymm0
-	vpor	%ymm1,%ymm5,%ymm5
-	vpand	416-128(%rsi),%ymm13,%ymm1
-	vpor	%ymm2,%ymm4,%ymm4
-	vpand	448-128(%rsi),%ymm14,%ymm2
-	vpor	%ymm3,%ymm5,%ymm5
-	vpand	480-128(%rsi),%ymm15,%ymm3
-	leaq	512(%rsi),%rsi
-	vpor	%ymm0,%ymm4,%ymm4
-	vpor	%ymm1,%ymm5,%ymm5
-	vpor	%ymm2,%ymm4,%ymm4
-	vpor	%ymm3,%ymm5,%ymm5
-
-	vpor	%ymm5,%ymm4,%ymm4
-	vextracti128	$1,%ymm4,%xmm5
-	vpor	%xmm4,%xmm5,%xmm5
-	vpermd	%ymm5,%ymm7,%ymm5
-	vmovdqu	%ymm5,(%rdi)
-	leaq	32(%rdi),%rdi
-	decl	%edx
-	jnz	L$oop_gather_1024
-
-	vpxor	%ymm0,%ymm0,%ymm0
-	vmovdqu	%ymm0,(%rdi)
-	vzeroupper
-	leaq	(%r11),%rsp
-
-	ret
-
-L$SEH_end_rsaz_1024_gather5:
-
-.section	__DATA,__const
-.p2align	6
-L$and_mask:
-.quad	0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
-L$scatter_permd:
-.long	0,2,4,6,7,7,7,7
-L$gather_permd:
-.long	0,7,1,7,2,7,3,7
-L$inc:
-.long	0,0,0,0, 1,1,1,1
-.long	2,2,2,2, 3,3,3,3
-.long	4,4,4,4, 4,4,4,4
-.p2align	6
-.text	
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/sha1-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/sha1-x86_64-apple.S
deleted file mode 100644
index 6af6744..0000000
--- a/apple-x86_64/crypto/fipsmodule/sha1-x86_64-apple.S
+++ /dev/null
@@ -1,5463 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-.globl	_sha1_block_data_order
-.private_extern _sha1_block_data_order
-
-.p2align	4
-_sha1_block_data_order:
-
-_CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%r10
-	movl	0(%r10),%r9d
-	movl	4(%r10),%r8d
-	movl	8(%r10),%r10d
-	testl	$512,%r8d
-	jz	L$ialu
-	testl	$536870912,%r10d
-	jnz	_shaext_shortcut
-	andl	$296,%r10d
-	cmpl	$296,%r10d
-	je	_avx2_shortcut
-	andl	$268435456,%r8d
-	andl	$1073741824,%r9d
-	orl	%r9d,%r8d
-	cmpl	$1342177280,%r8d
-	je	_avx_shortcut
-	jmp	_ssse3_shortcut
-
-.p2align	4
-L$ialu:
-	movq	%rsp,%rax
-
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	movq	%rdi,%r8
-	subq	$72,%rsp
-	movq	%rsi,%r9
-	andq	$-64,%rsp
-	movq	%rdx,%r10
-	movq	%rax,64(%rsp)
-
-L$prologue:
-
-	movl	0(%r8),%esi
-	movl	4(%r8),%edi
-	movl	8(%r8),%r11d
-	movl	12(%r8),%r12d
-	movl	16(%r8),%r13d
-	jmp	L$loop
-
-.p2align	4
-L$loop:
-	movl	0(%r9),%edx
-	bswapl	%edx
-	movl	4(%r9),%ebp
-	movl	%r12d,%eax
-	movl	%edx,0(%rsp)
-	movl	%esi,%ecx
-	bswapl	%ebp
-	xorl	%r11d,%eax
-	roll	$5,%ecx
-	andl	%edi,%eax
-	leal	1518500249(%rdx,%r13,1),%r13d
-	addl	%ecx,%r13d
-	xorl	%r12d,%eax
-	roll	$30,%edi
-	addl	%eax,%r13d
-	movl	8(%r9),%r14d
-	movl	%r11d,%eax
-	movl	%ebp,4(%rsp)
-	movl	%r13d,%ecx
-	bswapl	%r14d
-	xorl	%edi,%eax
-	roll	$5,%ecx
-	andl	%esi,%eax
-	leal	1518500249(%rbp,%r12,1),%r12d
-	addl	%ecx,%r12d
-	xorl	%r11d,%eax
-	roll	$30,%esi
-	addl	%eax,%r12d
-	movl	12(%r9),%edx
-	movl	%edi,%eax
-	movl	%r14d,8(%rsp)
-	movl	%r12d,%ecx
-	bswapl	%edx
-	xorl	%esi,%eax
-	roll	$5,%ecx
-	andl	%r13d,%eax
-	leal	1518500249(%r14,%r11,1),%r11d
-	addl	%ecx,%r11d
-	xorl	%edi,%eax
-	roll	$30,%r13d
-	addl	%eax,%r11d
-	movl	16(%r9),%ebp
-	movl	%esi,%eax
-	movl	%edx,12(%rsp)
-	movl	%r11d,%ecx
-	bswapl	%ebp
-	xorl	%r13d,%eax
-	roll	$5,%ecx
-	andl	%r12d,%eax
-	leal	1518500249(%rdx,%rdi,1),%edi
-	addl	%ecx,%edi
-	xorl	%esi,%eax
-	roll	$30,%r12d
-	addl	%eax,%edi
-	movl	20(%r9),%r14d
-	movl	%r13d,%eax
-	movl	%ebp,16(%rsp)
-	movl	%edi,%ecx
-	bswapl	%r14d
-	xorl	%r12d,%eax
-	roll	$5,%ecx
-	andl	%r11d,%eax
-	leal	1518500249(%rbp,%rsi,1),%esi
-	addl	%ecx,%esi
-	xorl	%r13d,%eax
-	roll	$30,%r11d
-	addl	%eax,%esi
-	movl	24(%r9),%edx
-	movl	%r12d,%eax
-	movl	%r14d,20(%rsp)
-	movl	%esi,%ecx
-	bswapl	%edx
-	xorl	%r11d,%eax
-	roll	$5,%ecx
-	andl	%edi,%eax
-	leal	1518500249(%r14,%r13,1),%r13d
-	addl	%ecx,%r13d
-	xorl	%r12d,%eax
-	roll	$30,%edi
-	addl	%eax,%r13d
-	movl	28(%r9),%ebp
-	movl	%r11d,%eax
-	movl	%edx,24(%rsp)
-	movl	%r13d,%ecx
-	bswapl	%ebp
-	xorl	%edi,%eax
-	roll	$5,%ecx
-	andl	%esi,%eax
-	leal	1518500249(%rdx,%r12,1),%r12d
-	addl	%ecx,%r12d
-	xorl	%r11d,%eax
-	roll	$30,%esi
-	addl	%eax,%r12d
-	movl	32(%r9),%r14d
-	movl	%edi,%eax
-	movl	%ebp,28(%rsp)
-	movl	%r12d,%ecx
-	bswapl	%r14d
-	xorl	%esi,%eax
-	roll	$5,%ecx
-	andl	%r13d,%eax
-	leal	1518500249(%rbp,%r11,1),%r11d
-	addl	%ecx,%r11d
-	xorl	%edi,%eax
-	roll	$30,%r13d
-	addl	%eax,%r11d
-	movl	36(%r9),%edx
-	movl	%esi,%eax
-	movl	%r14d,32(%rsp)
-	movl	%r11d,%ecx
-	bswapl	%edx
-	xorl	%r13d,%eax
-	roll	$5,%ecx
-	andl	%r12d,%eax
-	leal	1518500249(%r14,%rdi,1),%edi
-	addl	%ecx,%edi
-	xorl	%esi,%eax
-	roll	$30,%r12d
-	addl	%eax,%edi
-	movl	40(%r9),%ebp
-	movl	%r13d,%eax
-	movl	%edx,36(%rsp)
-	movl	%edi,%ecx
-	bswapl	%ebp
-	xorl	%r12d,%eax
-	roll	$5,%ecx
-	andl	%r11d,%eax
-	leal	1518500249(%rdx,%rsi,1),%esi
-	addl	%ecx,%esi
-	xorl	%r13d,%eax
-	roll	$30,%r11d
-	addl	%eax,%esi
-	movl	44(%r9),%r14d
-	movl	%r12d,%eax
-	movl	%ebp,40(%rsp)
-	movl	%esi,%ecx
-	bswapl	%r14d
-	xorl	%r11d,%eax
-	roll	$5,%ecx
-	andl	%edi,%eax
-	leal	1518500249(%rbp,%r13,1),%r13d
-	addl	%ecx,%r13d
-	xorl	%r12d,%eax
-	roll	$30,%edi
-	addl	%eax,%r13d
-	movl	48(%r9),%edx
-	movl	%r11d,%eax
-	movl	%r14d,44(%rsp)
-	movl	%r13d,%ecx
-	bswapl	%edx
-	xorl	%edi,%eax
-	roll	$5,%ecx
-	andl	%esi,%eax
-	leal	1518500249(%r14,%r12,1),%r12d
-	addl	%ecx,%r12d
-	xorl	%r11d,%eax
-	roll	$30,%esi
-	addl	%eax,%r12d
-	movl	52(%r9),%ebp
-	movl	%edi,%eax
-	movl	%edx,48(%rsp)
-	movl	%r12d,%ecx
-	bswapl	%ebp
-	xorl	%esi,%eax
-	roll	$5,%ecx
-	andl	%r13d,%eax
-	leal	1518500249(%rdx,%r11,1),%r11d
-	addl	%ecx,%r11d
-	xorl	%edi,%eax
-	roll	$30,%r13d
-	addl	%eax,%r11d
-	movl	56(%r9),%r14d
-	movl	%esi,%eax
-	movl	%ebp,52(%rsp)
-	movl	%r11d,%ecx
-	bswapl	%r14d
-	xorl	%r13d,%eax
-	roll	$5,%ecx
-	andl	%r12d,%eax
-	leal	1518500249(%rbp,%rdi,1),%edi
-	addl	%ecx,%edi
-	xorl	%esi,%eax
-	roll	$30,%r12d
-	addl	%eax,%edi
-	movl	60(%r9),%edx
-	movl	%r13d,%eax
-	movl	%r14d,56(%rsp)
-	movl	%edi,%ecx
-	bswapl	%edx
-	xorl	%r12d,%eax
-	roll	$5,%ecx
-	andl	%r11d,%eax
-	leal	1518500249(%r14,%rsi,1),%esi
-	addl	%ecx,%esi
-	xorl	%r13d,%eax
-	roll	$30,%r11d
-	addl	%eax,%esi
-	xorl	0(%rsp),%ebp
-	movl	%r12d,%eax
-	movl	%edx,60(%rsp)
-	movl	%esi,%ecx
-	xorl	8(%rsp),%ebp
-	xorl	%r11d,%eax
-	roll	$5,%ecx
-	xorl	32(%rsp),%ebp
-	andl	%edi,%eax
-	leal	1518500249(%rdx,%r13,1),%r13d
-	roll	$30,%edi
-	xorl	%r12d,%eax
-	addl	%ecx,%r13d
-	roll	$1,%ebp
-	addl	%eax,%r13d
-	xorl	4(%rsp),%r14d
-	movl	%r11d,%eax
-	movl	%ebp,0(%rsp)
-	movl	%r13d,%ecx
-	xorl	12(%rsp),%r14d
-	xorl	%edi,%eax
-	roll	$5,%ecx
-	xorl	36(%rsp),%r14d
-	andl	%esi,%eax
-	leal	1518500249(%rbp,%r12,1),%r12d
-	roll	$30,%esi
-	xorl	%r11d,%eax
-	addl	%ecx,%r12d
-	roll	$1,%r14d
-	addl	%eax,%r12d
-	xorl	8(%rsp),%edx
-	movl	%edi,%eax
-	movl	%r14d,4(%rsp)
-	movl	%r12d,%ecx
-	xorl	16(%rsp),%edx
-	xorl	%esi,%eax
-	roll	$5,%ecx
-	xorl	40(%rsp),%edx
-	andl	%r13d,%eax
-	leal	1518500249(%r14,%r11,1),%r11d
-	roll	$30,%r13d
-	xorl	%edi,%eax
-	addl	%ecx,%r11d
-	roll	$1,%edx
-	addl	%eax,%r11d
-	xorl	12(%rsp),%ebp
-	movl	%esi,%eax
-	movl	%edx,8(%rsp)
-	movl	%r11d,%ecx
-	xorl	20(%rsp),%ebp
-	xorl	%r13d,%eax
-	roll	$5,%ecx
-	xorl	44(%rsp),%ebp
-	andl	%r12d,%eax
-	leal	1518500249(%rdx,%rdi,1),%edi
-	roll	$30,%r12d
-	xorl	%esi,%eax
-	addl	%ecx,%edi
-	roll	$1,%ebp
-	addl	%eax,%edi
-	xorl	16(%rsp),%r14d
-	movl	%r13d,%eax
-	movl	%ebp,12(%rsp)
-	movl	%edi,%ecx
-	xorl	24(%rsp),%r14d
-	xorl	%r12d,%eax
-	roll	$5,%ecx
-	xorl	48(%rsp),%r14d
-	andl	%r11d,%eax
-	leal	1518500249(%rbp,%rsi,1),%esi
-	roll	$30,%r11d
-	xorl	%r13d,%eax
-	addl	%ecx,%esi
-	roll	$1,%r14d
-	addl	%eax,%esi
-	xorl	20(%rsp),%edx
-	movl	%edi,%eax
-	movl	%r14d,16(%rsp)
-	movl	%esi,%ecx
-	xorl	28(%rsp),%edx
-	xorl	%r12d,%eax
-	roll	$5,%ecx
-	xorl	52(%rsp),%edx
-	leal	1859775393(%r14,%r13,1),%r13d
-	xorl	%r11d,%eax
-	addl	%ecx,%r13d
-	roll	$30,%edi
-	addl	%eax,%r13d
-	roll	$1,%edx
-	xorl	24(%rsp),%ebp
-	movl	%esi,%eax
-	movl	%edx,20(%rsp)
-	movl	%r13d,%ecx
-	xorl	32(%rsp),%ebp
-	xorl	%r11d,%eax
-	roll	$5,%ecx
-	xorl	56(%rsp),%ebp
-	leal	1859775393(%rdx,%r12,1),%r12d
-	xorl	%edi,%eax
-	addl	%ecx,%r12d
-	roll	$30,%esi
-	addl	%eax,%r12d
-	roll	$1,%ebp
-	xorl	28(%rsp),%r14d
-	movl	%r13d,%eax
-	movl	%ebp,24(%rsp)
-	movl	%r12d,%ecx
-	xorl	36(%rsp),%r14d
-	xorl	%edi,%eax
-	roll	$5,%ecx
-	xorl	60(%rsp),%r14d
-	leal	1859775393(%rbp,%r11,1),%r11d
-	xorl	%esi,%eax
-	addl	%ecx,%r11d
-	roll	$30,%r13d
-	addl	%eax,%r11d
-	roll	$1,%r14d
-	xorl	32(%rsp),%edx
-	movl	%r12d,%eax
-	movl	%r14d,28(%rsp)
-	movl	%r11d,%ecx
-	xorl	40(%rsp),%edx
-	xorl	%esi,%eax
-	roll	$5,%ecx
-	xorl	0(%rsp),%edx
-	leal	1859775393(%r14,%rdi,1),%edi
-	xorl	%r13d,%eax
-	addl	%ecx,%edi
-	roll	$30,%r12d
-	addl	%eax,%edi
-	roll	$1,%edx
-	xorl	36(%rsp),%ebp
-	movl	%r11d,%eax
-	movl	%edx,32(%rsp)
-	movl	%edi,%ecx
-	xorl	44(%rsp),%ebp
-	xorl	%r13d,%eax
-	roll	$5,%ecx
-	xorl	4(%rsp),%ebp
-	leal	1859775393(%rdx,%rsi,1),%esi
-	xorl	%r12d,%eax
-	addl	%ecx,%esi
-	roll	$30,%r11d
-	addl	%eax,%esi
-	roll	$1,%ebp
-	xorl	40(%rsp),%r14d
-	movl	%edi,%eax
-	movl	%ebp,36(%rsp)
-	movl	%esi,%ecx
-	xorl	48(%rsp),%r14d
-	xorl	%r12d,%eax
-	roll	$5,%ecx
-	xorl	8(%rsp),%r14d
-	leal	1859775393(%rbp,%r13,1),%r13d
-	xorl	%r11d,%eax
-	addl	%ecx,%r13d
-	roll	$30,%edi
-	addl	%eax,%r13d
-	roll	$1,%r14d
-	xorl	44(%rsp),%edx
-	movl	%esi,%eax
-	movl	%r14d,40(%rsp)
-	movl	%r13d,%ecx
-	xorl	52(%rsp),%edx
-	xorl	%r11d,%eax
-	roll	$5,%ecx
-	xorl	12(%rsp),%edx
-	leal	1859775393(%r14,%r12,1),%r12d
-	xorl	%edi,%eax
-	addl	%ecx,%r12d
-	roll	$30,%esi
-	addl	%eax,%r12d
-	roll	$1,%edx
-	xorl	48(%rsp),%ebp
-	movl	%r13d,%eax
-	movl	%edx,44(%rsp)
-	movl	%r12d,%ecx
-	xorl	56(%rsp),%ebp
-	xorl	%edi,%eax
-	roll	$5,%ecx
-	xorl	16(%rsp),%ebp
-	leal	1859775393(%rdx,%r11,1),%r11d
-	xorl	%esi,%eax
-	addl	%ecx,%r11d
-	roll	$30,%r13d
-	addl	%eax,%r11d
-	roll	$1,%ebp
-	xorl	52(%rsp),%r14d
-	movl	%r12d,%eax
-	movl	%ebp,48(%rsp)
-	movl	%r11d,%ecx
-	xorl	60(%rsp),%r14d
-	xorl	%esi,%eax
-	roll	$5,%ecx
-	xorl	20(%rsp),%r14d
-	leal	1859775393(%rbp,%rdi,1),%edi
-	xorl	%r13d,%eax
-	addl	%ecx,%edi
-	roll	$30,%r12d
-	addl	%eax,%edi
-	roll	$1,%r14d
-	xorl	56(%rsp),%edx
-	movl	%r11d,%eax
-	movl	%r14d,52(%rsp)
-	movl	%edi,%ecx
-	xorl	0(%rsp),%edx
-	xorl	%r13d,%eax
-	roll	$5,%ecx
-	xorl	24(%rsp),%edx
-	leal	1859775393(%r14,%rsi,1),%esi
-	xorl	%r12d,%eax
-	addl	%ecx,%esi
-	roll	$30,%r11d
-	addl	%eax,%esi
-	roll	$1,%edx
-	xorl	60(%rsp),%ebp
-	movl	%edi,%eax
-	movl	%edx,56(%rsp)
-	movl	%esi,%ecx
-	xorl	4(%rsp),%ebp
-	xorl	%r12d,%eax
-	roll	$5,%ecx
-	xorl	28(%rsp),%ebp
-	leal	1859775393(%rdx,%r13,1),%r13d
-	xorl	%r11d,%eax
-	addl	%ecx,%r13d
-	roll	$30,%edi
-	addl	%eax,%r13d
-	roll	$1,%ebp
-	xorl	0(%rsp),%r14d
-	movl	%esi,%eax
-	movl	%ebp,60(%rsp)
-	movl	%r13d,%ecx
-	xorl	8(%rsp),%r14d
-	xorl	%r11d,%eax
-	roll	$5,%ecx
-	xorl	32(%rsp),%r14d
-	leal	1859775393(%rbp,%r12,1),%r12d
-	xorl	%edi,%eax
-	addl	%ecx,%r12d
-	roll	$30,%esi
-	addl	%eax,%r12d
-	roll	$1,%r14d
-	xorl	4(%rsp),%edx
-	movl	%r13d,%eax
-	movl	%r14d,0(%rsp)
-	movl	%r12d,%ecx
-	xorl	12(%rsp),%edx
-	xorl	%edi,%eax
-	roll	$5,%ecx
-	xorl	36(%rsp),%edx
-	leal	1859775393(%r14,%r11,1),%r11d
-	xorl	%esi,%eax
-	addl	%ecx,%r11d
-	roll	$30,%r13d
-	addl	%eax,%r11d
-	roll	$1,%edx
-	xorl	8(%rsp),%ebp
-	movl	%r12d,%eax
-	movl	%edx,4(%rsp)
-	movl	%r11d,%ecx
-	xorl	16(%rsp),%ebp
-	xorl	%esi,%eax
-	roll	$5,%ecx
-	xorl	40(%rsp),%ebp
-	leal	1859775393(%rdx,%rdi,1),%edi
-	xorl	%r13d,%eax
-	addl	%ecx,%edi
-	roll	$30,%r12d
-	addl	%eax,%edi
-	roll	$1,%ebp
-	xorl	12(%rsp),%r14d
-	movl	%r11d,%eax
-	movl	%ebp,8(%rsp)
-	movl	%edi,%ecx
-	xorl	20(%rsp),%r14d
-	xorl	%r13d,%eax
-	roll	$5,%ecx
-	xorl	44(%rsp),%r14d
-	leal	1859775393(%rbp,%rsi,1),%esi
-	xorl	%r12d,%eax
-	addl	%ecx,%esi
-	roll	$30,%r11d
-	addl	%eax,%esi
-	roll	$1,%r14d
-	xorl	16(%rsp),%edx
-	movl	%edi,%eax
-	movl	%r14d,12(%rsp)
-	movl	%esi,%ecx
-	xorl	24(%rsp),%edx
-	xorl	%r12d,%eax
-	roll	$5,%ecx
-	xorl	48(%rsp),%edx
-	leal	1859775393(%r14,%r13,1),%r13d
-	xorl	%r11d,%eax
-	addl	%ecx,%r13d
-	roll	$30,%edi
-	addl	%eax,%r13d
-	roll	$1,%edx
-	xorl	20(%rsp),%ebp
-	movl	%esi,%eax
-	movl	%edx,16(%rsp)
-	movl	%r13d,%ecx
-	xorl	28(%rsp),%ebp
-	xorl	%r11d,%eax
-	roll	$5,%ecx
-	xorl	52(%rsp),%ebp
-	leal	1859775393(%rdx,%r12,1),%r12d
-	xorl	%edi,%eax
-	addl	%ecx,%r12d
-	roll	$30,%esi
-	addl	%eax,%r12d
-	roll	$1,%ebp
-	xorl	24(%rsp),%r14d
-	movl	%r13d,%eax
-	movl	%ebp,20(%rsp)
-	movl	%r12d,%ecx
-	xorl	32(%rsp),%r14d
-	xorl	%edi,%eax
-	roll	$5,%ecx
-	xorl	56(%rsp),%r14d
-	leal	1859775393(%rbp,%r11,1),%r11d
-	xorl	%esi,%eax
-	addl	%ecx,%r11d
-	roll	$30,%r13d
-	addl	%eax,%r11d
-	roll	$1,%r14d
-	xorl	28(%rsp),%edx
-	movl	%r12d,%eax
-	movl	%r14d,24(%rsp)
-	movl	%r11d,%ecx
-	xorl	36(%rsp),%edx
-	xorl	%esi,%eax
-	roll	$5,%ecx
-	xorl	60(%rsp),%edx
-	leal	1859775393(%r14,%rdi,1),%edi
-	xorl	%r13d,%eax
-	addl	%ecx,%edi
-	roll	$30,%r12d
-	addl	%eax,%edi
-	roll	$1,%edx
-	xorl	32(%rsp),%ebp
-	movl	%r11d,%eax
-	movl	%edx,28(%rsp)
-	movl	%edi,%ecx
-	xorl	40(%rsp),%ebp
-	xorl	%r13d,%eax
-	roll	$5,%ecx
-	xorl	0(%rsp),%ebp
-	leal	1859775393(%rdx,%rsi,1),%esi
-	xorl	%r12d,%eax
-	addl	%ecx,%esi
-	roll	$30,%r11d
-	addl	%eax,%esi
-	roll	$1,%ebp
-	xorl	36(%rsp),%r14d
-	movl	%r12d,%eax
-	movl	%ebp,32(%rsp)
-	movl	%r12d,%ebx
-	xorl	44(%rsp),%r14d
-	andl	%r11d,%eax
-	movl	%esi,%ecx
-	xorl	4(%rsp),%r14d
-	leal	-1894007588(%rbp,%r13,1),%r13d
-	xorl	%r11d,%ebx
-	roll	$5,%ecx
-	addl	%eax,%r13d
-	roll	$1,%r14d
-	andl	%edi,%ebx
-	addl	%ecx,%r13d
-	roll	$30,%edi
-	addl	%ebx,%r13d
-	xorl	40(%rsp),%edx
-	movl	%r11d,%eax
-	movl	%r14d,36(%rsp)
-	movl	%r11d,%ebx
-	xorl	48(%rsp),%edx
-	andl	%edi,%eax
-	movl	%r13d,%ecx
-	xorl	8(%rsp),%edx
-	leal	-1894007588(%r14,%r12,1),%r12d
-	xorl	%edi,%ebx
-	roll	$5,%ecx
-	addl	%eax,%r12d
-	roll	$1,%edx
-	andl	%esi,%ebx
-	addl	%ecx,%r12d
-	roll	$30,%esi
-	addl	%ebx,%r12d
-	xorl	44(%rsp),%ebp
-	movl	%edi,%eax
-	movl	%edx,40(%rsp)
-	movl	%edi,%ebx
-	xorl	52(%rsp),%ebp
-	andl	%esi,%eax
-	movl	%r12d,%ecx
-	xorl	12(%rsp),%ebp
-	leal	-1894007588(%rdx,%r11,1),%r11d
-	xorl	%esi,%ebx
-	roll	$5,%ecx
-	addl	%eax,%r11d
-	roll	$1,%ebp
-	andl	%r13d,%ebx
-	addl	%ecx,%r11d
-	roll	$30,%r13d
-	addl	%ebx,%r11d
-	xorl	48(%rsp),%r14d
-	movl	%esi,%eax
-	movl	%ebp,44(%rsp)
-	movl	%esi,%ebx
-	xorl	56(%rsp),%r14d
-	andl	%r13d,%eax
-	movl	%r11d,%ecx
-	xorl	16(%rsp),%r14d
-	leal	-1894007588(%rbp,%rdi,1),%edi
-	xorl	%r13d,%ebx
-	roll	$5,%ecx
-	addl	%eax,%edi
-	roll	$1,%r14d
-	andl	%r12d,%ebx
-	addl	%ecx,%edi
-	roll	$30,%r12d
-	addl	%ebx,%edi
-	xorl	52(%rsp),%edx
-	movl	%r13d,%eax
-	movl	%r14d,48(%rsp)
-	movl	%r13d,%ebx
-	xorl	60(%rsp),%edx
-	andl	%r12d,%eax
-	movl	%edi,%ecx
-	xorl	20(%rsp),%edx
-	leal	-1894007588(%r14,%rsi,1),%esi
-	xorl	%r12d,%ebx
-	roll	$5,%ecx
-	addl	%eax,%esi
-	roll	$1,%edx
-	andl	%r11d,%ebx
-	addl	%ecx,%esi
-	roll	$30,%r11d
-	addl	%ebx,%esi
-	xorl	56(%rsp),%ebp
-	movl	%r12d,%eax
-	movl	%edx,52(%rsp)
-	movl	%r12d,%ebx
-	xorl	0(%rsp),%ebp
-	andl	%r11d,%eax
-	movl	%esi,%ecx
-	xorl	24(%rsp),%ebp
-	leal	-1894007588(%rdx,%r13,1),%r13d
-	xorl	%r11d,%ebx
-	roll	$5,%ecx
-	addl	%eax,%r13d
-	roll	$1,%ebp
-	andl	%edi,%ebx
-	addl	%ecx,%r13d
-	roll	$30,%edi
-	addl	%ebx,%r13d
-	xorl	60(%rsp),%r14d
-	movl	%r11d,%eax
-	movl	%ebp,56(%rsp)
-	movl	%r11d,%ebx
-	xorl	4(%rsp),%r14d
-	andl	%edi,%eax
-	movl	%r13d,%ecx
-	xorl	28(%rsp),%r14d
-	leal	-1894007588(%rbp,%r12,1),%r12d
-	xorl	%edi,%ebx
-	roll	$5,%ecx
-	addl	%eax,%r12d
-	roll	$1,%r14d
-	andl	%esi,%ebx
-	addl	%ecx,%r12d
-	roll	$30,%esi
-	addl	%ebx,%r12d
-	xorl	0(%rsp),%edx
-	movl	%edi,%eax
-	movl	%r14d,60(%rsp)
-	movl	%edi,%ebx
-	xorl	8(%rsp),%edx
-	andl	%esi,%eax
-	movl	%r12d,%ecx
-	xorl	32(%rsp),%edx
-	leal	-1894007588(%r14,%r11,1),%r11d
-	xorl	%esi,%ebx
-	roll	$5,%ecx
-	addl	%eax,%r11d
-	roll	$1,%edx
-	andl	%r13d,%ebx
-	addl	%ecx,%r11d
-	roll	$30,%r13d
-	addl	%ebx,%r11d
-	xorl	4(%rsp),%ebp
-	movl	%esi,%eax
-	movl	%edx,0(%rsp)
-	movl	%esi,%ebx
-	xorl	12(%rsp),%ebp
-	andl	%r13d,%eax
-	movl	%r11d,%ecx
-	xorl	36(%rsp),%ebp
-	leal	-1894007588(%rdx,%rdi,1),%edi
-	xorl	%r13d,%ebx
-	roll	$5,%ecx
-	addl	%eax,%edi
-	roll	$1,%ebp
-	andl	%r12d,%ebx
-	addl	%ecx,%edi
-	roll	$30,%r12d
-	addl	%ebx,%edi
-	xorl	8(%rsp),%r14d
-	movl	%r13d,%eax
-	movl	%ebp,4(%rsp)
-	movl	%r13d,%ebx
-	xorl	16(%rsp),%r14d
-	andl	%r12d,%eax
-	movl	%edi,%ecx
-	xorl	40(%rsp),%r14d
-	leal	-1894007588(%rbp,%rsi,1),%esi
-	xorl	%r12d,%ebx
-	roll	$5,%ecx
-	addl	%eax,%esi
-	roll	$1,%r14d
-	andl	%r11d,%ebx
-	addl	%ecx,%esi
-	roll	$30,%r11d
-	addl	%ebx,%esi
-	xorl	12(%rsp),%edx
-	movl	%r12d,%eax
-	movl	%r14d,8(%rsp)
-	movl	%r12d,%ebx
-	xorl	20(%rsp),%edx
-	andl	%r11d,%eax
-	movl	%esi,%ecx
-	xorl	44(%rsp),%edx
-	leal	-1894007588(%r14,%r13,1),%r13d
-	xorl	%r11d,%ebx
-	roll	$5,%ecx
-	addl	%eax,%r13d
-	roll	$1,%edx
-	andl	%edi,%ebx
-	addl	%ecx,%r13d
-	roll	$30,%edi
-	addl	%ebx,%r13d
-	xorl	16(%rsp),%ebp
-	movl	%r11d,%eax
-	movl	%edx,12(%rsp)
-	movl	%r11d,%ebx
-	xorl	24(%rsp),%ebp
-	andl	%edi,%eax
-	movl	%r13d,%ecx
-	xorl	48(%rsp),%ebp
-	leal	-1894007588(%rdx,%r12,1),%r12d
-	xorl	%edi,%ebx
-	roll	$5,%ecx
-	addl	%eax,%r12d
-	roll	$1,%ebp
-	andl	%esi,%ebx
-	addl	%ecx,%r12d
-	roll	$30,%esi
-	addl	%ebx,%r12d
-	xorl	20(%rsp),%r14d
-	movl	%edi,%eax
-	movl	%ebp,16(%rsp)
-	movl	%edi,%ebx
-	xorl	28(%rsp),%r14d
-	andl	%esi,%eax
-	movl	%r12d,%ecx
-	xorl	52(%rsp),%r14d
-	leal	-1894007588(%rbp,%r11,1),%r11d
-	xorl	%esi,%ebx
-	roll	$5,%ecx
-	addl	%eax,%r11d
-	roll	$1,%r14d
-	andl	%r13d,%ebx
-	addl	%ecx,%r11d
-	roll	$30,%r13d
-	addl	%ebx,%r11d
-	xorl	24(%rsp),%edx
-	movl	%esi,%eax
-	movl	%r14d,20(%rsp)
-	movl	%esi,%ebx
-	xorl	32(%rsp),%edx
-	andl	%r13d,%eax
-	movl	%r11d,%ecx
-	xorl	56(%rsp),%edx
-	leal	-1894007588(%r14,%rdi,1),%edi
-	xorl	%r13d,%ebx
-	roll	$5,%ecx
-	addl	%eax,%edi
-	roll	$1,%edx
-	andl	%r12d,%ebx
-	addl	%ecx,%edi
-	roll	$30,%r12d
-	addl	%ebx,%edi
-	xorl	28(%rsp),%ebp
-	movl	%r13d,%eax
-	movl	%edx,24(%rsp)
-	movl	%r13d,%ebx
-	xorl	36(%rsp),%ebp
-	andl	%r12d,%eax
-	movl	%edi,%ecx
-	xorl	60(%rsp),%ebp
-	leal	-1894007588(%rdx,%rsi,1),%esi
-	xorl	%r12d,%ebx
-	roll	$5,%ecx
-	addl	%eax,%esi
-	roll	$1,%ebp
-	andl	%r11d,%ebx
-	addl	%ecx,%esi
-	roll	$30,%r11d
-	addl	%ebx,%esi
-	xorl	32(%rsp),%r14d
-	movl	%r12d,%eax
-	movl	%ebp,28(%rsp)
-	movl	%r12d,%ebx
-	xorl	40(%rsp),%r14d
-	andl	%r11d,%eax
-	movl	%esi,%ecx
-	xorl	0(%rsp),%r14d
-	leal	-1894007588(%rbp,%r13,1),%r13d
-	xorl	%r11d,%ebx
-	roll	$5,%ecx
-	addl	%eax,%r13d
-	roll	$1,%r14d
-	andl	%edi,%ebx
-	addl	%ecx,%r13d
-	roll	$30,%edi
-	addl	%ebx,%r13d
-	xorl	36(%rsp),%edx
-	movl	%r11d,%eax
-	movl	%r14d,32(%rsp)
-	movl	%r11d,%ebx
-	xorl	44(%rsp),%edx
-	andl	%edi,%eax
-	movl	%r13d,%ecx
-	xorl	4(%rsp),%edx
-	leal	-1894007588(%r14,%r12,1),%r12d
-	xorl	%edi,%ebx
-	roll	$5,%ecx
-	addl	%eax,%r12d
-	roll	$1,%edx
-	andl	%esi,%ebx
-	addl	%ecx,%r12d
-	roll	$30,%esi
-	addl	%ebx,%r12d
-	xorl	40(%rsp),%ebp
-	movl	%edi,%eax
-	movl	%edx,36(%rsp)
-	movl	%edi,%ebx
-	xorl	48(%rsp),%ebp
-	andl	%esi,%eax
-	movl	%r12d,%ecx
-	xorl	8(%rsp),%ebp
-	leal	-1894007588(%rdx,%r11,1),%r11d
-	xorl	%esi,%ebx
-	roll	$5,%ecx
-	addl	%eax,%r11d
-	roll	$1,%ebp
-	andl	%r13d,%ebx
-	addl	%ecx,%r11d
-	roll	$30,%r13d
-	addl	%ebx,%r11d
-	xorl	44(%rsp),%r14d
-	movl	%esi,%eax
-	movl	%ebp,40(%rsp)
-	movl	%esi,%ebx
-	xorl	52(%rsp),%r14d
-	andl	%r13d,%eax
-	movl	%r11d,%ecx
-	xorl	12(%rsp),%r14d
-	leal	-1894007588(%rbp,%rdi,1),%edi
-	xorl	%r13d,%ebx
-	roll	$5,%ecx
-	addl	%eax,%edi
-	roll	$1,%r14d
-	andl	%r12d,%ebx
-	addl	%ecx,%edi
-	roll	$30,%r12d
-	addl	%ebx,%edi
-	xorl	48(%rsp),%edx
-	movl	%r13d,%eax
-	movl	%r14d,44(%rsp)
-	movl	%r13d,%ebx
-	xorl	56(%rsp),%edx
-	andl	%r12d,%eax
-	movl	%edi,%ecx
-	xorl	16(%rsp),%edx
-	leal	-1894007588(%r14,%rsi,1),%esi
-	xorl	%r12d,%ebx
-	roll	$5,%ecx
-	addl	%eax,%esi
-	roll	$1,%edx
-	andl	%r11d,%ebx
-	addl	%ecx,%esi
-	roll	$30,%r11d
-	addl	%ebx,%esi
-	xorl	52(%rsp),%ebp
-	movl	%edi,%eax
-	movl	%edx,48(%rsp)
-	movl	%esi,%ecx
-	xorl	60(%rsp),%ebp
-	xorl	%r12d,%eax
-	roll	$5,%ecx
-	xorl	20(%rsp),%ebp
-	leal	-899497514(%rdx,%r13,1),%r13d
-	xorl	%r11d,%eax
-	addl	%ecx,%r13d
-	roll	$30,%edi
-	addl	%eax,%r13d
-	roll	$1,%ebp
-	xorl	56(%rsp),%r14d
-	movl	%esi,%eax
-	movl	%ebp,52(%rsp)
-	movl	%r13d,%ecx
-	xorl	0(%rsp),%r14d
-	xorl	%r11d,%eax
-	roll	$5,%ecx
-	xorl	24(%rsp),%r14d
-	leal	-899497514(%rbp,%r12,1),%r12d
-	xorl	%edi,%eax
-	addl	%ecx,%r12d
-	roll	$30,%esi
-	addl	%eax,%r12d
-	roll	$1,%r14d
-	xorl	60(%rsp),%edx
-	movl	%r13d,%eax
-	movl	%r14d,56(%rsp)
-	movl	%r12d,%ecx
-	xorl	4(%rsp),%edx
-	xorl	%edi,%eax
-	roll	$5,%ecx
-	xorl	28(%rsp),%edx
-	leal	-899497514(%r14,%r11,1),%r11d
-	xorl	%esi,%eax
-	addl	%ecx,%r11d
-	roll	$30,%r13d
-	addl	%eax,%r11d
-	roll	$1,%edx
-	xorl	0(%rsp),%ebp
-	movl	%r12d,%eax
-	movl	%edx,60(%rsp)
-	movl	%r11d,%ecx
-	xorl	8(%rsp),%ebp
-	xorl	%esi,%eax
-	roll	$5,%ecx
-	xorl	32(%rsp),%ebp
-	leal	-899497514(%rdx,%rdi,1),%edi
-	xorl	%r13d,%eax
-	addl	%ecx,%edi
-	roll	$30,%r12d
-	addl	%eax,%edi
-	roll	$1,%ebp
-	xorl	4(%rsp),%r14d
-	movl	%r11d,%eax
-	movl	%ebp,0(%rsp)
-	movl	%edi,%ecx
-	xorl	12(%rsp),%r14d
-	xorl	%r13d,%eax
-	roll	$5,%ecx
-	xorl	36(%rsp),%r14d
-	leal	-899497514(%rbp,%rsi,1),%esi
-	xorl	%r12d,%eax
-	addl	%ecx,%esi
-	roll	$30,%r11d
-	addl	%eax,%esi
-	roll	$1,%r14d
-	xorl	8(%rsp),%edx
-	movl	%edi,%eax
-	movl	%r14d,4(%rsp)
-	movl	%esi,%ecx
-	xorl	16(%rsp),%edx
-	xorl	%r12d,%eax
-	roll	$5,%ecx
-	xorl	40(%rsp),%edx
-	leal	-899497514(%r14,%r13,1),%r13d
-	xorl	%r11d,%eax
-	addl	%ecx,%r13d
-	roll	$30,%edi
-	addl	%eax,%r13d
-	roll	$1,%edx
-	xorl	12(%rsp),%ebp
-	movl	%esi,%eax
-	movl	%edx,8(%rsp)
-	movl	%r13d,%ecx
-	xorl	20(%rsp),%ebp
-	xorl	%r11d,%eax
-	roll	$5,%ecx
-	xorl	44(%rsp),%ebp
-	leal	-899497514(%rdx,%r12,1),%r12d
-	xorl	%edi,%eax
-	addl	%ecx,%r12d
-	roll	$30,%esi
-	addl	%eax,%r12d
-	roll	$1,%ebp
-	xorl	16(%rsp),%r14d
-	movl	%r13d,%eax
-	movl	%ebp,12(%rsp)
-	movl	%r12d,%ecx
-	xorl	24(%rsp),%r14d
-	xorl	%edi,%eax
-	roll	$5,%ecx
-	xorl	48(%rsp),%r14d
-	leal	-899497514(%rbp,%r11,1),%r11d
-	xorl	%esi,%eax
-	addl	%ecx,%r11d
-	roll	$30,%r13d
-	addl	%eax,%r11d
-	roll	$1,%r14d
-	xorl	20(%rsp),%edx
-	movl	%r12d,%eax
-	movl	%r14d,16(%rsp)
-	movl	%r11d,%ecx
-	xorl	28(%rsp),%edx
-	xorl	%esi,%eax
-	roll	$5,%ecx
-	xorl	52(%rsp),%edx
-	leal	-899497514(%r14,%rdi,1),%edi
-	xorl	%r13d,%eax
-	addl	%ecx,%edi
-	roll	$30,%r12d
-	addl	%eax,%edi
-	roll	$1,%edx
-	xorl	24(%rsp),%ebp
-	movl	%r11d,%eax
-	movl	%edx,20(%rsp)
-	movl	%edi,%ecx
-	xorl	32(%rsp),%ebp
-	xorl	%r13d,%eax
-	roll	$5,%ecx
-	xorl	56(%rsp),%ebp
-	leal	-899497514(%rdx,%rsi,1),%esi
-	xorl	%r12d,%eax
-	addl	%ecx,%esi
-	roll	$30,%r11d
-	addl	%eax,%esi
-	roll	$1,%ebp
-	xorl	28(%rsp),%r14d
-	movl	%edi,%eax
-	movl	%ebp,24(%rsp)
-	movl	%esi,%ecx
-	xorl	36(%rsp),%r14d
-	xorl	%r12d,%eax
-	roll	$5,%ecx
-	xorl	60(%rsp),%r14d
-	leal	-899497514(%rbp,%r13,1),%r13d
-	xorl	%r11d,%eax
-	addl	%ecx,%r13d
-	roll	$30,%edi
-	addl	%eax,%r13d
-	roll	$1,%r14d
-	xorl	32(%rsp),%edx
-	movl	%esi,%eax
-	movl	%r14d,28(%rsp)
-	movl	%r13d,%ecx
-	xorl	40(%rsp),%edx
-	xorl	%r11d,%eax
-	roll	$5,%ecx
-	xorl	0(%rsp),%edx
-	leal	-899497514(%r14,%r12,1),%r12d
-	xorl	%edi,%eax
-	addl	%ecx,%r12d
-	roll	$30,%esi
-	addl	%eax,%r12d
-	roll	$1,%edx
-	xorl	36(%rsp),%ebp
-	movl	%r13d,%eax
-
-	movl	%r12d,%ecx
-	xorl	44(%rsp),%ebp
-	xorl	%edi,%eax
-	roll	$5,%ecx
-	xorl	4(%rsp),%ebp
-	leal	-899497514(%rdx,%r11,1),%r11d
-	xorl	%esi,%eax
-	addl	%ecx,%r11d
-	roll	$30,%r13d
-	addl	%eax,%r11d
-	roll	$1,%ebp
-	xorl	40(%rsp),%r14d
-	movl	%r12d,%eax
-
-	movl	%r11d,%ecx
-	xorl	48(%rsp),%r14d
-	xorl	%esi,%eax
-	roll	$5,%ecx
-	xorl	8(%rsp),%r14d
-	leal	-899497514(%rbp,%rdi,1),%edi
-	xorl	%r13d,%eax
-	addl	%ecx,%edi
-	roll	$30,%r12d
-	addl	%eax,%edi
-	roll	$1,%r14d
-	xorl	44(%rsp),%edx
-	movl	%r11d,%eax
-
-	movl	%edi,%ecx
-	xorl	52(%rsp),%edx
-	xorl	%r13d,%eax
-	roll	$5,%ecx
-	xorl	12(%rsp),%edx
-	leal	-899497514(%r14,%rsi,1),%esi
-	xorl	%r12d,%eax
-	addl	%ecx,%esi
-	roll	$30,%r11d
-	addl	%eax,%esi
-	roll	$1,%edx
-	xorl	48(%rsp),%ebp
-	movl	%edi,%eax
-
-	movl	%esi,%ecx
-	xorl	56(%rsp),%ebp
-	xorl	%r12d,%eax
-	roll	$5,%ecx
-	xorl	16(%rsp),%ebp
-	leal	-899497514(%rdx,%r13,1),%r13d
-	xorl	%r11d,%eax
-	addl	%ecx,%r13d
-	roll	$30,%edi
-	addl	%eax,%r13d
-	roll	$1,%ebp
-	xorl	52(%rsp),%r14d
-	movl	%esi,%eax
-
-	movl	%r13d,%ecx
-	xorl	60(%rsp),%r14d
-	xorl	%r11d,%eax
-	roll	$5,%ecx
-	xorl	20(%rsp),%r14d
-	leal	-899497514(%rbp,%r12,1),%r12d
-	xorl	%edi,%eax
-	addl	%ecx,%r12d
-	roll	$30,%esi
-	addl	%eax,%r12d
-	roll	$1,%r14d
-	xorl	56(%rsp),%edx
-	movl	%r13d,%eax
-
-	movl	%r12d,%ecx
-	xorl	0(%rsp),%edx
-	xorl	%edi,%eax
-	roll	$5,%ecx
-	xorl	24(%rsp),%edx
-	leal	-899497514(%r14,%r11,1),%r11d
-	xorl	%esi,%eax
-	addl	%ecx,%r11d
-	roll	$30,%r13d
-	addl	%eax,%r11d
-	roll	$1,%edx
-	xorl	60(%rsp),%ebp
-	movl	%r12d,%eax
-
-	movl	%r11d,%ecx
-	xorl	4(%rsp),%ebp
-	xorl	%esi,%eax
-	roll	$5,%ecx
-	xorl	28(%rsp),%ebp
-	leal	-899497514(%rdx,%rdi,1),%edi
-	xorl	%r13d,%eax
-	addl	%ecx,%edi
-	roll	$30,%r12d
-	addl	%eax,%edi
-	roll	$1,%ebp
-	movl	%r11d,%eax
-	movl	%edi,%ecx
-	xorl	%r13d,%eax
-	leal	-899497514(%rbp,%rsi,1),%esi
-	roll	$5,%ecx
-	xorl	%r12d,%eax
-	addl	%ecx,%esi
-	roll	$30,%r11d
-	addl	%eax,%esi
-	addl	0(%r8),%esi
-	addl	4(%r8),%edi
-	addl	8(%r8),%r11d
-	addl	12(%r8),%r12d
-	addl	16(%r8),%r13d
-	movl	%esi,0(%r8)
-	movl	%edi,4(%r8)
-	movl	%r11d,8(%r8)
-	movl	%r12d,12(%r8)
-	movl	%r13d,16(%r8)
-
-	subq	$1,%r10
-	leaq	64(%r9),%r9
-	jnz	L$loop
-
-	movq	64(%rsp),%rsi
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbp
-
-	movq	-8(%rsi),%rbx
-
-	leaq	(%rsi),%rsp
-
-L$epilogue:
-	ret
-
-
-
-.p2align	5
-sha1_block_data_order_shaext:
-_shaext_shortcut:
-
-	movdqu	(%rdi),%xmm0
-	movd	16(%rdi),%xmm1
-	movdqa	K_XX_XX+160(%rip),%xmm3
-
-	movdqu	(%rsi),%xmm4
-	pshufd	$27,%xmm0,%xmm0
-	movdqu	16(%rsi),%xmm5
-	pshufd	$27,%xmm1,%xmm1
-	movdqu	32(%rsi),%xmm6
-.byte	102,15,56,0,227
-	movdqu	48(%rsi),%xmm7
-.byte	102,15,56,0,235
-.byte	102,15,56,0,243
-	movdqa	%xmm1,%xmm9
-.byte	102,15,56,0,251
-	jmp	L$oop_shaext
-
-.p2align	4
-L$oop_shaext:
-	decq	%rdx
-	leaq	64(%rsi),%r8
-	paddd	%xmm4,%xmm1
-	cmovneq	%r8,%rsi
-	prefetcht0	512(%rsi)
-	movdqa	%xmm0,%xmm8
-.byte	15,56,201,229
-	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,0
-.byte	15,56,200,213
-	pxor	%xmm6,%xmm4
-.byte	15,56,201,238
-.byte	15,56,202,231
-
-	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,0
-.byte	15,56,200,206
-	pxor	%xmm7,%xmm5
-.byte	15,56,202,236
-.byte	15,56,201,247
-	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,0
-.byte	15,56,200,215
-	pxor	%xmm4,%xmm6
-.byte	15,56,201,252
-.byte	15,56,202,245
-
-	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,0
-.byte	15,56,200,204
-	pxor	%xmm5,%xmm7
-.byte	15,56,202,254
-.byte	15,56,201,229
-	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,0
-.byte	15,56,200,213
-	pxor	%xmm6,%xmm4
-.byte	15,56,201,238
-.byte	15,56,202,231
-
-	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,1
-.byte	15,56,200,206
-	pxor	%xmm7,%xmm5
-.byte	15,56,202,236
-.byte	15,56,201,247
-	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,1
-.byte	15,56,200,215
-	pxor	%xmm4,%xmm6
-.byte	15,56,201,252
-.byte	15,56,202,245
-
-	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,1
-.byte	15,56,200,204
-	pxor	%xmm5,%xmm7
-.byte	15,56,202,254
-.byte	15,56,201,229
-	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,1
-.byte	15,56,200,213
-	pxor	%xmm6,%xmm4
-.byte	15,56,201,238
-.byte	15,56,202,231
-
-	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,1
-.byte	15,56,200,206
-	pxor	%xmm7,%xmm5
-.byte	15,56,202,236
-.byte	15,56,201,247
-	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,2
-.byte	15,56,200,215
-	pxor	%xmm4,%xmm6
-.byte	15,56,201,252
-.byte	15,56,202,245
-
-	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,2
-.byte	15,56,200,204
-	pxor	%xmm5,%xmm7
-.byte	15,56,202,254
-.byte	15,56,201,229
-	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,2
-.byte	15,56,200,213
-	pxor	%xmm6,%xmm4
-.byte	15,56,201,238
-.byte	15,56,202,231
-
-	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,2
-.byte	15,56,200,206
-	pxor	%xmm7,%xmm5
-.byte	15,56,202,236
-.byte	15,56,201,247
-	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,2
-.byte	15,56,200,215
-	pxor	%xmm4,%xmm6
-.byte	15,56,201,252
-.byte	15,56,202,245
-
-	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,3
-.byte	15,56,200,204
-	pxor	%xmm5,%xmm7
-.byte	15,56,202,254
-	movdqu	(%rsi),%xmm4
-	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,3
-.byte	15,56,200,213
-	movdqu	16(%rsi),%xmm5
-.byte	102,15,56,0,227
-
-	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,3
-.byte	15,56,200,206
-	movdqu	32(%rsi),%xmm6
-.byte	102,15,56,0,235
-
-	movdqa	%xmm0,%xmm2
-.byte	15,58,204,193,3
-.byte	15,56,200,215
-	movdqu	48(%rsi),%xmm7
-.byte	102,15,56,0,243
-
-	movdqa	%xmm0,%xmm1
-.byte	15,58,204,194,3
-.byte	65,15,56,200,201
-.byte	102,15,56,0,251
-
-	paddd	%xmm8,%xmm0
-	movdqa	%xmm1,%xmm9
-
-	jnz	L$oop_shaext
-
-	pshufd	$27,%xmm0,%xmm0
-	pshufd	$27,%xmm1,%xmm1
-	movdqu	%xmm0,(%rdi)
-	movd	%xmm1,16(%rdi)
-	ret
-
-
-
-.p2align	4
-sha1_block_data_order_ssse3:
-_ssse3_shortcut:
-
-	movq	%rsp,%r11
-
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	leaq	-64(%rsp),%rsp
-	andq	$-64,%rsp
-	movq	%rdi,%r8
-	movq	%rsi,%r9
-	movq	%rdx,%r10
-
-	shlq	$6,%r10
-	addq	%r9,%r10
-	leaq	K_XX_XX+64(%rip),%r14
-
-	movl	0(%r8),%eax
-	movl	4(%r8),%ebx
-	movl	8(%r8),%ecx
-	movl	12(%r8),%edx
-	movl	%ebx,%esi
-	movl	16(%r8),%ebp
-	movl	%ecx,%edi
-	xorl	%edx,%edi
-	andl	%edi,%esi
-
-	movdqa	64(%r14),%xmm6
-	movdqa	-64(%r14),%xmm9
-	movdqu	0(%r9),%xmm0
-	movdqu	16(%r9),%xmm1
-	movdqu	32(%r9),%xmm2
-	movdqu	48(%r9),%xmm3
-.byte	102,15,56,0,198
-.byte	102,15,56,0,206
-.byte	102,15,56,0,214
-	addq	$64,%r9
-	paddd	%xmm9,%xmm0
-.byte	102,15,56,0,222
-	paddd	%xmm9,%xmm1
-	paddd	%xmm9,%xmm2
-	movdqa	%xmm0,0(%rsp)
-	psubd	%xmm9,%xmm0
-	movdqa	%xmm1,16(%rsp)
-	psubd	%xmm9,%xmm1
-	movdqa	%xmm2,32(%rsp)
-	psubd	%xmm9,%xmm2
-	jmp	L$oop_ssse3
-.p2align	4
-L$oop_ssse3:
-	rorl	$2,%ebx
-	pshufd	$238,%xmm0,%xmm4
-	xorl	%edx,%esi
-	movdqa	%xmm3,%xmm8
-	paddd	%xmm3,%xmm9
-	movl	%eax,%edi
-	addl	0(%rsp),%ebp
-	punpcklqdq	%xmm1,%xmm4
-	xorl	%ecx,%ebx
-	roll	$5,%eax
-	addl	%esi,%ebp
-	psrldq	$4,%xmm8
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
-	pxor	%xmm0,%xmm4
-	addl	%eax,%ebp
-	rorl	$7,%eax
-	pxor	%xmm2,%xmm8
-	xorl	%ecx,%edi
-	movl	%ebp,%esi
-	addl	4(%rsp),%edx
-	pxor	%xmm8,%xmm4
-	xorl	%ebx,%eax
-	roll	$5,%ebp
-	movdqa	%xmm9,48(%rsp)
-	addl	%edi,%edx
-	andl	%eax,%esi
-	movdqa	%xmm4,%xmm10
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	rorl	$7,%ebp
-	movdqa	%xmm4,%xmm8
-	xorl	%ebx,%esi
-	pslldq	$12,%xmm10
-	paddd	%xmm4,%xmm4
-	movl	%edx,%edi
-	addl	8(%rsp),%ecx
-	psrld	$31,%xmm8
-	xorl	%eax,%ebp
-	roll	$5,%edx
-	addl	%esi,%ecx
-	movdqa	%xmm10,%xmm9
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
-	psrld	$30,%xmm10
-	addl	%edx,%ecx
-	rorl	$7,%edx
-	por	%xmm8,%xmm4
-	xorl	%eax,%edi
-	movl	%ecx,%esi
-	addl	12(%rsp),%ebx
-	pslld	$2,%xmm9
-	pxor	%xmm10,%xmm4
-	xorl	%ebp,%edx
-	movdqa	-64(%r14),%xmm10
-	roll	$5,%ecx
-	addl	%edi,%ebx
-	andl	%edx,%esi
-	pxor	%xmm9,%xmm4
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	rorl	$7,%ecx
-	pshufd	$238,%xmm1,%xmm5
-	xorl	%ebp,%esi
-	movdqa	%xmm4,%xmm9
-	paddd	%xmm4,%xmm10
-	movl	%ebx,%edi
-	addl	16(%rsp),%eax
-	punpcklqdq	%xmm2,%xmm5
-	xorl	%edx,%ecx
-	roll	$5,%ebx
-	addl	%esi,%eax
-	psrldq	$4,%xmm9
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	pxor	%xmm1,%xmm5
-	addl	%ebx,%eax
-	rorl	$7,%ebx
-	pxor	%xmm3,%xmm9
-	xorl	%edx,%edi
-	movl	%eax,%esi
-	addl	20(%rsp),%ebp
-	pxor	%xmm9,%xmm5
-	xorl	%ecx,%ebx
-	roll	$5,%eax
-	movdqa	%xmm10,0(%rsp)
-	addl	%edi,%ebp
-	andl	%ebx,%esi
-	movdqa	%xmm5,%xmm8
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	rorl	$7,%eax
-	movdqa	%xmm5,%xmm9
-	xorl	%ecx,%esi
-	pslldq	$12,%xmm8
-	paddd	%xmm5,%xmm5
-	movl	%ebp,%edi
-	addl	24(%rsp),%edx
-	psrld	$31,%xmm9
-	xorl	%ebx,%eax
-	roll	$5,%ebp
-	addl	%esi,%edx
-	movdqa	%xmm8,%xmm10
-	andl	%eax,%edi
-	xorl	%ebx,%eax
-	psrld	$30,%xmm8
-	addl	%ebp,%edx
-	rorl	$7,%ebp
-	por	%xmm9,%xmm5
-	xorl	%ebx,%edi
-	movl	%edx,%esi
-	addl	28(%rsp),%ecx
-	pslld	$2,%xmm10
-	pxor	%xmm8,%xmm5
-	xorl	%eax,%ebp
-	movdqa	-32(%r14),%xmm8
-	roll	$5,%edx
-	addl	%edi,%ecx
-	andl	%ebp,%esi
-	pxor	%xmm10,%xmm5
-	xorl	%eax,%ebp
-	addl	%edx,%ecx
-	rorl	$7,%edx
-	pshufd	$238,%xmm2,%xmm6
-	xorl	%eax,%esi
-	movdqa	%xmm5,%xmm10
-	paddd	%xmm5,%xmm8
-	movl	%ecx,%edi
-	addl	32(%rsp),%ebx
-	punpcklqdq	%xmm3,%xmm6
-	xorl	%ebp,%edx
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	psrldq	$4,%xmm10
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	pxor	%xmm2,%xmm6
-	addl	%ecx,%ebx
-	rorl	$7,%ecx
-	pxor	%xmm4,%xmm10
-	xorl	%ebp,%edi
-	movl	%ebx,%esi
-	addl	36(%rsp),%eax
-	pxor	%xmm10,%xmm6
-	xorl	%edx,%ecx
-	roll	$5,%ebx
-	movdqa	%xmm8,16(%rsp)
-	addl	%edi,%eax
-	andl	%ecx,%esi
-	movdqa	%xmm6,%xmm9
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	rorl	$7,%ebx
-	movdqa	%xmm6,%xmm10
-	xorl	%edx,%esi
-	pslldq	$12,%xmm9
-	paddd	%xmm6,%xmm6
-	movl	%eax,%edi
-	addl	40(%rsp),%ebp
-	psrld	$31,%xmm10
-	xorl	%ecx,%ebx
-	roll	$5,%eax
-	addl	%esi,%ebp
-	movdqa	%xmm9,%xmm8
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
-	psrld	$30,%xmm9
-	addl	%eax,%ebp
-	rorl	$7,%eax
-	por	%xmm10,%xmm6
-	xorl	%ecx,%edi
-	movl	%ebp,%esi
-	addl	44(%rsp),%edx
-	pslld	$2,%xmm8
-	pxor	%xmm9,%xmm6
-	xorl	%ebx,%eax
-	movdqa	-32(%r14),%xmm9
-	roll	$5,%ebp
-	addl	%edi,%edx
-	andl	%eax,%esi
-	pxor	%xmm8,%xmm6
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	rorl	$7,%ebp
-	pshufd	$238,%xmm3,%xmm7
-	xorl	%ebx,%esi
-	movdqa	%xmm6,%xmm8
-	paddd	%xmm6,%xmm9
-	movl	%edx,%edi
-	addl	48(%rsp),%ecx
-	punpcklqdq	%xmm4,%xmm7
-	xorl	%eax,%ebp
-	roll	$5,%edx
-	addl	%esi,%ecx
-	psrldq	$4,%xmm8
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
-	pxor	%xmm3,%xmm7
-	addl	%edx,%ecx
-	rorl	$7,%edx
-	pxor	%xmm5,%xmm8
-	xorl	%eax,%edi
-	movl	%ecx,%esi
-	addl	52(%rsp),%ebx
-	pxor	%xmm8,%xmm7
-	xorl	%ebp,%edx
-	roll	$5,%ecx
-	movdqa	%xmm9,32(%rsp)
-	addl	%edi,%ebx
-	andl	%edx,%esi
-	movdqa	%xmm7,%xmm10
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	rorl	$7,%ecx
-	movdqa	%xmm7,%xmm8
-	xorl	%ebp,%esi
-	pslldq	$12,%xmm10
-	paddd	%xmm7,%xmm7
-	movl	%ebx,%edi
-	addl	56(%rsp),%eax
-	psrld	$31,%xmm8
-	xorl	%edx,%ecx
-	roll	$5,%ebx
-	addl	%esi,%eax
-	movdqa	%xmm10,%xmm9
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	psrld	$30,%xmm10
-	addl	%ebx,%eax
-	rorl	$7,%ebx
-	por	%xmm8,%xmm7
-	xorl	%edx,%edi
-	movl	%eax,%esi
-	addl	60(%rsp),%ebp
-	pslld	$2,%xmm9
-	pxor	%xmm10,%xmm7
-	xorl	%ecx,%ebx
-	movdqa	-32(%r14),%xmm10
-	roll	$5,%eax
-	addl	%edi,%ebp
-	andl	%ebx,%esi
-	pxor	%xmm9,%xmm7
-	pshufd	$238,%xmm6,%xmm9
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	rorl	$7,%eax
-	pxor	%xmm4,%xmm0
-	xorl	%ecx,%esi
-	movl	%ebp,%edi
-	addl	0(%rsp),%edx
-	punpcklqdq	%xmm7,%xmm9
-	xorl	%ebx,%eax
-	roll	$5,%ebp
-	pxor	%xmm1,%xmm0
-	addl	%esi,%edx
-	andl	%eax,%edi
-	movdqa	%xmm10,%xmm8
-	xorl	%ebx,%eax
-	paddd	%xmm7,%xmm10
-	addl	%ebp,%edx
-	pxor	%xmm9,%xmm0
-	rorl	$7,%ebp
-	xorl	%ebx,%edi
-	movl	%edx,%esi
-	addl	4(%rsp),%ecx
-	movdqa	%xmm0,%xmm9
-	xorl	%eax,%ebp
-	roll	$5,%edx
-	movdqa	%xmm10,48(%rsp)
-	addl	%edi,%ecx
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	pslld	$2,%xmm0
-	addl	%edx,%ecx
-	rorl	$7,%edx
-	psrld	$30,%xmm9
-	xorl	%eax,%esi
-	movl	%ecx,%edi
-	addl	8(%rsp),%ebx
-	por	%xmm9,%xmm0
-	xorl	%ebp,%edx
-	roll	$5,%ecx
-	pshufd	$238,%xmm7,%xmm10
-	addl	%esi,%ebx
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	addl	12(%rsp),%eax
-	xorl	%ebp,%edi
-	movl	%ebx,%esi
-	roll	$5,%ebx
-	addl	%edi,%eax
-	xorl	%edx,%esi
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	pxor	%xmm5,%xmm1
-	addl	16(%rsp),%ebp
-	xorl	%ecx,%esi
-	punpcklqdq	%xmm0,%xmm10
-	movl	%eax,%edi
-	roll	$5,%eax
-	pxor	%xmm2,%xmm1
-	addl	%esi,%ebp
-	xorl	%ecx,%edi
-	movdqa	%xmm8,%xmm9
-	rorl	$7,%ebx
-	paddd	%xmm0,%xmm8
-	addl	%eax,%ebp
-	pxor	%xmm10,%xmm1
-	addl	20(%rsp),%edx
-	xorl	%ebx,%edi
-	movl	%ebp,%esi
-	roll	$5,%ebp
-	movdqa	%xmm1,%xmm10
-	addl	%edi,%edx
-	xorl	%ebx,%esi
-	movdqa	%xmm8,0(%rsp)
-	rorl	$7,%eax
-	addl	%ebp,%edx
-	addl	24(%rsp),%ecx
-	pslld	$2,%xmm1
-	xorl	%eax,%esi
-	movl	%edx,%edi
-	psrld	$30,%xmm10
-	roll	$5,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%edi
-	rorl	$7,%ebp
-	por	%xmm10,%xmm1
-	addl	%edx,%ecx
-	addl	28(%rsp),%ebx
-	pshufd	$238,%xmm0,%xmm8
-	xorl	%ebp,%edi
-	movl	%ecx,%esi
-	roll	$5,%ecx
-	addl	%edi,%ebx
-	xorl	%ebp,%esi
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	pxor	%xmm6,%xmm2
-	addl	32(%rsp),%eax
-	xorl	%edx,%esi
-	punpcklqdq	%xmm1,%xmm8
-	movl	%ebx,%edi
-	roll	$5,%ebx
-	pxor	%xmm3,%xmm2
-	addl	%esi,%eax
-	xorl	%edx,%edi
-	movdqa	0(%r14),%xmm10
-	rorl	$7,%ecx
-	paddd	%xmm1,%xmm9
-	addl	%ebx,%eax
-	pxor	%xmm8,%xmm2
-	addl	36(%rsp),%ebp
-	xorl	%ecx,%edi
-	movl	%eax,%esi
-	roll	$5,%eax
-	movdqa	%xmm2,%xmm8
-	addl	%edi,%ebp
-	xorl	%ecx,%esi
-	movdqa	%xmm9,16(%rsp)
-	rorl	$7,%ebx
-	addl	%eax,%ebp
-	addl	40(%rsp),%edx
-	pslld	$2,%xmm2
-	xorl	%ebx,%esi
-	movl	%ebp,%edi
-	psrld	$30,%xmm8
-	roll	$5,%ebp
-	addl	%esi,%edx
-	xorl	%ebx,%edi
-	rorl	$7,%eax
-	por	%xmm8,%xmm2
-	addl	%ebp,%edx
-	addl	44(%rsp),%ecx
-	pshufd	$238,%xmm1,%xmm9
-	xorl	%eax,%edi
-	movl	%edx,%esi
-	roll	$5,%edx
-	addl	%edi,%ecx
-	xorl	%eax,%esi
-	rorl	$7,%ebp
-	addl	%edx,%ecx
-	pxor	%xmm7,%xmm3
-	addl	48(%rsp),%ebx
-	xorl	%ebp,%esi
-	punpcklqdq	%xmm2,%xmm9
-	movl	%ecx,%edi
-	roll	$5,%ecx
-	pxor	%xmm4,%xmm3
-	addl	%esi,%ebx
-	xorl	%ebp,%edi
-	movdqa	%xmm10,%xmm8
-	rorl	$7,%edx
-	paddd	%xmm2,%xmm10
-	addl	%ecx,%ebx
-	pxor	%xmm9,%xmm3
-	addl	52(%rsp),%eax
-	xorl	%edx,%edi
-	movl	%ebx,%esi
-	roll	$5,%ebx
-	movdqa	%xmm3,%xmm9
-	addl	%edi,%eax
-	xorl	%edx,%esi
-	movdqa	%xmm10,32(%rsp)
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	addl	56(%rsp),%ebp
-	pslld	$2,%xmm3
-	xorl	%ecx,%esi
-	movl	%eax,%edi
-	psrld	$30,%xmm9
-	roll	$5,%eax
-	addl	%esi,%ebp
-	xorl	%ecx,%edi
-	rorl	$7,%ebx
-	por	%xmm9,%xmm3
-	addl	%eax,%ebp
-	addl	60(%rsp),%edx
-	pshufd	$238,%xmm2,%xmm10
-	xorl	%ebx,%edi
-	movl	%ebp,%esi
-	roll	$5,%ebp
-	addl	%edi,%edx
-	xorl	%ebx,%esi
-	rorl	$7,%eax
-	addl	%ebp,%edx
-	pxor	%xmm0,%xmm4
-	addl	0(%rsp),%ecx
-	xorl	%eax,%esi
-	punpcklqdq	%xmm3,%xmm10
-	movl	%edx,%edi
-	roll	$5,%edx
-	pxor	%xmm5,%xmm4
-	addl	%esi,%ecx
-	xorl	%eax,%edi
-	movdqa	%xmm8,%xmm9
-	rorl	$7,%ebp
-	paddd	%xmm3,%xmm8
-	addl	%edx,%ecx
-	pxor	%xmm10,%xmm4
-	addl	4(%rsp),%ebx
-	xorl	%ebp,%edi
-	movl	%ecx,%esi
-	roll	$5,%ecx
-	movdqa	%xmm4,%xmm10
-	addl	%edi,%ebx
-	xorl	%ebp,%esi
-	movdqa	%xmm8,48(%rsp)
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	addl	8(%rsp),%eax
-	pslld	$2,%xmm4
-	xorl	%edx,%esi
-	movl	%ebx,%edi
-	psrld	$30,%xmm10
-	roll	$5,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%edi
-	rorl	$7,%ecx
-	por	%xmm10,%xmm4
-	addl	%ebx,%eax
-	addl	12(%rsp),%ebp
-	pshufd	$238,%xmm3,%xmm8
-	xorl	%ecx,%edi
-	movl	%eax,%esi
-	roll	$5,%eax
-	addl	%edi,%ebp
-	xorl	%ecx,%esi
-	rorl	$7,%ebx
-	addl	%eax,%ebp
-	pxor	%xmm1,%xmm5
-	addl	16(%rsp),%edx
-	xorl	%ebx,%esi
-	punpcklqdq	%xmm4,%xmm8
-	movl	%ebp,%edi
-	roll	$5,%ebp
-	pxor	%xmm6,%xmm5
-	addl	%esi,%edx
-	xorl	%ebx,%edi
-	movdqa	%xmm9,%xmm10
-	rorl	$7,%eax
-	paddd	%xmm4,%xmm9
-	addl	%ebp,%edx
-	pxor	%xmm8,%xmm5
-	addl	20(%rsp),%ecx
-	xorl	%eax,%edi
-	movl	%edx,%esi
-	roll	$5,%edx
-	movdqa	%xmm5,%xmm8
-	addl	%edi,%ecx
-	xorl	%eax,%esi
-	movdqa	%xmm9,0(%rsp)
-	rorl	$7,%ebp
-	addl	%edx,%ecx
-	addl	24(%rsp),%ebx
-	pslld	$2,%xmm5
-	xorl	%ebp,%esi
-	movl	%ecx,%edi
-	psrld	$30,%xmm8
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%ebp,%edi
-	rorl	$7,%edx
-	por	%xmm8,%xmm5
-	addl	%ecx,%ebx
-	addl	28(%rsp),%eax
-	pshufd	$238,%xmm4,%xmm9
-	rorl	$7,%ecx
-	movl	%ebx,%esi
-	xorl	%edx,%edi
-	roll	$5,%ebx
-	addl	%edi,%eax
-	xorl	%ecx,%esi
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	pxor	%xmm2,%xmm6
-	addl	32(%rsp),%ebp
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
-	rorl	$7,%ebx
-	punpcklqdq	%xmm5,%xmm9
-	movl	%eax,%edi
-	xorl	%ecx,%esi
-	pxor	%xmm7,%xmm6
-	roll	$5,%eax
-	addl	%esi,%ebp
-	movdqa	%xmm10,%xmm8
-	xorl	%ebx,%edi
-	paddd	%xmm5,%xmm10
-	xorl	%ecx,%ebx
-	pxor	%xmm9,%xmm6
-	addl	%eax,%ebp
-	addl	36(%rsp),%edx
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
-	rorl	$7,%eax
-	movdqa	%xmm6,%xmm9
-	movl	%ebp,%esi
-	xorl	%ebx,%edi
-	movdqa	%xmm10,16(%rsp)
-	roll	$5,%ebp
-	addl	%edi,%edx
-	xorl	%eax,%esi
-	pslld	$2,%xmm6
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	psrld	$30,%xmm9
-	addl	40(%rsp),%ecx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	por	%xmm9,%xmm6
-	rorl	$7,%ebp
-	movl	%edx,%edi
-	xorl	%eax,%esi
-	roll	$5,%edx
-	pshufd	$238,%xmm5,%xmm10
-	addl	%esi,%ecx
-	xorl	%ebp,%edi
-	xorl	%eax,%ebp
-	addl	%edx,%ecx
-	addl	44(%rsp),%ebx
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
-	rorl	$7,%edx
-	movl	%ecx,%esi
-	xorl	%ebp,%edi
-	roll	$5,%ecx
-	addl	%edi,%ebx
-	xorl	%edx,%esi
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	pxor	%xmm3,%xmm7
-	addl	48(%rsp),%eax
-	andl	%edx,%esi
-	xorl	%ebp,%edx
-	rorl	$7,%ecx
-	punpcklqdq	%xmm6,%xmm10
-	movl	%ebx,%edi
-	xorl	%edx,%esi
-	pxor	%xmm0,%xmm7
-	roll	$5,%ebx
-	addl	%esi,%eax
-	movdqa	32(%r14),%xmm9
-	xorl	%ecx,%edi
-	paddd	%xmm6,%xmm8
-	xorl	%edx,%ecx
-	pxor	%xmm10,%xmm7
-	addl	%ebx,%eax
-	addl	52(%rsp),%ebp
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	rorl	$7,%ebx
-	movdqa	%xmm7,%xmm10
-	movl	%eax,%esi
-	xorl	%ecx,%edi
-	movdqa	%xmm8,32(%rsp)
-	roll	$5,%eax
-	addl	%edi,%ebp
-	xorl	%ebx,%esi
-	pslld	$2,%xmm7
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	psrld	$30,%xmm10
-	addl	56(%rsp),%edx
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	por	%xmm10,%xmm7
-	rorl	$7,%eax
-	movl	%ebp,%edi
-	xorl	%ebx,%esi
-	roll	$5,%ebp
-	pshufd	$238,%xmm6,%xmm8
-	addl	%esi,%edx
-	xorl	%eax,%edi
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	addl	60(%rsp),%ecx
-	andl	%eax,%edi
-	xorl	%ebx,%eax
-	rorl	$7,%ebp
-	movl	%edx,%esi
-	xorl	%eax,%edi
-	roll	$5,%edx
-	addl	%edi,%ecx
-	xorl	%ebp,%esi
-	xorl	%eax,%ebp
-	addl	%edx,%ecx
-	pxor	%xmm4,%xmm0
-	addl	0(%rsp),%ebx
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	rorl	$7,%edx
-	punpcklqdq	%xmm7,%xmm8
-	movl	%ecx,%edi
-	xorl	%ebp,%esi
-	pxor	%xmm1,%xmm0
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	movdqa	%xmm9,%xmm10
-	xorl	%edx,%edi
-	paddd	%xmm7,%xmm9
-	xorl	%ebp,%edx
-	pxor	%xmm8,%xmm0
-	addl	%ecx,%ebx
-	addl	4(%rsp),%eax
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	rorl	$7,%ecx
-	movdqa	%xmm0,%xmm8
-	movl	%ebx,%esi
-	xorl	%edx,%edi
-	movdqa	%xmm9,48(%rsp)
-	roll	$5,%ebx
-	addl	%edi,%eax
-	xorl	%ecx,%esi
-	pslld	$2,%xmm0
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	psrld	$30,%xmm8
-	addl	8(%rsp),%ebp
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
-	por	%xmm8,%xmm0
-	rorl	$7,%ebx
-	movl	%eax,%edi
-	xorl	%ecx,%esi
-	roll	$5,%eax
-	pshufd	$238,%xmm7,%xmm9
-	addl	%esi,%ebp
-	xorl	%ebx,%edi
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	addl	12(%rsp),%edx
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
-	rorl	$7,%eax
-	movl	%ebp,%esi
-	xorl	%ebx,%edi
-	roll	$5,%ebp
-	addl	%edi,%edx
-	xorl	%eax,%esi
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	pxor	%xmm5,%xmm1
-	addl	16(%rsp),%ecx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	rorl	$7,%ebp
-	punpcklqdq	%xmm0,%xmm9
-	movl	%edx,%edi
-	xorl	%eax,%esi
-	pxor	%xmm2,%xmm1
-	roll	$5,%edx
-	addl	%esi,%ecx
-	movdqa	%xmm10,%xmm8
-	xorl	%ebp,%edi
-	paddd	%xmm0,%xmm10
-	xorl	%eax,%ebp
-	pxor	%xmm9,%xmm1
-	addl	%edx,%ecx
-	addl	20(%rsp),%ebx
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
-	rorl	$7,%edx
-	movdqa	%xmm1,%xmm9
-	movl	%ecx,%esi
-	xorl	%ebp,%edi
-	movdqa	%xmm10,0(%rsp)
-	roll	$5,%ecx
-	addl	%edi,%ebx
-	xorl	%edx,%esi
-	pslld	$2,%xmm1
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	psrld	$30,%xmm9
-	addl	24(%rsp),%eax
-	andl	%edx,%esi
-	xorl	%ebp,%edx
-	por	%xmm9,%xmm1
-	rorl	$7,%ecx
-	movl	%ebx,%edi
-	xorl	%edx,%esi
-	roll	$5,%ebx
-	pshufd	$238,%xmm0,%xmm10
-	addl	%esi,%eax
-	xorl	%ecx,%edi
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	addl	28(%rsp),%ebp
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	rorl	$7,%ebx
-	movl	%eax,%esi
-	xorl	%ecx,%edi
-	roll	$5,%eax
-	addl	%edi,%ebp
-	xorl	%ebx,%esi
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	pxor	%xmm6,%xmm2
-	addl	32(%rsp),%edx
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	rorl	$7,%eax
-	punpcklqdq	%xmm1,%xmm10
-	movl	%ebp,%edi
-	xorl	%ebx,%esi
-	pxor	%xmm3,%xmm2
-	roll	$5,%ebp
-	addl	%esi,%edx
-	movdqa	%xmm8,%xmm9
-	xorl	%eax,%edi
-	paddd	%xmm1,%xmm8
-	xorl	%ebx,%eax
-	pxor	%xmm10,%xmm2
-	addl	%ebp,%edx
-	addl	36(%rsp),%ecx
-	andl	%eax,%edi
-	xorl	%ebx,%eax
-	rorl	$7,%ebp
-	movdqa	%xmm2,%xmm10
-	movl	%edx,%esi
-	xorl	%eax,%edi
-	movdqa	%xmm8,16(%rsp)
-	roll	$5,%edx
-	addl	%edi,%ecx
-	xorl	%ebp,%esi
-	pslld	$2,%xmm2
-	xorl	%eax,%ebp
-	addl	%edx,%ecx
-	psrld	$30,%xmm10
-	addl	40(%rsp),%ebx
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	por	%xmm10,%xmm2
-	rorl	$7,%edx
-	movl	%ecx,%edi
-	xorl	%ebp,%esi
-	roll	$5,%ecx
-	pshufd	$238,%xmm1,%xmm8
-	addl	%esi,%ebx
-	xorl	%edx,%edi
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	addl	44(%rsp),%eax
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	rorl	$7,%ecx
-	movl	%ebx,%esi
-	xorl	%edx,%edi
-	roll	$5,%ebx
-	addl	%edi,%eax
-	xorl	%edx,%esi
-	addl	%ebx,%eax
-	pxor	%xmm7,%xmm3
-	addl	48(%rsp),%ebp
-	xorl	%ecx,%esi
-	punpcklqdq	%xmm2,%xmm8
-	movl	%eax,%edi
-	roll	$5,%eax
-	pxor	%xmm4,%xmm3
-	addl	%esi,%ebp
-	xorl	%ecx,%edi
-	movdqa	%xmm9,%xmm10
-	rorl	$7,%ebx
-	paddd	%xmm2,%xmm9
-	addl	%eax,%ebp
-	pxor	%xmm8,%xmm3
-	addl	52(%rsp),%edx
-	xorl	%ebx,%edi
-	movl	%ebp,%esi
-	roll	$5,%ebp
-	movdqa	%xmm3,%xmm8
-	addl	%edi,%edx
-	xorl	%ebx,%esi
-	movdqa	%xmm9,32(%rsp)
-	rorl	$7,%eax
-	addl	%ebp,%edx
-	addl	56(%rsp),%ecx
-	pslld	$2,%xmm3
-	xorl	%eax,%esi
-	movl	%edx,%edi
-	psrld	$30,%xmm8
-	roll	$5,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%edi
-	rorl	$7,%ebp
-	por	%xmm8,%xmm3
-	addl	%edx,%ecx
-	addl	60(%rsp),%ebx
-	xorl	%ebp,%edi
-	movl	%ecx,%esi
-	roll	$5,%ecx
-	addl	%edi,%ebx
-	xorl	%ebp,%esi
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	addl	0(%rsp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%edi
-	roll	$5,%ebx
-	paddd	%xmm3,%xmm10
-	addl	%esi,%eax
-	xorl	%edx,%edi
-	movdqa	%xmm10,48(%rsp)
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	addl	4(%rsp),%ebp
-	xorl	%ecx,%edi
-	movl	%eax,%esi
-	roll	$5,%eax
-	addl	%edi,%ebp
-	xorl	%ecx,%esi
-	rorl	$7,%ebx
-	addl	%eax,%ebp
-	addl	8(%rsp),%edx
-	xorl	%ebx,%esi
-	movl	%ebp,%edi
-	roll	$5,%ebp
-	addl	%esi,%edx
-	xorl	%ebx,%edi
-	rorl	$7,%eax
-	addl	%ebp,%edx
-	addl	12(%rsp),%ecx
-	xorl	%eax,%edi
-	movl	%edx,%esi
-	roll	$5,%edx
-	addl	%edi,%ecx
-	xorl	%eax,%esi
-	rorl	$7,%ebp
-	addl	%edx,%ecx
-	cmpq	%r10,%r9
-	je	L$done_ssse3
-	movdqa	64(%r14),%xmm6
-	movdqa	-64(%r14),%xmm9
-	movdqu	0(%r9),%xmm0
-	movdqu	16(%r9),%xmm1
-	movdqu	32(%r9),%xmm2
-	movdqu	48(%r9),%xmm3
-.byte	102,15,56,0,198
-	addq	$64,%r9
-	addl	16(%rsp),%ebx
-	xorl	%ebp,%esi
-	movl	%ecx,%edi
-.byte	102,15,56,0,206
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%ebp,%edi
-	rorl	$7,%edx
-	paddd	%xmm9,%xmm0
-	addl	%ecx,%ebx
-	addl	20(%rsp),%eax
-	xorl	%edx,%edi
-	movl	%ebx,%esi
-	movdqa	%xmm0,0(%rsp)
-	roll	$5,%ebx
-	addl	%edi,%eax
-	xorl	%edx,%esi
-	rorl	$7,%ecx
-	psubd	%xmm9,%xmm0
-	addl	%ebx,%eax
-	addl	24(%rsp),%ebp
-	xorl	%ecx,%esi
-	movl	%eax,%edi
-	roll	$5,%eax
-	addl	%esi,%ebp
-	xorl	%ecx,%edi
-	rorl	$7,%ebx
-	addl	%eax,%ebp
-	addl	28(%rsp),%edx
-	xorl	%ebx,%edi
-	movl	%ebp,%esi
-	roll	$5,%ebp
-	addl	%edi,%edx
-	xorl	%ebx,%esi
-	rorl	$7,%eax
-	addl	%ebp,%edx
-	addl	32(%rsp),%ecx
-	xorl	%eax,%esi
-	movl	%edx,%edi
-.byte	102,15,56,0,214
-	roll	$5,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%edi
-	rorl	$7,%ebp
-	paddd	%xmm9,%xmm1
-	addl	%edx,%ecx
-	addl	36(%rsp),%ebx
-	xorl	%ebp,%edi
-	movl	%ecx,%esi
-	movdqa	%xmm1,16(%rsp)
-	roll	$5,%ecx
-	addl	%edi,%ebx
-	xorl	%ebp,%esi
-	rorl	$7,%edx
-	psubd	%xmm9,%xmm1
-	addl	%ecx,%ebx
-	addl	40(%rsp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%edi
-	roll	$5,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%edi
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	addl	44(%rsp),%ebp
-	xorl	%ecx,%edi
-	movl	%eax,%esi
-	roll	$5,%eax
-	addl	%edi,%ebp
-	xorl	%ecx,%esi
-	rorl	$7,%ebx
-	addl	%eax,%ebp
-	addl	48(%rsp),%edx
-	xorl	%ebx,%esi
-	movl	%ebp,%edi
-.byte	102,15,56,0,222
-	roll	$5,%ebp
-	addl	%esi,%edx
-	xorl	%ebx,%edi
-	rorl	$7,%eax
-	paddd	%xmm9,%xmm2
-	addl	%ebp,%edx
-	addl	52(%rsp),%ecx
-	xorl	%eax,%edi
-	movl	%edx,%esi
-	movdqa	%xmm2,32(%rsp)
-	roll	$5,%edx
-	addl	%edi,%ecx
-	xorl	%eax,%esi
-	rorl	$7,%ebp
-	psubd	%xmm9,%xmm2
-	addl	%edx,%ecx
-	addl	56(%rsp),%ebx
-	xorl	%ebp,%esi
-	movl	%ecx,%edi
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%ebp,%edi
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	addl	60(%rsp),%eax
-	xorl	%edx,%edi
-	movl	%ebx,%esi
-	roll	$5,%ebx
-	addl	%edi,%eax
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	addl	0(%r8),%eax
-	addl	4(%r8),%esi
-	addl	8(%r8),%ecx
-	addl	12(%r8),%edx
-	movl	%eax,0(%r8)
-	addl	16(%r8),%ebp
-	movl	%esi,4(%r8)
-	movl	%esi,%ebx
-	movl	%ecx,8(%r8)
-	movl	%ecx,%edi
-	movl	%edx,12(%r8)
-	xorl	%edx,%edi
-	movl	%ebp,16(%r8)
-	andl	%edi,%esi
-	jmp	L$oop_ssse3
-
-.p2align	4
-L$done_ssse3:
-	addl	16(%rsp),%ebx
-	xorl	%ebp,%esi
-	movl	%ecx,%edi
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%ebp,%edi
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	addl	20(%rsp),%eax
-	xorl	%edx,%edi
-	movl	%ebx,%esi
-	roll	$5,%ebx
-	addl	%edi,%eax
-	xorl	%edx,%esi
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	addl	24(%rsp),%ebp
-	xorl	%ecx,%esi
-	movl	%eax,%edi
-	roll	$5,%eax
-	addl	%esi,%ebp
-	xorl	%ecx,%edi
-	rorl	$7,%ebx
-	addl	%eax,%ebp
-	addl	28(%rsp),%edx
-	xorl	%ebx,%edi
-	movl	%ebp,%esi
-	roll	$5,%ebp
-	addl	%edi,%edx
-	xorl	%ebx,%esi
-	rorl	$7,%eax
-	addl	%ebp,%edx
-	addl	32(%rsp),%ecx
-	xorl	%eax,%esi
-	movl	%edx,%edi
-	roll	$5,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%edi
-	rorl	$7,%ebp
-	addl	%edx,%ecx
-	addl	36(%rsp),%ebx
-	xorl	%ebp,%edi
-	movl	%ecx,%esi
-	roll	$5,%ecx
-	addl	%edi,%ebx
-	xorl	%ebp,%esi
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	addl	40(%rsp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%edi
-	roll	$5,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%edi
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	addl	44(%rsp),%ebp
-	xorl	%ecx,%edi
-	movl	%eax,%esi
-	roll	$5,%eax
-	addl	%edi,%ebp
-	xorl	%ecx,%esi
-	rorl	$7,%ebx
-	addl	%eax,%ebp
-	addl	48(%rsp),%edx
-	xorl	%ebx,%esi
-	movl	%ebp,%edi
-	roll	$5,%ebp
-	addl	%esi,%edx
-	xorl	%ebx,%edi
-	rorl	$7,%eax
-	addl	%ebp,%edx
-	addl	52(%rsp),%ecx
-	xorl	%eax,%edi
-	movl	%edx,%esi
-	roll	$5,%edx
-	addl	%edi,%ecx
-	xorl	%eax,%esi
-	rorl	$7,%ebp
-	addl	%edx,%ecx
-	addl	56(%rsp),%ebx
-	xorl	%ebp,%esi
-	movl	%ecx,%edi
-	roll	$5,%ecx
-	addl	%esi,%ebx
-	xorl	%ebp,%edi
-	rorl	$7,%edx
-	addl	%ecx,%ebx
-	addl	60(%rsp),%eax
-	xorl	%edx,%edi
-	movl	%ebx,%esi
-	roll	$5,%ebx
-	addl	%edi,%eax
-	rorl	$7,%ecx
-	addl	%ebx,%eax
-	addl	0(%r8),%eax
-	addl	4(%r8),%esi
-	addl	8(%r8),%ecx
-	movl	%eax,0(%r8)
-	addl	12(%r8),%edx
-	movl	%esi,4(%r8)
-	addl	16(%r8),%ebp
-	movl	%ecx,8(%r8)
-	movl	%edx,12(%r8)
-	movl	%ebp,16(%r8)
-	movq	-40(%r11),%r14
-
-	movq	-32(%r11),%r13
-
-	movq	-24(%r11),%r12
-
-	movq	-16(%r11),%rbp
-
-	movq	-8(%r11),%rbx
-
-	leaq	(%r11),%rsp
-
-L$epilogue_ssse3:
-	ret
-
-
-
-.p2align	4
-sha1_block_data_order_avx:
-_avx_shortcut:
-
-	movq	%rsp,%r11
-
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	leaq	-64(%rsp),%rsp
-	vzeroupper
-	andq	$-64,%rsp
-	movq	%rdi,%r8
-	movq	%rsi,%r9
-	movq	%rdx,%r10
-
-	shlq	$6,%r10
-	addq	%r9,%r10
-	leaq	K_XX_XX+64(%rip),%r14
-
-	movl	0(%r8),%eax
-	movl	4(%r8),%ebx
-	movl	8(%r8),%ecx
-	movl	12(%r8),%edx
-	movl	%ebx,%esi
-	movl	16(%r8),%ebp
-	movl	%ecx,%edi
-	xorl	%edx,%edi
-	andl	%edi,%esi
-
-	vmovdqa	64(%r14),%xmm6
-	vmovdqa	-64(%r14),%xmm11
-	vmovdqu	0(%r9),%xmm0
-	vmovdqu	16(%r9),%xmm1
-	vmovdqu	32(%r9),%xmm2
-	vmovdqu	48(%r9),%xmm3
-	vpshufb	%xmm6,%xmm0,%xmm0
-	addq	$64,%r9
-	vpshufb	%xmm6,%xmm1,%xmm1
-	vpshufb	%xmm6,%xmm2,%xmm2
-	vpshufb	%xmm6,%xmm3,%xmm3
-	vpaddd	%xmm11,%xmm0,%xmm4
-	vpaddd	%xmm11,%xmm1,%xmm5
-	vpaddd	%xmm11,%xmm2,%xmm6
-	vmovdqa	%xmm4,0(%rsp)
-	vmovdqa	%xmm5,16(%rsp)
-	vmovdqa	%xmm6,32(%rsp)
-	jmp	L$oop_avx
-.p2align	4
-L$oop_avx:
-	shrdl	$2,%ebx,%ebx
-	xorl	%edx,%esi
-	vpalignr	$8,%xmm0,%xmm1,%xmm4
-	movl	%eax,%edi
-	addl	0(%rsp),%ebp
-	vpaddd	%xmm3,%xmm11,%xmm9
-	xorl	%ecx,%ebx
-	shldl	$5,%eax,%eax
-	vpsrldq	$4,%xmm3,%xmm8
-	addl	%esi,%ebp
-	andl	%ebx,%edi
-	vpxor	%xmm0,%xmm4,%xmm4
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	vpxor	%xmm2,%xmm8,%xmm8
-	shrdl	$7,%eax,%eax
-	xorl	%ecx,%edi
-	movl	%ebp,%esi
-	addl	4(%rsp),%edx
-	vpxor	%xmm8,%xmm4,%xmm4
-	xorl	%ebx,%eax
-	shldl	$5,%ebp,%ebp
-	vmovdqa	%xmm9,48(%rsp)
-	addl	%edi,%edx
-	andl	%eax,%esi
-	vpsrld	$31,%xmm4,%xmm8
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	shrdl	$7,%ebp,%ebp
-	xorl	%ebx,%esi
-	vpslldq	$12,%xmm4,%xmm10
-	vpaddd	%xmm4,%xmm4,%xmm4
-	movl	%edx,%edi
-	addl	8(%rsp),%ecx
-	xorl	%eax,%ebp
-	shldl	$5,%edx,%edx
-	vpsrld	$30,%xmm10,%xmm9
-	vpor	%xmm8,%xmm4,%xmm4
-	addl	%esi,%ecx
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
-	addl	%edx,%ecx
-	vpslld	$2,%xmm10,%xmm10
-	vpxor	%xmm9,%xmm4,%xmm4
-	shrdl	$7,%edx,%edx
-	xorl	%eax,%edi
-	movl	%ecx,%esi
-	addl	12(%rsp),%ebx
-	vpxor	%xmm10,%xmm4,%xmm4
-	xorl	%ebp,%edx
-	shldl	$5,%ecx,%ecx
-	addl	%edi,%ebx
-	andl	%edx,%esi
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	shrdl	$7,%ecx,%ecx
-	xorl	%ebp,%esi
-	vpalignr	$8,%xmm1,%xmm2,%xmm5
-	movl	%ebx,%edi
-	addl	16(%rsp),%eax
-	vpaddd	%xmm4,%xmm11,%xmm9
-	xorl	%edx,%ecx
-	shldl	$5,%ebx,%ebx
-	vpsrldq	$4,%xmm4,%xmm8
-	addl	%esi,%eax
-	andl	%ecx,%edi
-	vpxor	%xmm1,%xmm5,%xmm5
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	vpxor	%xmm3,%xmm8,%xmm8
-	shrdl	$7,%ebx,%ebx
-	xorl	%edx,%edi
-	movl	%eax,%esi
-	addl	20(%rsp),%ebp
-	vpxor	%xmm8,%xmm5,%xmm5
-	xorl	%ecx,%ebx
-	shldl	$5,%eax,%eax
-	vmovdqa	%xmm9,0(%rsp)
-	addl	%edi,%ebp
-	andl	%ebx,%esi
-	vpsrld	$31,%xmm5,%xmm8
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	shrdl	$7,%eax,%eax
-	xorl	%ecx,%esi
-	vpslldq	$12,%xmm5,%xmm10
-	vpaddd	%xmm5,%xmm5,%xmm5
-	movl	%ebp,%edi
-	addl	24(%rsp),%edx
-	xorl	%ebx,%eax
-	shldl	$5,%ebp,%ebp
-	vpsrld	$30,%xmm10,%xmm9
-	vpor	%xmm8,%xmm5,%xmm5
-	addl	%esi,%edx
-	andl	%eax,%edi
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	vpslld	$2,%xmm10,%xmm10
-	vpxor	%xmm9,%xmm5,%xmm5
-	shrdl	$7,%ebp,%ebp
-	xorl	%ebx,%edi
-	movl	%edx,%esi
-	addl	28(%rsp),%ecx
-	vpxor	%xmm10,%xmm5,%xmm5
-	xorl	%eax,%ebp
-	shldl	$5,%edx,%edx
-	vmovdqa	-32(%r14),%xmm11
-	addl	%edi,%ecx
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	addl	%edx,%ecx
-	shrdl	$7,%edx,%edx
-	xorl	%eax,%esi
-	vpalignr	$8,%xmm2,%xmm3,%xmm6
-	movl	%ecx,%edi
-	addl	32(%rsp),%ebx
-	vpaddd	%xmm5,%xmm11,%xmm9
-	xorl	%ebp,%edx
-	shldl	$5,%ecx,%ecx
-	vpsrldq	$4,%xmm5,%xmm8
-	addl	%esi,%ebx
-	andl	%edx,%edi
-	vpxor	%xmm2,%xmm6,%xmm6
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	vpxor	%xmm4,%xmm8,%xmm8
-	shrdl	$7,%ecx,%ecx
-	xorl	%ebp,%edi
-	movl	%ebx,%esi
-	addl	36(%rsp),%eax
-	vpxor	%xmm8,%xmm6,%xmm6
-	xorl	%edx,%ecx
-	shldl	$5,%ebx,%ebx
-	vmovdqa	%xmm9,16(%rsp)
-	addl	%edi,%eax
-	andl	%ecx,%esi
-	vpsrld	$31,%xmm6,%xmm8
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	shrdl	$7,%ebx,%ebx
-	xorl	%edx,%esi
-	vpslldq	$12,%xmm6,%xmm10
-	vpaddd	%xmm6,%xmm6,%xmm6
-	movl	%eax,%edi
-	addl	40(%rsp),%ebp
-	xorl	%ecx,%ebx
-	shldl	$5,%eax,%eax
-	vpsrld	$30,%xmm10,%xmm9
-	vpor	%xmm8,%xmm6,%xmm6
-	addl	%esi,%ebp
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	vpslld	$2,%xmm10,%xmm10
-	vpxor	%xmm9,%xmm6,%xmm6
-	shrdl	$7,%eax,%eax
-	xorl	%ecx,%edi
-	movl	%ebp,%esi
-	addl	44(%rsp),%edx
-	vpxor	%xmm10,%xmm6,%xmm6
-	xorl	%ebx,%eax
-	shldl	$5,%ebp,%ebp
-	addl	%edi,%edx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	shrdl	$7,%ebp,%ebp
-	xorl	%ebx,%esi
-	vpalignr	$8,%xmm3,%xmm4,%xmm7
-	movl	%edx,%edi
-	addl	48(%rsp),%ecx
-	vpaddd	%xmm6,%xmm11,%xmm9
-	xorl	%eax,%ebp
-	shldl	$5,%edx,%edx
-	vpsrldq	$4,%xmm6,%xmm8
-	addl	%esi,%ecx
-	andl	%ebp,%edi
-	vpxor	%xmm3,%xmm7,%xmm7
-	xorl	%eax,%ebp
-	addl	%edx,%ecx
-	vpxor	%xmm5,%xmm8,%xmm8
-	shrdl	$7,%edx,%edx
-	xorl	%eax,%edi
-	movl	%ecx,%esi
-	addl	52(%rsp),%ebx
-	vpxor	%xmm8,%xmm7,%xmm7
-	xorl	%ebp,%edx
-	shldl	$5,%ecx,%ecx
-	vmovdqa	%xmm9,32(%rsp)
-	addl	%edi,%ebx
-	andl	%edx,%esi
-	vpsrld	$31,%xmm7,%xmm8
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	shrdl	$7,%ecx,%ecx
-	xorl	%ebp,%esi
-	vpslldq	$12,%xmm7,%xmm10
-	vpaddd	%xmm7,%xmm7,%xmm7
-	movl	%ebx,%edi
-	addl	56(%rsp),%eax
-	xorl	%edx,%ecx
-	shldl	$5,%ebx,%ebx
-	vpsrld	$30,%xmm10,%xmm9
-	vpor	%xmm8,%xmm7,%xmm7
-	addl	%esi,%eax
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	vpslld	$2,%xmm10,%xmm10
-	vpxor	%xmm9,%xmm7,%xmm7
-	shrdl	$7,%ebx,%ebx
-	xorl	%edx,%edi
-	movl	%eax,%esi
-	addl	60(%rsp),%ebp
-	vpxor	%xmm10,%xmm7,%xmm7
-	xorl	%ecx,%ebx
-	shldl	$5,%eax,%eax
-	addl	%edi,%ebp
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	vpalignr	$8,%xmm6,%xmm7,%xmm8
-	vpxor	%xmm4,%xmm0,%xmm0
-	shrdl	$7,%eax,%eax
-	xorl	%ecx,%esi
-	movl	%ebp,%edi
-	addl	0(%rsp),%edx
-	vpxor	%xmm1,%xmm0,%xmm0
-	xorl	%ebx,%eax
-	shldl	$5,%ebp,%ebp
-	vpaddd	%xmm7,%xmm11,%xmm9
-	addl	%esi,%edx
-	andl	%eax,%edi
-	vpxor	%xmm8,%xmm0,%xmm0
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	shrdl	$7,%ebp,%ebp
-	xorl	%ebx,%edi
-	vpsrld	$30,%xmm0,%xmm8
-	vmovdqa	%xmm9,48(%rsp)
-	movl	%edx,%esi
-	addl	4(%rsp),%ecx
-	xorl	%eax,%ebp
-	shldl	$5,%edx,%edx
-	vpslld	$2,%xmm0,%xmm0
-	addl	%edi,%ecx
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	addl	%edx,%ecx
-	shrdl	$7,%edx,%edx
-	xorl	%eax,%esi
-	movl	%ecx,%edi
-	addl	8(%rsp),%ebx
-	vpor	%xmm8,%xmm0,%xmm0
-	xorl	%ebp,%edx
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	addl	12(%rsp),%eax
-	xorl	%ebp,%edi
-	movl	%ebx,%esi
-	shldl	$5,%ebx,%ebx
-	addl	%edi,%eax
-	xorl	%edx,%esi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	vpalignr	$8,%xmm7,%xmm0,%xmm8
-	vpxor	%xmm5,%xmm1,%xmm1
-	addl	16(%rsp),%ebp
-	xorl	%ecx,%esi
-	movl	%eax,%edi
-	shldl	$5,%eax,%eax
-	vpxor	%xmm2,%xmm1,%xmm1
-	addl	%esi,%ebp
-	xorl	%ecx,%edi
-	vpaddd	%xmm0,%xmm11,%xmm9
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%ebp
-	vpxor	%xmm8,%xmm1,%xmm1
-	addl	20(%rsp),%edx
-	xorl	%ebx,%edi
-	movl	%ebp,%esi
-	shldl	$5,%ebp,%ebp
-	vpsrld	$30,%xmm1,%xmm8
-	vmovdqa	%xmm9,0(%rsp)
-	addl	%edi,%edx
-	xorl	%ebx,%esi
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	vpslld	$2,%xmm1,%xmm1
-	addl	24(%rsp),%ecx
-	xorl	%eax,%esi
-	movl	%edx,%edi
-	shldl	$5,%edx,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%edi
-	shrdl	$7,%ebp,%ebp
-	addl	%edx,%ecx
-	vpor	%xmm8,%xmm1,%xmm1
-	addl	28(%rsp),%ebx
-	xorl	%ebp,%edi
-	movl	%ecx,%esi
-	shldl	$5,%ecx,%ecx
-	addl	%edi,%ebx
-	xorl	%ebp,%esi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	vpalignr	$8,%xmm0,%xmm1,%xmm8
-	vpxor	%xmm6,%xmm2,%xmm2
-	addl	32(%rsp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%edi
-	shldl	$5,%ebx,%ebx
-	vpxor	%xmm3,%xmm2,%xmm2
-	addl	%esi,%eax
-	xorl	%edx,%edi
-	vpaddd	%xmm1,%xmm11,%xmm9
-	vmovdqa	0(%r14),%xmm11
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	vpxor	%xmm8,%xmm2,%xmm2
-	addl	36(%rsp),%ebp
-	xorl	%ecx,%edi
-	movl	%eax,%esi
-	shldl	$5,%eax,%eax
-	vpsrld	$30,%xmm2,%xmm8
-	vmovdqa	%xmm9,16(%rsp)
-	addl	%edi,%ebp
-	xorl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%ebp
-	vpslld	$2,%xmm2,%xmm2
-	addl	40(%rsp),%edx
-	xorl	%ebx,%esi
-	movl	%ebp,%edi
-	shldl	$5,%ebp,%ebp
-	addl	%esi,%edx
-	xorl	%ebx,%edi
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	vpor	%xmm8,%xmm2,%xmm2
-	addl	44(%rsp),%ecx
-	xorl	%eax,%edi
-	movl	%edx,%esi
-	shldl	$5,%edx,%edx
-	addl	%edi,%ecx
-	xorl	%eax,%esi
-	shrdl	$7,%ebp,%ebp
-	addl	%edx,%ecx
-	vpalignr	$8,%xmm1,%xmm2,%xmm8
-	vpxor	%xmm7,%xmm3,%xmm3
-	addl	48(%rsp),%ebx
-	xorl	%ebp,%esi
-	movl	%ecx,%edi
-	shldl	$5,%ecx,%ecx
-	vpxor	%xmm4,%xmm3,%xmm3
-	addl	%esi,%ebx
-	xorl	%ebp,%edi
-	vpaddd	%xmm2,%xmm11,%xmm9
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	vpxor	%xmm8,%xmm3,%xmm3
-	addl	52(%rsp),%eax
-	xorl	%edx,%edi
-	movl	%ebx,%esi
-	shldl	$5,%ebx,%ebx
-	vpsrld	$30,%xmm3,%xmm8
-	vmovdqa	%xmm9,32(%rsp)
-	addl	%edi,%eax
-	xorl	%edx,%esi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	vpslld	$2,%xmm3,%xmm3
-	addl	56(%rsp),%ebp
-	xorl	%ecx,%esi
-	movl	%eax,%edi
-	shldl	$5,%eax,%eax
-	addl	%esi,%ebp
-	xorl	%ecx,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%ebp
-	vpor	%xmm8,%xmm3,%xmm3
-	addl	60(%rsp),%edx
-	xorl	%ebx,%edi
-	movl	%ebp,%esi
-	shldl	$5,%ebp,%ebp
-	addl	%edi,%edx
-	xorl	%ebx,%esi
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	vpalignr	$8,%xmm2,%xmm3,%xmm8
-	vpxor	%xmm0,%xmm4,%xmm4
-	addl	0(%rsp),%ecx
-	xorl	%eax,%esi
-	movl	%edx,%edi
-	shldl	$5,%edx,%edx
-	vpxor	%xmm5,%xmm4,%xmm4
-	addl	%esi,%ecx
-	xorl	%eax,%edi
-	vpaddd	%xmm3,%xmm11,%xmm9
-	shrdl	$7,%ebp,%ebp
-	addl	%edx,%ecx
-	vpxor	%xmm8,%xmm4,%xmm4
-	addl	4(%rsp),%ebx
-	xorl	%ebp,%edi
-	movl	%ecx,%esi
-	shldl	$5,%ecx,%ecx
-	vpsrld	$30,%xmm4,%xmm8
-	vmovdqa	%xmm9,48(%rsp)
-	addl	%edi,%ebx
-	xorl	%ebp,%esi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	vpslld	$2,%xmm4,%xmm4
-	addl	8(%rsp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%edi
-	shldl	$5,%ebx,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%edi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	vpor	%xmm8,%xmm4,%xmm4
-	addl	12(%rsp),%ebp
-	xorl	%ecx,%edi
-	movl	%eax,%esi
-	shldl	$5,%eax,%eax
-	addl	%edi,%ebp
-	xorl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%ebp
-	vpalignr	$8,%xmm3,%xmm4,%xmm8
-	vpxor	%xmm1,%xmm5,%xmm5
-	addl	16(%rsp),%edx
-	xorl	%ebx,%esi
-	movl	%ebp,%edi
-	shldl	$5,%ebp,%ebp
-	vpxor	%xmm6,%xmm5,%xmm5
-	addl	%esi,%edx
-	xorl	%ebx,%edi
-	vpaddd	%xmm4,%xmm11,%xmm9
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	vpxor	%xmm8,%xmm5,%xmm5
-	addl	20(%rsp),%ecx
-	xorl	%eax,%edi
-	movl	%edx,%esi
-	shldl	$5,%edx,%edx
-	vpsrld	$30,%xmm5,%xmm8
-	vmovdqa	%xmm9,0(%rsp)
-	addl	%edi,%ecx
-	xorl	%eax,%esi
-	shrdl	$7,%ebp,%ebp
-	addl	%edx,%ecx
-	vpslld	$2,%xmm5,%xmm5
-	addl	24(%rsp),%ebx
-	xorl	%ebp,%esi
-	movl	%ecx,%edi
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	xorl	%ebp,%edi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	vpor	%xmm8,%xmm5,%xmm5
-	addl	28(%rsp),%eax
-	shrdl	$7,%ecx,%ecx
-	movl	%ebx,%esi
-	xorl	%edx,%edi
-	shldl	$5,%ebx,%ebx
-	addl	%edi,%eax
-	xorl	%ecx,%esi
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	vpalignr	$8,%xmm4,%xmm5,%xmm8
-	vpxor	%xmm2,%xmm6,%xmm6
-	addl	32(%rsp),%ebp
-	andl	%ecx,%esi
-	xorl	%edx,%ecx
-	shrdl	$7,%ebx,%ebx
-	vpxor	%xmm7,%xmm6,%xmm6
-	movl	%eax,%edi
-	xorl	%ecx,%esi
-	vpaddd	%xmm5,%xmm11,%xmm9
-	shldl	$5,%eax,%eax
-	addl	%esi,%ebp
-	vpxor	%xmm8,%xmm6,%xmm6
-	xorl	%ebx,%edi
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	addl	36(%rsp),%edx
-	vpsrld	$30,%xmm6,%xmm8
-	vmovdqa	%xmm9,16(%rsp)
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
-	shrdl	$7,%eax,%eax
-	movl	%ebp,%esi
-	vpslld	$2,%xmm6,%xmm6
-	xorl	%ebx,%edi
-	shldl	$5,%ebp,%ebp
-	addl	%edi,%edx
-	xorl	%eax,%esi
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	addl	40(%rsp),%ecx
-	andl	%eax,%esi
-	vpor	%xmm8,%xmm6,%xmm6
-	xorl	%ebx,%eax
-	shrdl	$7,%ebp,%ebp
-	movl	%edx,%edi
-	xorl	%eax,%esi
-	shldl	$5,%edx,%edx
-	addl	%esi,%ecx
-	xorl	%ebp,%edi
-	xorl	%eax,%ebp
-	addl	%edx,%ecx
-	addl	44(%rsp),%ebx
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
-	shrdl	$7,%edx,%edx
-	movl	%ecx,%esi
-	xorl	%ebp,%edi
-	shldl	$5,%ecx,%ecx
-	addl	%edi,%ebx
-	xorl	%edx,%esi
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	vpalignr	$8,%xmm5,%xmm6,%xmm8
-	vpxor	%xmm3,%xmm7,%xmm7
-	addl	48(%rsp),%eax
-	andl	%edx,%esi
-	xorl	%ebp,%edx
-	shrdl	$7,%ecx,%ecx
-	vpxor	%xmm0,%xmm7,%xmm7
-	movl	%ebx,%edi
-	xorl	%edx,%esi
-	vpaddd	%xmm6,%xmm11,%xmm9
-	vmovdqa	32(%r14),%xmm11
-	shldl	$5,%ebx,%ebx
-	addl	%esi,%eax
-	vpxor	%xmm8,%xmm7,%xmm7
-	xorl	%ecx,%edi
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	addl	52(%rsp),%ebp
-	vpsrld	$30,%xmm7,%xmm8
-	vmovdqa	%xmm9,32(%rsp)
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	shrdl	$7,%ebx,%ebx
-	movl	%eax,%esi
-	vpslld	$2,%xmm7,%xmm7
-	xorl	%ecx,%edi
-	shldl	$5,%eax,%eax
-	addl	%edi,%ebp
-	xorl	%ebx,%esi
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	addl	56(%rsp),%edx
-	andl	%ebx,%esi
-	vpor	%xmm8,%xmm7,%xmm7
-	xorl	%ecx,%ebx
-	shrdl	$7,%eax,%eax
-	movl	%ebp,%edi
-	xorl	%ebx,%esi
-	shldl	$5,%ebp,%ebp
-	addl	%esi,%edx
-	xorl	%eax,%edi
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	addl	60(%rsp),%ecx
-	andl	%eax,%edi
-	xorl	%ebx,%eax
-	shrdl	$7,%ebp,%ebp
-	movl	%edx,%esi
-	xorl	%eax,%edi
-	shldl	$5,%edx,%edx
-	addl	%edi,%ecx
-	xorl	%ebp,%esi
-	xorl	%eax,%ebp
-	addl	%edx,%ecx
-	vpalignr	$8,%xmm6,%xmm7,%xmm8
-	vpxor	%xmm4,%xmm0,%xmm0
-	addl	0(%rsp),%ebx
-	andl	%ebp,%esi
-	xorl	%eax,%ebp
-	shrdl	$7,%edx,%edx
-	vpxor	%xmm1,%xmm0,%xmm0
-	movl	%ecx,%edi
-	xorl	%ebp,%esi
-	vpaddd	%xmm7,%xmm11,%xmm9
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	vpxor	%xmm8,%xmm0,%xmm0
-	xorl	%edx,%edi
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	addl	4(%rsp),%eax
-	vpsrld	$30,%xmm0,%xmm8
-	vmovdqa	%xmm9,48(%rsp)
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	shrdl	$7,%ecx,%ecx
-	movl	%ebx,%esi
-	vpslld	$2,%xmm0,%xmm0
-	xorl	%edx,%edi
-	shldl	$5,%ebx,%ebx
-	addl	%edi,%eax
-	xorl	%ecx,%esi
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	addl	8(%rsp),%ebp
-	andl	%ecx,%esi
-	vpor	%xmm8,%xmm0,%xmm0
-	xorl	%edx,%ecx
-	shrdl	$7,%ebx,%ebx
-	movl	%eax,%edi
-	xorl	%ecx,%esi
-	shldl	$5,%eax,%eax
-	addl	%esi,%ebp
-	xorl	%ebx,%edi
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	addl	12(%rsp),%edx
-	andl	%ebx,%edi
-	xorl	%ecx,%ebx
-	shrdl	$7,%eax,%eax
-	movl	%ebp,%esi
-	xorl	%ebx,%edi
-	shldl	$5,%ebp,%ebp
-	addl	%edi,%edx
-	xorl	%eax,%esi
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	vpalignr	$8,%xmm7,%xmm0,%xmm8
-	vpxor	%xmm5,%xmm1,%xmm1
-	addl	16(%rsp),%ecx
-	andl	%eax,%esi
-	xorl	%ebx,%eax
-	shrdl	$7,%ebp,%ebp
-	vpxor	%xmm2,%xmm1,%xmm1
-	movl	%edx,%edi
-	xorl	%eax,%esi
-	vpaddd	%xmm0,%xmm11,%xmm9
-	shldl	$5,%edx,%edx
-	addl	%esi,%ecx
-	vpxor	%xmm8,%xmm1,%xmm1
-	xorl	%ebp,%edi
-	xorl	%eax,%ebp
-	addl	%edx,%ecx
-	addl	20(%rsp),%ebx
-	vpsrld	$30,%xmm1,%xmm8
-	vmovdqa	%xmm9,0(%rsp)
-	andl	%ebp,%edi
-	xorl	%eax,%ebp
-	shrdl	$7,%edx,%edx
-	movl	%ecx,%esi
-	vpslld	$2,%xmm1,%xmm1
-	xorl	%ebp,%edi
-	shldl	$5,%ecx,%ecx
-	addl	%edi,%ebx
-	xorl	%edx,%esi
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	addl	24(%rsp),%eax
-	andl	%edx,%esi
-	vpor	%xmm8,%xmm1,%xmm1
-	xorl	%ebp,%edx
-	shrdl	$7,%ecx,%ecx
-	movl	%ebx,%edi
-	xorl	%edx,%esi
-	shldl	$5,%ebx,%ebx
-	addl	%esi,%eax
-	xorl	%ecx,%edi
-	xorl	%edx,%ecx
-	addl	%ebx,%eax
-	addl	28(%rsp),%ebp
-	andl	%ecx,%edi
-	xorl	%edx,%ecx
-	shrdl	$7,%ebx,%ebx
-	movl	%eax,%esi
-	xorl	%ecx,%edi
-	shldl	$5,%eax,%eax
-	addl	%edi,%ebp
-	xorl	%ebx,%esi
-	xorl	%ecx,%ebx
-	addl	%eax,%ebp
-	vpalignr	$8,%xmm0,%xmm1,%xmm8
-	vpxor	%xmm6,%xmm2,%xmm2
-	addl	32(%rsp),%edx
-	andl	%ebx,%esi
-	xorl	%ecx,%ebx
-	shrdl	$7,%eax,%eax
-	vpxor	%xmm3,%xmm2,%xmm2
-	movl	%ebp,%edi
-	xorl	%ebx,%esi
-	vpaddd	%xmm1,%xmm11,%xmm9
-	shldl	$5,%ebp,%ebp
-	addl	%esi,%edx
-	vpxor	%xmm8,%xmm2,%xmm2
-	xorl	%eax,%edi
-	xorl	%ebx,%eax
-	addl	%ebp,%edx
-	addl	36(%rsp),%ecx
-	vpsrld	$30,%xmm2,%xmm8
-	vmovdqa	%xmm9,16(%rsp)
-	andl	%eax,%edi
-	xorl	%ebx,%eax
-	shrdl	$7,%ebp,%ebp
-	movl	%edx,%esi
-	vpslld	$2,%xmm2,%xmm2
-	xorl	%eax,%edi
-	shldl	$5,%edx,%edx
-	addl	%edi,%ecx
-	xorl	%ebp,%esi
-	xorl	%eax,%ebp
-	addl	%edx,%ecx
-	addl	40(%rsp),%ebx
-	andl	%ebp,%esi
-	vpor	%xmm8,%xmm2,%xmm2
-	xorl	%eax,%ebp
-	shrdl	$7,%edx,%edx
-	movl	%ecx,%edi
-	xorl	%ebp,%esi
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	xorl	%edx,%edi
-	xorl	%ebp,%edx
-	addl	%ecx,%ebx
-	addl	44(%rsp),%eax
-	andl	%edx,%edi
-	xorl	%ebp,%edx
-	shrdl	$7,%ecx,%ecx
-	movl	%ebx,%esi
-	xorl	%edx,%edi
-	shldl	$5,%ebx,%ebx
-	addl	%edi,%eax
-	xorl	%edx,%esi
-	addl	%ebx,%eax
-	vpalignr	$8,%xmm1,%xmm2,%xmm8
-	vpxor	%xmm7,%xmm3,%xmm3
-	addl	48(%rsp),%ebp
-	xorl	%ecx,%esi
-	movl	%eax,%edi
-	shldl	$5,%eax,%eax
-	vpxor	%xmm4,%xmm3,%xmm3
-	addl	%esi,%ebp
-	xorl	%ecx,%edi
-	vpaddd	%xmm2,%xmm11,%xmm9
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%ebp
-	vpxor	%xmm8,%xmm3,%xmm3
-	addl	52(%rsp),%edx
-	xorl	%ebx,%edi
-	movl	%ebp,%esi
-	shldl	$5,%ebp,%ebp
-	vpsrld	$30,%xmm3,%xmm8
-	vmovdqa	%xmm9,32(%rsp)
-	addl	%edi,%edx
-	xorl	%ebx,%esi
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	vpslld	$2,%xmm3,%xmm3
-	addl	56(%rsp),%ecx
-	xorl	%eax,%esi
-	movl	%edx,%edi
-	shldl	$5,%edx,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%edi
-	shrdl	$7,%ebp,%ebp
-	addl	%edx,%ecx
-	vpor	%xmm8,%xmm3,%xmm3
-	addl	60(%rsp),%ebx
-	xorl	%ebp,%edi
-	movl	%ecx,%esi
-	shldl	$5,%ecx,%ecx
-	addl	%edi,%ebx
-	xorl	%ebp,%esi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	addl	0(%rsp),%eax
-	vpaddd	%xmm3,%xmm11,%xmm9
-	xorl	%edx,%esi
-	movl	%ebx,%edi
-	shldl	$5,%ebx,%ebx
-	addl	%esi,%eax
-	vmovdqa	%xmm9,48(%rsp)
-	xorl	%edx,%edi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	addl	4(%rsp),%ebp
-	xorl	%ecx,%edi
-	movl	%eax,%esi
-	shldl	$5,%eax,%eax
-	addl	%edi,%ebp
-	xorl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%ebp
-	addl	8(%rsp),%edx
-	xorl	%ebx,%esi
-	movl	%ebp,%edi
-	shldl	$5,%ebp,%ebp
-	addl	%esi,%edx
-	xorl	%ebx,%edi
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	addl	12(%rsp),%ecx
-	xorl	%eax,%edi
-	movl	%edx,%esi
-	shldl	$5,%edx,%edx
-	addl	%edi,%ecx
-	xorl	%eax,%esi
-	shrdl	$7,%ebp,%ebp
-	addl	%edx,%ecx
-	cmpq	%r10,%r9
-	je	L$done_avx
-	vmovdqa	64(%r14),%xmm6
-	vmovdqa	-64(%r14),%xmm11
-	vmovdqu	0(%r9),%xmm0
-	vmovdqu	16(%r9),%xmm1
-	vmovdqu	32(%r9),%xmm2
-	vmovdqu	48(%r9),%xmm3
-	vpshufb	%xmm6,%xmm0,%xmm0
-	addq	$64,%r9
-	addl	16(%rsp),%ebx
-	xorl	%ebp,%esi
-	vpshufb	%xmm6,%xmm1,%xmm1
-	movl	%ecx,%edi
-	shldl	$5,%ecx,%ecx
-	vpaddd	%xmm11,%xmm0,%xmm4
-	addl	%esi,%ebx
-	xorl	%ebp,%edi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	vmovdqa	%xmm4,0(%rsp)
-	addl	20(%rsp),%eax
-	xorl	%edx,%edi
-	movl	%ebx,%esi
-	shldl	$5,%ebx,%ebx
-	addl	%edi,%eax
-	xorl	%edx,%esi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	addl	24(%rsp),%ebp
-	xorl	%ecx,%esi
-	movl	%eax,%edi
-	shldl	$5,%eax,%eax
-	addl	%esi,%ebp
-	xorl	%ecx,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%ebp
-	addl	28(%rsp),%edx
-	xorl	%ebx,%edi
-	movl	%ebp,%esi
-	shldl	$5,%ebp,%ebp
-	addl	%edi,%edx
-	xorl	%ebx,%esi
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	addl	32(%rsp),%ecx
-	xorl	%eax,%esi
-	vpshufb	%xmm6,%xmm2,%xmm2
-	movl	%edx,%edi
-	shldl	$5,%edx,%edx
-	vpaddd	%xmm11,%xmm1,%xmm5
-	addl	%esi,%ecx
-	xorl	%eax,%edi
-	shrdl	$7,%ebp,%ebp
-	addl	%edx,%ecx
-	vmovdqa	%xmm5,16(%rsp)
-	addl	36(%rsp),%ebx
-	xorl	%ebp,%edi
-	movl	%ecx,%esi
-	shldl	$5,%ecx,%ecx
-	addl	%edi,%ebx
-	xorl	%ebp,%esi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	addl	40(%rsp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%edi
-	shldl	$5,%ebx,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%edi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	addl	44(%rsp),%ebp
-	xorl	%ecx,%edi
-	movl	%eax,%esi
-	shldl	$5,%eax,%eax
-	addl	%edi,%ebp
-	xorl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%ebp
-	addl	48(%rsp),%edx
-	xorl	%ebx,%esi
-	vpshufb	%xmm6,%xmm3,%xmm3
-	movl	%ebp,%edi
-	shldl	$5,%ebp,%ebp
-	vpaddd	%xmm11,%xmm2,%xmm6
-	addl	%esi,%edx
-	xorl	%ebx,%edi
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	vmovdqa	%xmm6,32(%rsp)
-	addl	52(%rsp),%ecx
-	xorl	%eax,%edi
-	movl	%edx,%esi
-	shldl	$5,%edx,%edx
-	addl	%edi,%ecx
-	xorl	%eax,%esi
-	shrdl	$7,%ebp,%ebp
-	addl	%edx,%ecx
-	addl	56(%rsp),%ebx
-	xorl	%ebp,%esi
-	movl	%ecx,%edi
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	xorl	%ebp,%edi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	addl	60(%rsp),%eax
-	xorl	%edx,%edi
-	movl	%ebx,%esi
-	shldl	$5,%ebx,%ebx
-	addl	%edi,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	addl	0(%r8),%eax
-	addl	4(%r8),%esi
-	addl	8(%r8),%ecx
-	addl	12(%r8),%edx
-	movl	%eax,0(%r8)
-	addl	16(%r8),%ebp
-	movl	%esi,4(%r8)
-	movl	%esi,%ebx
-	movl	%ecx,8(%r8)
-	movl	%ecx,%edi
-	movl	%edx,12(%r8)
-	xorl	%edx,%edi
-	movl	%ebp,16(%r8)
-	andl	%edi,%esi
-	jmp	L$oop_avx
-
-.p2align	4
-L$done_avx:
-	addl	16(%rsp),%ebx
-	xorl	%ebp,%esi
-	movl	%ecx,%edi
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	xorl	%ebp,%edi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	addl	20(%rsp),%eax
-	xorl	%edx,%edi
-	movl	%ebx,%esi
-	shldl	$5,%ebx,%ebx
-	addl	%edi,%eax
-	xorl	%edx,%esi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	addl	24(%rsp),%ebp
-	xorl	%ecx,%esi
-	movl	%eax,%edi
-	shldl	$5,%eax,%eax
-	addl	%esi,%ebp
-	xorl	%ecx,%edi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%ebp
-	addl	28(%rsp),%edx
-	xorl	%ebx,%edi
-	movl	%ebp,%esi
-	shldl	$5,%ebp,%ebp
-	addl	%edi,%edx
-	xorl	%ebx,%esi
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	addl	32(%rsp),%ecx
-	xorl	%eax,%esi
-	movl	%edx,%edi
-	shldl	$5,%edx,%edx
-	addl	%esi,%ecx
-	xorl	%eax,%edi
-	shrdl	$7,%ebp,%ebp
-	addl	%edx,%ecx
-	addl	36(%rsp),%ebx
-	xorl	%ebp,%edi
-	movl	%ecx,%esi
-	shldl	$5,%ecx,%ecx
-	addl	%edi,%ebx
-	xorl	%ebp,%esi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	addl	40(%rsp),%eax
-	xorl	%edx,%esi
-	movl	%ebx,%edi
-	shldl	$5,%ebx,%ebx
-	addl	%esi,%eax
-	xorl	%edx,%edi
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	addl	44(%rsp),%ebp
-	xorl	%ecx,%edi
-	movl	%eax,%esi
-	shldl	$5,%eax,%eax
-	addl	%edi,%ebp
-	xorl	%ecx,%esi
-	shrdl	$7,%ebx,%ebx
-	addl	%eax,%ebp
-	addl	48(%rsp),%edx
-	xorl	%ebx,%esi
-	movl	%ebp,%edi
-	shldl	$5,%ebp,%ebp
-	addl	%esi,%edx
-	xorl	%ebx,%edi
-	shrdl	$7,%eax,%eax
-	addl	%ebp,%edx
-	addl	52(%rsp),%ecx
-	xorl	%eax,%edi
-	movl	%edx,%esi
-	shldl	$5,%edx,%edx
-	addl	%edi,%ecx
-	xorl	%eax,%esi
-	shrdl	$7,%ebp,%ebp
-	addl	%edx,%ecx
-	addl	56(%rsp),%ebx
-	xorl	%ebp,%esi
-	movl	%ecx,%edi
-	shldl	$5,%ecx,%ecx
-	addl	%esi,%ebx
-	xorl	%ebp,%edi
-	shrdl	$7,%edx,%edx
-	addl	%ecx,%ebx
-	addl	60(%rsp),%eax
-	xorl	%edx,%edi
-	movl	%ebx,%esi
-	shldl	$5,%ebx,%ebx
-	addl	%edi,%eax
-	shrdl	$7,%ecx,%ecx
-	addl	%ebx,%eax
-	vzeroupper
-
-	addl	0(%r8),%eax
-	addl	4(%r8),%esi
-	addl	8(%r8),%ecx
-	movl	%eax,0(%r8)
-	addl	12(%r8),%edx
-	movl	%esi,4(%r8)
-	addl	16(%r8),%ebp
-	movl	%ecx,8(%r8)
-	movl	%edx,12(%r8)
-	movl	%ebp,16(%r8)
-	movq	-40(%r11),%r14
-
-	movq	-32(%r11),%r13
-
-	movq	-24(%r11),%r12
-
-	movq	-16(%r11),%rbp
-
-	movq	-8(%r11),%rbx
-
-	leaq	(%r11),%rsp
-
-L$epilogue_avx:
-	ret
-
-
-
-.p2align	4
-sha1_block_data_order_avx2:
-_avx2_shortcut:
-
-	movq	%rsp,%r11
-
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	vzeroupper
-	movq	%rdi,%r8
-	movq	%rsi,%r9
-	movq	%rdx,%r10
-
-	leaq	-640(%rsp),%rsp
-	shlq	$6,%r10
-	leaq	64(%r9),%r13
-	andq	$-128,%rsp
-	addq	%r9,%r10
-	leaq	K_XX_XX+64(%rip),%r14
-
-	movl	0(%r8),%eax
-	cmpq	%r10,%r13
-	cmovaeq	%r9,%r13
-	movl	4(%r8),%ebp
-	movl	8(%r8),%ecx
-	movl	12(%r8),%edx
-	movl	16(%r8),%esi
-	vmovdqu	64(%r14),%ymm6
-
-	vmovdqu	(%r9),%xmm0
-	vmovdqu	16(%r9),%xmm1
-	vmovdqu	32(%r9),%xmm2
-	vmovdqu	48(%r9),%xmm3
-	leaq	64(%r9),%r9
-	vinserti128	$1,(%r13),%ymm0,%ymm0
-	vinserti128	$1,16(%r13),%ymm1,%ymm1
-	vpshufb	%ymm6,%ymm0,%ymm0
-	vinserti128	$1,32(%r13),%ymm2,%ymm2
-	vpshufb	%ymm6,%ymm1,%ymm1
-	vinserti128	$1,48(%r13),%ymm3,%ymm3
-	vpshufb	%ymm6,%ymm2,%ymm2
-	vmovdqu	-64(%r14),%ymm11
-	vpshufb	%ymm6,%ymm3,%ymm3
-
-	vpaddd	%ymm11,%ymm0,%ymm4
-	vpaddd	%ymm11,%ymm1,%ymm5
-	vmovdqu	%ymm4,0(%rsp)
-	vpaddd	%ymm11,%ymm2,%ymm6
-	vmovdqu	%ymm5,32(%rsp)
-	vpaddd	%ymm11,%ymm3,%ymm7
-	vmovdqu	%ymm6,64(%rsp)
-	vmovdqu	%ymm7,96(%rsp)
-	vpalignr	$8,%ymm0,%ymm1,%ymm4
-	vpsrldq	$4,%ymm3,%ymm8
-	vpxor	%ymm0,%ymm4,%ymm4
-	vpxor	%ymm2,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm4,%ymm4
-	vpsrld	$31,%ymm4,%ymm8
-	vpslldq	$12,%ymm4,%ymm10
-	vpaddd	%ymm4,%ymm4,%ymm4
-	vpsrld	$30,%ymm10,%ymm9
-	vpor	%ymm8,%ymm4,%ymm4
-	vpslld	$2,%ymm10,%ymm10
-	vpxor	%ymm9,%ymm4,%ymm4
-	vpxor	%ymm10,%ymm4,%ymm4
-	vpaddd	%ymm11,%ymm4,%ymm9
-	vmovdqu	%ymm9,128(%rsp)
-	vpalignr	$8,%ymm1,%ymm2,%ymm5
-	vpsrldq	$4,%ymm4,%ymm8
-	vpxor	%ymm1,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm5,%ymm5
-	vpsrld	$31,%ymm5,%ymm8
-	vmovdqu	-32(%r14),%ymm11
-	vpslldq	$12,%ymm5,%ymm10
-	vpaddd	%ymm5,%ymm5,%ymm5
-	vpsrld	$30,%ymm10,%ymm9
-	vpor	%ymm8,%ymm5,%ymm5
-	vpslld	$2,%ymm10,%ymm10
-	vpxor	%ymm9,%ymm5,%ymm5
-	vpxor	%ymm10,%ymm5,%ymm5
-	vpaddd	%ymm11,%ymm5,%ymm9
-	vmovdqu	%ymm9,160(%rsp)
-	vpalignr	$8,%ymm2,%ymm3,%ymm6
-	vpsrldq	$4,%ymm5,%ymm8
-	vpxor	%ymm2,%ymm6,%ymm6
-	vpxor	%ymm4,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm6,%ymm6
-	vpsrld	$31,%ymm6,%ymm8
-	vpslldq	$12,%ymm6,%ymm10
-	vpaddd	%ymm6,%ymm6,%ymm6
-	vpsrld	$30,%ymm10,%ymm9
-	vpor	%ymm8,%ymm6,%ymm6
-	vpslld	$2,%ymm10,%ymm10
-	vpxor	%ymm9,%ymm6,%ymm6
-	vpxor	%ymm10,%ymm6,%ymm6
-	vpaddd	%ymm11,%ymm6,%ymm9
-	vmovdqu	%ymm9,192(%rsp)
-	vpalignr	$8,%ymm3,%ymm4,%ymm7
-	vpsrldq	$4,%ymm6,%ymm8
-	vpxor	%ymm3,%ymm7,%ymm7
-	vpxor	%ymm5,%ymm8,%ymm8
-	vpxor	%ymm8,%ymm7,%ymm7
-	vpsrld	$31,%ymm7,%ymm8
-	vpslldq	$12,%ymm7,%ymm10
-	vpaddd	%ymm7,%ymm7,%ymm7
-	vpsrld	$30,%ymm10,%ymm9
-	vpor	%ymm8,%ymm7,%ymm7
-	vpslld	$2,%ymm10,%ymm10
-	vpxor	%ymm9,%ymm7,%ymm7
-	vpxor	%ymm10,%ymm7,%ymm7
-	vpaddd	%ymm11,%ymm7,%ymm9
-	vmovdqu	%ymm9,224(%rsp)
-	leaq	128(%rsp),%r13
-	jmp	L$oop_avx2
-.p2align	5
-L$oop_avx2:
-	rorxl	$2,%ebp,%ebx
-	andnl	%edx,%ebp,%edi
-	andl	%ecx,%ebp
-	xorl	%edi,%ebp
-	jmp	L$align32_1
-.p2align	5
-L$align32_1:
-	vpalignr	$8,%ymm6,%ymm7,%ymm8
-	vpxor	%ymm4,%ymm0,%ymm0
-	addl	-128(%r13),%esi
-	andnl	%ecx,%eax,%edi
-	vpxor	%ymm1,%ymm0,%ymm0
-	addl	%ebp,%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	vpxor	%ymm8,%ymm0,%ymm0
-	andl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%edi,%eax
-	vpsrld	$30,%ymm0,%ymm8
-	vpslld	$2,%ymm0,%ymm0
-	addl	-124(%r13),%edx
-	andnl	%ebx,%esi,%edi
-	addl	%eax,%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	andl	%ebp,%esi
-	vpor	%ymm8,%ymm0,%ymm0
-	addl	%r12d,%edx
-	xorl	%edi,%esi
-	addl	-120(%r13),%ecx
-	andnl	%ebp,%edx,%edi
-	vpaddd	%ymm11,%ymm0,%ymm9
-	addl	%esi,%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	andl	%eax,%edx
-	vmovdqu	%ymm9,256(%rsp)
-	addl	%r12d,%ecx
-	xorl	%edi,%edx
-	addl	-116(%r13),%ebx
-	andnl	%eax,%ecx,%edi
-	addl	%edx,%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	andl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%edi,%ecx
-	addl	-96(%r13),%ebp
-	andnl	%esi,%ebx,%edi
-	addl	%ecx,%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	andl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%edi,%ebx
-	vpalignr	$8,%ymm7,%ymm0,%ymm8
-	vpxor	%ymm5,%ymm1,%ymm1
-	addl	-92(%r13),%eax
-	andnl	%edx,%ebp,%edi
-	vpxor	%ymm2,%ymm1,%ymm1
-	addl	%ebx,%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	vpxor	%ymm8,%ymm1,%ymm1
-	andl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edi,%ebp
-	vpsrld	$30,%ymm1,%ymm8
-	vpslld	$2,%ymm1,%ymm1
-	addl	-88(%r13),%esi
-	andnl	%ecx,%eax,%edi
-	addl	%ebp,%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	andl	%ebx,%eax
-	vpor	%ymm8,%ymm1,%ymm1
-	addl	%r12d,%esi
-	xorl	%edi,%eax
-	addl	-84(%r13),%edx
-	andnl	%ebx,%esi,%edi
-	vpaddd	%ymm11,%ymm1,%ymm9
-	addl	%eax,%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	andl	%ebp,%esi
-	vmovdqu	%ymm9,288(%rsp)
-	addl	%r12d,%edx
-	xorl	%edi,%esi
-	addl	-64(%r13),%ecx
-	andnl	%ebp,%edx,%edi
-	addl	%esi,%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	andl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%edi,%edx
-	addl	-60(%r13),%ebx
-	andnl	%eax,%ecx,%edi
-	addl	%edx,%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	andl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%edi,%ecx
-	vpalignr	$8,%ymm0,%ymm1,%ymm8
-	vpxor	%ymm6,%ymm2,%ymm2
-	addl	-56(%r13),%ebp
-	andnl	%esi,%ebx,%edi
-	vpxor	%ymm3,%ymm2,%ymm2
-	vmovdqu	0(%r14),%ymm11
-	addl	%ecx,%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	vpxor	%ymm8,%ymm2,%ymm2
-	andl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%edi,%ebx
-	vpsrld	$30,%ymm2,%ymm8
-	vpslld	$2,%ymm2,%ymm2
-	addl	-52(%r13),%eax
-	andnl	%edx,%ebp,%edi
-	addl	%ebx,%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	andl	%ecx,%ebp
-	vpor	%ymm8,%ymm2,%ymm2
-	addl	%r12d,%eax
-	xorl	%edi,%ebp
-	addl	-32(%r13),%esi
-	andnl	%ecx,%eax,%edi
-	vpaddd	%ymm11,%ymm2,%ymm9
-	addl	%ebp,%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	andl	%ebx,%eax
-	vmovdqu	%ymm9,320(%rsp)
-	addl	%r12d,%esi
-	xorl	%edi,%eax
-	addl	-28(%r13),%edx
-	andnl	%ebx,%esi,%edi
-	addl	%eax,%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	andl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%edi,%esi
-	addl	-24(%r13),%ecx
-	andnl	%ebp,%edx,%edi
-	addl	%esi,%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	andl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%edi,%edx
-	vpalignr	$8,%ymm1,%ymm2,%ymm8
-	vpxor	%ymm7,%ymm3,%ymm3
-	addl	-20(%r13),%ebx
-	andnl	%eax,%ecx,%edi
-	vpxor	%ymm4,%ymm3,%ymm3
-	addl	%edx,%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	vpxor	%ymm8,%ymm3,%ymm3
-	andl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%edi,%ecx
-	vpsrld	$30,%ymm3,%ymm8
-	vpslld	$2,%ymm3,%ymm3
-	addl	0(%r13),%ebp
-	andnl	%esi,%ebx,%edi
-	addl	%ecx,%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	andl	%edx,%ebx
-	vpor	%ymm8,%ymm3,%ymm3
-	addl	%r12d,%ebp
-	xorl	%edi,%ebx
-	addl	4(%r13),%eax
-	andnl	%edx,%ebp,%edi
-	vpaddd	%ymm11,%ymm3,%ymm9
-	addl	%ebx,%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	andl	%ecx,%ebp
-	vmovdqu	%ymm9,352(%rsp)
-	addl	%r12d,%eax
-	xorl	%edi,%ebp
-	addl	8(%r13),%esi
-	andnl	%ecx,%eax,%edi
-	addl	%ebp,%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	andl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%edi,%eax
-	addl	12(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%ebx,%esi
-	vpalignr	$8,%ymm2,%ymm3,%ymm8
-	vpxor	%ymm0,%ymm4,%ymm4
-	addl	32(%r13),%ecx
-	leal	(%rcx,%rsi,1),%ecx
-	vpxor	%ymm5,%ymm4,%ymm4
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	vpxor	%ymm8,%ymm4,%ymm4
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	addl	36(%r13),%ebx
-	vpsrld	$30,%ymm4,%ymm8
-	vpslld	$2,%ymm4,%ymm4
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%eax,%ecx
-	vpor	%ymm8,%ymm4,%ymm4
-	addl	40(%r13),%ebp
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	vpaddd	%ymm11,%ymm4,%ymm9
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%esi,%ebx
-	addl	44(%r13),%eax
-	vmovdqu	%ymm9,384(%rsp)
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	addl	64(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%ecx,%eax
-	vpalignr	$8,%ymm3,%ymm4,%ymm8
-	vpxor	%ymm1,%ymm5,%ymm5
-	addl	68(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	vpxor	%ymm6,%ymm5,%ymm5
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	vpxor	%ymm8,%ymm5,%ymm5
-	addl	%r12d,%edx
-	xorl	%ebx,%esi
-	addl	72(%r13),%ecx
-	vpsrld	$30,%ymm5,%ymm8
-	vpslld	$2,%ymm5,%ymm5
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	vpor	%ymm8,%ymm5,%ymm5
-	addl	76(%r13),%ebx
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	vpaddd	%ymm11,%ymm5,%ymm9
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%eax,%ecx
-	addl	96(%r13),%ebp
-	vmovdqu	%ymm9,416(%rsp)
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%esi,%ebx
-	addl	100(%r13),%eax
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	vpalignr	$8,%ymm4,%ymm5,%ymm8
-	vpxor	%ymm2,%ymm6,%ymm6
-	addl	104(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	vpxor	%ymm7,%ymm6,%ymm6
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	vpxor	%ymm8,%ymm6,%ymm6
-	addl	%r12d,%esi
-	xorl	%ecx,%eax
-	addl	108(%r13),%edx
-	leaq	256(%r13),%r13
-	vpsrld	$30,%ymm6,%ymm8
-	vpslld	$2,%ymm6,%ymm6
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%ebx,%esi
-	vpor	%ymm8,%ymm6,%ymm6
-	addl	-128(%r13),%ecx
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	vpaddd	%ymm11,%ymm6,%ymm9
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	addl	-124(%r13),%ebx
-	vmovdqu	%ymm9,448(%rsp)
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%eax,%ecx
-	addl	-120(%r13),%ebp
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%esi,%ebx
-	vpalignr	$8,%ymm5,%ymm6,%ymm8
-	vpxor	%ymm3,%ymm7,%ymm7
-	addl	-116(%r13),%eax
-	leal	(%rax,%rbx,1),%eax
-	vpxor	%ymm0,%ymm7,%ymm7
-	vmovdqu	32(%r14),%ymm11
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	vpxor	%ymm8,%ymm7,%ymm7
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	addl	-96(%r13),%esi
-	vpsrld	$30,%ymm7,%ymm8
-	vpslld	$2,%ymm7,%ymm7
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%ecx,%eax
-	vpor	%ymm8,%ymm7,%ymm7
-	addl	-92(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	vpaddd	%ymm11,%ymm7,%ymm9
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%ebx,%esi
-	addl	-88(%r13),%ecx
-	vmovdqu	%ymm9,480(%rsp)
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	addl	-84(%r13),%ebx
-	movl	%esi,%edi
-	xorl	%eax,%edi
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	andl	%edi,%ecx
-	jmp	L$align32_2
-.p2align	5
-L$align32_2:
-	vpalignr	$8,%ymm6,%ymm7,%ymm8
-	vpxor	%ymm4,%ymm0,%ymm0
-	addl	-64(%r13),%ebp
-	xorl	%esi,%ecx
-	vpxor	%ymm1,%ymm0,%ymm0
-	movl	%edx,%edi
-	xorl	%esi,%edi
-	leal	(%rcx,%rbp,1),%ebp
-	vpxor	%ymm8,%ymm0,%ymm0
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	vpsrld	$30,%ymm0,%ymm8
-	vpslld	$2,%ymm0,%ymm0
-	addl	%r12d,%ebp
-	andl	%edi,%ebx
-	addl	-60(%r13),%eax
-	xorl	%edx,%ebx
-	movl	%ecx,%edi
-	xorl	%edx,%edi
-	vpor	%ymm8,%ymm0,%ymm0
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	vpaddd	%ymm11,%ymm0,%ymm9
-	addl	%r12d,%eax
-	andl	%edi,%ebp
-	addl	-56(%r13),%esi
-	xorl	%ecx,%ebp
-	vmovdqu	%ymm9,512(%rsp)
-	movl	%ebx,%edi
-	xorl	%ecx,%edi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	andl	%edi,%eax
-	addl	-52(%r13),%edx
-	xorl	%ebx,%eax
-	movl	%ebp,%edi
-	xorl	%ebx,%edi
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	andl	%edi,%esi
-	addl	-32(%r13),%ecx
-	xorl	%ebp,%esi
-	movl	%eax,%edi
-	xorl	%ebp,%edi
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	andl	%edi,%edx
-	vpalignr	$8,%ymm7,%ymm0,%ymm8
-	vpxor	%ymm5,%ymm1,%ymm1
-	addl	-28(%r13),%ebx
-	xorl	%eax,%edx
-	vpxor	%ymm2,%ymm1,%ymm1
-	movl	%esi,%edi
-	xorl	%eax,%edi
-	leal	(%rbx,%rdx,1),%ebx
-	vpxor	%ymm8,%ymm1,%ymm1
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	vpsrld	$30,%ymm1,%ymm8
-	vpslld	$2,%ymm1,%ymm1
-	addl	%r12d,%ebx
-	andl	%edi,%ecx
-	addl	-24(%r13),%ebp
-	xorl	%esi,%ecx
-	movl	%edx,%edi
-	xorl	%esi,%edi
-	vpor	%ymm8,%ymm1,%ymm1
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	vpaddd	%ymm11,%ymm1,%ymm9
-	addl	%r12d,%ebp
-	andl	%edi,%ebx
-	addl	-20(%r13),%eax
-	xorl	%edx,%ebx
-	vmovdqu	%ymm9,544(%rsp)
-	movl	%ecx,%edi
-	xorl	%edx,%edi
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	andl	%edi,%ebp
-	addl	0(%r13),%esi
-	xorl	%ecx,%ebp
-	movl	%ebx,%edi
-	xorl	%ecx,%edi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	andl	%edi,%eax
-	addl	4(%r13),%edx
-	xorl	%ebx,%eax
-	movl	%ebp,%edi
-	xorl	%ebx,%edi
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	andl	%edi,%esi
-	vpalignr	$8,%ymm0,%ymm1,%ymm8
-	vpxor	%ymm6,%ymm2,%ymm2
-	addl	8(%r13),%ecx
-	xorl	%ebp,%esi
-	vpxor	%ymm3,%ymm2,%ymm2
-	movl	%eax,%edi
-	xorl	%ebp,%edi
-	leal	(%rcx,%rsi,1),%ecx
-	vpxor	%ymm8,%ymm2,%ymm2
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	vpsrld	$30,%ymm2,%ymm8
-	vpslld	$2,%ymm2,%ymm2
-	addl	%r12d,%ecx
-	andl	%edi,%edx
-	addl	12(%r13),%ebx
-	xorl	%eax,%edx
-	movl	%esi,%edi
-	xorl	%eax,%edi
-	vpor	%ymm8,%ymm2,%ymm2
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	vpaddd	%ymm11,%ymm2,%ymm9
-	addl	%r12d,%ebx
-	andl	%edi,%ecx
-	addl	32(%r13),%ebp
-	xorl	%esi,%ecx
-	vmovdqu	%ymm9,576(%rsp)
-	movl	%edx,%edi
-	xorl	%esi,%edi
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	andl	%edi,%ebx
-	addl	36(%r13),%eax
-	xorl	%edx,%ebx
-	movl	%ecx,%edi
-	xorl	%edx,%edi
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	andl	%edi,%ebp
-	addl	40(%r13),%esi
-	xorl	%ecx,%ebp
-	movl	%ebx,%edi
-	xorl	%ecx,%edi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	andl	%edi,%eax
-	vpalignr	$8,%ymm1,%ymm2,%ymm8
-	vpxor	%ymm7,%ymm3,%ymm3
-	addl	44(%r13),%edx
-	xorl	%ebx,%eax
-	vpxor	%ymm4,%ymm3,%ymm3
-	movl	%ebp,%edi
-	xorl	%ebx,%edi
-	leal	(%rdx,%rax,1),%edx
-	vpxor	%ymm8,%ymm3,%ymm3
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	vpsrld	$30,%ymm3,%ymm8
-	vpslld	$2,%ymm3,%ymm3
-	addl	%r12d,%edx
-	andl	%edi,%esi
-	addl	64(%r13),%ecx
-	xorl	%ebp,%esi
-	movl	%eax,%edi
-	xorl	%ebp,%edi
-	vpor	%ymm8,%ymm3,%ymm3
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	vpaddd	%ymm11,%ymm3,%ymm9
-	addl	%r12d,%ecx
-	andl	%edi,%edx
-	addl	68(%r13),%ebx
-	xorl	%eax,%edx
-	vmovdqu	%ymm9,608(%rsp)
-	movl	%esi,%edi
-	xorl	%eax,%edi
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	andl	%edi,%ecx
-	addl	72(%r13),%ebp
-	xorl	%esi,%ecx
-	movl	%edx,%edi
-	xorl	%esi,%edi
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	andl	%edi,%ebx
-	addl	76(%r13),%eax
-	xorl	%edx,%ebx
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	addl	96(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%ecx,%eax
-	addl	100(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%ebx,%esi
-	addl	104(%r13),%ecx
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	addl	108(%r13),%ebx
-	leaq	256(%r13),%r13
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%eax,%ecx
-	addl	-128(%r13),%ebp
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%esi,%ebx
-	addl	-124(%r13),%eax
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	addl	-120(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%ecx,%eax
-	addl	-116(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%ebx,%esi
-	addl	-96(%r13),%ecx
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	addl	-92(%r13),%ebx
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%eax,%ecx
-	addl	-88(%r13),%ebp
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%esi,%ebx
-	addl	-84(%r13),%eax
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	addl	-64(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%ecx,%eax
-	addl	-60(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%ebx,%esi
-	addl	-56(%r13),%ecx
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	addl	-52(%r13),%ebx
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%eax,%ecx
-	addl	-32(%r13),%ebp
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%esi,%ebx
-	addl	-28(%r13),%eax
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	addl	-24(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%ecx,%eax
-	addl	-20(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	addl	%r12d,%edx
-	leaq	128(%r9),%r13
-	leaq	128(%r9),%rdi
-	cmpq	%r10,%r13
-	cmovaeq	%r9,%r13
-
-
-	addl	0(%r8),%edx
-	addl	4(%r8),%esi
-	addl	8(%r8),%ebp
-	movl	%edx,0(%r8)
-	addl	12(%r8),%ebx
-	movl	%esi,4(%r8)
-	movl	%edx,%eax
-	addl	16(%r8),%ecx
-	movl	%ebp,%r12d
-	movl	%ebp,8(%r8)
-	movl	%ebx,%edx
-
-	movl	%ebx,12(%r8)
-	movl	%esi,%ebp
-	movl	%ecx,16(%r8)
-
-	movl	%ecx,%esi
-	movl	%r12d,%ecx
-
-
-	cmpq	%r10,%r9
-	je	L$done_avx2
-	vmovdqu	64(%r14),%ymm6
-	cmpq	%r10,%rdi
-	ja	L$ast_avx2
-
-	vmovdqu	-64(%rdi),%xmm0
-	vmovdqu	-48(%rdi),%xmm1
-	vmovdqu	-32(%rdi),%xmm2
-	vmovdqu	-16(%rdi),%xmm3
-	vinserti128	$1,0(%r13),%ymm0,%ymm0
-	vinserti128	$1,16(%r13),%ymm1,%ymm1
-	vinserti128	$1,32(%r13),%ymm2,%ymm2
-	vinserti128	$1,48(%r13),%ymm3,%ymm3
-	jmp	L$ast_avx2
-
-.p2align	5
-L$ast_avx2:
-	leaq	128+16(%rsp),%r13
-	rorxl	$2,%ebp,%ebx
-	andnl	%edx,%ebp,%edi
-	andl	%ecx,%ebp
-	xorl	%edi,%ebp
-	subq	$-128,%r9
-	addl	-128(%r13),%esi
-	andnl	%ecx,%eax,%edi
-	addl	%ebp,%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	andl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%edi,%eax
-	addl	-124(%r13),%edx
-	andnl	%ebx,%esi,%edi
-	addl	%eax,%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	andl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%edi,%esi
-	addl	-120(%r13),%ecx
-	andnl	%ebp,%edx,%edi
-	addl	%esi,%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	andl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%edi,%edx
-	addl	-116(%r13),%ebx
-	andnl	%eax,%ecx,%edi
-	addl	%edx,%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	andl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%edi,%ecx
-	addl	-96(%r13),%ebp
-	andnl	%esi,%ebx,%edi
-	addl	%ecx,%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	andl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%edi,%ebx
-	addl	-92(%r13),%eax
-	andnl	%edx,%ebp,%edi
-	addl	%ebx,%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	andl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edi,%ebp
-	addl	-88(%r13),%esi
-	andnl	%ecx,%eax,%edi
-	addl	%ebp,%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	andl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%edi,%eax
-	addl	-84(%r13),%edx
-	andnl	%ebx,%esi,%edi
-	addl	%eax,%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	andl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%edi,%esi
-	addl	-64(%r13),%ecx
-	andnl	%ebp,%edx,%edi
-	addl	%esi,%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	andl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%edi,%edx
-	addl	-60(%r13),%ebx
-	andnl	%eax,%ecx,%edi
-	addl	%edx,%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	andl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%edi,%ecx
-	addl	-56(%r13),%ebp
-	andnl	%esi,%ebx,%edi
-	addl	%ecx,%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	andl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%edi,%ebx
-	addl	-52(%r13),%eax
-	andnl	%edx,%ebp,%edi
-	addl	%ebx,%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	andl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edi,%ebp
-	addl	-32(%r13),%esi
-	andnl	%ecx,%eax,%edi
-	addl	%ebp,%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	andl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%edi,%eax
-	addl	-28(%r13),%edx
-	andnl	%ebx,%esi,%edi
-	addl	%eax,%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	andl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%edi,%esi
-	addl	-24(%r13),%ecx
-	andnl	%ebp,%edx,%edi
-	addl	%esi,%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	andl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%edi,%edx
-	addl	-20(%r13),%ebx
-	andnl	%eax,%ecx,%edi
-	addl	%edx,%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	andl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%edi,%ecx
-	addl	0(%r13),%ebp
-	andnl	%esi,%ebx,%edi
-	addl	%ecx,%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	andl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%edi,%ebx
-	addl	4(%r13),%eax
-	andnl	%edx,%ebp,%edi
-	addl	%ebx,%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	andl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edi,%ebp
-	addl	8(%r13),%esi
-	andnl	%ecx,%eax,%edi
-	addl	%ebp,%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	andl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%edi,%eax
-	addl	12(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%ebx,%esi
-	addl	32(%r13),%ecx
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	addl	36(%r13),%ebx
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%eax,%ecx
-	addl	40(%r13),%ebp
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%esi,%ebx
-	addl	44(%r13),%eax
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	addl	64(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%ecx,%eax
-	vmovdqu	-64(%r14),%ymm11
-	vpshufb	%ymm6,%ymm0,%ymm0
-	addl	68(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%ebx,%esi
-	addl	72(%r13),%ecx
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	addl	76(%r13),%ebx
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%eax,%ecx
-	addl	96(%r13),%ebp
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%esi,%ebx
-	addl	100(%r13),%eax
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	vpshufb	%ymm6,%ymm1,%ymm1
-	vpaddd	%ymm11,%ymm0,%ymm8
-	addl	104(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%ecx,%eax
-	addl	108(%r13),%edx
-	leaq	256(%r13),%r13
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%ebx,%esi
-	addl	-128(%r13),%ecx
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	addl	-124(%r13),%ebx
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%eax,%ecx
-	addl	-120(%r13),%ebp
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%esi,%ebx
-	vmovdqu	%ymm8,0(%rsp)
-	vpshufb	%ymm6,%ymm2,%ymm2
-	vpaddd	%ymm11,%ymm1,%ymm9
-	addl	-116(%r13),%eax
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	addl	-96(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%ecx,%eax
-	addl	-92(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%ebx,%esi
-	addl	-88(%r13),%ecx
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	addl	-84(%r13),%ebx
-	movl	%esi,%edi
-	xorl	%eax,%edi
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	andl	%edi,%ecx
-	vmovdqu	%ymm9,32(%rsp)
-	vpshufb	%ymm6,%ymm3,%ymm3
-	vpaddd	%ymm11,%ymm2,%ymm6
-	addl	-64(%r13),%ebp
-	xorl	%esi,%ecx
-	movl	%edx,%edi
-	xorl	%esi,%edi
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	andl	%edi,%ebx
-	addl	-60(%r13),%eax
-	xorl	%edx,%ebx
-	movl	%ecx,%edi
-	xorl	%edx,%edi
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	andl	%edi,%ebp
-	addl	-56(%r13),%esi
-	xorl	%ecx,%ebp
-	movl	%ebx,%edi
-	xorl	%ecx,%edi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	andl	%edi,%eax
-	addl	-52(%r13),%edx
-	xorl	%ebx,%eax
-	movl	%ebp,%edi
-	xorl	%ebx,%edi
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	andl	%edi,%esi
-	addl	-32(%r13),%ecx
-	xorl	%ebp,%esi
-	movl	%eax,%edi
-	xorl	%ebp,%edi
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	andl	%edi,%edx
-	jmp	L$align32_3
-.p2align	5
-L$align32_3:
-	vmovdqu	%ymm6,64(%rsp)
-	vpaddd	%ymm11,%ymm3,%ymm7
-	addl	-28(%r13),%ebx
-	xorl	%eax,%edx
-	movl	%esi,%edi
-	xorl	%eax,%edi
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	andl	%edi,%ecx
-	addl	-24(%r13),%ebp
-	xorl	%esi,%ecx
-	movl	%edx,%edi
-	xorl	%esi,%edi
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	andl	%edi,%ebx
-	addl	-20(%r13),%eax
-	xorl	%edx,%ebx
-	movl	%ecx,%edi
-	xorl	%edx,%edi
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	andl	%edi,%ebp
-	addl	0(%r13),%esi
-	xorl	%ecx,%ebp
-	movl	%ebx,%edi
-	xorl	%ecx,%edi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	andl	%edi,%eax
-	addl	4(%r13),%edx
-	xorl	%ebx,%eax
-	movl	%ebp,%edi
-	xorl	%ebx,%edi
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	andl	%edi,%esi
-	vmovdqu	%ymm7,96(%rsp)
-	addl	8(%r13),%ecx
-	xorl	%ebp,%esi
-	movl	%eax,%edi
-	xorl	%ebp,%edi
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	andl	%edi,%edx
-	addl	12(%r13),%ebx
-	xorl	%eax,%edx
-	movl	%esi,%edi
-	xorl	%eax,%edi
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	andl	%edi,%ecx
-	addl	32(%r13),%ebp
-	xorl	%esi,%ecx
-	movl	%edx,%edi
-	xorl	%esi,%edi
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	andl	%edi,%ebx
-	addl	36(%r13),%eax
-	xorl	%edx,%ebx
-	movl	%ecx,%edi
-	xorl	%edx,%edi
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	andl	%edi,%ebp
-	addl	40(%r13),%esi
-	xorl	%ecx,%ebp
-	movl	%ebx,%edi
-	xorl	%ecx,%edi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	andl	%edi,%eax
-	vpalignr	$8,%ymm0,%ymm1,%ymm4
-	addl	44(%r13),%edx
-	xorl	%ebx,%eax
-	movl	%ebp,%edi
-	xorl	%ebx,%edi
-	vpsrldq	$4,%ymm3,%ymm8
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	vpxor	%ymm0,%ymm4,%ymm4
-	vpxor	%ymm2,%ymm8,%ymm8
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	vpxor	%ymm8,%ymm4,%ymm4
-	andl	%edi,%esi
-	addl	64(%r13),%ecx
-	xorl	%ebp,%esi
-	movl	%eax,%edi
-	vpsrld	$31,%ymm4,%ymm8
-	xorl	%ebp,%edi
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	vpslldq	$12,%ymm4,%ymm10
-	vpaddd	%ymm4,%ymm4,%ymm4
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	vpsrld	$30,%ymm10,%ymm9
-	vpor	%ymm8,%ymm4,%ymm4
-	addl	%r12d,%ecx
-	andl	%edi,%edx
-	vpslld	$2,%ymm10,%ymm10
-	vpxor	%ymm9,%ymm4,%ymm4
-	addl	68(%r13),%ebx
-	xorl	%eax,%edx
-	vpxor	%ymm10,%ymm4,%ymm4
-	movl	%esi,%edi
-	xorl	%eax,%edi
-	leal	(%rbx,%rdx,1),%ebx
-	vpaddd	%ymm11,%ymm4,%ymm9
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	vmovdqu	%ymm9,128(%rsp)
-	addl	%r12d,%ebx
-	andl	%edi,%ecx
-	addl	72(%r13),%ebp
-	xorl	%esi,%ecx
-	movl	%edx,%edi
-	xorl	%esi,%edi
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	andl	%edi,%ebx
-	addl	76(%r13),%eax
-	xorl	%edx,%ebx
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	vpalignr	$8,%ymm1,%ymm2,%ymm5
-	addl	96(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	vpsrldq	$4,%ymm4,%ymm8
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%ecx,%eax
-	vpxor	%ymm1,%ymm5,%ymm5
-	vpxor	%ymm3,%ymm8,%ymm8
-	addl	100(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	vpxor	%ymm8,%ymm5,%ymm5
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	vpsrld	$31,%ymm5,%ymm8
-	vmovdqu	-32(%r14),%ymm11
-	xorl	%ebx,%esi
-	addl	104(%r13),%ecx
-	leal	(%rcx,%rsi,1),%ecx
-	vpslldq	$12,%ymm5,%ymm10
-	vpaddd	%ymm5,%ymm5,%ymm5
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	vpsrld	$30,%ymm10,%ymm9
-	vpor	%ymm8,%ymm5,%ymm5
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	vpslld	$2,%ymm10,%ymm10
-	vpxor	%ymm9,%ymm5,%ymm5
-	xorl	%ebp,%edx
-	addl	108(%r13),%ebx
-	leaq	256(%r13),%r13
-	vpxor	%ymm10,%ymm5,%ymm5
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	vpaddd	%ymm11,%ymm5,%ymm9
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%eax,%ecx
-	vmovdqu	%ymm9,160(%rsp)
-	addl	-128(%r13),%ebp
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%esi,%ebx
-	vpalignr	$8,%ymm2,%ymm3,%ymm6
-	addl	-124(%r13),%eax
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	vpsrldq	$4,%ymm5,%ymm8
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	vpxor	%ymm2,%ymm6,%ymm6
-	vpxor	%ymm4,%ymm8,%ymm8
-	addl	-120(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	vpxor	%ymm8,%ymm6,%ymm6
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	vpsrld	$31,%ymm6,%ymm8
-	xorl	%ecx,%eax
-	addl	-116(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	vpslldq	$12,%ymm6,%ymm10
-	vpaddd	%ymm6,%ymm6,%ymm6
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	vpsrld	$30,%ymm10,%ymm9
-	vpor	%ymm8,%ymm6,%ymm6
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	vpslld	$2,%ymm10,%ymm10
-	vpxor	%ymm9,%ymm6,%ymm6
-	xorl	%ebx,%esi
-	addl	-96(%r13),%ecx
-	vpxor	%ymm10,%ymm6,%ymm6
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	vpaddd	%ymm11,%ymm6,%ymm9
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	vmovdqu	%ymm9,192(%rsp)
-	addl	-92(%r13),%ebx
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%eax,%ecx
-	vpalignr	$8,%ymm3,%ymm4,%ymm7
-	addl	-88(%r13),%ebp
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	vpsrldq	$4,%ymm6,%ymm8
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%esi,%ebx
-	vpxor	%ymm3,%ymm7,%ymm7
-	vpxor	%ymm5,%ymm8,%ymm8
-	addl	-84(%r13),%eax
-	leal	(%rax,%rbx,1),%eax
-	vpxor	%ymm8,%ymm7,%ymm7
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	vpsrld	$31,%ymm7,%ymm8
-	xorl	%edx,%ebp
-	addl	-64(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	vpslldq	$12,%ymm7,%ymm10
-	vpaddd	%ymm7,%ymm7,%ymm7
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	vpsrld	$30,%ymm10,%ymm9
-	vpor	%ymm8,%ymm7,%ymm7
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	vpslld	$2,%ymm10,%ymm10
-	vpxor	%ymm9,%ymm7,%ymm7
-	xorl	%ecx,%eax
-	addl	-60(%r13),%edx
-	vpxor	%ymm10,%ymm7,%ymm7
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	rorxl	$2,%esi,%eax
-	vpaddd	%ymm11,%ymm7,%ymm9
-	xorl	%ebp,%esi
-	addl	%r12d,%edx
-	xorl	%ebx,%esi
-	vmovdqu	%ymm9,224(%rsp)
-	addl	-56(%r13),%ecx
-	leal	(%rcx,%rsi,1),%ecx
-	rorxl	$27,%edx,%r12d
-	rorxl	$2,%edx,%esi
-	xorl	%eax,%edx
-	addl	%r12d,%ecx
-	xorl	%ebp,%edx
-	addl	-52(%r13),%ebx
-	leal	(%rbx,%rdx,1),%ebx
-	rorxl	$27,%ecx,%r12d
-	rorxl	$2,%ecx,%edx
-	xorl	%esi,%ecx
-	addl	%r12d,%ebx
-	xorl	%eax,%ecx
-	addl	-32(%r13),%ebp
-	leal	(%rcx,%rbp,1),%ebp
-	rorxl	$27,%ebx,%r12d
-	rorxl	$2,%ebx,%ecx
-	xorl	%edx,%ebx
-	addl	%r12d,%ebp
-	xorl	%esi,%ebx
-	addl	-28(%r13),%eax
-	leal	(%rax,%rbx,1),%eax
-	rorxl	$27,%ebp,%r12d
-	rorxl	$2,%ebp,%ebx
-	xorl	%ecx,%ebp
-	addl	%r12d,%eax
-	xorl	%edx,%ebp
-	addl	-24(%r13),%esi
-	leal	(%rsi,%rbp,1),%esi
-	rorxl	$27,%eax,%r12d
-	rorxl	$2,%eax,%ebp
-	xorl	%ebx,%eax
-	addl	%r12d,%esi
-	xorl	%ecx,%eax
-	addl	-20(%r13),%edx
-	leal	(%rdx,%rax,1),%edx
-	rorxl	$27,%esi,%r12d
-	addl	%r12d,%edx
-	leaq	128(%rsp),%r13
-
-
-	addl	0(%r8),%edx
-	addl	4(%r8),%esi
-	addl	8(%r8),%ebp
-	movl	%edx,0(%r8)
-	addl	12(%r8),%ebx
-	movl	%esi,4(%r8)
-	movl	%edx,%eax
-	addl	16(%r8),%ecx
-	movl	%ebp,%r12d
-	movl	%ebp,8(%r8)
-	movl	%ebx,%edx
-
-	movl	%ebx,12(%r8)
-	movl	%esi,%ebp
-	movl	%ecx,16(%r8)
-
-	movl	%ecx,%esi
-	movl	%r12d,%ecx
-
-
-	cmpq	%r10,%r9
-	jbe	L$oop_avx2
-
-L$done_avx2:
-	vzeroupper
-	movq	-40(%r11),%r14
-
-	movq	-32(%r11),%r13
-
-	movq	-24(%r11),%r12
-
-	movq	-16(%r11),%rbp
-
-	movq	-8(%r11),%rbx
-
-	leaq	(%r11),%rsp
-
-L$epilogue_avx2:
-	ret
-
-
-.section	__DATA,__const
-.p2align	6
-K_XX_XX:
-.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
-.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
-.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
-.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
-.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
-.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
-.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
-.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
-.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
-.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.p2align	6
-.text	
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/sha256-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/sha256-x86_64-apple.S
deleted file mode 100644
index 018af0d..0000000
--- a/apple-x86_64/crypto/fipsmodule/sha256-x86_64-apple.S
+++ /dev/null
@@ -1,4178 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-.globl	_sha256_block_data_order
-.private_extern _sha256_block_data_order
-
-.p2align	4
-_sha256_block_data_order:
-
-_CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%r11
-	movl	0(%r11),%r9d
-	movl	4(%r11),%r10d
-	movl	8(%r11),%r11d
-	testl	$536870912,%r11d
-	jnz	L$shaext_shortcut
-	andl	$1073741824,%r9d
-	andl	$268435968,%r10d
-	orl	%r9d,%r10d
-	cmpl	$1342177792,%r10d
-	je	L$avx_shortcut
-	testl	$512,%r10d
-	jnz	L$ssse3_shortcut
-	movq	%rsp,%rax
-
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	shlq	$4,%rdx
-	subq	$64+32,%rsp
-	leaq	(%rsi,%rdx,4),%rdx
-	andq	$-64,%rsp
-	movq	%rdi,64+0(%rsp)
-	movq	%rsi,64+8(%rsp)
-	movq	%rdx,64+16(%rsp)
-	movq	%rax,88(%rsp)
-
-L$prologue:
-
-	movl	0(%rdi),%eax
-	movl	4(%rdi),%ebx
-	movl	8(%rdi),%ecx
-	movl	12(%rdi),%edx
-	movl	16(%rdi),%r8d
-	movl	20(%rdi),%r9d
-	movl	24(%rdi),%r10d
-	movl	28(%rdi),%r11d
-	jmp	L$loop
-
-.p2align	4
-L$loop:
-	movl	%ebx,%edi
-	leaq	K256(%rip),%rbp
-	xorl	%ecx,%edi
-	movl	0(%rsi),%r12d
-	movl	%r8d,%r13d
-	movl	%eax,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%r9d,%r15d
-
-	xorl	%r8d,%r13d
-	rorl	$9,%r14d
-	xorl	%r10d,%r15d
-
-	movl	%r12d,0(%rsp)
-	xorl	%eax,%r14d
-	andl	%r8d,%r15d
-
-	rorl	$5,%r13d
-	addl	%r11d,%r12d
-	xorl	%r10d,%r15d
-
-	rorl	$11,%r14d
-	xorl	%r8d,%r13d
-	addl	%r15d,%r12d
-
-	movl	%eax,%r15d
-	addl	(%rbp),%r12d
-	xorl	%eax,%r14d
-
-	xorl	%ebx,%r15d
-	rorl	$6,%r13d
-	movl	%ebx,%r11d
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%r11d
-	addl	%r12d,%edx
-	addl	%r12d,%r11d
-
-	leaq	4(%rbp),%rbp
-	addl	%r14d,%r11d
-	movl	4(%rsi),%r12d
-	movl	%edx,%r13d
-	movl	%r11d,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%r8d,%edi
-
-	xorl	%edx,%r13d
-	rorl	$9,%r14d
-	xorl	%r9d,%edi
-
-	movl	%r12d,4(%rsp)
-	xorl	%r11d,%r14d
-	andl	%edx,%edi
-
-	rorl	$5,%r13d
-	addl	%r10d,%r12d
-	xorl	%r9d,%edi
-
-	rorl	$11,%r14d
-	xorl	%edx,%r13d
-	addl	%edi,%r12d
-
-	movl	%r11d,%edi
-	addl	(%rbp),%r12d
-	xorl	%r11d,%r14d
-
-	xorl	%eax,%edi
-	rorl	$6,%r13d
-	movl	%eax,%r10d
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%r10d
-	addl	%r12d,%ecx
-	addl	%r12d,%r10d
-
-	leaq	4(%rbp),%rbp
-	addl	%r14d,%r10d
-	movl	8(%rsi),%r12d
-	movl	%ecx,%r13d
-	movl	%r10d,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%edx,%r15d
-
-	xorl	%ecx,%r13d
-	rorl	$9,%r14d
-	xorl	%r8d,%r15d
-
-	movl	%r12d,8(%rsp)
-	xorl	%r10d,%r14d
-	andl	%ecx,%r15d
-
-	rorl	$5,%r13d
-	addl	%r9d,%r12d
-	xorl	%r8d,%r15d
-
-	rorl	$11,%r14d
-	xorl	%ecx,%r13d
-	addl	%r15d,%r12d
-
-	movl	%r10d,%r15d
-	addl	(%rbp),%r12d
-	xorl	%r10d,%r14d
-
-	xorl	%r11d,%r15d
-	rorl	$6,%r13d
-	movl	%r11d,%r9d
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%r9d
-	addl	%r12d,%ebx
-	addl	%r12d,%r9d
-
-	leaq	4(%rbp),%rbp
-	addl	%r14d,%r9d
-	movl	12(%rsi),%r12d
-	movl	%ebx,%r13d
-	movl	%r9d,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%ecx,%edi
-
-	xorl	%ebx,%r13d
-	rorl	$9,%r14d
-	xorl	%edx,%edi
-
-	movl	%r12d,12(%rsp)
-	xorl	%r9d,%r14d
-	andl	%ebx,%edi
-
-	rorl	$5,%r13d
-	addl	%r8d,%r12d
-	xorl	%edx,%edi
-
-	rorl	$11,%r14d
-	xorl	%ebx,%r13d
-	addl	%edi,%r12d
-
-	movl	%r9d,%edi
-	addl	(%rbp),%r12d
-	xorl	%r9d,%r14d
-
-	xorl	%r10d,%edi
-	rorl	$6,%r13d
-	movl	%r10d,%r8d
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%r8d
-	addl	%r12d,%eax
-	addl	%r12d,%r8d
-
-	leaq	20(%rbp),%rbp
-	addl	%r14d,%r8d
-	movl	16(%rsi),%r12d
-	movl	%eax,%r13d
-	movl	%r8d,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%ebx,%r15d
-
-	xorl	%eax,%r13d
-	rorl	$9,%r14d
-	xorl	%ecx,%r15d
-
-	movl	%r12d,16(%rsp)
-	xorl	%r8d,%r14d
-	andl	%eax,%r15d
-
-	rorl	$5,%r13d
-	addl	%edx,%r12d
-	xorl	%ecx,%r15d
-
-	rorl	$11,%r14d
-	xorl	%eax,%r13d
-	addl	%r15d,%r12d
-
-	movl	%r8d,%r15d
-	addl	(%rbp),%r12d
-	xorl	%r8d,%r14d
-
-	xorl	%r9d,%r15d
-	rorl	$6,%r13d
-	movl	%r9d,%edx
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%edx
-	addl	%r12d,%r11d
-	addl	%r12d,%edx
-
-	leaq	4(%rbp),%rbp
-	addl	%r14d,%edx
-	movl	20(%rsi),%r12d
-	movl	%r11d,%r13d
-	movl	%edx,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%eax,%edi
-
-	xorl	%r11d,%r13d
-	rorl	$9,%r14d
-	xorl	%ebx,%edi
-
-	movl	%r12d,20(%rsp)
-	xorl	%edx,%r14d
-	andl	%r11d,%edi
-
-	rorl	$5,%r13d
-	addl	%ecx,%r12d
-	xorl	%ebx,%edi
-
-	rorl	$11,%r14d
-	xorl	%r11d,%r13d
-	addl	%edi,%r12d
-
-	movl	%edx,%edi
-	addl	(%rbp),%r12d
-	xorl	%edx,%r14d
-
-	xorl	%r8d,%edi
-	rorl	$6,%r13d
-	movl	%r8d,%ecx
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%ecx
-	addl	%r12d,%r10d
-	addl	%r12d,%ecx
-
-	leaq	4(%rbp),%rbp
-	addl	%r14d,%ecx
-	movl	24(%rsi),%r12d
-	movl	%r10d,%r13d
-	movl	%ecx,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%r11d,%r15d
-
-	xorl	%r10d,%r13d
-	rorl	$9,%r14d
-	xorl	%eax,%r15d
-
-	movl	%r12d,24(%rsp)
-	xorl	%ecx,%r14d
-	andl	%r10d,%r15d
-
-	rorl	$5,%r13d
-	addl	%ebx,%r12d
-	xorl	%eax,%r15d
-
-	rorl	$11,%r14d
-	xorl	%r10d,%r13d
-	addl	%r15d,%r12d
-
-	movl	%ecx,%r15d
-	addl	(%rbp),%r12d
-	xorl	%ecx,%r14d
-
-	xorl	%edx,%r15d
-	rorl	$6,%r13d
-	movl	%edx,%ebx
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%ebx
-	addl	%r12d,%r9d
-	addl	%r12d,%ebx
-
-	leaq	4(%rbp),%rbp
-	addl	%r14d,%ebx
-	movl	28(%rsi),%r12d
-	movl	%r9d,%r13d
-	movl	%ebx,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%r10d,%edi
-
-	xorl	%r9d,%r13d
-	rorl	$9,%r14d
-	xorl	%r11d,%edi
-
-	movl	%r12d,28(%rsp)
-	xorl	%ebx,%r14d
-	andl	%r9d,%edi
-
-	rorl	$5,%r13d
-	addl	%eax,%r12d
-	xorl	%r11d,%edi
-
-	rorl	$11,%r14d
-	xorl	%r9d,%r13d
-	addl	%edi,%r12d
-
-	movl	%ebx,%edi
-	addl	(%rbp),%r12d
-	xorl	%ebx,%r14d
-
-	xorl	%ecx,%edi
-	rorl	$6,%r13d
-	movl	%ecx,%eax
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%eax
-	addl	%r12d,%r8d
-	addl	%r12d,%eax
-
-	leaq	20(%rbp),%rbp
-	addl	%r14d,%eax
-	movl	32(%rsi),%r12d
-	movl	%r8d,%r13d
-	movl	%eax,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%r9d,%r15d
-
-	xorl	%r8d,%r13d
-	rorl	$9,%r14d
-	xorl	%r10d,%r15d
-
-	movl	%r12d,32(%rsp)
-	xorl	%eax,%r14d
-	andl	%r8d,%r15d
-
-	rorl	$5,%r13d
-	addl	%r11d,%r12d
-	xorl	%r10d,%r15d
-
-	rorl	$11,%r14d
-	xorl	%r8d,%r13d
-	addl	%r15d,%r12d
-
-	movl	%eax,%r15d
-	addl	(%rbp),%r12d
-	xorl	%eax,%r14d
-
-	xorl	%ebx,%r15d
-	rorl	$6,%r13d
-	movl	%ebx,%r11d
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%r11d
-	addl	%r12d,%edx
-	addl	%r12d,%r11d
-
-	leaq	4(%rbp),%rbp
-	addl	%r14d,%r11d
-	movl	36(%rsi),%r12d
-	movl	%edx,%r13d
-	movl	%r11d,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%r8d,%edi
-
-	xorl	%edx,%r13d
-	rorl	$9,%r14d
-	xorl	%r9d,%edi
-
-	movl	%r12d,36(%rsp)
-	xorl	%r11d,%r14d
-	andl	%edx,%edi
-
-	rorl	$5,%r13d
-	addl	%r10d,%r12d
-	xorl	%r9d,%edi
-
-	rorl	$11,%r14d
-	xorl	%edx,%r13d
-	addl	%edi,%r12d
-
-	movl	%r11d,%edi
-	addl	(%rbp),%r12d
-	xorl	%r11d,%r14d
-
-	xorl	%eax,%edi
-	rorl	$6,%r13d
-	movl	%eax,%r10d
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%r10d
-	addl	%r12d,%ecx
-	addl	%r12d,%r10d
-
-	leaq	4(%rbp),%rbp
-	addl	%r14d,%r10d
-	movl	40(%rsi),%r12d
-	movl	%ecx,%r13d
-	movl	%r10d,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%edx,%r15d
-
-	xorl	%ecx,%r13d
-	rorl	$9,%r14d
-	xorl	%r8d,%r15d
-
-	movl	%r12d,40(%rsp)
-	xorl	%r10d,%r14d
-	andl	%ecx,%r15d
-
-	rorl	$5,%r13d
-	addl	%r9d,%r12d
-	xorl	%r8d,%r15d
-
-	rorl	$11,%r14d
-	xorl	%ecx,%r13d
-	addl	%r15d,%r12d
-
-	movl	%r10d,%r15d
-	addl	(%rbp),%r12d
-	xorl	%r10d,%r14d
-
-	xorl	%r11d,%r15d
-	rorl	$6,%r13d
-	movl	%r11d,%r9d
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%r9d
-	addl	%r12d,%ebx
-	addl	%r12d,%r9d
-
-	leaq	4(%rbp),%rbp
-	addl	%r14d,%r9d
-	movl	44(%rsi),%r12d
-	movl	%ebx,%r13d
-	movl	%r9d,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%ecx,%edi
-
-	xorl	%ebx,%r13d
-	rorl	$9,%r14d
-	xorl	%edx,%edi
-
-	movl	%r12d,44(%rsp)
-	xorl	%r9d,%r14d
-	andl	%ebx,%edi
-
-	rorl	$5,%r13d
-	addl	%r8d,%r12d
-	xorl	%edx,%edi
-
-	rorl	$11,%r14d
-	xorl	%ebx,%r13d
-	addl	%edi,%r12d
-
-	movl	%r9d,%edi
-	addl	(%rbp),%r12d
-	xorl	%r9d,%r14d
-
-	xorl	%r10d,%edi
-	rorl	$6,%r13d
-	movl	%r10d,%r8d
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%r8d
-	addl	%r12d,%eax
-	addl	%r12d,%r8d
-
-	leaq	20(%rbp),%rbp
-	addl	%r14d,%r8d
-	movl	48(%rsi),%r12d
-	movl	%eax,%r13d
-	movl	%r8d,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%ebx,%r15d
-
-	xorl	%eax,%r13d
-	rorl	$9,%r14d
-	xorl	%ecx,%r15d
-
-	movl	%r12d,48(%rsp)
-	xorl	%r8d,%r14d
-	andl	%eax,%r15d
-
-	rorl	$5,%r13d
-	addl	%edx,%r12d
-	xorl	%ecx,%r15d
-
-	rorl	$11,%r14d
-	xorl	%eax,%r13d
-	addl	%r15d,%r12d
-
-	movl	%r8d,%r15d
-	addl	(%rbp),%r12d
-	xorl	%r8d,%r14d
-
-	xorl	%r9d,%r15d
-	rorl	$6,%r13d
-	movl	%r9d,%edx
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%edx
-	addl	%r12d,%r11d
-	addl	%r12d,%edx
-
-	leaq	4(%rbp),%rbp
-	addl	%r14d,%edx
-	movl	52(%rsi),%r12d
-	movl	%r11d,%r13d
-	movl	%edx,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%eax,%edi
-
-	xorl	%r11d,%r13d
-	rorl	$9,%r14d
-	xorl	%ebx,%edi
-
-	movl	%r12d,52(%rsp)
-	xorl	%edx,%r14d
-	andl	%r11d,%edi
-
-	rorl	$5,%r13d
-	addl	%ecx,%r12d
-	xorl	%ebx,%edi
-
-	rorl	$11,%r14d
-	xorl	%r11d,%r13d
-	addl	%edi,%r12d
-
-	movl	%edx,%edi
-	addl	(%rbp),%r12d
-	xorl	%edx,%r14d
-
-	xorl	%r8d,%edi
-	rorl	$6,%r13d
-	movl	%r8d,%ecx
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%ecx
-	addl	%r12d,%r10d
-	addl	%r12d,%ecx
-
-	leaq	4(%rbp),%rbp
-	addl	%r14d,%ecx
-	movl	56(%rsi),%r12d
-	movl	%r10d,%r13d
-	movl	%ecx,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%r11d,%r15d
-
-	xorl	%r10d,%r13d
-	rorl	$9,%r14d
-	xorl	%eax,%r15d
-
-	movl	%r12d,56(%rsp)
-	xorl	%ecx,%r14d
-	andl	%r10d,%r15d
-
-	rorl	$5,%r13d
-	addl	%ebx,%r12d
-	xorl	%eax,%r15d
-
-	rorl	$11,%r14d
-	xorl	%r10d,%r13d
-	addl	%r15d,%r12d
-
-	movl	%ecx,%r15d
-	addl	(%rbp),%r12d
-	xorl	%ecx,%r14d
-
-	xorl	%edx,%r15d
-	rorl	$6,%r13d
-	movl	%edx,%ebx
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%ebx
-	addl	%r12d,%r9d
-	addl	%r12d,%ebx
-
-	leaq	4(%rbp),%rbp
-	addl	%r14d,%ebx
-	movl	60(%rsi),%r12d
-	movl	%r9d,%r13d
-	movl	%ebx,%r14d
-	bswapl	%r12d
-	rorl	$14,%r13d
-	movl	%r10d,%edi
-
-	xorl	%r9d,%r13d
-	rorl	$9,%r14d
-	xorl	%r11d,%edi
-
-	movl	%r12d,60(%rsp)
-	xorl	%ebx,%r14d
-	andl	%r9d,%edi
-
-	rorl	$5,%r13d
-	addl	%eax,%r12d
-	xorl	%r11d,%edi
-
-	rorl	$11,%r14d
-	xorl	%r9d,%r13d
-	addl	%edi,%r12d
-
-	movl	%ebx,%edi
-	addl	(%rbp),%r12d
-	xorl	%ebx,%r14d
-
-	xorl	%ecx,%edi
-	rorl	$6,%r13d
-	movl	%ecx,%eax
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%eax
-	addl	%r12d,%r8d
-	addl	%r12d,%eax
-
-	leaq	20(%rbp),%rbp
-	jmp	L$rounds_16_xx
-.p2align	4
-L$rounds_16_xx:
-	movl	4(%rsp),%r13d
-	movl	56(%rsp),%r15d
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%eax
-	movl	%r15d,%r14d
-	rorl	$2,%r15d
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%r15d
-	shrl	$10,%r14d
-
-	rorl	$17,%r15d
-	xorl	%r13d,%r12d
-	xorl	%r14d,%r15d
-	addl	36(%rsp),%r12d
-
-	addl	0(%rsp),%r12d
-	movl	%r8d,%r13d
-	addl	%r15d,%r12d
-	movl	%eax,%r14d
-	rorl	$14,%r13d
-	movl	%r9d,%r15d
-
-	xorl	%r8d,%r13d
-	rorl	$9,%r14d
-	xorl	%r10d,%r15d
-
-	movl	%r12d,0(%rsp)
-	xorl	%eax,%r14d
-	andl	%r8d,%r15d
-
-	rorl	$5,%r13d
-	addl	%r11d,%r12d
-	xorl	%r10d,%r15d
-
-	rorl	$11,%r14d
-	xorl	%r8d,%r13d
-	addl	%r15d,%r12d
-
-	movl	%eax,%r15d
-	addl	(%rbp),%r12d
-	xorl	%eax,%r14d
-
-	xorl	%ebx,%r15d
-	rorl	$6,%r13d
-	movl	%ebx,%r11d
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%r11d
-	addl	%r12d,%edx
-	addl	%r12d,%r11d
-
-	leaq	4(%rbp),%rbp
-	movl	8(%rsp),%r13d
-	movl	60(%rsp),%edi
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%r11d
-	movl	%edi,%r14d
-	rorl	$2,%edi
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%edi
-	shrl	$10,%r14d
-
-	rorl	$17,%edi
-	xorl	%r13d,%r12d
-	xorl	%r14d,%edi
-	addl	40(%rsp),%r12d
-
-	addl	4(%rsp),%r12d
-	movl	%edx,%r13d
-	addl	%edi,%r12d
-	movl	%r11d,%r14d
-	rorl	$14,%r13d
-	movl	%r8d,%edi
-
-	xorl	%edx,%r13d
-	rorl	$9,%r14d
-	xorl	%r9d,%edi
-
-	movl	%r12d,4(%rsp)
-	xorl	%r11d,%r14d
-	andl	%edx,%edi
-
-	rorl	$5,%r13d
-	addl	%r10d,%r12d
-	xorl	%r9d,%edi
-
-	rorl	$11,%r14d
-	xorl	%edx,%r13d
-	addl	%edi,%r12d
-
-	movl	%r11d,%edi
-	addl	(%rbp),%r12d
-	xorl	%r11d,%r14d
-
-	xorl	%eax,%edi
-	rorl	$6,%r13d
-	movl	%eax,%r10d
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%r10d
-	addl	%r12d,%ecx
-	addl	%r12d,%r10d
-
-	leaq	4(%rbp),%rbp
-	movl	12(%rsp),%r13d
-	movl	0(%rsp),%r15d
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%r10d
-	movl	%r15d,%r14d
-	rorl	$2,%r15d
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%r15d
-	shrl	$10,%r14d
-
-	rorl	$17,%r15d
-	xorl	%r13d,%r12d
-	xorl	%r14d,%r15d
-	addl	44(%rsp),%r12d
-
-	addl	8(%rsp),%r12d
-	movl	%ecx,%r13d
-	addl	%r15d,%r12d
-	movl	%r10d,%r14d
-	rorl	$14,%r13d
-	movl	%edx,%r15d
-
-	xorl	%ecx,%r13d
-	rorl	$9,%r14d
-	xorl	%r8d,%r15d
-
-	movl	%r12d,8(%rsp)
-	xorl	%r10d,%r14d
-	andl	%ecx,%r15d
-
-	rorl	$5,%r13d
-	addl	%r9d,%r12d
-	xorl	%r8d,%r15d
-
-	rorl	$11,%r14d
-	xorl	%ecx,%r13d
-	addl	%r15d,%r12d
-
-	movl	%r10d,%r15d
-	addl	(%rbp),%r12d
-	xorl	%r10d,%r14d
-
-	xorl	%r11d,%r15d
-	rorl	$6,%r13d
-	movl	%r11d,%r9d
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%r9d
-	addl	%r12d,%ebx
-	addl	%r12d,%r9d
-
-	leaq	4(%rbp),%rbp
-	movl	16(%rsp),%r13d
-	movl	4(%rsp),%edi
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%r9d
-	movl	%edi,%r14d
-	rorl	$2,%edi
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%edi
-	shrl	$10,%r14d
-
-	rorl	$17,%edi
-	xorl	%r13d,%r12d
-	xorl	%r14d,%edi
-	addl	48(%rsp),%r12d
-
-	addl	12(%rsp),%r12d
-	movl	%ebx,%r13d
-	addl	%edi,%r12d
-	movl	%r9d,%r14d
-	rorl	$14,%r13d
-	movl	%ecx,%edi
-
-	xorl	%ebx,%r13d
-	rorl	$9,%r14d
-	xorl	%edx,%edi
-
-	movl	%r12d,12(%rsp)
-	xorl	%r9d,%r14d
-	andl	%ebx,%edi
-
-	rorl	$5,%r13d
-	addl	%r8d,%r12d
-	xorl	%edx,%edi
-
-	rorl	$11,%r14d
-	xorl	%ebx,%r13d
-	addl	%edi,%r12d
-
-	movl	%r9d,%edi
-	addl	(%rbp),%r12d
-	xorl	%r9d,%r14d
-
-	xorl	%r10d,%edi
-	rorl	$6,%r13d
-	movl	%r10d,%r8d
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%r8d
-	addl	%r12d,%eax
-	addl	%r12d,%r8d
-
-	leaq	20(%rbp),%rbp
-	movl	20(%rsp),%r13d
-	movl	8(%rsp),%r15d
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%r8d
-	movl	%r15d,%r14d
-	rorl	$2,%r15d
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%r15d
-	shrl	$10,%r14d
-
-	rorl	$17,%r15d
-	xorl	%r13d,%r12d
-	xorl	%r14d,%r15d
-	addl	52(%rsp),%r12d
-
-	addl	16(%rsp),%r12d
-	movl	%eax,%r13d
-	addl	%r15d,%r12d
-	movl	%r8d,%r14d
-	rorl	$14,%r13d
-	movl	%ebx,%r15d
-
-	xorl	%eax,%r13d
-	rorl	$9,%r14d
-	xorl	%ecx,%r15d
-
-	movl	%r12d,16(%rsp)
-	xorl	%r8d,%r14d
-	andl	%eax,%r15d
-
-	rorl	$5,%r13d
-	addl	%edx,%r12d
-	xorl	%ecx,%r15d
-
-	rorl	$11,%r14d
-	xorl	%eax,%r13d
-	addl	%r15d,%r12d
-
-	movl	%r8d,%r15d
-	addl	(%rbp),%r12d
-	xorl	%r8d,%r14d
-
-	xorl	%r9d,%r15d
-	rorl	$6,%r13d
-	movl	%r9d,%edx
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%edx
-	addl	%r12d,%r11d
-	addl	%r12d,%edx
-
-	leaq	4(%rbp),%rbp
-	movl	24(%rsp),%r13d
-	movl	12(%rsp),%edi
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%edx
-	movl	%edi,%r14d
-	rorl	$2,%edi
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%edi
-	shrl	$10,%r14d
-
-	rorl	$17,%edi
-	xorl	%r13d,%r12d
-	xorl	%r14d,%edi
-	addl	56(%rsp),%r12d
-
-	addl	20(%rsp),%r12d
-	movl	%r11d,%r13d
-	addl	%edi,%r12d
-	movl	%edx,%r14d
-	rorl	$14,%r13d
-	movl	%eax,%edi
-
-	xorl	%r11d,%r13d
-	rorl	$9,%r14d
-	xorl	%ebx,%edi
-
-	movl	%r12d,20(%rsp)
-	xorl	%edx,%r14d
-	andl	%r11d,%edi
-
-	rorl	$5,%r13d
-	addl	%ecx,%r12d
-	xorl	%ebx,%edi
-
-	rorl	$11,%r14d
-	xorl	%r11d,%r13d
-	addl	%edi,%r12d
-
-	movl	%edx,%edi
-	addl	(%rbp),%r12d
-	xorl	%edx,%r14d
-
-	xorl	%r8d,%edi
-	rorl	$6,%r13d
-	movl	%r8d,%ecx
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%ecx
-	addl	%r12d,%r10d
-	addl	%r12d,%ecx
-
-	leaq	4(%rbp),%rbp
-	movl	28(%rsp),%r13d
-	movl	16(%rsp),%r15d
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%ecx
-	movl	%r15d,%r14d
-	rorl	$2,%r15d
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%r15d
-	shrl	$10,%r14d
-
-	rorl	$17,%r15d
-	xorl	%r13d,%r12d
-	xorl	%r14d,%r15d
-	addl	60(%rsp),%r12d
-
-	addl	24(%rsp),%r12d
-	movl	%r10d,%r13d
-	addl	%r15d,%r12d
-	movl	%ecx,%r14d
-	rorl	$14,%r13d
-	movl	%r11d,%r15d
-
-	xorl	%r10d,%r13d
-	rorl	$9,%r14d
-	xorl	%eax,%r15d
-
-	movl	%r12d,24(%rsp)
-	xorl	%ecx,%r14d
-	andl	%r10d,%r15d
-
-	rorl	$5,%r13d
-	addl	%ebx,%r12d
-	xorl	%eax,%r15d
-
-	rorl	$11,%r14d
-	xorl	%r10d,%r13d
-	addl	%r15d,%r12d
-
-	movl	%ecx,%r15d
-	addl	(%rbp),%r12d
-	xorl	%ecx,%r14d
-
-	xorl	%edx,%r15d
-	rorl	$6,%r13d
-	movl	%edx,%ebx
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%ebx
-	addl	%r12d,%r9d
-	addl	%r12d,%ebx
-
-	leaq	4(%rbp),%rbp
-	movl	32(%rsp),%r13d
-	movl	20(%rsp),%edi
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%ebx
-	movl	%edi,%r14d
-	rorl	$2,%edi
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%edi
-	shrl	$10,%r14d
-
-	rorl	$17,%edi
-	xorl	%r13d,%r12d
-	xorl	%r14d,%edi
-	addl	0(%rsp),%r12d
-
-	addl	28(%rsp),%r12d
-	movl	%r9d,%r13d
-	addl	%edi,%r12d
-	movl	%ebx,%r14d
-	rorl	$14,%r13d
-	movl	%r10d,%edi
-
-	xorl	%r9d,%r13d
-	rorl	$9,%r14d
-	xorl	%r11d,%edi
-
-	movl	%r12d,28(%rsp)
-	xorl	%ebx,%r14d
-	andl	%r9d,%edi
-
-	rorl	$5,%r13d
-	addl	%eax,%r12d
-	xorl	%r11d,%edi
-
-	rorl	$11,%r14d
-	xorl	%r9d,%r13d
-	addl	%edi,%r12d
-
-	movl	%ebx,%edi
-	addl	(%rbp),%r12d
-	xorl	%ebx,%r14d
-
-	xorl	%ecx,%edi
-	rorl	$6,%r13d
-	movl	%ecx,%eax
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%eax
-	addl	%r12d,%r8d
-	addl	%r12d,%eax
-
-	leaq	20(%rbp),%rbp
-	movl	36(%rsp),%r13d
-	movl	24(%rsp),%r15d
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%eax
-	movl	%r15d,%r14d
-	rorl	$2,%r15d
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%r15d
-	shrl	$10,%r14d
-
-	rorl	$17,%r15d
-	xorl	%r13d,%r12d
-	xorl	%r14d,%r15d
-	addl	4(%rsp),%r12d
-
-	addl	32(%rsp),%r12d
-	movl	%r8d,%r13d
-	addl	%r15d,%r12d
-	movl	%eax,%r14d
-	rorl	$14,%r13d
-	movl	%r9d,%r15d
-
-	xorl	%r8d,%r13d
-	rorl	$9,%r14d
-	xorl	%r10d,%r15d
-
-	movl	%r12d,32(%rsp)
-	xorl	%eax,%r14d
-	andl	%r8d,%r15d
-
-	rorl	$5,%r13d
-	addl	%r11d,%r12d
-	xorl	%r10d,%r15d
-
-	rorl	$11,%r14d
-	xorl	%r8d,%r13d
-	addl	%r15d,%r12d
-
-	movl	%eax,%r15d
-	addl	(%rbp),%r12d
-	xorl	%eax,%r14d
-
-	xorl	%ebx,%r15d
-	rorl	$6,%r13d
-	movl	%ebx,%r11d
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%r11d
-	addl	%r12d,%edx
-	addl	%r12d,%r11d
-
-	leaq	4(%rbp),%rbp
-	movl	40(%rsp),%r13d
-	movl	28(%rsp),%edi
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%r11d
-	movl	%edi,%r14d
-	rorl	$2,%edi
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%edi
-	shrl	$10,%r14d
-
-	rorl	$17,%edi
-	xorl	%r13d,%r12d
-	xorl	%r14d,%edi
-	addl	8(%rsp),%r12d
-
-	addl	36(%rsp),%r12d
-	movl	%edx,%r13d
-	addl	%edi,%r12d
-	movl	%r11d,%r14d
-	rorl	$14,%r13d
-	movl	%r8d,%edi
-
-	xorl	%edx,%r13d
-	rorl	$9,%r14d
-	xorl	%r9d,%edi
-
-	movl	%r12d,36(%rsp)
-	xorl	%r11d,%r14d
-	andl	%edx,%edi
-
-	rorl	$5,%r13d
-	addl	%r10d,%r12d
-	xorl	%r9d,%edi
-
-	rorl	$11,%r14d
-	xorl	%edx,%r13d
-	addl	%edi,%r12d
-
-	movl	%r11d,%edi
-	addl	(%rbp),%r12d
-	xorl	%r11d,%r14d
-
-	xorl	%eax,%edi
-	rorl	$6,%r13d
-	movl	%eax,%r10d
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%r10d
-	addl	%r12d,%ecx
-	addl	%r12d,%r10d
-
-	leaq	4(%rbp),%rbp
-	movl	44(%rsp),%r13d
-	movl	32(%rsp),%r15d
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%r10d
-	movl	%r15d,%r14d
-	rorl	$2,%r15d
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%r15d
-	shrl	$10,%r14d
-
-	rorl	$17,%r15d
-	xorl	%r13d,%r12d
-	xorl	%r14d,%r15d
-	addl	12(%rsp),%r12d
-
-	addl	40(%rsp),%r12d
-	movl	%ecx,%r13d
-	addl	%r15d,%r12d
-	movl	%r10d,%r14d
-	rorl	$14,%r13d
-	movl	%edx,%r15d
-
-	xorl	%ecx,%r13d
-	rorl	$9,%r14d
-	xorl	%r8d,%r15d
-
-	movl	%r12d,40(%rsp)
-	xorl	%r10d,%r14d
-	andl	%ecx,%r15d
-
-	rorl	$5,%r13d
-	addl	%r9d,%r12d
-	xorl	%r8d,%r15d
-
-	rorl	$11,%r14d
-	xorl	%ecx,%r13d
-	addl	%r15d,%r12d
-
-	movl	%r10d,%r15d
-	addl	(%rbp),%r12d
-	xorl	%r10d,%r14d
-
-	xorl	%r11d,%r15d
-	rorl	$6,%r13d
-	movl	%r11d,%r9d
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%r9d
-	addl	%r12d,%ebx
-	addl	%r12d,%r9d
-
-	leaq	4(%rbp),%rbp
-	movl	48(%rsp),%r13d
-	movl	36(%rsp),%edi
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%r9d
-	movl	%edi,%r14d
-	rorl	$2,%edi
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%edi
-	shrl	$10,%r14d
-
-	rorl	$17,%edi
-	xorl	%r13d,%r12d
-	xorl	%r14d,%edi
-	addl	16(%rsp),%r12d
-
-	addl	44(%rsp),%r12d
-	movl	%ebx,%r13d
-	addl	%edi,%r12d
-	movl	%r9d,%r14d
-	rorl	$14,%r13d
-	movl	%ecx,%edi
-
-	xorl	%ebx,%r13d
-	rorl	$9,%r14d
-	xorl	%edx,%edi
-
-	movl	%r12d,44(%rsp)
-	xorl	%r9d,%r14d
-	andl	%ebx,%edi
-
-	rorl	$5,%r13d
-	addl	%r8d,%r12d
-	xorl	%edx,%edi
-
-	rorl	$11,%r14d
-	xorl	%ebx,%r13d
-	addl	%edi,%r12d
-
-	movl	%r9d,%edi
-	addl	(%rbp),%r12d
-	xorl	%r9d,%r14d
-
-	xorl	%r10d,%edi
-	rorl	$6,%r13d
-	movl	%r10d,%r8d
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%r8d
-	addl	%r12d,%eax
-	addl	%r12d,%r8d
-
-	leaq	20(%rbp),%rbp
-	movl	52(%rsp),%r13d
-	movl	40(%rsp),%r15d
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%r8d
-	movl	%r15d,%r14d
-	rorl	$2,%r15d
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%r15d
-	shrl	$10,%r14d
-
-	rorl	$17,%r15d
-	xorl	%r13d,%r12d
-	xorl	%r14d,%r15d
-	addl	20(%rsp),%r12d
-
-	addl	48(%rsp),%r12d
-	movl	%eax,%r13d
-	addl	%r15d,%r12d
-	movl	%r8d,%r14d
-	rorl	$14,%r13d
-	movl	%ebx,%r15d
-
-	xorl	%eax,%r13d
-	rorl	$9,%r14d
-	xorl	%ecx,%r15d
-
-	movl	%r12d,48(%rsp)
-	xorl	%r8d,%r14d
-	andl	%eax,%r15d
-
-	rorl	$5,%r13d
-	addl	%edx,%r12d
-	xorl	%ecx,%r15d
-
-	rorl	$11,%r14d
-	xorl	%eax,%r13d
-	addl	%r15d,%r12d
-
-	movl	%r8d,%r15d
-	addl	(%rbp),%r12d
-	xorl	%r8d,%r14d
-
-	xorl	%r9d,%r15d
-	rorl	$6,%r13d
-	movl	%r9d,%edx
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%edx
-	addl	%r12d,%r11d
-	addl	%r12d,%edx
-
-	leaq	4(%rbp),%rbp
-	movl	56(%rsp),%r13d
-	movl	44(%rsp),%edi
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%edx
-	movl	%edi,%r14d
-	rorl	$2,%edi
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%edi
-	shrl	$10,%r14d
-
-	rorl	$17,%edi
-	xorl	%r13d,%r12d
-	xorl	%r14d,%edi
-	addl	24(%rsp),%r12d
-
-	addl	52(%rsp),%r12d
-	movl	%r11d,%r13d
-	addl	%edi,%r12d
-	movl	%edx,%r14d
-	rorl	$14,%r13d
-	movl	%eax,%edi
-
-	xorl	%r11d,%r13d
-	rorl	$9,%r14d
-	xorl	%ebx,%edi
-
-	movl	%r12d,52(%rsp)
-	xorl	%edx,%r14d
-	andl	%r11d,%edi
-
-	rorl	$5,%r13d
-	addl	%ecx,%r12d
-	xorl	%ebx,%edi
-
-	rorl	$11,%r14d
-	xorl	%r11d,%r13d
-	addl	%edi,%r12d
-
-	movl	%edx,%edi
-	addl	(%rbp),%r12d
-	xorl	%edx,%r14d
-
-	xorl	%r8d,%edi
-	rorl	$6,%r13d
-	movl	%r8d,%ecx
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%ecx
-	addl	%r12d,%r10d
-	addl	%r12d,%ecx
-
-	leaq	4(%rbp),%rbp
-	movl	60(%rsp),%r13d
-	movl	48(%rsp),%r15d
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%ecx
-	movl	%r15d,%r14d
-	rorl	$2,%r15d
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%r15d
-	shrl	$10,%r14d
-
-	rorl	$17,%r15d
-	xorl	%r13d,%r12d
-	xorl	%r14d,%r15d
-	addl	28(%rsp),%r12d
-
-	addl	56(%rsp),%r12d
-	movl	%r10d,%r13d
-	addl	%r15d,%r12d
-	movl	%ecx,%r14d
-	rorl	$14,%r13d
-	movl	%r11d,%r15d
-
-	xorl	%r10d,%r13d
-	rorl	$9,%r14d
-	xorl	%eax,%r15d
-
-	movl	%r12d,56(%rsp)
-	xorl	%ecx,%r14d
-	andl	%r10d,%r15d
-
-	rorl	$5,%r13d
-	addl	%ebx,%r12d
-	xorl	%eax,%r15d
-
-	rorl	$11,%r14d
-	xorl	%r10d,%r13d
-	addl	%r15d,%r12d
-
-	movl	%ecx,%r15d
-	addl	(%rbp),%r12d
-	xorl	%ecx,%r14d
-
-	xorl	%edx,%r15d
-	rorl	$6,%r13d
-	movl	%edx,%ebx
-
-	andl	%r15d,%edi
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%edi,%ebx
-	addl	%r12d,%r9d
-	addl	%r12d,%ebx
-
-	leaq	4(%rbp),%rbp
-	movl	0(%rsp),%r13d
-	movl	52(%rsp),%edi
-
-	movl	%r13d,%r12d
-	rorl	$11,%r13d
-	addl	%r14d,%ebx
-	movl	%edi,%r14d
-	rorl	$2,%edi
-
-	xorl	%r12d,%r13d
-	shrl	$3,%r12d
-	rorl	$7,%r13d
-	xorl	%r14d,%edi
-	shrl	$10,%r14d
-
-	rorl	$17,%edi
-	xorl	%r13d,%r12d
-	xorl	%r14d,%edi
-	addl	32(%rsp),%r12d
-
-	addl	60(%rsp),%r12d
-	movl	%r9d,%r13d
-	addl	%edi,%r12d
-	movl	%ebx,%r14d
-	rorl	$14,%r13d
-	movl	%r10d,%edi
-
-	xorl	%r9d,%r13d
-	rorl	$9,%r14d
-	xorl	%r11d,%edi
-
-	movl	%r12d,60(%rsp)
-	xorl	%ebx,%r14d
-	andl	%r9d,%edi
-
-	rorl	$5,%r13d
-	addl	%eax,%r12d
-	xorl	%r11d,%edi
-
-	rorl	$11,%r14d
-	xorl	%r9d,%r13d
-	addl	%edi,%r12d
-
-	movl	%ebx,%edi
-	addl	(%rbp),%r12d
-	xorl	%ebx,%r14d
-
-	xorl	%ecx,%edi
-	rorl	$6,%r13d
-	movl	%ecx,%eax
-
-	andl	%edi,%r15d
-	rorl	$2,%r14d
-	addl	%r13d,%r12d
-
-	xorl	%r15d,%eax
-	addl	%r12d,%r8d
-	addl	%r12d,%eax
-
-	leaq	20(%rbp),%rbp
-	cmpb	$0,3(%rbp)
-	jnz	L$rounds_16_xx
-
-	movq	64+0(%rsp),%rdi
-	addl	%r14d,%eax
-	leaq	64(%rsi),%rsi
-
-	addl	0(%rdi),%eax
-	addl	4(%rdi),%ebx
-	addl	8(%rdi),%ecx
-	addl	12(%rdi),%edx
-	addl	16(%rdi),%r8d
-	addl	20(%rdi),%r9d
-	addl	24(%rdi),%r10d
-	addl	28(%rdi),%r11d
-
-	cmpq	64+16(%rsp),%rsi
-
-	movl	%eax,0(%rdi)
-	movl	%ebx,4(%rdi)
-	movl	%ecx,8(%rdi)
-	movl	%edx,12(%rdi)
-	movl	%r8d,16(%rdi)
-	movl	%r9d,20(%rdi)
-	movl	%r10d,24(%rdi)
-	movl	%r11d,28(%rdi)
-	jb	L$loop
-
-	movq	88(%rsp),%rsi
-
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbp
-
-	movq	-8(%rsi),%rbx
-
-	leaq	(%rsi),%rsp
-
-L$epilogue:
-	ret
-
-
-.section	__DATA,__const
-.p2align	6
-
-K256:
-.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
-.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
-.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
-.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
-.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.text	
-
-.p2align	6
-sha256_block_data_order_shaext:
-
-L$shaext_shortcut:
-	leaq	K256+128(%rip),%rcx
-	movdqu	(%rdi),%xmm1
-	movdqu	16(%rdi),%xmm2
-	movdqa	512-128(%rcx),%xmm7
-
-	pshufd	$0x1b,%xmm1,%xmm0
-	pshufd	$0xb1,%xmm1,%xmm1
-	pshufd	$0x1b,%xmm2,%xmm2
-	movdqa	%xmm7,%xmm8
-.byte	102,15,58,15,202,8
-	punpcklqdq	%xmm0,%xmm2
-	jmp	L$oop_shaext
-
-.p2align	4
-L$oop_shaext:
-	movdqu	(%rsi),%xmm3
-	movdqu	16(%rsi),%xmm4
-	movdqu	32(%rsi),%xmm5
-.byte	102,15,56,0,223
-	movdqu	48(%rsi),%xmm6
-
-	movdqa	0-128(%rcx),%xmm0
-	paddd	%xmm3,%xmm0
-.byte	102,15,56,0,231
-	movdqa	%xmm2,%xmm10
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	nop
-	movdqa	%xmm1,%xmm9
-.byte	15,56,203,202
-
-	movdqa	32-128(%rcx),%xmm0
-	paddd	%xmm4,%xmm0
-.byte	102,15,56,0,239
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	leaq	64(%rsi),%rsi
-.byte	15,56,204,220
-.byte	15,56,203,202
-
-	movdqa	64-128(%rcx),%xmm0
-	paddd	%xmm5,%xmm0
-.byte	102,15,56,0,247
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	movdqa	%xmm6,%xmm7
-.byte	102,15,58,15,253,4
-	nop
-	paddd	%xmm7,%xmm3
-.byte	15,56,204,229
-.byte	15,56,203,202
-
-	movdqa	96-128(%rcx),%xmm0
-	paddd	%xmm6,%xmm0
-.byte	15,56,205,222
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	movdqa	%xmm3,%xmm7
-.byte	102,15,58,15,254,4
-	nop
-	paddd	%xmm7,%xmm4
-.byte	15,56,204,238
-.byte	15,56,203,202
-	movdqa	128-128(%rcx),%xmm0
-	paddd	%xmm3,%xmm0
-.byte	15,56,205,227
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	movdqa	%xmm4,%xmm7
-.byte	102,15,58,15,251,4
-	nop
-	paddd	%xmm7,%xmm5
-.byte	15,56,204,243
-.byte	15,56,203,202
-	movdqa	160-128(%rcx),%xmm0
-	paddd	%xmm4,%xmm0
-.byte	15,56,205,236
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	movdqa	%xmm5,%xmm7
-.byte	102,15,58,15,252,4
-	nop
-	paddd	%xmm7,%xmm6
-.byte	15,56,204,220
-.byte	15,56,203,202
-	movdqa	192-128(%rcx),%xmm0
-	paddd	%xmm5,%xmm0
-.byte	15,56,205,245
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	movdqa	%xmm6,%xmm7
-.byte	102,15,58,15,253,4
-	nop
-	paddd	%xmm7,%xmm3
-.byte	15,56,204,229
-.byte	15,56,203,202
-	movdqa	224-128(%rcx),%xmm0
-	paddd	%xmm6,%xmm0
-.byte	15,56,205,222
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	movdqa	%xmm3,%xmm7
-.byte	102,15,58,15,254,4
-	nop
-	paddd	%xmm7,%xmm4
-.byte	15,56,204,238
-.byte	15,56,203,202
-	movdqa	256-128(%rcx),%xmm0
-	paddd	%xmm3,%xmm0
-.byte	15,56,205,227
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	movdqa	%xmm4,%xmm7
-.byte	102,15,58,15,251,4
-	nop
-	paddd	%xmm7,%xmm5
-.byte	15,56,204,243
-.byte	15,56,203,202
-	movdqa	288-128(%rcx),%xmm0
-	paddd	%xmm4,%xmm0
-.byte	15,56,205,236
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	movdqa	%xmm5,%xmm7
-.byte	102,15,58,15,252,4
-	nop
-	paddd	%xmm7,%xmm6
-.byte	15,56,204,220
-.byte	15,56,203,202
-	movdqa	320-128(%rcx),%xmm0
-	paddd	%xmm5,%xmm0
-.byte	15,56,205,245
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	movdqa	%xmm6,%xmm7
-.byte	102,15,58,15,253,4
-	nop
-	paddd	%xmm7,%xmm3
-.byte	15,56,204,229
-.byte	15,56,203,202
-	movdqa	352-128(%rcx),%xmm0
-	paddd	%xmm6,%xmm0
-.byte	15,56,205,222
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	movdqa	%xmm3,%xmm7
-.byte	102,15,58,15,254,4
-	nop
-	paddd	%xmm7,%xmm4
-.byte	15,56,204,238
-.byte	15,56,203,202
-	movdqa	384-128(%rcx),%xmm0
-	paddd	%xmm3,%xmm0
-.byte	15,56,205,227
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	movdqa	%xmm4,%xmm7
-.byte	102,15,58,15,251,4
-	nop
-	paddd	%xmm7,%xmm5
-.byte	15,56,204,243
-.byte	15,56,203,202
-	movdqa	416-128(%rcx),%xmm0
-	paddd	%xmm4,%xmm0
-.byte	15,56,205,236
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	movdqa	%xmm5,%xmm7
-.byte	102,15,58,15,252,4
-.byte	15,56,203,202
-	paddd	%xmm7,%xmm6
-
-	movdqa	448-128(%rcx),%xmm0
-	paddd	%xmm5,%xmm0
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-.byte	15,56,205,245
-	movdqa	%xmm8,%xmm7
-.byte	15,56,203,202
-
-	movdqa	480-128(%rcx),%xmm0
-	paddd	%xmm6,%xmm0
-	nop
-.byte	15,56,203,209
-	pshufd	$0x0e,%xmm0,%xmm0
-	decq	%rdx
-	nop
-.byte	15,56,203,202
-
-	paddd	%xmm10,%xmm2
-	paddd	%xmm9,%xmm1
-	jnz	L$oop_shaext
-
-	pshufd	$0xb1,%xmm2,%xmm2
-	pshufd	$0x1b,%xmm1,%xmm7
-	pshufd	$0xb1,%xmm1,%xmm1
-	punpckhqdq	%xmm2,%xmm1
-.byte	102,15,58,15,215,8
-
-	movdqu	%xmm1,(%rdi)
-	movdqu	%xmm2,16(%rdi)
-	ret
-
-
-
-.p2align	6
-sha256_block_data_order_ssse3:
-
-L$ssse3_shortcut:
-	movq	%rsp,%rax
-
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	shlq	$4,%rdx
-	subq	$96,%rsp
-	leaq	(%rsi,%rdx,4),%rdx
-	andq	$-64,%rsp
-	movq	%rdi,64+0(%rsp)
-	movq	%rsi,64+8(%rsp)
-	movq	%rdx,64+16(%rsp)
-	movq	%rax,88(%rsp)
-
-L$prologue_ssse3:
-
-	movl	0(%rdi),%eax
-	movl	4(%rdi),%ebx
-	movl	8(%rdi),%ecx
-	movl	12(%rdi),%edx
-	movl	16(%rdi),%r8d
-	movl	20(%rdi),%r9d
-	movl	24(%rdi),%r10d
-	movl	28(%rdi),%r11d
-
-
-	jmp	L$loop_ssse3
-.p2align	4
-L$loop_ssse3:
-	movdqa	K256+512(%rip),%xmm7
-	movdqu	0(%rsi),%xmm0
-	movdqu	16(%rsi),%xmm1
-	movdqu	32(%rsi),%xmm2
-.byte	102,15,56,0,199
-	movdqu	48(%rsi),%xmm3
-	leaq	K256(%rip),%rbp
-.byte	102,15,56,0,207
-	movdqa	0(%rbp),%xmm4
-	movdqa	32(%rbp),%xmm5
-.byte	102,15,56,0,215
-	paddd	%xmm0,%xmm4
-	movdqa	64(%rbp),%xmm6
-.byte	102,15,56,0,223
-	movdqa	96(%rbp),%xmm7
-	paddd	%xmm1,%xmm5
-	paddd	%xmm2,%xmm6
-	paddd	%xmm3,%xmm7
-	movdqa	%xmm4,0(%rsp)
-	movl	%eax,%r14d
-	movdqa	%xmm5,16(%rsp)
-	movl	%ebx,%edi
-	movdqa	%xmm6,32(%rsp)
-	xorl	%ecx,%edi
-	movdqa	%xmm7,48(%rsp)
-	movl	%r8d,%r13d
-	jmp	L$ssse3_00_47
-
-.p2align	4
-L$ssse3_00_47:
-	subq	$-128,%rbp
-	rorl	$14,%r13d
-	movdqa	%xmm1,%xmm4
-	movl	%r14d,%eax
-	movl	%r9d,%r12d
-	movdqa	%xmm3,%xmm7
-	rorl	$9,%r14d
-	xorl	%r8d,%r13d
-	xorl	%r10d,%r12d
-	rorl	$5,%r13d
-	xorl	%eax,%r14d
-.byte	102,15,58,15,224,4
-	andl	%r8d,%r12d
-	xorl	%r8d,%r13d
-.byte	102,15,58,15,250,4
-	addl	0(%rsp),%r11d
-	movl	%eax,%r15d
-	xorl	%r10d,%r12d
-	rorl	$11,%r14d
-	movdqa	%xmm4,%xmm5
-	xorl	%ebx,%r15d
-	addl	%r12d,%r11d
-	movdqa	%xmm4,%xmm6
-	rorl	$6,%r13d
-	andl	%r15d,%edi
-	psrld	$3,%xmm4
-	xorl	%eax,%r14d
-	addl	%r13d,%r11d
-	xorl	%ebx,%edi
-	paddd	%xmm7,%xmm0
-	rorl	$2,%r14d
-	addl	%r11d,%edx
-	psrld	$7,%xmm6
-	addl	%edi,%r11d
-	movl	%edx,%r13d
-	pshufd	$250,%xmm3,%xmm7
-	addl	%r11d,%r14d
-	rorl	$14,%r13d
-	pslld	$14,%xmm5
-	movl	%r14d,%r11d
-	movl	%r8d,%r12d
-	pxor	%xmm6,%xmm4
-	rorl	$9,%r14d
-	xorl	%edx,%r13d
-	xorl	%r9d,%r12d
-	rorl	$5,%r13d
-	psrld	$11,%xmm6
-	xorl	%r11d,%r14d
-	pxor	%xmm5,%xmm4
-	andl	%edx,%r12d
-	xorl	%edx,%r13d
-	pslld	$11,%xmm5
-	addl	4(%rsp),%r10d
-	movl	%r11d,%edi
-	pxor	%xmm6,%xmm4
-	xorl	%r9d,%r12d
-	rorl	$11,%r14d
-	movdqa	%xmm7,%xmm6
-	xorl	%eax,%edi
-	addl	%r12d,%r10d
-	pxor	%xmm5,%xmm4
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	xorl	%r11d,%r14d
-	psrld	$10,%xmm7
-	addl	%r13d,%r10d
-	xorl	%eax,%r15d
-	paddd	%xmm4,%xmm0
-	rorl	$2,%r14d
-	addl	%r10d,%ecx
-	psrlq	$17,%xmm6
-	addl	%r15d,%r10d
-	movl	%ecx,%r13d
-	addl	%r10d,%r14d
-	pxor	%xmm6,%xmm7
-	rorl	$14,%r13d
-	movl	%r14d,%r10d
-	movl	%edx,%r12d
-	rorl	$9,%r14d
-	psrlq	$2,%xmm6
-	xorl	%ecx,%r13d
-	xorl	%r8d,%r12d
-	pxor	%xmm6,%xmm7
-	rorl	$5,%r13d
-	xorl	%r10d,%r14d
-	andl	%ecx,%r12d
-	pshufd	$128,%xmm7,%xmm7
-	xorl	%ecx,%r13d
-	addl	8(%rsp),%r9d
-	movl	%r10d,%r15d
-	psrldq	$8,%xmm7
-	xorl	%r8d,%r12d
-	rorl	$11,%r14d
-	xorl	%r11d,%r15d
-	addl	%r12d,%r9d
-	rorl	$6,%r13d
-	paddd	%xmm7,%xmm0
-	andl	%r15d,%edi
-	xorl	%r10d,%r14d
-	addl	%r13d,%r9d
-	pshufd	$80,%xmm0,%xmm7
-	xorl	%r11d,%edi
-	rorl	$2,%r14d
-	addl	%r9d,%ebx
-	movdqa	%xmm7,%xmm6
-	addl	%edi,%r9d
-	movl	%ebx,%r13d
-	psrld	$10,%xmm7
-	addl	%r9d,%r14d
-	rorl	$14,%r13d
-	psrlq	$17,%xmm6
-	movl	%r14d,%r9d
-	movl	%ecx,%r12d
-	pxor	%xmm6,%xmm7
-	rorl	$9,%r14d
-	xorl	%ebx,%r13d
-	xorl	%edx,%r12d
-	rorl	$5,%r13d
-	xorl	%r9d,%r14d
-	psrlq	$2,%xmm6
-	andl	%ebx,%r12d
-	xorl	%ebx,%r13d
-	addl	12(%rsp),%r8d
-	pxor	%xmm6,%xmm7
-	movl	%r9d,%edi
-	xorl	%edx,%r12d
-	rorl	$11,%r14d
-	pshufd	$8,%xmm7,%xmm7
-	xorl	%r10d,%edi
-	addl	%r12d,%r8d
-	movdqa	0(%rbp),%xmm6
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	pslldq	$8,%xmm7
-	xorl	%r9d,%r14d
-	addl	%r13d,%r8d
-	xorl	%r10d,%r15d
-	paddd	%xmm7,%xmm0
-	rorl	$2,%r14d
-	addl	%r8d,%eax
-	addl	%r15d,%r8d
-	paddd	%xmm0,%xmm6
-	movl	%eax,%r13d
-	addl	%r8d,%r14d
-	movdqa	%xmm6,0(%rsp)
-	rorl	$14,%r13d
-	movdqa	%xmm2,%xmm4
-	movl	%r14d,%r8d
-	movl	%ebx,%r12d
-	movdqa	%xmm0,%xmm7
-	rorl	$9,%r14d
-	xorl	%eax,%r13d
-	xorl	%ecx,%r12d
-	rorl	$5,%r13d
-	xorl	%r8d,%r14d
-.byte	102,15,58,15,225,4
-	andl	%eax,%r12d
-	xorl	%eax,%r13d
-.byte	102,15,58,15,251,4
-	addl	16(%rsp),%edx
-	movl	%r8d,%r15d
-	xorl	%ecx,%r12d
-	rorl	$11,%r14d
-	movdqa	%xmm4,%xmm5
-	xorl	%r9d,%r15d
-	addl	%r12d,%edx
-	movdqa	%xmm4,%xmm6
-	rorl	$6,%r13d
-	andl	%r15d,%edi
-	psrld	$3,%xmm4
-	xorl	%r8d,%r14d
-	addl	%r13d,%edx
-	xorl	%r9d,%edi
-	paddd	%xmm7,%xmm1
-	rorl	$2,%r14d
-	addl	%edx,%r11d
-	psrld	$7,%xmm6
-	addl	%edi,%edx
-	movl	%r11d,%r13d
-	pshufd	$250,%xmm0,%xmm7
-	addl	%edx,%r14d
-	rorl	$14,%r13d
-	pslld	$14,%xmm5
-	movl	%r14d,%edx
-	movl	%eax,%r12d
-	pxor	%xmm6,%xmm4
-	rorl	$9,%r14d
-	xorl	%r11d,%r13d
-	xorl	%ebx,%r12d
-	rorl	$5,%r13d
-	psrld	$11,%xmm6
-	xorl	%edx,%r14d
-	pxor	%xmm5,%xmm4
-	andl	%r11d,%r12d
-	xorl	%r11d,%r13d
-	pslld	$11,%xmm5
-	addl	20(%rsp),%ecx
-	movl	%edx,%edi
-	pxor	%xmm6,%xmm4
-	xorl	%ebx,%r12d
-	rorl	$11,%r14d
-	movdqa	%xmm7,%xmm6
-	xorl	%r8d,%edi
-	addl	%r12d,%ecx
-	pxor	%xmm5,%xmm4
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	xorl	%edx,%r14d
-	psrld	$10,%xmm7
-	addl	%r13d,%ecx
-	xorl	%r8d,%r15d
-	paddd	%xmm4,%xmm1
-	rorl	$2,%r14d
-	addl	%ecx,%r10d
-	psrlq	$17,%xmm6
-	addl	%r15d,%ecx
-	movl	%r10d,%r13d
-	addl	%ecx,%r14d
-	pxor	%xmm6,%xmm7
-	rorl	$14,%r13d
-	movl	%r14d,%ecx
-	movl	%r11d,%r12d
-	rorl	$9,%r14d
-	psrlq	$2,%xmm6
-	xorl	%r10d,%r13d
-	xorl	%eax,%r12d
-	pxor	%xmm6,%xmm7
-	rorl	$5,%r13d
-	xorl	%ecx,%r14d
-	andl	%r10d,%r12d
-	pshufd	$128,%xmm7,%xmm7
-	xorl	%r10d,%r13d
-	addl	24(%rsp),%ebx
-	movl	%ecx,%r15d
-	psrldq	$8,%xmm7
-	xorl	%eax,%r12d
-	rorl	$11,%r14d
-	xorl	%edx,%r15d
-	addl	%r12d,%ebx
-	rorl	$6,%r13d
-	paddd	%xmm7,%xmm1
-	andl	%r15d,%edi
-	xorl	%ecx,%r14d
-	addl	%r13d,%ebx
-	pshufd	$80,%xmm1,%xmm7
-	xorl	%edx,%edi
-	rorl	$2,%r14d
-	addl	%ebx,%r9d
-	movdqa	%xmm7,%xmm6
-	addl	%edi,%ebx
-	movl	%r9d,%r13d
-	psrld	$10,%xmm7
-	addl	%ebx,%r14d
-	rorl	$14,%r13d
-	psrlq	$17,%xmm6
-	movl	%r14d,%ebx
-	movl	%r10d,%r12d
-	pxor	%xmm6,%xmm7
-	rorl	$9,%r14d
-	xorl	%r9d,%r13d
-	xorl	%r11d,%r12d
-	rorl	$5,%r13d
-	xorl	%ebx,%r14d
-	psrlq	$2,%xmm6
-	andl	%r9d,%r12d
-	xorl	%r9d,%r13d
-	addl	28(%rsp),%eax
-	pxor	%xmm6,%xmm7
-	movl	%ebx,%edi
-	xorl	%r11d,%r12d
-	rorl	$11,%r14d
-	pshufd	$8,%xmm7,%xmm7
-	xorl	%ecx,%edi
-	addl	%r12d,%eax
-	movdqa	32(%rbp),%xmm6
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	pslldq	$8,%xmm7
-	xorl	%ebx,%r14d
-	addl	%r13d,%eax
-	xorl	%ecx,%r15d
-	paddd	%xmm7,%xmm1
-	rorl	$2,%r14d
-	addl	%eax,%r8d
-	addl	%r15d,%eax
-	paddd	%xmm1,%xmm6
-	movl	%r8d,%r13d
-	addl	%eax,%r14d
-	movdqa	%xmm6,16(%rsp)
-	rorl	$14,%r13d
-	movdqa	%xmm3,%xmm4
-	movl	%r14d,%eax
-	movl	%r9d,%r12d
-	movdqa	%xmm1,%xmm7
-	rorl	$9,%r14d
-	xorl	%r8d,%r13d
-	xorl	%r10d,%r12d
-	rorl	$5,%r13d
-	xorl	%eax,%r14d
-.byte	102,15,58,15,226,4
-	andl	%r8d,%r12d
-	xorl	%r8d,%r13d
-.byte	102,15,58,15,248,4
-	addl	32(%rsp),%r11d
-	movl	%eax,%r15d
-	xorl	%r10d,%r12d
-	rorl	$11,%r14d
-	movdqa	%xmm4,%xmm5
-	xorl	%ebx,%r15d
-	addl	%r12d,%r11d
-	movdqa	%xmm4,%xmm6
-	rorl	$6,%r13d
-	andl	%r15d,%edi
-	psrld	$3,%xmm4
-	xorl	%eax,%r14d
-	addl	%r13d,%r11d
-	xorl	%ebx,%edi
-	paddd	%xmm7,%xmm2
-	rorl	$2,%r14d
-	addl	%r11d,%edx
-	psrld	$7,%xmm6
-	addl	%edi,%r11d
-	movl	%edx,%r13d
-	pshufd	$250,%xmm1,%xmm7
-	addl	%r11d,%r14d
-	rorl	$14,%r13d
-	pslld	$14,%xmm5
-	movl	%r14d,%r11d
-	movl	%r8d,%r12d
-	pxor	%xmm6,%xmm4
-	rorl	$9,%r14d
-	xorl	%edx,%r13d
-	xorl	%r9d,%r12d
-	rorl	$5,%r13d
-	psrld	$11,%xmm6
-	xorl	%r11d,%r14d
-	pxor	%xmm5,%xmm4
-	andl	%edx,%r12d
-	xorl	%edx,%r13d
-	pslld	$11,%xmm5
-	addl	36(%rsp),%r10d
-	movl	%r11d,%edi
-	pxor	%xmm6,%xmm4
-	xorl	%r9d,%r12d
-	rorl	$11,%r14d
-	movdqa	%xmm7,%xmm6
-	xorl	%eax,%edi
-	addl	%r12d,%r10d
-	pxor	%xmm5,%xmm4
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	xorl	%r11d,%r14d
-	psrld	$10,%xmm7
-	addl	%r13d,%r10d
-	xorl	%eax,%r15d
-	paddd	%xmm4,%xmm2
-	rorl	$2,%r14d
-	addl	%r10d,%ecx
-	psrlq	$17,%xmm6
-	addl	%r15d,%r10d
-	movl	%ecx,%r13d
-	addl	%r10d,%r14d
-	pxor	%xmm6,%xmm7
-	rorl	$14,%r13d
-	movl	%r14d,%r10d
-	movl	%edx,%r12d
-	rorl	$9,%r14d
-	psrlq	$2,%xmm6
-	xorl	%ecx,%r13d
-	xorl	%r8d,%r12d
-	pxor	%xmm6,%xmm7
-	rorl	$5,%r13d
-	xorl	%r10d,%r14d
-	andl	%ecx,%r12d
-	pshufd	$128,%xmm7,%xmm7
-	xorl	%ecx,%r13d
-	addl	40(%rsp),%r9d
-	movl	%r10d,%r15d
-	psrldq	$8,%xmm7
-	xorl	%r8d,%r12d
-	rorl	$11,%r14d
-	xorl	%r11d,%r15d
-	addl	%r12d,%r9d
-	rorl	$6,%r13d
-	paddd	%xmm7,%xmm2
-	andl	%r15d,%edi
-	xorl	%r10d,%r14d
-	addl	%r13d,%r9d
-	pshufd	$80,%xmm2,%xmm7
-	xorl	%r11d,%edi
-	rorl	$2,%r14d
-	addl	%r9d,%ebx
-	movdqa	%xmm7,%xmm6
-	addl	%edi,%r9d
-	movl	%ebx,%r13d
-	psrld	$10,%xmm7
-	addl	%r9d,%r14d
-	rorl	$14,%r13d
-	psrlq	$17,%xmm6
-	movl	%r14d,%r9d
-	movl	%ecx,%r12d
-	pxor	%xmm6,%xmm7
-	rorl	$9,%r14d
-	xorl	%ebx,%r13d
-	xorl	%edx,%r12d
-	rorl	$5,%r13d
-	xorl	%r9d,%r14d
-	psrlq	$2,%xmm6
-	andl	%ebx,%r12d
-	xorl	%ebx,%r13d
-	addl	44(%rsp),%r8d
-	pxor	%xmm6,%xmm7
-	movl	%r9d,%edi
-	xorl	%edx,%r12d
-	rorl	$11,%r14d
-	pshufd	$8,%xmm7,%xmm7
-	xorl	%r10d,%edi
-	addl	%r12d,%r8d
-	movdqa	64(%rbp),%xmm6
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	pslldq	$8,%xmm7
-	xorl	%r9d,%r14d
-	addl	%r13d,%r8d
-	xorl	%r10d,%r15d
-	paddd	%xmm7,%xmm2
-	rorl	$2,%r14d
-	addl	%r8d,%eax
-	addl	%r15d,%r8d
-	paddd	%xmm2,%xmm6
-	movl	%eax,%r13d
-	addl	%r8d,%r14d
-	movdqa	%xmm6,32(%rsp)
-	rorl	$14,%r13d
-	movdqa	%xmm0,%xmm4
-	movl	%r14d,%r8d
-	movl	%ebx,%r12d
-	movdqa	%xmm2,%xmm7
-	rorl	$9,%r14d
-	xorl	%eax,%r13d
-	xorl	%ecx,%r12d
-	rorl	$5,%r13d
-	xorl	%r8d,%r14d
-.byte	102,15,58,15,227,4
-	andl	%eax,%r12d
-	xorl	%eax,%r13d
-.byte	102,15,58,15,249,4
-	addl	48(%rsp),%edx
-	movl	%r8d,%r15d
-	xorl	%ecx,%r12d
-	rorl	$11,%r14d
-	movdqa	%xmm4,%xmm5
-	xorl	%r9d,%r15d
-	addl	%r12d,%edx
-	movdqa	%xmm4,%xmm6
-	rorl	$6,%r13d
-	andl	%r15d,%edi
-	psrld	$3,%xmm4
-	xorl	%r8d,%r14d
-	addl	%r13d,%edx
-	xorl	%r9d,%edi
-	paddd	%xmm7,%xmm3
-	rorl	$2,%r14d
-	addl	%edx,%r11d
-	psrld	$7,%xmm6
-	addl	%edi,%edx
-	movl	%r11d,%r13d
-	pshufd	$250,%xmm2,%xmm7
-	addl	%edx,%r14d
-	rorl	$14,%r13d
-	pslld	$14,%xmm5
-	movl	%r14d,%edx
-	movl	%eax,%r12d
-	pxor	%xmm6,%xmm4
-	rorl	$9,%r14d
-	xorl	%r11d,%r13d
-	xorl	%ebx,%r12d
-	rorl	$5,%r13d
-	psrld	$11,%xmm6
-	xorl	%edx,%r14d
-	pxor	%xmm5,%xmm4
-	andl	%r11d,%r12d
-	xorl	%r11d,%r13d
-	pslld	$11,%xmm5
-	addl	52(%rsp),%ecx
-	movl	%edx,%edi
-	pxor	%xmm6,%xmm4
-	xorl	%ebx,%r12d
-	rorl	$11,%r14d
-	movdqa	%xmm7,%xmm6
-	xorl	%r8d,%edi
-	addl	%r12d,%ecx
-	pxor	%xmm5,%xmm4
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	xorl	%edx,%r14d
-	psrld	$10,%xmm7
-	addl	%r13d,%ecx
-	xorl	%r8d,%r15d
-	paddd	%xmm4,%xmm3
-	rorl	$2,%r14d
-	addl	%ecx,%r10d
-	psrlq	$17,%xmm6
-	addl	%r15d,%ecx
-	movl	%r10d,%r13d
-	addl	%ecx,%r14d
-	pxor	%xmm6,%xmm7
-	rorl	$14,%r13d
-	movl	%r14d,%ecx
-	movl	%r11d,%r12d
-	rorl	$9,%r14d
-	psrlq	$2,%xmm6
-	xorl	%r10d,%r13d
-	xorl	%eax,%r12d
-	pxor	%xmm6,%xmm7
-	rorl	$5,%r13d
-	xorl	%ecx,%r14d
-	andl	%r10d,%r12d
-	pshufd	$128,%xmm7,%xmm7
-	xorl	%r10d,%r13d
-	addl	56(%rsp),%ebx
-	movl	%ecx,%r15d
-	psrldq	$8,%xmm7
-	xorl	%eax,%r12d
-	rorl	$11,%r14d
-	xorl	%edx,%r15d
-	addl	%r12d,%ebx
-	rorl	$6,%r13d
-	paddd	%xmm7,%xmm3
-	andl	%r15d,%edi
-	xorl	%ecx,%r14d
-	addl	%r13d,%ebx
-	pshufd	$80,%xmm3,%xmm7
-	xorl	%edx,%edi
-	rorl	$2,%r14d
-	addl	%ebx,%r9d
-	movdqa	%xmm7,%xmm6
-	addl	%edi,%ebx
-	movl	%r9d,%r13d
-	psrld	$10,%xmm7
-	addl	%ebx,%r14d
-	rorl	$14,%r13d
-	psrlq	$17,%xmm6
-	movl	%r14d,%ebx
-	movl	%r10d,%r12d
-	pxor	%xmm6,%xmm7
-	rorl	$9,%r14d
-	xorl	%r9d,%r13d
-	xorl	%r11d,%r12d
-	rorl	$5,%r13d
-	xorl	%ebx,%r14d
-	psrlq	$2,%xmm6
-	andl	%r9d,%r12d
-	xorl	%r9d,%r13d
-	addl	60(%rsp),%eax
-	pxor	%xmm6,%xmm7
-	movl	%ebx,%edi
-	xorl	%r11d,%r12d
-	rorl	$11,%r14d
-	pshufd	$8,%xmm7,%xmm7
-	xorl	%ecx,%edi
-	addl	%r12d,%eax
-	movdqa	96(%rbp),%xmm6
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	pslldq	$8,%xmm7
-	xorl	%ebx,%r14d
-	addl	%r13d,%eax
-	xorl	%ecx,%r15d
-	paddd	%xmm7,%xmm3
-	rorl	$2,%r14d
-	addl	%eax,%r8d
-	addl	%r15d,%eax
-	paddd	%xmm3,%xmm6
-	movl	%r8d,%r13d
-	addl	%eax,%r14d
-	movdqa	%xmm6,48(%rsp)
-	cmpb	$0,131(%rbp)
-	jne	L$ssse3_00_47
-	rorl	$14,%r13d
-	movl	%r14d,%eax
-	movl	%r9d,%r12d
-	rorl	$9,%r14d
-	xorl	%r8d,%r13d
-	xorl	%r10d,%r12d
-	rorl	$5,%r13d
-	xorl	%eax,%r14d
-	andl	%r8d,%r12d
-	xorl	%r8d,%r13d
-	addl	0(%rsp),%r11d
-	movl	%eax,%r15d
-	xorl	%r10d,%r12d
-	rorl	$11,%r14d
-	xorl	%ebx,%r15d
-	addl	%r12d,%r11d
-	rorl	$6,%r13d
-	andl	%r15d,%edi
-	xorl	%eax,%r14d
-	addl	%r13d,%r11d
-	xorl	%ebx,%edi
-	rorl	$2,%r14d
-	addl	%r11d,%edx
-	addl	%edi,%r11d
-	movl	%edx,%r13d
-	addl	%r11d,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%r11d
-	movl	%r8d,%r12d
-	rorl	$9,%r14d
-	xorl	%edx,%r13d
-	xorl	%r9d,%r12d
-	rorl	$5,%r13d
-	xorl	%r11d,%r14d
-	andl	%edx,%r12d
-	xorl	%edx,%r13d
-	addl	4(%rsp),%r10d
-	movl	%r11d,%edi
-	xorl	%r9d,%r12d
-	rorl	$11,%r14d
-	xorl	%eax,%edi
-	addl	%r12d,%r10d
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	xorl	%r11d,%r14d
-	addl	%r13d,%r10d
-	xorl	%eax,%r15d
-	rorl	$2,%r14d
-	addl	%r10d,%ecx
-	addl	%r15d,%r10d
-	movl	%ecx,%r13d
-	addl	%r10d,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%r10d
-	movl	%edx,%r12d
-	rorl	$9,%r14d
-	xorl	%ecx,%r13d
-	xorl	%r8d,%r12d
-	rorl	$5,%r13d
-	xorl	%r10d,%r14d
-	andl	%ecx,%r12d
-	xorl	%ecx,%r13d
-	addl	8(%rsp),%r9d
-	movl	%r10d,%r15d
-	xorl	%r8d,%r12d
-	rorl	$11,%r14d
-	xorl	%r11d,%r15d
-	addl	%r12d,%r9d
-	rorl	$6,%r13d
-	andl	%r15d,%edi
-	xorl	%r10d,%r14d
-	addl	%r13d,%r9d
-	xorl	%r11d,%edi
-	rorl	$2,%r14d
-	addl	%r9d,%ebx
-	addl	%edi,%r9d
-	movl	%ebx,%r13d
-	addl	%r9d,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%r9d
-	movl	%ecx,%r12d
-	rorl	$9,%r14d
-	xorl	%ebx,%r13d
-	xorl	%edx,%r12d
-	rorl	$5,%r13d
-	xorl	%r9d,%r14d
-	andl	%ebx,%r12d
-	xorl	%ebx,%r13d
-	addl	12(%rsp),%r8d
-	movl	%r9d,%edi
-	xorl	%edx,%r12d
-	rorl	$11,%r14d
-	xorl	%r10d,%edi
-	addl	%r12d,%r8d
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	xorl	%r9d,%r14d
-	addl	%r13d,%r8d
-	xorl	%r10d,%r15d
-	rorl	$2,%r14d
-	addl	%r8d,%eax
-	addl	%r15d,%r8d
-	movl	%eax,%r13d
-	addl	%r8d,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%r8d
-	movl	%ebx,%r12d
-	rorl	$9,%r14d
-	xorl	%eax,%r13d
-	xorl	%ecx,%r12d
-	rorl	$5,%r13d
-	xorl	%r8d,%r14d
-	andl	%eax,%r12d
-	xorl	%eax,%r13d
-	addl	16(%rsp),%edx
-	movl	%r8d,%r15d
-	xorl	%ecx,%r12d
-	rorl	$11,%r14d
-	xorl	%r9d,%r15d
-	addl	%r12d,%edx
-	rorl	$6,%r13d
-	andl	%r15d,%edi
-	xorl	%r8d,%r14d
-	addl	%r13d,%edx
-	xorl	%r9d,%edi
-	rorl	$2,%r14d
-	addl	%edx,%r11d
-	addl	%edi,%edx
-	movl	%r11d,%r13d
-	addl	%edx,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%edx
-	movl	%eax,%r12d
-	rorl	$9,%r14d
-	xorl	%r11d,%r13d
-	xorl	%ebx,%r12d
-	rorl	$5,%r13d
-	xorl	%edx,%r14d
-	andl	%r11d,%r12d
-	xorl	%r11d,%r13d
-	addl	20(%rsp),%ecx
-	movl	%edx,%edi
-	xorl	%ebx,%r12d
-	rorl	$11,%r14d
-	xorl	%r8d,%edi
-	addl	%r12d,%ecx
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	xorl	%edx,%r14d
-	addl	%r13d,%ecx
-	xorl	%r8d,%r15d
-	rorl	$2,%r14d
-	addl	%ecx,%r10d
-	addl	%r15d,%ecx
-	movl	%r10d,%r13d
-	addl	%ecx,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%ecx
-	movl	%r11d,%r12d
-	rorl	$9,%r14d
-	xorl	%r10d,%r13d
-	xorl	%eax,%r12d
-	rorl	$5,%r13d
-	xorl	%ecx,%r14d
-	andl	%r10d,%r12d
-	xorl	%r10d,%r13d
-	addl	24(%rsp),%ebx
-	movl	%ecx,%r15d
-	xorl	%eax,%r12d
-	rorl	$11,%r14d
-	xorl	%edx,%r15d
-	addl	%r12d,%ebx
-	rorl	$6,%r13d
-	andl	%r15d,%edi
-	xorl	%ecx,%r14d
-	addl	%r13d,%ebx
-	xorl	%edx,%edi
-	rorl	$2,%r14d
-	addl	%ebx,%r9d
-	addl	%edi,%ebx
-	movl	%r9d,%r13d
-	addl	%ebx,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%ebx
-	movl	%r10d,%r12d
-	rorl	$9,%r14d
-	xorl	%r9d,%r13d
-	xorl	%r11d,%r12d
-	rorl	$5,%r13d
-	xorl	%ebx,%r14d
-	andl	%r9d,%r12d
-	xorl	%r9d,%r13d
-	addl	28(%rsp),%eax
-	movl	%ebx,%edi
-	xorl	%r11d,%r12d
-	rorl	$11,%r14d
-	xorl	%ecx,%edi
-	addl	%r12d,%eax
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	xorl	%ebx,%r14d
-	addl	%r13d,%eax
-	xorl	%ecx,%r15d
-	rorl	$2,%r14d
-	addl	%eax,%r8d
-	addl	%r15d,%eax
-	movl	%r8d,%r13d
-	addl	%eax,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%eax
-	movl	%r9d,%r12d
-	rorl	$9,%r14d
-	xorl	%r8d,%r13d
-	xorl	%r10d,%r12d
-	rorl	$5,%r13d
-	xorl	%eax,%r14d
-	andl	%r8d,%r12d
-	xorl	%r8d,%r13d
-	addl	32(%rsp),%r11d
-	movl	%eax,%r15d
-	xorl	%r10d,%r12d
-	rorl	$11,%r14d
-	xorl	%ebx,%r15d
-	addl	%r12d,%r11d
-	rorl	$6,%r13d
-	andl	%r15d,%edi
-	xorl	%eax,%r14d
-	addl	%r13d,%r11d
-	xorl	%ebx,%edi
-	rorl	$2,%r14d
-	addl	%r11d,%edx
-	addl	%edi,%r11d
-	movl	%edx,%r13d
-	addl	%r11d,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%r11d
-	movl	%r8d,%r12d
-	rorl	$9,%r14d
-	xorl	%edx,%r13d
-	xorl	%r9d,%r12d
-	rorl	$5,%r13d
-	xorl	%r11d,%r14d
-	andl	%edx,%r12d
-	xorl	%edx,%r13d
-	addl	36(%rsp),%r10d
-	movl	%r11d,%edi
-	xorl	%r9d,%r12d
-	rorl	$11,%r14d
-	xorl	%eax,%edi
-	addl	%r12d,%r10d
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	xorl	%r11d,%r14d
-	addl	%r13d,%r10d
-	xorl	%eax,%r15d
-	rorl	$2,%r14d
-	addl	%r10d,%ecx
-	addl	%r15d,%r10d
-	movl	%ecx,%r13d
-	addl	%r10d,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%r10d
-	movl	%edx,%r12d
-	rorl	$9,%r14d
-	xorl	%ecx,%r13d
-	xorl	%r8d,%r12d
-	rorl	$5,%r13d
-	xorl	%r10d,%r14d
-	andl	%ecx,%r12d
-	xorl	%ecx,%r13d
-	addl	40(%rsp),%r9d
-	movl	%r10d,%r15d
-	xorl	%r8d,%r12d
-	rorl	$11,%r14d
-	xorl	%r11d,%r15d
-	addl	%r12d,%r9d
-	rorl	$6,%r13d
-	andl	%r15d,%edi
-	xorl	%r10d,%r14d
-	addl	%r13d,%r9d
-	xorl	%r11d,%edi
-	rorl	$2,%r14d
-	addl	%r9d,%ebx
-	addl	%edi,%r9d
-	movl	%ebx,%r13d
-	addl	%r9d,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%r9d
-	movl	%ecx,%r12d
-	rorl	$9,%r14d
-	xorl	%ebx,%r13d
-	xorl	%edx,%r12d
-	rorl	$5,%r13d
-	xorl	%r9d,%r14d
-	andl	%ebx,%r12d
-	xorl	%ebx,%r13d
-	addl	44(%rsp),%r8d
-	movl	%r9d,%edi
-	xorl	%edx,%r12d
-	rorl	$11,%r14d
-	xorl	%r10d,%edi
-	addl	%r12d,%r8d
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	xorl	%r9d,%r14d
-	addl	%r13d,%r8d
-	xorl	%r10d,%r15d
-	rorl	$2,%r14d
-	addl	%r8d,%eax
-	addl	%r15d,%r8d
-	movl	%eax,%r13d
-	addl	%r8d,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%r8d
-	movl	%ebx,%r12d
-	rorl	$9,%r14d
-	xorl	%eax,%r13d
-	xorl	%ecx,%r12d
-	rorl	$5,%r13d
-	xorl	%r8d,%r14d
-	andl	%eax,%r12d
-	xorl	%eax,%r13d
-	addl	48(%rsp),%edx
-	movl	%r8d,%r15d
-	xorl	%ecx,%r12d
-	rorl	$11,%r14d
-	xorl	%r9d,%r15d
-	addl	%r12d,%edx
-	rorl	$6,%r13d
-	andl	%r15d,%edi
-	xorl	%r8d,%r14d
-	addl	%r13d,%edx
-	xorl	%r9d,%edi
-	rorl	$2,%r14d
-	addl	%edx,%r11d
-	addl	%edi,%edx
-	movl	%r11d,%r13d
-	addl	%edx,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%edx
-	movl	%eax,%r12d
-	rorl	$9,%r14d
-	xorl	%r11d,%r13d
-	xorl	%ebx,%r12d
-	rorl	$5,%r13d
-	xorl	%edx,%r14d
-	andl	%r11d,%r12d
-	xorl	%r11d,%r13d
-	addl	52(%rsp),%ecx
-	movl	%edx,%edi
-	xorl	%ebx,%r12d
-	rorl	$11,%r14d
-	xorl	%r8d,%edi
-	addl	%r12d,%ecx
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	xorl	%edx,%r14d
-	addl	%r13d,%ecx
-	xorl	%r8d,%r15d
-	rorl	$2,%r14d
-	addl	%ecx,%r10d
-	addl	%r15d,%ecx
-	movl	%r10d,%r13d
-	addl	%ecx,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%ecx
-	movl	%r11d,%r12d
-	rorl	$9,%r14d
-	xorl	%r10d,%r13d
-	xorl	%eax,%r12d
-	rorl	$5,%r13d
-	xorl	%ecx,%r14d
-	andl	%r10d,%r12d
-	xorl	%r10d,%r13d
-	addl	56(%rsp),%ebx
-	movl	%ecx,%r15d
-	xorl	%eax,%r12d
-	rorl	$11,%r14d
-	xorl	%edx,%r15d
-	addl	%r12d,%ebx
-	rorl	$6,%r13d
-	andl	%r15d,%edi
-	xorl	%ecx,%r14d
-	addl	%r13d,%ebx
-	xorl	%edx,%edi
-	rorl	$2,%r14d
-	addl	%ebx,%r9d
-	addl	%edi,%ebx
-	movl	%r9d,%r13d
-	addl	%ebx,%r14d
-	rorl	$14,%r13d
-	movl	%r14d,%ebx
-	movl	%r10d,%r12d
-	rorl	$9,%r14d
-	xorl	%r9d,%r13d
-	xorl	%r11d,%r12d
-	rorl	$5,%r13d
-	xorl	%ebx,%r14d
-	andl	%r9d,%r12d
-	xorl	%r9d,%r13d
-	addl	60(%rsp),%eax
-	movl	%ebx,%edi
-	xorl	%r11d,%r12d
-	rorl	$11,%r14d
-	xorl	%ecx,%edi
-	addl	%r12d,%eax
-	rorl	$6,%r13d
-	andl	%edi,%r15d
-	xorl	%ebx,%r14d
-	addl	%r13d,%eax
-	xorl	%ecx,%r15d
-	rorl	$2,%r14d
-	addl	%eax,%r8d
-	addl	%r15d,%eax
-	movl	%r8d,%r13d
-	addl	%eax,%r14d
-	movq	64+0(%rsp),%rdi
-	movl	%r14d,%eax
-
-	addl	0(%rdi),%eax
-	leaq	64(%rsi),%rsi
-	addl	4(%rdi),%ebx
-	addl	8(%rdi),%ecx
-	addl	12(%rdi),%edx
-	addl	16(%rdi),%r8d
-	addl	20(%rdi),%r9d
-	addl	24(%rdi),%r10d
-	addl	28(%rdi),%r11d
-
-	cmpq	64+16(%rsp),%rsi
-
-	movl	%eax,0(%rdi)
-	movl	%ebx,4(%rdi)
-	movl	%ecx,8(%rdi)
-	movl	%edx,12(%rdi)
-	movl	%r8d,16(%rdi)
-	movl	%r9d,20(%rdi)
-	movl	%r10d,24(%rdi)
-	movl	%r11d,28(%rdi)
-	jb	L$loop_ssse3
-
-	movq	88(%rsp),%rsi
-
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbp
-
-	movq	-8(%rsi),%rbx
-
-	leaq	(%rsi),%rsp
-
-L$epilogue_ssse3:
-	ret
-
-
-
-.p2align	6
-sha256_block_data_order_avx:
-
-L$avx_shortcut:
-	movq	%rsp,%rax
-
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	shlq	$4,%rdx
-	subq	$96,%rsp
-	leaq	(%rsi,%rdx,4),%rdx
-	andq	$-64,%rsp
-	movq	%rdi,64+0(%rsp)
-	movq	%rsi,64+8(%rsp)
-	movq	%rdx,64+16(%rsp)
-	movq	%rax,88(%rsp)
-
-L$prologue_avx:
-
-	vzeroupper
-	movl	0(%rdi),%eax
-	movl	4(%rdi),%ebx
-	movl	8(%rdi),%ecx
-	movl	12(%rdi),%edx
-	movl	16(%rdi),%r8d
-	movl	20(%rdi),%r9d
-	movl	24(%rdi),%r10d
-	movl	28(%rdi),%r11d
-	vmovdqa	K256+512+32(%rip),%xmm8
-	vmovdqa	K256+512+64(%rip),%xmm9
-	jmp	L$loop_avx
-.p2align	4
-L$loop_avx:
-	vmovdqa	K256+512(%rip),%xmm7
-	vmovdqu	0(%rsi),%xmm0
-	vmovdqu	16(%rsi),%xmm1
-	vmovdqu	32(%rsi),%xmm2
-	vmovdqu	48(%rsi),%xmm3
-	vpshufb	%xmm7,%xmm0,%xmm0
-	leaq	K256(%rip),%rbp
-	vpshufb	%xmm7,%xmm1,%xmm1
-	vpshufb	%xmm7,%xmm2,%xmm2
-	vpaddd	0(%rbp),%xmm0,%xmm4
-	vpshufb	%xmm7,%xmm3,%xmm3
-	vpaddd	32(%rbp),%xmm1,%xmm5
-	vpaddd	64(%rbp),%xmm2,%xmm6
-	vpaddd	96(%rbp),%xmm3,%xmm7
-	vmovdqa	%xmm4,0(%rsp)
-	movl	%eax,%r14d
-	vmovdqa	%xmm5,16(%rsp)
-	movl	%ebx,%edi
-	vmovdqa	%xmm6,32(%rsp)
-	xorl	%ecx,%edi
-	vmovdqa	%xmm7,48(%rsp)
-	movl	%r8d,%r13d
-	jmp	L$avx_00_47
-
-.p2align	4
-L$avx_00_47:
-	subq	$-128,%rbp
-	vpalignr	$4,%xmm0,%xmm1,%xmm4
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%eax
-	movl	%r9d,%r12d
-	vpalignr	$4,%xmm2,%xmm3,%xmm7
-	shrdl	$9,%r14d,%r14d
-	xorl	%r8d,%r13d
-	xorl	%r10d,%r12d
-	vpsrld	$7,%xmm4,%xmm6
-	shrdl	$5,%r13d,%r13d
-	xorl	%eax,%r14d
-	andl	%r8d,%r12d
-	vpaddd	%xmm7,%xmm0,%xmm0
-	xorl	%r8d,%r13d
-	addl	0(%rsp),%r11d
-	movl	%eax,%r15d
-	vpsrld	$3,%xmm4,%xmm7
-	xorl	%r10d,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%ebx,%r15d
-	vpslld	$14,%xmm4,%xmm5
-	addl	%r12d,%r11d
-	shrdl	$6,%r13d,%r13d
-	andl	%r15d,%edi
-	vpxor	%xmm6,%xmm7,%xmm4
-	xorl	%eax,%r14d
-	addl	%r13d,%r11d
-	xorl	%ebx,%edi
-	vpshufd	$250,%xmm3,%xmm7
-	shrdl	$2,%r14d,%r14d
-	addl	%r11d,%edx
-	addl	%edi,%r11d
-	vpsrld	$11,%xmm6,%xmm6
-	movl	%edx,%r13d
-	addl	%r11d,%r14d
-	shrdl	$14,%r13d,%r13d
-	vpxor	%xmm5,%xmm4,%xmm4
-	movl	%r14d,%r11d
-	movl	%r8d,%r12d
-	shrdl	$9,%r14d,%r14d
-	vpslld	$11,%xmm5,%xmm5
-	xorl	%edx,%r13d
-	xorl	%r9d,%r12d
-	shrdl	$5,%r13d,%r13d
-	vpxor	%xmm6,%xmm4,%xmm4
-	xorl	%r11d,%r14d
-	andl	%edx,%r12d
-	xorl	%edx,%r13d
-	vpsrld	$10,%xmm7,%xmm6
-	addl	4(%rsp),%r10d
-	movl	%r11d,%edi
-	xorl	%r9d,%r12d
-	vpxor	%xmm5,%xmm4,%xmm4
-	shrdl	$11,%r14d,%r14d
-	xorl	%eax,%edi
-	addl	%r12d,%r10d
-	vpsrlq	$17,%xmm7,%xmm7
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%r11d,%r14d
-	vpaddd	%xmm4,%xmm0,%xmm0
-	addl	%r13d,%r10d
-	xorl	%eax,%r15d
-	shrdl	$2,%r14d,%r14d
-	vpxor	%xmm7,%xmm6,%xmm6
-	addl	%r10d,%ecx
-	addl	%r15d,%r10d
-	movl	%ecx,%r13d
-	vpsrlq	$2,%xmm7,%xmm7
-	addl	%r10d,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r10d
-	vpxor	%xmm7,%xmm6,%xmm6
-	movl	%edx,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%ecx,%r13d
-	vpshufb	%xmm8,%xmm6,%xmm6
-	xorl	%r8d,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%r10d,%r14d
-	vpaddd	%xmm6,%xmm0,%xmm0
-	andl	%ecx,%r12d
-	xorl	%ecx,%r13d
-	addl	8(%rsp),%r9d
-	vpshufd	$80,%xmm0,%xmm7
-	movl	%r10d,%r15d
-	xorl	%r8d,%r12d
-	shrdl	$11,%r14d,%r14d
-	vpsrld	$10,%xmm7,%xmm6
-	xorl	%r11d,%r15d
-	addl	%r12d,%r9d
-	shrdl	$6,%r13d,%r13d
-	vpsrlq	$17,%xmm7,%xmm7
-	andl	%r15d,%edi
-	xorl	%r10d,%r14d
-	addl	%r13d,%r9d
-	vpxor	%xmm7,%xmm6,%xmm6
-	xorl	%r11d,%edi
-	shrdl	$2,%r14d,%r14d
-	addl	%r9d,%ebx
-	vpsrlq	$2,%xmm7,%xmm7
-	addl	%edi,%r9d
-	movl	%ebx,%r13d
-	addl	%r9d,%r14d
-	vpxor	%xmm7,%xmm6,%xmm6
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r9d
-	movl	%ecx,%r12d
-	vpshufb	%xmm9,%xmm6,%xmm6
-	shrdl	$9,%r14d,%r14d
-	xorl	%ebx,%r13d
-	xorl	%edx,%r12d
-	vpaddd	%xmm6,%xmm0,%xmm0
-	shrdl	$5,%r13d,%r13d
-	xorl	%r9d,%r14d
-	andl	%ebx,%r12d
-	vpaddd	0(%rbp),%xmm0,%xmm6
-	xorl	%ebx,%r13d
-	addl	12(%rsp),%r8d
-	movl	%r9d,%edi
-	xorl	%edx,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%r10d,%edi
-	addl	%r12d,%r8d
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%r9d,%r14d
-	addl	%r13d,%r8d
-	xorl	%r10d,%r15d
-	shrdl	$2,%r14d,%r14d
-	addl	%r8d,%eax
-	addl	%r15d,%r8d
-	movl	%eax,%r13d
-	addl	%r8d,%r14d
-	vmovdqa	%xmm6,0(%rsp)
-	vpalignr	$4,%xmm1,%xmm2,%xmm4
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r8d
-	movl	%ebx,%r12d
-	vpalignr	$4,%xmm3,%xmm0,%xmm7
-	shrdl	$9,%r14d,%r14d
-	xorl	%eax,%r13d
-	xorl	%ecx,%r12d
-	vpsrld	$7,%xmm4,%xmm6
-	shrdl	$5,%r13d,%r13d
-	xorl	%r8d,%r14d
-	andl	%eax,%r12d
-	vpaddd	%xmm7,%xmm1,%xmm1
-	xorl	%eax,%r13d
-	addl	16(%rsp),%edx
-	movl	%r8d,%r15d
-	vpsrld	$3,%xmm4,%xmm7
-	xorl	%ecx,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%r9d,%r15d
-	vpslld	$14,%xmm4,%xmm5
-	addl	%r12d,%edx
-	shrdl	$6,%r13d,%r13d
-	andl	%r15d,%edi
-	vpxor	%xmm6,%xmm7,%xmm4
-	xorl	%r8d,%r14d
-	addl	%r13d,%edx
-	xorl	%r9d,%edi
-	vpshufd	$250,%xmm0,%xmm7
-	shrdl	$2,%r14d,%r14d
-	addl	%edx,%r11d
-	addl	%edi,%edx
-	vpsrld	$11,%xmm6,%xmm6
-	movl	%r11d,%r13d
-	addl	%edx,%r14d
-	shrdl	$14,%r13d,%r13d
-	vpxor	%xmm5,%xmm4,%xmm4
-	movl	%r14d,%edx
-	movl	%eax,%r12d
-	shrdl	$9,%r14d,%r14d
-	vpslld	$11,%xmm5,%xmm5
-	xorl	%r11d,%r13d
-	xorl	%ebx,%r12d
-	shrdl	$5,%r13d,%r13d
-	vpxor	%xmm6,%xmm4,%xmm4
-	xorl	%edx,%r14d
-	andl	%r11d,%r12d
-	xorl	%r11d,%r13d
-	vpsrld	$10,%xmm7,%xmm6
-	addl	20(%rsp),%ecx
-	movl	%edx,%edi
-	xorl	%ebx,%r12d
-	vpxor	%xmm5,%xmm4,%xmm4
-	shrdl	$11,%r14d,%r14d
-	xorl	%r8d,%edi
-	addl	%r12d,%ecx
-	vpsrlq	$17,%xmm7,%xmm7
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%edx,%r14d
-	vpaddd	%xmm4,%xmm1,%xmm1
-	addl	%r13d,%ecx
-	xorl	%r8d,%r15d
-	shrdl	$2,%r14d,%r14d
-	vpxor	%xmm7,%xmm6,%xmm6
-	addl	%ecx,%r10d
-	addl	%r15d,%ecx
-	movl	%r10d,%r13d
-	vpsrlq	$2,%xmm7,%xmm7
-	addl	%ecx,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%ecx
-	vpxor	%xmm7,%xmm6,%xmm6
-	movl	%r11d,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%r10d,%r13d
-	vpshufb	%xmm8,%xmm6,%xmm6
-	xorl	%eax,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%ecx,%r14d
-	vpaddd	%xmm6,%xmm1,%xmm1
-	andl	%r10d,%r12d
-	xorl	%r10d,%r13d
-	addl	24(%rsp),%ebx
-	vpshufd	$80,%xmm1,%xmm7
-	movl	%ecx,%r15d
-	xorl	%eax,%r12d
-	shrdl	$11,%r14d,%r14d
-	vpsrld	$10,%xmm7,%xmm6
-	xorl	%edx,%r15d
-	addl	%r12d,%ebx
-	shrdl	$6,%r13d,%r13d
-	vpsrlq	$17,%xmm7,%xmm7
-	andl	%r15d,%edi
-	xorl	%ecx,%r14d
-	addl	%r13d,%ebx
-	vpxor	%xmm7,%xmm6,%xmm6
-	xorl	%edx,%edi
-	shrdl	$2,%r14d,%r14d
-	addl	%ebx,%r9d
-	vpsrlq	$2,%xmm7,%xmm7
-	addl	%edi,%ebx
-	movl	%r9d,%r13d
-	addl	%ebx,%r14d
-	vpxor	%xmm7,%xmm6,%xmm6
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%ebx
-	movl	%r10d,%r12d
-	vpshufb	%xmm9,%xmm6,%xmm6
-	shrdl	$9,%r14d,%r14d
-	xorl	%r9d,%r13d
-	xorl	%r11d,%r12d
-	vpaddd	%xmm6,%xmm1,%xmm1
-	shrdl	$5,%r13d,%r13d
-	xorl	%ebx,%r14d
-	andl	%r9d,%r12d
-	vpaddd	32(%rbp),%xmm1,%xmm6
-	xorl	%r9d,%r13d
-	addl	28(%rsp),%eax
-	movl	%ebx,%edi
-	xorl	%r11d,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%ecx,%edi
-	addl	%r12d,%eax
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%ebx,%r14d
-	addl	%r13d,%eax
-	xorl	%ecx,%r15d
-	shrdl	$2,%r14d,%r14d
-	addl	%eax,%r8d
-	addl	%r15d,%eax
-	movl	%r8d,%r13d
-	addl	%eax,%r14d
-	vmovdqa	%xmm6,16(%rsp)
-	vpalignr	$4,%xmm2,%xmm3,%xmm4
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%eax
-	movl	%r9d,%r12d
-	vpalignr	$4,%xmm0,%xmm1,%xmm7
-	shrdl	$9,%r14d,%r14d
-	xorl	%r8d,%r13d
-	xorl	%r10d,%r12d
-	vpsrld	$7,%xmm4,%xmm6
-	shrdl	$5,%r13d,%r13d
-	xorl	%eax,%r14d
-	andl	%r8d,%r12d
-	vpaddd	%xmm7,%xmm2,%xmm2
-	xorl	%r8d,%r13d
-	addl	32(%rsp),%r11d
-	movl	%eax,%r15d
-	vpsrld	$3,%xmm4,%xmm7
-	xorl	%r10d,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%ebx,%r15d
-	vpslld	$14,%xmm4,%xmm5
-	addl	%r12d,%r11d
-	shrdl	$6,%r13d,%r13d
-	andl	%r15d,%edi
-	vpxor	%xmm6,%xmm7,%xmm4
-	xorl	%eax,%r14d
-	addl	%r13d,%r11d
-	xorl	%ebx,%edi
-	vpshufd	$250,%xmm1,%xmm7
-	shrdl	$2,%r14d,%r14d
-	addl	%r11d,%edx
-	addl	%edi,%r11d
-	vpsrld	$11,%xmm6,%xmm6
-	movl	%edx,%r13d
-	addl	%r11d,%r14d
-	shrdl	$14,%r13d,%r13d
-	vpxor	%xmm5,%xmm4,%xmm4
-	movl	%r14d,%r11d
-	movl	%r8d,%r12d
-	shrdl	$9,%r14d,%r14d
-	vpslld	$11,%xmm5,%xmm5
-	xorl	%edx,%r13d
-	xorl	%r9d,%r12d
-	shrdl	$5,%r13d,%r13d
-	vpxor	%xmm6,%xmm4,%xmm4
-	xorl	%r11d,%r14d
-	andl	%edx,%r12d
-	xorl	%edx,%r13d
-	vpsrld	$10,%xmm7,%xmm6
-	addl	36(%rsp),%r10d
-	movl	%r11d,%edi
-	xorl	%r9d,%r12d
-	vpxor	%xmm5,%xmm4,%xmm4
-	shrdl	$11,%r14d,%r14d
-	xorl	%eax,%edi
-	addl	%r12d,%r10d
-	vpsrlq	$17,%xmm7,%xmm7
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%r11d,%r14d
-	vpaddd	%xmm4,%xmm2,%xmm2
-	addl	%r13d,%r10d
-	xorl	%eax,%r15d
-	shrdl	$2,%r14d,%r14d
-	vpxor	%xmm7,%xmm6,%xmm6
-	addl	%r10d,%ecx
-	addl	%r15d,%r10d
-	movl	%ecx,%r13d
-	vpsrlq	$2,%xmm7,%xmm7
-	addl	%r10d,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r10d
-	vpxor	%xmm7,%xmm6,%xmm6
-	movl	%edx,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%ecx,%r13d
-	vpshufb	%xmm8,%xmm6,%xmm6
-	xorl	%r8d,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%r10d,%r14d
-	vpaddd	%xmm6,%xmm2,%xmm2
-	andl	%ecx,%r12d
-	xorl	%ecx,%r13d
-	addl	40(%rsp),%r9d
-	vpshufd	$80,%xmm2,%xmm7
-	movl	%r10d,%r15d
-	xorl	%r8d,%r12d
-	shrdl	$11,%r14d,%r14d
-	vpsrld	$10,%xmm7,%xmm6
-	xorl	%r11d,%r15d
-	addl	%r12d,%r9d
-	shrdl	$6,%r13d,%r13d
-	vpsrlq	$17,%xmm7,%xmm7
-	andl	%r15d,%edi
-	xorl	%r10d,%r14d
-	addl	%r13d,%r9d
-	vpxor	%xmm7,%xmm6,%xmm6
-	xorl	%r11d,%edi
-	shrdl	$2,%r14d,%r14d
-	addl	%r9d,%ebx
-	vpsrlq	$2,%xmm7,%xmm7
-	addl	%edi,%r9d
-	movl	%ebx,%r13d
-	addl	%r9d,%r14d
-	vpxor	%xmm7,%xmm6,%xmm6
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r9d
-	movl	%ecx,%r12d
-	vpshufb	%xmm9,%xmm6,%xmm6
-	shrdl	$9,%r14d,%r14d
-	xorl	%ebx,%r13d
-	xorl	%edx,%r12d
-	vpaddd	%xmm6,%xmm2,%xmm2
-	shrdl	$5,%r13d,%r13d
-	xorl	%r9d,%r14d
-	andl	%ebx,%r12d
-	vpaddd	64(%rbp),%xmm2,%xmm6
-	xorl	%ebx,%r13d
-	addl	44(%rsp),%r8d
-	movl	%r9d,%edi
-	xorl	%edx,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%r10d,%edi
-	addl	%r12d,%r8d
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%r9d,%r14d
-	addl	%r13d,%r8d
-	xorl	%r10d,%r15d
-	shrdl	$2,%r14d,%r14d
-	addl	%r8d,%eax
-	addl	%r15d,%r8d
-	movl	%eax,%r13d
-	addl	%r8d,%r14d
-	vmovdqa	%xmm6,32(%rsp)
-	vpalignr	$4,%xmm3,%xmm0,%xmm4
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r8d
-	movl	%ebx,%r12d
-	vpalignr	$4,%xmm1,%xmm2,%xmm7
-	shrdl	$9,%r14d,%r14d
-	xorl	%eax,%r13d
-	xorl	%ecx,%r12d
-	vpsrld	$7,%xmm4,%xmm6
-	shrdl	$5,%r13d,%r13d
-	xorl	%r8d,%r14d
-	andl	%eax,%r12d
-	vpaddd	%xmm7,%xmm3,%xmm3
-	xorl	%eax,%r13d
-	addl	48(%rsp),%edx
-	movl	%r8d,%r15d
-	vpsrld	$3,%xmm4,%xmm7
-	xorl	%ecx,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%r9d,%r15d
-	vpslld	$14,%xmm4,%xmm5
-	addl	%r12d,%edx
-	shrdl	$6,%r13d,%r13d
-	andl	%r15d,%edi
-	vpxor	%xmm6,%xmm7,%xmm4
-	xorl	%r8d,%r14d
-	addl	%r13d,%edx
-	xorl	%r9d,%edi
-	vpshufd	$250,%xmm2,%xmm7
-	shrdl	$2,%r14d,%r14d
-	addl	%edx,%r11d
-	addl	%edi,%edx
-	vpsrld	$11,%xmm6,%xmm6
-	movl	%r11d,%r13d
-	addl	%edx,%r14d
-	shrdl	$14,%r13d,%r13d
-	vpxor	%xmm5,%xmm4,%xmm4
-	movl	%r14d,%edx
-	movl	%eax,%r12d
-	shrdl	$9,%r14d,%r14d
-	vpslld	$11,%xmm5,%xmm5
-	xorl	%r11d,%r13d
-	xorl	%ebx,%r12d
-	shrdl	$5,%r13d,%r13d
-	vpxor	%xmm6,%xmm4,%xmm4
-	xorl	%edx,%r14d
-	andl	%r11d,%r12d
-	xorl	%r11d,%r13d
-	vpsrld	$10,%xmm7,%xmm6
-	addl	52(%rsp),%ecx
-	movl	%edx,%edi
-	xorl	%ebx,%r12d
-	vpxor	%xmm5,%xmm4,%xmm4
-	shrdl	$11,%r14d,%r14d
-	xorl	%r8d,%edi
-	addl	%r12d,%ecx
-	vpsrlq	$17,%xmm7,%xmm7
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%edx,%r14d
-	vpaddd	%xmm4,%xmm3,%xmm3
-	addl	%r13d,%ecx
-	xorl	%r8d,%r15d
-	shrdl	$2,%r14d,%r14d
-	vpxor	%xmm7,%xmm6,%xmm6
-	addl	%ecx,%r10d
-	addl	%r15d,%ecx
-	movl	%r10d,%r13d
-	vpsrlq	$2,%xmm7,%xmm7
-	addl	%ecx,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%ecx
-	vpxor	%xmm7,%xmm6,%xmm6
-	movl	%r11d,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%r10d,%r13d
-	vpshufb	%xmm8,%xmm6,%xmm6
-	xorl	%eax,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%ecx,%r14d
-	vpaddd	%xmm6,%xmm3,%xmm3
-	andl	%r10d,%r12d
-	xorl	%r10d,%r13d
-	addl	56(%rsp),%ebx
-	vpshufd	$80,%xmm3,%xmm7
-	movl	%ecx,%r15d
-	xorl	%eax,%r12d
-	shrdl	$11,%r14d,%r14d
-	vpsrld	$10,%xmm7,%xmm6
-	xorl	%edx,%r15d
-	addl	%r12d,%ebx
-	shrdl	$6,%r13d,%r13d
-	vpsrlq	$17,%xmm7,%xmm7
-	andl	%r15d,%edi
-	xorl	%ecx,%r14d
-	addl	%r13d,%ebx
-	vpxor	%xmm7,%xmm6,%xmm6
-	xorl	%edx,%edi
-	shrdl	$2,%r14d,%r14d
-	addl	%ebx,%r9d
-	vpsrlq	$2,%xmm7,%xmm7
-	addl	%edi,%ebx
-	movl	%r9d,%r13d
-	addl	%ebx,%r14d
-	vpxor	%xmm7,%xmm6,%xmm6
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%ebx
-	movl	%r10d,%r12d
-	vpshufb	%xmm9,%xmm6,%xmm6
-	shrdl	$9,%r14d,%r14d
-	xorl	%r9d,%r13d
-	xorl	%r11d,%r12d
-	vpaddd	%xmm6,%xmm3,%xmm3
-	shrdl	$5,%r13d,%r13d
-	xorl	%ebx,%r14d
-	andl	%r9d,%r12d
-	vpaddd	96(%rbp),%xmm3,%xmm6
-	xorl	%r9d,%r13d
-	addl	60(%rsp),%eax
-	movl	%ebx,%edi
-	xorl	%r11d,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%ecx,%edi
-	addl	%r12d,%eax
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%ebx,%r14d
-	addl	%r13d,%eax
-	xorl	%ecx,%r15d
-	shrdl	$2,%r14d,%r14d
-	addl	%eax,%r8d
-	addl	%r15d,%eax
-	movl	%r8d,%r13d
-	addl	%eax,%r14d
-	vmovdqa	%xmm6,48(%rsp)
-	cmpb	$0,131(%rbp)
-	jne	L$avx_00_47
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%eax
-	movl	%r9d,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%r8d,%r13d
-	xorl	%r10d,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%eax,%r14d
-	andl	%r8d,%r12d
-	xorl	%r8d,%r13d
-	addl	0(%rsp),%r11d
-	movl	%eax,%r15d
-	xorl	%r10d,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%ebx,%r15d
-	addl	%r12d,%r11d
-	shrdl	$6,%r13d,%r13d
-	andl	%r15d,%edi
-	xorl	%eax,%r14d
-	addl	%r13d,%r11d
-	xorl	%ebx,%edi
-	shrdl	$2,%r14d,%r14d
-	addl	%r11d,%edx
-	addl	%edi,%r11d
-	movl	%edx,%r13d
-	addl	%r11d,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r11d
-	movl	%r8d,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%edx,%r13d
-	xorl	%r9d,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%r11d,%r14d
-	andl	%edx,%r12d
-	xorl	%edx,%r13d
-	addl	4(%rsp),%r10d
-	movl	%r11d,%edi
-	xorl	%r9d,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%eax,%edi
-	addl	%r12d,%r10d
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%r11d,%r14d
-	addl	%r13d,%r10d
-	xorl	%eax,%r15d
-	shrdl	$2,%r14d,%r14d
-	addl	%r10d,%ecx
-	addl	%r15d,%r10d
-	movl	%ecx,%r13d
-	addl	%r10d,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r10d
-	movl	%edx,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%ecx,%r13d
-	xorl	%r8d,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%r10d,%r14d
-	andl	%ecx,%r12d
-	xorl	%ecx,%r13d
-	addl	8(%rsp),%r9d
-	movl	%r10d,%r15d
-	xorl	%r8d,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%r11d,%r15d
-	addl	%r12d,%r9d
-	shrdl	$6,%r13d,%r13d
-	andl	%r15d,%edi
-	xorl	%r10d,%r14d
-	addl	%r13d,%r9d
-	xorl	%r11d,%edi
-	shrdl	$2,%r14d,%r14d
-	addl	%r9d,%ebx
-	addl	%edi,%r9d
-	movl	%ebx,%r13d
-	addl	%r9d,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r9d
-	movl	%ecx,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%ebx,%r13d
-	xorl	%edx,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%r9d,%r14d
-	andl	%ebx,%r12d
-	xorl	%ebx,%r13d
-	addl	12(%rsp),%r8d
-	movl	%r9d,%edi
-	xorl	%edx,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%r10d,%edi
-	addl	%r12d,%r8d
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%r9d,%r14d
-	addl	%r13d,%r8d
-	xorl	%r10d,%r15d
-	shrdl	$2,%r14d,%r14d
-	addl	%r8d,%eax
-	addl	%r15d,%r8d
-	movl	%eax,%r13d
-	addl	%r8d,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r8d
-	movl	%ebx,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%eax,%r13d
-	xorl	%ecx,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%r8d,%r14d
-	andl	%eax,%r12d
-	xorl	%eax,%r13d
-	addl	16(%rsp),%edx
-	movl	%r8d,%r15d
-	xorl	%ecx,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%r9d,%r15d
-	addl	%r12d,%edx
-	shrdl	$6,%r13d,%r13d
-	andl	%r15d,%edi
-	xorl	%r8d,%r14d
-	addl	%r13d,%edx
-	xorl	%r9d,%edi
-	shrdl	$2,%r14d,%r14d
-	addl	%edx,%r11d
-	addl	%edi,%edx
-	movl	%r11d,%r13d
-	addl	%edx,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%edx
-	movl	%eax,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%r11d,%r13d
-	xorl	%ebx,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%edx,%r14d
-	andl	%r11d,%r12d
-	xorl	%r11d,%r13d
-	addl	20(%rsp),%ecx
-	movl	%edx,%edi
-	xorl	%ebx,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%r8d,%edi
-	addl	%r12d,%ecx
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%edx,%r14d
-	addl	%r13d,%ecx
-	xorl	%r8d,%r15d
-	shrdl	$2,%r14d,%r14d
-	addl	%ecx,%r10d
-	addl	%r15d,%ecx
-	movl	%r10d,%r13d
-	addl	%ecx,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%ecx
-	movl	%r11d,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%r10d,%r13d
-	xorl	%eax,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%ecx,%r14d
-	andl	%r10d,%r12d
-	xorl	%r10d,%r13d
-	addl	24(%rsp),%ebx
-	movl	%ecx,%r15d
-	xorl	%eax,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%edx,%r15d
-	addl	%r12d,%ebx
-	shrdl	$6,%r13d,%r13d
-	andl	%r15d,%edi
-	xorl	%ecx,%r14d
-	addl	%r13d,%ebx
-	xorl	%edx,%edi
-	shrdl	$2,%r14d,%r14d
-	addl	%ebx,%r9d
-	addl	%edi,%ebx
-	movl	%r9d,%r13d
-	addl	%ebx,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%ebx
-	movl	%r10d,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%r9d,%r13d
-	xorl	%r11d,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%ebx,%r14d
-	andl	%r9d,%r12d
-	xorl	%r9d,%r13d
-	addl	28(%rsp),%eax
-	movl	%ebx,%edi
-	xorl	%r11d,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%ecx,%edi
-	addl	%r12d,%eax
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%ebx,%r14d
-	addl	%r13d,%eax
-	xorl	%ecx,%r15d
-	shrdl	$2,%r14d,%r14d
-	addl	%eax,%r8d
-	addl	%r15d,%eax
-	movl	%r8d,%r13d
-	addl	%eax,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%eax
-	movl	%r9d,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%r8d,%r13d
-	xorl	%r10d,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%eax,%r14d
-	andl	%r8d,%r12d
-	xorl	%r8d,%r13d
-	addl	32(%rsp),%r11d
-	movl	%eax,%r15d
-	xorl	%r10d,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%ebx,%r15d
-	addl	%r12d,%r11d
-	shrdl	$6,%r13d,%r13d
-	andl	%r15d,%edi
-	xorl	%eax,%r14d
-	addl	%r13d,%r11d
-	xorl	%ebx,%edi
-	shrdl	$2,%r14d,%r14d
-	addl	%r11d,%edx
-	addl	%edi,%r11d
-	movl	%edx,%r13d
-	addl	%r11d,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r11d
-	movl	%r8d,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%edx,%r13d
-	xorl	%r9d,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%r11d,%r14d
-	andl	%edx,%r12d
-	xorl	%edx,%r13d
-	addl	36(%rsp),%r10d
-	movl	%r11d,%edi
-	xorl	%r9d,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%eax,%edi
-	addl	%r12d,%r10d
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%r11d,%r14d
-	addl	%r13d,%r10d
-	xorl	%eax,%r15d
-	shrdl	$2,%r14d,%r14d
-	addl	%r10d,%ecx
-	addl	%r15d,%r10d
-	movl	%ecx,%r13d
-	addl	%r10d,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r10d
-	movl	%edx,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%ecx,%r13d
-	xorl	%r8d,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%r10d,%r14d
-	andl	%ecx,%r12d
-	xorl	%ecx,%r13d
-	addl	40(%rsp),%r9d
-	movl	%r10d,%r15d
-	xorl	%r8d,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%r11d,%r15d
-	addl	%r12d,%r9d
-	shrdl	$6,%r13d,%r13d
-	andl	%r15d,%edi
-	xorl	%r10d,%r14d
-	addl	%r13d,%r9d
-	xorl	%r11d,%edi
-	shrdl	$2,%r14d,%r14d
-	addl	%r9d,%ebx
-	addl	%edi,%r9d
-	movl	%ebx,%r13d
-	addl	%r9d,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r9d
-	movl	%ecx,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%ebx,%r13d
-	xorl	%edx,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%r9d,%r14d
-	andl	%ebx,%r12d
-	xorl	%ebx,%r13d
-	addl	44(%rsp),%r8d
-	movl	%r9d,%edi
-	xorl	%edx,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%r10d,%edi
-	addl	%r12d,%r8d
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%r9d,%r14d
-	addl	%r13d,%r8d
-	xorl	%r10d,%r15d
-	shrdl	$2,%r14d,%r14d
-	addl	%r8d,%eax
-	addl	%r15d,%r8d
-	movl	%eax,%r13d
-	addl	%r8d,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%r8d
-	movl	%ebx,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%eax,%r13d
-	xorl	%ecx,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%r8d,%r14d
-	andl	%eax,%r12d
-	xorl	%eax,%r13d
-	addl	48(%rsp),%edx
-	movl	%r8d,%r15d
-	xorl	%ecx,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%r9d,%r15d
-	addl	%r12d,%edx
-	shrdl	$6,%r13d,%r13d
-	andl	%r15d,%edi
-	xorl	%r8d,%r14d
-	addl	%r13d,%edx
-	xorl	%r9d,%edi
-	shrdl	$2,%r14d,%r14d
-	addl	%edx,%r11d
-	addl	%edi,%edx
-	movl	%r11d,%r13d
-	addl	%edx,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%edx
-	movl	%eax,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%r11d,%r13d
-	xorl	%ebx,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%edx,%r14d
-	andl	%r11d,%r12d
-	xorl	%r11d,%r13d
-	addl	52(%rsp),%ecx
-	movl	%edx,%edi
-	xorl	%ebx,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%r8d,%edi
-	addl	%r12d,%ecx
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%edx,%r14d
-	addl	%r13d,%ecx
-	xorl	%r8d,%r15d
-	shrdl	$2,%r14d,%r14d
-	addl	%ecx,%r10d
-	addl	%r15d,%ecx
-	movl	%r10d,%r13d
-	addl	%ecx,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%ecx
-	movl	%r11d,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%r10d,%r13d
-	xorl	%eax,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%ecx,%r14d
-	andl	%r10d,%r12d
-	xorl	%r10d,%r13d
-	addl	56(%rsp),%ebx
-	movl	%ecx,%r15d
-	xorl	%eax,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%edx,%r15d
-	addl	%r12d,%ebx
-	shrdl	$6,%r13d,%r13d
-	andl	%r15d,%edi
-	xorl	%ecx,%r14d
-	addl	%r13d,%ebx
-	xorl	%edx,%edi
-	shrdl	$2,%r14d,%r14d
-	addl	%ebx,%r9d
-	addl	%edi,%ebx
-	movl	%r9d,%r13d
-	addl	%ebx,%r14d
-	shrdl	$14,%r13d,%r13d
-	movl	%r14d,%ebx
-	movl	%r10d,%r12d
-	shrdl	$9,%r14d,%r14d
-	xorl	%r9d,%r13d
-	xorl	%r11d,%r12d
-	shrdl	$5,%r13d,%r13d
-	xorl	%ebx,%r14d
-	andl	%r9d,%r12d
-	xorl	%r9d,%r13d
-	addl	60(%rsp),%eax
-	movl	%ebx,%edi
-	xorl	%r11d,%r12d
-	shrdl	$11,%r14d,%r14d
-	xorl	%ecx,%edi
-	addl	%r12d,%eax
-	shrdl	$6,%r13d,%r13d
-	andl	%edi,%r15d
-	xorl	%ebx,%r14d
-	addl	%r13d,%eax
-	xorl	%ecx,%r15d
-	shrdl	$2,%r14d,%r14d
-	addl	%eax,%r8d
-	addl	%r15d,%eax
-	movl	%r8d,%r13d
-	addl	%eax,%r14d
-	movq	64+0(%rsp),%rdi
-	movl	%r14d,%eax
-
-	addl	0(%rdi),%eax
-	leaq	64(%rsi),%rsi
-	addl	4(%rdi),%ebx
-	addl	8(%rdi),%ecx
-	addl	12(%rdi),%edx
-	addl	16(%rdi),%r8d
-	addl	20(%rdi),%r9d
-	addl	24(%rdi),%r10d
-	addl	28(%rdi),%r11d
-
-	cmpq	64+16(%rsp),%rsi
-
-	movl	%eax,0(%rdi)
-	movl	%ebx,4(%rdi)
-	movl	%ecx,8(%rdi)
-	movl	%edx,12(%rdi)
-	movl	%r8d,16(%rdi)
-	movl	%r9d,20(%rdi)
-	movl	%r10d,24(%rdi)
-	movl	%r11d,28(%rdi)
-	jb	L$loop_avx
-
-	movq	88(%rsp),%rsi
-
-	vzeroupper
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbp
-
-	movq	-8(%rsi),%rbx
-
-	leaq	(%rsi),%rsp
-
-L$epilogue_avx:
-	ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/sha512-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/sha512-x86_64-apple.S
deleted file mode 100644
index 6e2e13e..0000000
--- a/apple-x86_64/crypto/fipsmodule/sha512-x86_64-apple.S
+++ /dev/null
@@ -1,2986 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-.globl	_sha512_block_data_order
-.private_extern _sha512_block_data_order
-
-.p2align	4
-_sha512_block_data_order:
-
-_CET_ENDBR
-	leaq	_OPENSSL_ia32cap_P(%rip),%r11
-	movl	0(%r11),%r9d
-	movl	4(%r11),%r10d
-	movl	8(%r11),%r11d
-	andl	$1073741824,%r9d
-	andl	$268435968,%r10d
-	orl	%r9d,%r10d
-	cmpl	$1342177792,%r10d
-	je	L$avx_shortcut
-	movq	%rsp,%rax
-
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	shlq	$4,%rdx
-	subq	$128+32,%rsp
-	leaq	(%rsi,%rdx,8),%rdx
-	andq	$-64,%rsp
-	movq	%rdi,128+0(%rsp)
-	movq	%rsi,128+8(%rsp)
-	movq	%rdx,128+16(%rsp)
-	movq	%rax,152(%rsp)
-
-L$prologue:
-
-	movq	0(%rdi),%rax
-	movq	8(%rdi),%rbx
-	movq	16(%rdi),%rcx
-	movq	24(%rdi),%rdx
-	movq	32(%rdi),%r8
-	movq	40(%rdi),%r9
-	movq	48(%rdi),%r10
-	movq	56(%rdi),%r11
-	jmp	L$loop
-
-.p2align	4
-L$loop:
-	movq	%rbx,%rdi
-	leaq	K512(%rip),%rbp
-	xorq	%rcx,%rdi
-	movq	0(%rsi),%r12
-	movq	%r8,%r13
-	movq	%rax,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%r9,%r15
-
-	xorq	%r8,%r13
-	rorq	$5,%r14
-	xorq	%r10,%r15
-
-	movq	%r12,0(%rsp)
-	xorq	%rax,%r14
-	andq	%r8,%r15
-
-	rorq	$4,%r13
-	addq	%r11,%r12
-	xorq	%r10,%r15
-
-	rorq	$6,%r14
-	xorq	%r8,%r13
-	addq	%r15,%r12
-
-	movq	%rax,%r15
-	addq	(%rbp),%r12
-	xorq	%rax,%r14
-
-	xorq	%rbx,%r15
-	rorq	$14,%r13
-	movq	%rbx,%r11
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%r11
-	addq	%r12,%rdx
-	addq	%r12,%r11
-
-	leaq	8(%rbp),%rbp
-	addq	%r14,%r11
-	movq	8(%rsi),%r12
-	movq	%rdx,%r13
-	movq	%r11,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%r8,%rdi
-
-	xorq	%rdx,%r13
-	rorq	$5,%r14
-	xorq	%r9,%rdi
-
-	movq	%r12,8(%rsp)
-	xorq	%r11,%r14
-	andq	%rdx,%rdi
-
-	rorq	$4,%r13
-	addq	%r10,%r12
-	xorq	%r9,%rdi
-
-	rorq	$6,%r14
-	xorq	%rdx,%r13
-	addq	%rdi,%r12
-
-	movq	%r11,%rdi
-	addq	(%rbp),%r12
-	xorq	%r11,%r14
-
-	xorq	%rax,%rdi
-	rorq	$14,%r13
-	movq	%rax,%r10
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%r10
-	addq	%r12,%rcx
-	addq	%r12,%r10
-
-	leaq	24(%rbp),%rbp
-	addq	%r14,%r10
-	movq	16(%rsi),%r12
-	movq	%rcx,%r13
-	movq	%r10,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%rdx,%r15
-
-	xorq	%rcx,%r13
-	rorq	$5,%r14
-	xorq	%r8,%r15
-
-	movq	%r12,16(%rsp)
-	xorq	%r10,%r14
-	andq	%rcx,%r15
-
-	rorq	$4,%r13
-	addq	%r9,%r12
-	xorq	%r8,%r15
-
-	rorq	$6,%r14
-	xorq	%rcx,%r13
-	addq	%r15,%r12
-
-	movq	%r10,%r15
-	addq	(%rbp),%r12
-	xorq	%r10,%r14
-
-	xorq	%r11,%r15
-	rorq	$14,%r13
-	movq	%r11,%r9
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%r9
-	addq	%r12,%rbx
-	addq	%r12,%r9
-
-	leaq	8(%rbp),%rbp
-	addq	%r14,%r9
-	movq	24(%rsi),%r12
-	movq	%rbx,%r13
-	movq	%r9,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%rcx,%rdi
-
-	xorq	%rbx,%r13
-	rorq	$5,%r14
-	xorq	%rdx,%rdi
-
-	movq	%r12,24(%rsp)
-	xorq	%r9,%r14
-	andq	%rbx,%rdi
-
-	rorq	$4,%r13
-	addq	%r8,%r12
-	xorq	%rdx,%rdi
-
-	rorq	$6,%r14
-	xorq	%rbx,%r13
-	addq	%rdi,%r12
-
-	movq	%r9,%rdi
-	addq	(%rbp),%r12
-	xorq	%r9,%r14
-
-	xorq	%r10,%rdi
-	rorq	$14,%r13
-	movq	%r10,%r8
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%r8
-	addq	%r12,%rax
-	addq	%r12,%r8
-
-	leaq	24(%rbp),%rbp
-	addq	%r14,%r8
-	movq	32(%rsi),%r12
-	movq	%rax,%r13
-	movq	%r8,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%rbx,%r15
-
-	xorq	%rax,%r13
-	rorq	$5,%r14
-	xorq	%rcx,%r15
-
-	movq	%r12,32(%rsp)
-	xorq	%r8,%r14
-	andq	%rax,%r15
-
-	rorq	$4,%r13
-	addq	%rdx,%r12
-	xorq	%rcx,%r15
-
-	rorq	$6,%r14
-	xorq	%rax,%r13
-	addq	%r15,%r12
-
-	movq	%r8,%r15
-	addq	(%rbp),%r12
-	xorq	%r8,%r14
-
-	xorq	%r9,%r15
-	rorq	$14,%r13
-	movq	%r9,%rdx
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%rdx
-	addq	%r12,%r11
-	addq	%r12,%rdx
-
-	leaq	8(%rbp),%rbp
-	addq	%r14,%rdx
-	movq	40(%rsi),%r12
-	movq	%r11,%r13
-	movq	%rdx,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%rax,%rdi
-
-	xorq	%r11,%r13
-	rorq	$5,%r14
-	xorq	%rbx,%rdi
-
-	movq	%r12,40(%rsp)
-	xorq	%rdx,%r14
-	andq	%r11,%rdi
-
-	rorq	$4,%r13
-	addq	%rcx,%r12
-	xorq	%rbx,%rdi
-
-	rorq	$6,%r14
-	xorq	%r11,%r13
-	addq	%rdi,%r12
-
-	movq	%rdx,%rdi
-	addq	(%rbp),%r12
-	xorq	%rdx,%r14
-
-	xorq	%r8,%rdi
-	rorq	$14,%r13
-	movq	%r8,%rcx
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%rcx
-	addq	%r12,%r10
-	addq	%r12,%rcx
-
-	leaq	24(%rbp),%rbp
-	addq	%r14,%rcx
-	movq	48(%rsi),%r12
-	movq	%r10,%r13
-	movq	%rcx,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%r11,%r15
-
-	xorq	%r10,%r13
-	rorq	$5,%r14
-	xorq	%rax,%r15
-
-	movq	%r12,48(%rsp)
-	xorq	%rcx,%r14
-	andq	%r10,%r15
-
-	rorq	$4,%r13
-	addq	%rbx,%r12
-	xorq	%rax,%r15
-
-	rorq	$6,%r14
-	xorq	%r10,%r13
-	addq	%r15,%r12
-
-	movq	%rcx,%r15
-	addq	(%rbp),%r12
-	xorq	%rcx,%r14
-
-	xorq	%rdx,%r15
-	rorq	$14,%r13
-	movq	%rdx,%rbx
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%rbx
-	addq	%r12,%r9
-	addq	%r12,%rbx
-
-	leaq	8(%rbp),%rbp
-	addq	%r14,%rbx
-	movq	56(%rsi),%r12
-	movq	%r9,%r13
-	movq	%rbx,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%r10,%rdi
-
-	xorq	%r9,%r13
-	rorq	$5,%r14
-	xorq	%r11,%rdi
-
-	movq	%r12,56(%rsp)
-	xorq	%rbx,%r14
-	andq	%r9,%rdi
-
-	rorq	$4,%r13
-	addq	%rax,%r12
-	xorq	%r11,%rdi
-
-	rorq	$6,%r14
-	xorq	%r9,%r13
-	addq	%rdi,%r12
-
-	movq	%rbx,%rdi
-	addq	(%rbp),%r12
-	xorq	%rbx,%r14
-
-	xorq	%rcx,%rdi
-	rorq	$14,%r13
-	movq	%rcx,%rax
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%rax
-	addq	%r12,%r8
-	addq	%r12,%rax
-
-	leaq	24(%rbp),%rbp
-	addq	%r14,%rax
-	movq	64(%rsi),%r12
-	movq	%r8,%r13
-	movq	%rax,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%r9,%r15
-
-	xorq	%r8,%r13
-	rorq	$5,%r14
-	xorq	%r10,%r15
-
-	movq	%r12,64(%rsp)
-	xorq	%rax,%r14
-	andq	%r8,%r15
-
-	rorq	$4,%r13
-	addq	%r11,%r12
-	xorq	%r10,%r15
-
-	rorq	$6,%r14
-	xorq	%r8,%r13
-	addq	%r15,%r12
-
-	movq	%rax,%r15
-	addq	(%rbp),%r12
-	xorq	%rax,%r14
-
-	xorq	%rbx,%r15
-	rorq	$14,%r13
-	movq	%rbx,%r11
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%r11
-	addq	%r12,%rdx
-	addq	%r12,%r11
-
-	leaq	8(%rbp),%rbp
-	addq	%r14,%r11
-	movq	72(%rsi),%r12
-	movq	%rdx,%r13
-	movq	%r11,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%r8,%rdi
-
-	xorq	%rdx,%r13
-	rorq	$5,%r14
-	xorq	%r9,%rdi
-
-	movq	%r12,72(%rsp)
-	xorq	%r11,%r14
-	andq	%rdx,%rdi
-
-	rorq	$4,%r13
-	addq	%r10,%r12
-	xorq	%r9,%rdi
-
-	rorq	$6,%r14
-	xorq	%rdx,%r13
-	addq	%rdi,%r12
-
-	movq	%r11,%rdi
-	addq	(%rbp),%r12
-	xorq	%r11,%r14
-
-	xorq	%rax,%rdi
-	rorq	$14,%r13
-	movq	%rax,%r10
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%r10
-	addq	%r12,%rcx
-	addq	%r12,%r10
-
-	leaq	24(%rbp),%rbp
-	addq	%r14,%r10
-	movq	80(%rsi),%r12
-	movq	%rcx,%r13
-	movq	%r10,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%rdx,%r15
-
-	xorq	%rcx,%r13
-	rorq	$5,%r14
-	xorq	%r8,%r15
-
-	movq	%r12,80(%rsp)
-	xorq	%r10,%r14
-	andq	%rcx,%r15
-
-	rorq	$4,%r13
-	addq	%r9,%r12
-	xorq	%r8,%r15
-
-	rorq	$6,%r14
-	xorq	%rcx,%r13
-	addq	%r15,%r12
-
-	movq	%r10,%r15
-	addq	(%rbp),%r12
-	xorq	%r10,%r14
-
-	xorq	%r11,%r15
-	rorq	$14,%r13
-	movq	%r11,%r9
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%r9
-	addq	%r12,%rbx
-	addq	%r12,%r9
-
-	leaq	8(%rbp),%rbp
-	addq	%r14,%r9
-	movq	88(%rsi),%r12
-	movq	%rbx,%r13
-	movq	%r9,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%rcx,%rdi
-
-	xorq	%rbx,%r13
-	rorq	$5,%r14
-	xorq	%rdx,%rdi
-
-	movq	%r12,88(%rsp)
-	xorq	%r9,%r14
-	andq	%rbx,%rdi
-
-	rorq	$4,%r13
-	addq	%r8,%r12
-	xorq	%rdx,%rdi
-
-	rorq	$6,%r14
-	xorq	%rbx,%r13
-	addq	%rdi,%r12
-
-	movq	%r9,%rdi
-	addq	(%rbp),%r12
-	xorq	%r9,%r14
-
-	xorq	%r10,%rdi
-	rorq	$14,%r13
-	movq	%r10,%r8
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%r8
-	addq	%r12,%rax
-	addq	%r12,%r8
-
-	leaq	24(%rbp),%rbp
-	addq	%r14,%r8
-	movq	96(%rsi),%r12
-	movq	%rax,%r13
-	movq	%r8,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%rbx,%r15
-
-	xorq	%rax,%r13
-	rorq	$5,%r14
-	xorq	%rcx,%r15
-
-	movq	%r12,96(%rsp)
-	xorq	%r8,%r14
-	andq	%rax,%r15
-
-	rorq	$4,%r13
-	addq	%rdx,%r12
-	xorq	%rcx,%r15
-
-	rorq	$6,%r14
-	xorq	%rax,%r13
-	addq	%r15,%r12
-
-	movq	%r8,%r15
-	addq	(%rbp),%r12
-	xorq	%r8,%r14
-
-	xorq	%r9,%r15
-	rorq	$14,%r13
-	movq	%r9,%rdx
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%rdx
-	addq	%r12,%r11
-	addq	%r12,%rdx
-
-	leaq	8(%rbp),%rbp
-	addq	%r14,%rdx
-	movq	104(%rsi),%r12
-	movq	%r11,%r13
-	movq	%rdx,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%rax,%rdi
-
-	xorq	%r11,%r13
-	rorq	$5,%r14
-	xorq	%rbx,%rdi
-
-	movq	%r12,104(%rsp)
-	xorq	%rdx,%r14
-	andq	%r11,%rdi
-
-	rorq	$4,%r13
-	addq	%rcx,%r12
-	xorq	%rbx,%rdi
-
-	rorq	$6,%r14
-	xorq	%r11,%r13
-	addq	%rdi,%r12
-
-	movq	%rdx,%rdi
-	addq	(%rbp),%r12
-	xorq	%rdx,%r14
-
-	xorq	%r8,%rdi
-	rorq	$14,%r13
-	movq	%r8,%rcx
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%rcx
-	addq	%r12,%r10
-	addq	%r12,%rcx
-
-	leaq	24(%rbp),%rbp
-	addq	%r14,%rcx
-	movq	112(%rsi),%r12
-	movq	%r10,%r13
-	movq	%rcx,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%r11,%r15
-
-	xorq	%r10,%r13
-	rorq	$5,%r14
-	xorq	%rax,%r15
-
-	movq	%r12,112(%rsp)
-	xorq	%rcx,%r14
-	andq	%r10,%r15
-
-	rorq	$4,%r13
-	addq	%rbx,%r12
-	xorq	%rax,%r15
-
-	rorq	$6,%r14
-	xorq	%r10,%r13
-	addq	%r15,%r12
-
-	movq	%rcx,%r15
-	addq	(%rbp),%r12
-	xorq	%rcx,%r14
-
-	xorq	%rdx,%r15
-	rorq	$14,%r13
-	movq	%rdx,%rbx
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%rbx
-	addq	%r12,%r9
-	addq	%r12,%rbx
-
-	leaq	8(%rbp),%rbp
-	addq	%r14,%rbx
-	movq	120(%rsi),%r12
-	movq	%r9,%r13
-	movq	%rbx,%r14
-	bswapq	%r12
-	rorq	$23,%r13
-	movq	%r10,%rdi
-
-	xorq	%r9,%r13
-	rorq	$5,%r14
-	xorq	%r11,%rdi
-
-	movq	%r12,120(%rsp)
-	xorq	%rbx,%r14
-	andq	%r9,%rdi
-
-	rorq	$4,%r13
-	addq	%rax,%r12
-	xorq	%r11,%rdi
-
-	rorq	$6,%r14
-	xorq	%r9,%r13
-	addq	%rdi,%r12
-
-	movq	%rbx,%rdi
-	addq	(%rbp),%r12
-	xorq	%rbx,%r14
-
-	xorq	%rcx,%rdi
-	rorq	$14,%r13
-	movq	%rcx,%rax
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%rax
-	addq	%r12,%r8
-	addq	%r12,%rax
-
-	leaq	24(%rbp),%rbp
-	jmp	L$rounds_16_xx
-.p2align	4
-L$rounds_16_xx:
-	movq	8(%rsp),%r13
-	movq	112(%rsp),%r15
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%rax
-	movq	%r15,%r14
-	rorq	$42,%r15
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%r15
-	shrq	$6,%r14
-
-	rorq	$19,%r15
-	xorq	%r13,%r12
-	xorq	%r14,%r15
-	addq	72(%rsp),%r12
-
-	addq	0(%rsp),%r12
-	movq	%r8,%r13
-	addq	%r15,%r12
-	movq	%rax,%r14
-	rorq	$23,%r13
-	movq	%r9,%r15
-
-	xorq	%r8,%r13
-	rorq	$5,%r14
-	xorq	%r10,%r15
-
-	movq	%r12,0(%rsp)
-	xorq	%rax,%r14
-	andq	%r8,%r15
-
-	rorq	$4,%r13
-	addq	%r11,%r12
-	xorq	%r10,%r15
-
-	rorq	$6,%r14
-	xorq	%r8,%r13
-	addq	%r15,%r12
-
-	movq	%rax,%r15
-	addq	(%rbp),%r12
-	xorq	%rax,%r14
-
-	xorq	%rbx,%r15
-	rorq	$14,%r13
-	movq	%rbx,%r11
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%r11
-	addq	%r12,%rdx
-	addq	%r12,%r11
-
-	leaq	8(%rbp),%rbp
-	movq	16(%rsp),%r13
-	movq	120(%rsp),%rdi
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%r11
-	movq	%rdi,%r14
-	rorq	$42,%rdi
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%rdi
-	shrq	$6,%r14
-
-	rorq	$19,%rdi
-	xorq	%r13,%r12
-	xorq	%r14,%rdi
-	addq	80(%rsp),%r12
-
-	addq	8(%rsp),%r12
-	movq	%rdx,%r13
-	addq	%rdi,%r12
-	movq	%r11,%r14
-	rorq	$23,%r13
-	movq	%r8,%rdi
-
-	xorq	%rdx,%r13
-	rorq	$5,%r14
-	xorq	%r9,%rdi
-
-	movq	%r12,8(%rsp)
-	xorq	%r11,%r14
-	andq	%rdx,%rdi
-
-	rorq	$4,%r13
-	addq	%r10,%r12
-	xorq	%r9,%rdi
-
-	rorq	$6,%r14
-	xorq	%rdx,%r13
-	addq	%rdi,%r12
-
-	movq	%r11,%rdi
-	addq	(%rbp),%r12
-	xorq	%r11,%r14
-
-	xorq	%rax,%rdi
-	rorq	$14,%r13
-	movq	%rax,%r10
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%r10
-	addq	%r12,%rcx
-	addq	%r12,%r10
-
-	leaq	24(%rbp),%rbp
-	movq	24(%rsp),%r13
-	movq	0(%rsp),%r15
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%r10
-	movq	%r15,%r14
-	rorq	$42,%r15
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%r15
-	shrq	$6,%r14
-
-	rorq	$19,%r15
-	xorq	%r13,%r12
-	xorq	%r14,%r15
-	addq	88(%rsp),%r12
-
-	addq	16(%rsp),%r12
-	movq	%rcx,%r13
-	addq	%r15,%r12
-	movq	%r10,%r14
-	rorq	$23,%r13
-	movq	%rdx,%r15
-
-	xorq	%rcx,%r13
-	rorq	$5,%r14
-	xorq	%r8,%r15
-
-	movq	%r12,16(%rsp)
-	xorq	%r10,%r14
-	andq	%rcx,%r15
-
-	rorq	$4,%r13
-	addq	%r9,%r12
-	xorq	%r8,%r15
-
-	rorq	$6,%r14
-	xorq	%rcx,%r13
-	addq	%r15,%r12
-
-	movq	%r10,%r15
-	addq	(%rbp),%r12
-	xorq	%r10,%r14
-
-	xorq	%r11,%r15
-	rorq	$14,%r13
-	movq	%r11,%r9
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%r9
-	addq	%r12,%rbx
-	addq	%r12,%r9
-
-	leaq	8(%rbp),%rbp
-	movq	32(%rsp),%r13
-	movq	8(%rsp),%rdi
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%r9
-	movq	%rdi,%r14
-	rorq	$42,%rdi
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%rdi
-	shrq	$6,%r14
-
-	rorq	$19,%rdi
-	xorq	%r13,%r12
-	xorq	%r14,%rdi
-	addq	96(%rsp),%r12
-
-	addq	24(%rsp),%r12
-	movq	%rbx,%r13
-	addq	%rdi,%r12
-	movq	%r9,%r14
-	rorq	$23,%r13
-	movq	%rcx,%rdi
-
-	xorq	%rbx,%r13
-	rorq	$5,%r14
-	xorq	%rdx,%rdi
-
-	movq	%r12,24(%rsp)
-	xorq	%r9,%r14
-	andq	%rbx,%rdi
-
-	rorq	$4,%r13
-	addq	%r8,%r12
-	xorq	%rdx,%rdi
-
-	rorq	$6,%r14
-	xorq	%rbx,%r13
-	addq	%rdi,%r12
-
-	movq	%r9,%rdi
-	addq	(%rbp),%r12
-	xorq	%r9,%r14
-
-	xorq	%r10,%rdi
-	rorq	$14,%r13
-	movq	%r10,%r8
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%r8
-	addq	%r12,%rax
-	addq	%r12,%r8
-
-	leaq	24(%rbp),%rbp
-	movq	40(%rsp),%r13
-	movq	16(%rsp),%r15
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%r8
-	movq	%r15,%r14
-	rorq	$42,%r15
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%r15
-	shrq	$6,%r14
-
-	rorq	$19,%r15
-	xorq	%r13,%r12
-	xorq	%r14,%r15
-	addq	104(%rsp),%r12
-
-	addq	32(%rsp),%r12
-	movq	%rax,%r13
-	addq	%r15,%r12
-	movq	%r8,%r14
-	rorq	$23,%r13
-	movq	%rbx,%r15
-
-	xorq	%rax,%r13
-	rorq	$5,%r14
-	xorq	%rcx,%r15
-
-	movq	%r12,32(%rsp)
-	xorq	%r8,%r14
-	andq	%rax,%r15
-
-	rorq	$4,%r13
-	addq	%rdx,%r12
-	xorq	%rcx,%r15
-
-	rorq	$6,%r14
-	xorq	%rax,%r13
-	addq	%r15,%r12
-
-	movq	%r8,%r15
-	addq	(%rbp),%r12
-	xorq	%r8,%r14
-
-	xorq	%r9,%r15
-	rorq	$14,%r13
-	movq	%r9,%rdx
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%rdx
-	addq	%r12,%r11
-	addq	%r12,%rdx
-
-	leaq	8(%rbp),%rbp
-	movq	48(%rsp),%r13
-	movq	24(%rsp),%rdi
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%rdx
-	movq	%rdi,%r14
-	rorq	$42,%rdi
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%rdi
-	shrq	$6,%r14
-
-	rorq	$19,%rdi
-	xorq	%r13,%r12
-	xorq	%r14,%rdi
-	addq	112(%rsp),%r12
-
-	addq	40(%rsp),%r12
-	movq	%r11,%r13
-	addq	%rdi,%r12
-	movq	%rdx,%r14
-	rorq	$23,%r13
-	movq	%rax,%rdi
-
-	xorq	%r11,%r13
-	rorq	$5,%r14
-	xorq	%rbx,%rdi
-
-	movq	%r12,40(%rsp)
-	xorq	%rdx,%r14
-	andq	%r11,%rdi
-
-	rorq	$4,%r13
-	addq	%rcx,%r12
-	xorq	%rbx,%rdi
-
-	rorq	$6,%r14
-	xorq	%r11,%r13
-	addq	%rdi,%r12
-
-	movq	%rdx,%rdi
-	addq	(%rbp),%r12
-	xorq	%rdx,%r14
-
-	xorq	%r8,%rdi
-	rorq	$14,%r13
-	movq	%r8,%rcx
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%rcx
-	addq	%r12,%r10
-	addq	%r12,%rcx
-
-	leaq	24(%rbp),%rbp
-	movq	56(%rsp),%r13
-	movq	32(%rsp),%r15
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%rcx
-	movq	%r15,%r14
-	rorq	$42,%r15
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%r15
-	shrq	$6,%r14
-
-	rorq	$19,%r15
-	xorq	%r13,%r12
-	xorq	%r14,%r15
-	addq	120(%rsp),%r12
-
-	addq	48(%rsp),%r12
-	movq	%r10,%r13
-	addq	%r15,%r12
-	movq	%rcx,%r14
-	rorq	$23,%r13
-	movq	%r11,%r15
-
-	xorq	%r10,%r13
-	rorq	$5,%r14
-	xorq	%rax,%r15
-
-	movq	%r12,48(%rsp)
-	xorq	%rcx,%r14
-	andq	%r10,%r15
-
-	rorq	$4,%r13
-	addq	%rbx,%r12
-	xorq	%rax,%r15
-
-	rorq	$6,%r14
-	xorq	%r10,%r13
-	addq	%r15,%r12
-
-	movq	%rcx,%r15
-	addq	(%rbp),%r12
-	xorq	%rcx,%r14
-
-	xorq	%rdx,%r15
-	rorq	$14,%r13
-	movq	%rdx,%rbx
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%rbx
-	addq	%r12,%r9
-	addq	%r12,%rbx
-
-	leaq	8(%rbp),%rbp
-	movq	64(%rsp),%r13
-	movq	40(%rsp),%rdi
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%rbx
-	movq	%rdi,%r14
-	rorq	$42,%rdi
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%rdi
-	shrq	$6,%r14
-
-	rorq	$19,%rdi
-	xorq	%r13,%r12
-	xorq	%r14,%rdi
-	addq	0(%rsp),%r12
-
-	addq	56(%rsp),%r12
-	movq	%r9,%r13
-	addq	%rdi,%r12
-	movq	%rbx,%r14
-	rorq	$23,%r13
-	movq	%r10,%rdi
-
-	xorq	%r9,%r13
-	rorq	$5,%r14
-	xorq	%r11,%rdi
-
-	movq	%r12,56(%rsp)
-	xorq	%rbx,%r14
-	andq	%r9,%rdi
-
-	rorq	$4,%r13
-	addq	%rax,%r12
-	xorq	%r11,%rdi
-
-	rorq	$6,%r14
-	xorq	%r9,%r13
-	addq	%rdi,%r12
-
-	movq	%rbx,%rdi
-	addq	(%rbp),%r12
-	xorq	%rbx,%r14
-
-	xorq	%rcx,%rdi
-	rorq	$14,%r13
-	movq	%rcx,%rax
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%rax
-	addq	%r12,%r8
-	addq	%r12,%rax
-
-	leaq	24(%rbp),%rbp
-	movq	72(%rsp),%r13
-	movq	48(%rsp),%r15
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%rax
-	movq	%r15,%r14
-	rorq	$42,%r15
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%r15
-	shrq	$6,%r14
-
-	rorq	$19,%r15
-	xorq	%r13,%r12
-	xorq	%r14,%r15
-	addq	8(%rsp),%r12
-
-	addq	64(%rsp),%r12
-	movq	%r8,%r13
-	addq	%r15,%r12
-	movq	%rax,%r14
-	rorq	$23,%r13
-	movq	%r9,%r15
-
-	xorq	%r8,%r13
-	rorq	$5,%r14
-	xorq	%r10,%r15
-
-	movq	%r12,64(%rsp)
-	xorq	%rax,%r14
-	andq	%r8,%r15
-
-	rorq	$4,%r13
-	addq	%r11,%r12
-	xorq	%r10,%r15
-
-	rorq	$6,%r14
-	xorq	%r8,%r13
-	addq	%r15,%r12
-
-	movq	%rax,%r15
-	addq	(%rbp),%r12
-	xorq	%rax,%r14
-
-	xorq	%rbx,%r15
-	rorq	$14,%r13
-	movq	%rbx,%r11
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%r11
-	addq	%r12,%rdx
-	addq	%r12,%r11
-
-	leaq	8(%rbp),%rbp
-	movq	80(%rsp),%r13
-	movq	56(%rsp),%rdi
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%r11
-	movq	%rdi,%r14
-	rorq	$42,%rdi
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%rdi
-	shrq	$6,%r14
-
-	rorq	$19,%rdi
-	xorq	%r13,%r12
-	xorq	%r14,%rdi
-	addq	16(%rsp),%r12
-
-	addq	72(%rsp),%r12
-	movq	%rdx,%r13
-	addq	%rdi,%r12
-	movq	%r11,%r14
-	rorq	$23,%r13
-	movq	%r8,%rdi
-
-	xorq	%rdx,%r13
-	rorq	$5,%r14
-	xorq	%r9,%rdi
-
-	movq	%r12,72(%rsp)
-	xorq	%r11,%r14
-	andq	%rdx,%rdi
-
-	rorq	$4,%r13
-	addq	%r10,%r12
-	xorq	%r9,%rdi
-
-	rorq	$6,%r14
-	xorq	%rdx,%r13
-	addq	%rdi,%r12
-
-	movq	%r11,%rdi
-	addq	(%rbp),%r12
-	xorq	%r11,%r14
-
-	xorq	%rax,%rdi
-	rorq	$14,%r13
-	movq	%rax,%r10
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%r10
-	addq	%r12,%rcx
-	addq	%r12,%r10
-
-	leaq	24(%rbp),%rbp
-	movq	88(%rsp),%r13
-	movq	64(%rsp),%r15
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%r10
-	movq	%r15,%r14
-	rorq	$42,%r15
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%r15
-	shrq	$6,%r14
-
-	rorq	$19,%r15
-	xorq	%r13,%r12
-	xorq	%r14,%r15
-	addq	24(%rsp),%r12
-
-	addq	80(%rsp),%r12
-	movq	%rcx,%r13
-	addq	%r15,%r12
-	movq	%r10,%r14
-	rorq	$23,%r13
-	movq	%rdx,%r15
-
-	xorq	%rcx,%r13
-	rorq	$5,%r14
-	xorq	%r8,%r15
-
-	movq	%r12,80(%rsp)
-	xorq	%r10,%r14
-	andq	%rcx,%r15
-
-	rorq	$4,%r13
-	addq	%r9,%r12
-	xorq	%r8,%r15
-
-	rorq	$6,%r14
-	xorq	%rcx,%r13
-	addq	%r15,%r12
-
-	movq	%r10,%r15
-	addq	(%rbp),%r12
-	xorq	%r10,%r14
-
-	xorq	%r11,%r15
-	rorq	$14,%r13
-	movq	%r11,%r9
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%r9
-	addq	%r12,%rbx
-	addq	%r12,%r9
-
-	leaq	8(%rbp),%rbp
-	movq	96(%rsp),%r13
-	movq	72(%rsp),%rdi
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%r9
-	movq	%rdi,%r14
-	rorq	$42,%rdi
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%rdi
-	shrq	$6,%r14
-
-	rorq	$19,%rdi
-	xorq	%r13,%r12
-	xorq	%r14,%rdi
-	addq	32(%rsp),%r12
-
-	addq	88(%rsp),%r12
-	movq	%rbx,%r13
-	addq	%rdi,%r12
-	movq	%r9,%r14
-	rorq	$23,%r13
-	movq	%rcx,%rdi
-
-	xorq	%rbx,%r13
-	rorq	$5,%r14
-	xorq	%rdx,%rdi
-
-	movq	%r12,88(%rsp)
-	xorq	%r9,%r14
-	andq	%rbx,%rdi
-
-	rorq	$4,%r13
-	addq	%r8,%r12
-	xorq	%rdx,%rdi
-
-	rorq	$6,%r14
-	xorq	%rbx,%r13
-	addq	%rdi,%r12
-
-	movq	%r9,%rdi
-	addq	(%rbp),%r12
-	xorq	%r9,%r14
-
-	xorq	%r10,%rdi
-	rorq	$14,%r13
-	movq	%r10,%r8
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%r8
-	addq	%r12,%rax
-	addq	%r12,%r8
-
-	leaq	24(%rbp),%rbp
-	movq	104(%rsp),%r13
-	movq	80(%rsp),%r15
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%r8
-	movq	%r15,%r14
-	rorq	$42,%r15
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%r15
-	shrq	$6,%r14
-
-	rorq	$19,%r15
-	xorq	%r13,%r12
-	xorq	%r14,%r15
-	addq	40(%rsp),%r12
-
-	addq	96(%rsp),%r12
-	movq	%rax,%r13
-	addq	%r15,%r12
-	movq	%r8,%r14
-	rorq	$23,%r13
-	movq	%rbx,%r15
-
-	xorq	%rax,%r13
-	rorq	$5,%r14
-	xorq	%rcx,%r15
-
-	movq	%r12,96(%rsp)
-	xorq	%r8,%r14
-	andq	%rax,%r15
-
-	rorq	$4,%r13
-	addq	%rdx,%r12
-	xorq	%rcx,%r15
-
-	rorq	$6,%r14
-	xorq	%rax,%r13
-	addq	%r15,%r12
-
-	movq	%r8,%r15
-	addq	(%rbp),%r12
-	xorq	%r8,%r14
-
-	xorq	%r9,%r15
-	rorq	$14,%r13
-	movq	%r9,%rdx
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%rdx
-	addq	%r12,%r11
-	addq	%r12,%rdx
-
-	leaq	8(%rbp),%rbp
-	movq	112(%rsp),%r13
-	movq	88(%rsp),%rdi
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%rdx
-	movq	%rdi,%r14
-	rorq	$42,%rdi
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%rdi
-	shrq	$6,%r14
-
-	rorq	$19,%rdi
-	xorq	%r13,%r12
-	xorq	%r14,%rdi
-	addq	48(%rsp),%r12
-
-	addq	104(%rsp),%r12
-	movq	%r11,%r13
-	addq	%rdi,%r12
-	movq	%rdx,%r14
-	rorq	$23,%r13
-	movq	%rax,%rdi
-
-	xorq	%r11,%r13
-	rorq	$5,%r14
-	xorq	%rbx,%rdi
-
-	movq	%r12,104(%rsp)
-	xorq	%rdx,%r14
-	andq	%r11,%rdi
-
-	rorq	$4,%r13
-	addq	%rcx,%r12
-	xorq	%rbx,%rdi
-
-	rorq	$6,%r14
-	xorq	%r11,%r13
-	addq	%rdi,%r12
-
-	movq	%rdx,%rdi
-	addq	(%rbp),%r12
-	xorq	%rdx,%r14
-
-	xorq	%r8,%rdi
-	rorq	$14,%r13
-	movq	%r8,%rcx
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%rcx
-	addq	%r12,%r10
-	addq	%r12,%rcx
-
-	leaq	24(%rbp),%rbp
-	movq	120(%rsp),%r13
-	movq	96(%rsp),%r15
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%rcx
-	movq	%r15,%r14
-	rorq	$42,%r15
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%r15
-	shrq	$6,%r14
-
-	rorq	$19,%r15
-	xorq	%r13,%r12
-	xorq	%r14,%r15
-	addq	56(%rsp),%r12
-
-	addq	112(%rsp),%r12
-	movq	%r10,%r13
-	addq	%r15,%r12
-	movq	%rcx,%r14
-	rorq	$23,%r13
-	movq	%r11,%r15
-
-	xorq	%r10,%r13
-	rorq	$5,%r14
-	xorq	%rax,%r15
-
-	movq	%r12,112(%rsp)
-	xorq	%rcx,%r14
-	andq	%r10,%r15
-
-	rorq	$4,%r13
-	addq	%rbx,%r12
-	xorq	%rax,%r15
-
-	rorq	$6,%r14
-	xorq	%r10,%r13
-	addq	%r15,%r12
-
-	movq	%rcx,%r15
-	addq	(%rbp),%r12
-	xorq	%rcx,%r14
-
-	xorq	%rdx,%r15
-	rorq	$14,%r13
-	movq	%rdx,%rbx
-
-	andq	%r15,%rdi
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%rdi,%rbx
-	addq	%r12,%r9
-	addq	%r12,%rbx
-
-	leaq	8(%rbp),%rbp
-	movq	0(%rsp),%r13
-	movq	104(%rsp),%rdi
-
-	movq	%r13,%r12
-	rorq	$7,%r13
-	addq	%r14,%rbx
-	movq	%rdi,%r14
-	rorq	$42,%rdi
-
-	xorq	%r12,%r13
-	shrq	$7,%r12
-	rorq	$1,%r13
-	xorq	%r14,%rdi
-	shrq	$6,%r14
-
-	rorq	$19,%rdi
-	xorq	%r13,%r12
-	xorq	%r14,%rdi
-	addq	64(%rsp),%r12
-
-	addq	120(%rsp),%r12
-	movq	%r9,%r13
-	addq	%rdi,%r12
-	movq	%rbx,%r14
-	rorq	$23,%r13
-	movq	%r10,%rdi
-
-	xorq	%r9,%r13
-	rorq	$5,%r14
-	xorq	%r11,%rdi
-
-	movq	%r12,120(%rsp)
-	xorq	%rbx,%r14
-	andq	%r9,%rdi
-
-	rorq	$4,%r13
-	addq	%rax,%r12
-	xorq	%r11,%rdi
-
-	rorq	$6,%r14
-	xorq	%r9,%r13
-	addq	%rdi,%r12
-
-	movq	%rbx,%rdi
-	addq	(%rbp),%r12
-	xorq	%rbx,%r14
-
-	xorq	%rcx,%rdi
-	rorq	$14,%r13
-	movq	%rcx,%rax
-
-	andq	%rdi,%r15
-	rorq	$28,%r14
-	addq	%r13,%r12
-
-	xorq	%r15,%rax
-	addq	%r12,%r8
-	addq	%r12,%rax
-
-	leaq	24(%rbp),%rbp
-	cmpb	$0,7(%rbp)
-	jnz	L$rounds_16_xx
-
-	movq	128+0(%rsp),%rdi
-	addq	%r14,%rax
-	leaq	128(%rsi),%rsi
-
-	addq	0(%rdi),%rax
-	addq	8(%rdi),%rbx
-	addq	16(%rdi),%rcx
-	addq	24(%rdi),%rdx
-	addq	32(%rdi),%r8
-	addq	40(%rdi),%r9
-	addq	48(%rdi),%r10
-	addq	56(%rdi),%r11
-
-	cmpq	128+16(%rsp),%rsi
-
-	movq	%rax,0(%rdi)
-	movq	%rbx,8(%rdi)
-	movq	%rcx,16(%rdi)
-	movq	%rdx,24(%rdi)
-	movq	%r8,32(%rdi)
-	movq	%r9,40(%rdi)
-	movq	%r10,48(%rdi)
-	movq	%r11,56(%rdi)
-	jb	L$loop
-
-	movq	152(%rsp),%rsi
-
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbp
-
-	movq	-8(%rsi),%rbx
-
-	leaq	(%rsi),%rsp
-
-L$epilogue:
-	ret
-
-
-.section	__DATA,__const
-.p2align	6
-
-K512:
-.quad	0x428a2f98d728ae22,0x7137449123ef65cd
-.quad	0x428a2f98d728ae22,0x7137449123ef65cd
-.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
-.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
-.quad	0x3956c25bf348b538,0x59f111f1b605d019
-.quad	0x3956c25bf348b538,0x59f111f1b605d019
-.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
-.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
-.quad	0xd807aa98a3030242,0x12835b0145706fbe
-.quad	0xd807aa98a3030242,0x12835b0145706fbe
-.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
-.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
-.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
-.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
-.quad	0x9bdc06a725c71235,0xc19bf174cf692694
-.quad	0x9bdc06a725c71235,0xc19bf174cf692694
-.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
-.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
-.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
-.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
-.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
-.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
-.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
-.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
-.quad	0x983e5152ee66dfab,0xa831c66d2db43210
-.quad	0x983e5152ee66dfab,0xa831c66d2db43210
-.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
-.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
-.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
-.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
-.quad	0x06ca6351e003826f,0x142929670a0e6e70
-.quad	0x06ca6351e003826f,0x142929670a0e6e70
-.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
-.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
-.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
-.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
-.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
-.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
-.quad	0x81c2c92e47edaee6,0x92722c851482353b
-.quad	0x81c2c92e47edaee6,0x92722c851482353b
-.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
-.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
-.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
-.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
-.quad	0xd192e819d6ef5218,0xd69906245565a910
-.quad	0xd192e819d6ef5218,0xd69906245565a910
-.quad	0xf40e35855771202a,0x106aa07032bbd1b8
-.quad	0xf40e35855771202a,0x106aa07032bbd1b8
-.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
-.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
-.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
-.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
-.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
-.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
-.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
-.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
-.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
-.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
-.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
-.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
-.quad	0x90befffa23631e28,0xa4506cebde82bde9
-.quad	0x90befffa23631e28,0xa4506cebde82bde9
-.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
-.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
-.quad	0xca273eceea26619c,0xd186b8c721c0c207
-.quad	0xca273eceea26619c,0xd186b8c721c0c207
-.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
-.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
-.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
-.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
-.quad	0x113f9804bef90dae,0x1b710b35131c471b
-.quad	0x113f9804bef90dae,0x1b710b35131c471b
-.quad	0x28db77f523047d84,0x32caab7b40c72493
-.quad	0x28db77f523047d84,0x32caab7b40c72493
-.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
-.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
-.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
-.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
-.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
-.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
-
-.quad	0x0001020304050607,0x08090a0b0c0d0e0f
-.quad	0x0001020304050607,0x08090a0b0c0d0e0f
-.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.text	
-
-.p2align	6
-sha512_block_data_order_avx:
-
-L$avx_shortcut:
-	movq	%rsp,%rax
-
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-	shlq	$4,%rdx
-	subq	$160,%rsp
-	leaq	(%rsi,%rdx,8),%rdx
-	andq	$-64,%rsp
-	movq	%rdi,128+0(%rsp)
-	movq	%rsi,128+8(%rsp)
-	movq	%rdx,128+16(%rsp)
-	movq	%rax,152(%rsp)
-
-L$prologue_avx:
-
-	vzeroupper
-	movq	0(%rdi),%rax
-	movq	8(%rdi),%rbx
-	movq	16(%rdi),%rcx
-	movq	24(%rdi),%rdx
-	movq	32(%rdi),%r8
-	movq	40(%rdi),%r9
-	movq	48(%rdi),%r10
-	movq	56(%rdi),%r11
-	jmp	L$loop_avx
-.p2align	4
-L$loop_avx:
-	vmovdqa	K512+1280(%rip),%xmm11
-	vmovdqu	0(%rsi),%xmm0
-	leaq	K512+128(%rip),%rbp
-	vmovdqu	16(%rsi),%xmm1
-	vmovdqu	32(%rsi),%xmm2
-	vpshufb	%xmm11,%xmm0,%xmm0
-	vmovdqu	48(%rsi),%xmm3
-	vpshufb	%xmm11,%xmm1,%xmm1
-	vmovdqu	64(%rsi),%xmm4
-	vpshufb	%xmm11,%xmm2,%xmm2
-	vmovdqu	80(%rsi),%xmm5
-	vpshufb	%xmm11,%xmm3,%xmm3
-	vmovdqu	96(%rsi),%xmm6
-	vpshufb	%xmm11,%xmm4,%xmm4
-	vmovdqu	112(%rsi),%xmm7
-	vpshufb	%xmm11,%xmm5,%xmm5
-	vpaddq	-128(%rbp),%xmm0,%xmm8
-	vpshufb	%xmm11,%xmm6,%xmm6
-	vpaddq	-96(%rbp),%xmm1,%xmm9
-	vpshufb	%xmm11,%xmm7,%xmm7
-	vpaddq	-64(%rbp),%xmm2,%xmm10
-	vpaddq	-32(%rbp),%xmm3,%xmm11
-	vmovdqa	%xmm8,0(%rsp)
-	vpaddq	0(%rbp),%xmm4,%xmm8
-	vmovdqa	%xmm9,16(%rsp)
-	vpaddq	32(%rbp),%xmm5,%xmm9
-	vmovdqa	%xmm10,32(%rsp)
-	vpaddq	64(%rbp),%xmm6,%xmm10
-	vmovdqa	%xmm11,48(%rsp)
-	vpaddq	96(%rbp),%xmm7,%xmm11
-	vmovdqa	%xmm8,64(%rsp)
-	movq	%rax,%r14
-	vmovdqa	%xmm9,80(%rsp)
-	movq	%rbx,%rdi
-	vmovdqa	%xmm10,96(%rsp)
-	xorq	%rcx,%rdi
-	vmovdqa	%xmm11,112(%rsp)
-	movq	%r8,%r13
-	jmp	L$avx_00_47
-
-.p2align	4
-L$avx_00_47:
-	addq	$256,%rbp
-	vpalignr	$8,%xmm0,%xmm1,%xmm8
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rax
-	vpalignr	$8,%xmm4,%xmm5,%xmm11
-	movq	%r9,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$1,%xmm8,%xmm10
-	xorq	%r8,%r13
-	xorq	%r10,%r12
-	vpaddq	%xmm11,%xmm0,%xmm0
-	shrdq	$4,%r13,%r13
-	xorq	%rax,%r14
-	vpsrlq	$7,%xmm8,%xmm11
-	andq	%r8,%r12
-	xorq	%r8,%r13
-	vpsllq	$56,%xmm8,%xmm9
-	addq	0(%rsp),%r11
-	movq	%rax,%r15
-	vpxor	%xmm10,%xmm11,%xmm8
-	xorq	%r10,%r12
-	shrdq	$6,%r14,%r14
-	vpsrlq	$7,%xmm10,%xmm10
-	xorq	%rbx,%r15
-	addq	%r12,%r11
-	vpxor	%xmm9,%xmm8,%xmm8
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	vpsllq	$7,%xmm9,%xmm9
-	xorq	%rax,%r14
-	addq	%r13,%r11
-	vpxor	%xmm10,%xmm8,%xmm8
-	xorq	%rbx,%rdi
-	shrdq	$28,%r14,%r14
-	vpsrlq	$6,%xmm7,%xmm11
-	addq	%r11,%rdx
-	addq	%rdi,%r11
-	vpxor	%xmm9,%xmm8,%xmm8
-	movq	%rdx,%r13
-	addq	%r11,%r14
-	vpsllq	$3,%xmm7,%xmm10
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r11
-	vpaddq	%xmm8,%xmm0,%xmm0
-	movq	%r8,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$19,%xmm7,%xmm9
-	xorq	%rdx,%r13
-	xorq	%r9,%r12
-	vpxor	%xmm10,%xmm11,%xmm11
-	shrdq	$4,%r13,%r13
-	xorq	%r11,%r14
-	vpsllq	$42,%xmm10,%xmm10
-	andq	%rdx,%r12
-	xorq	%rdx,%r13
-	vpxor	%xmm9,%xmm11,%xmm11
-	addq	8(%rsp),%r10
-	movq	%r11,%rdi
-	vpsrlq	$42,%xmm9,%xmm9
-	xorq	%r9,%r12
-	shrdq	$6,%r14,%r14
-	vpxor	%xmm10,%xmm11,%xmm11
-	xorq	%rax,%rdi
-	addq	%r12,%r10
-	vpxor	%xmm9,%xmm11,%xmm11
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	vpaddq	%xmm11,%xmm0,%xmm0
-	xorq	%r11,%r14
-	addq	%r13,%r10
-	vpaddq	-128(%rbp),%xmm0,%xmm10
-	xorq	%rax,%r15
-	shrdq	$28,%r14,%r14
-	addq	%r10,%rcx
-	addq	%r15,%r10
-	movq	%rcx,%r13
-	addq	%r10,%r14
-	vmovdqa	%xmm10,0(%rsp)
-	vpalignr	$8,%xmm1,%xmm2,%xmm8
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r10
-	vpalignr	$8,%xmm5,%xmm6,%xmm11
-	movq	%rdx,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$1,%xmm8,%xmm10
-	xorq	%rcx,%r13
-	xorq	%r8,%r12
-	vpaddq	%xmm11,%xmm1,%xmm1
-	shrdq	$4,%r13,%r13
-	xorq	%r10,%r14
-	vpsrlq	$7,%xmm8,%xmm11
-	andq	%rcx,%r12
-	xorq	%rcx,%r13
-	vpsllq	$56,%xmm8,%xmm9
-	addq	16(%rsp),%r9
-	movq	%r10,%r15
-	vpxor	%xmm10,%xmm11,%xmm8
-	xorq	%r8,%r12
-	shrdq	$6,%r14,%r14
-	vpsrlq	$7,%xmm10,%xmm10
-	xorq	%r11,%r15
-	addq	%r12,%r9
-	vpxor	%xmm9,%xmm8,%xmm8
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	vpsllq	$7,%xmm9,%xmm9
-	xorq	%r10,%r14
-	addq	%r13,%r9
-	vpxor	%xmm10,%xmm8,%xmm8
-	xorq	%r11,%rdi
-	shrdq	$28,%r14,%r14
-	vpsrlq	$6,%xmm0,%xmm11
-	addq	%r9,%rbx
-	addq	%rdi,%r9
-	vpxor	%xmm9,%xmm8,%xmm8
-	movq	%rbx,%r13
-	addq	%r9,%r14
-	vpsllq	$3,%xmm0,%xmm10
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r9
-	vpaddq	%xmm8,%xmm1,%xmm1
-	movq	%rcx,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$19,%xmm0,%xmm9
-	xorq	%rbx,%r13
-	xorq	%rdx,%r12
-	vpxor	%xmm10,%xmm11,%xmm11
-	shrdq	$4,%r13,%r13
-	xorq	%r9,%r14
-	vpsllq	$42,%xmm10,%xmm10
-	andq	%rbx,%r12
-	xorq	%rbx,%r13
-	vpxor	%xmm9,%xmm11,%xmm11
-	addq	24(%rsp),%r8
-	movq	%r9,%rdi
-	vpsrlq	$42,%xmm9,%xmm9
-	xorq	%rdx,%r12
-	shrdq	$6,%r14,%r14
-	vpxor	%xmm10,%xmm11,%xmm11
-	xorq	%r10,%rdi
-	addq	%r12,%r8
-	vpxor	%xmm9,%xmm11,%xmm11
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	vpaddq	%xmm11,%xmm1,%xmm1
-	xorq	%r9,%r14
-	addq	%r13,%r8
-	vpaddq	-96(%rbp),%xmm1,%xmm10
-	xorq	%r10,%r15
-	shrdq	$28,%r14,%r14
-	addq	%r8,%rax
-	addq	%r15,%r8
-	movq	%rax,%r13
-	addq	%r8,%r14
-	vmovdqa	%xmm10,16(%rsp)
-	vpalignr	$8,%xmm2,%xmm3,%xmm8
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r8
-	vpalignr	$8,%xmm6,%xmm7,%xmm11
-	movq	%rbx,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$1,%xmm8,%xmm10
-	xorq	%rax,%r13
-	xorq	%rcx,%r12
-	vpaddq	%xmm11,%xmm2,%xmm2
-	shrdq	$4,%r13,%r13
-	xorq	%r8,%r14
-	vpsrlq	$7,%xmm8,%xmm11
-	andq	%rax,%r12
-	xorq	%rax,%r13
-	vpsllq	$56,%xmm8,%xmm9
-	addq	32(%rsp),%rdx
-	movq	%r8,%r15
-	vpxor	%xmm10,%xmm11,%xmm8
-	xorq	%rcx,%r12
-	shrdq	$6,%r14,%r14
-	vpsrlq	$7,%xmm10,%xmm10
-	xorq	%r9,%r15
-	addq	%r12,%rdx
-	vpxor	%xmm9,%xmm8,%xmm8
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	vpsllq	$7,%xmm9,%xmm9
-	xorq	%r8,%r14
-	addq	%r13,%rdx
-	vpxor	%xmm10,%xmm8,%xmm8
-	xorq	%r9,%rdi
-	shrdq	$28,%r14,%r14
-	vpsrlq	$6,%xmm1,%xmm11
-	addq	%rdx,%r11
-	addq	%rdi,%rdx
-	vpxor	%xmm9,%xmm8,%xmm8
-	movq	%r11,%r13
-	addq	%rdx,%r14
-	vpsllq	$3,%xmm1,%xmm10
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rdx
-	vpaddq	%xmm8,%xmm2,%xmm2
-	movq	%rax,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$19,%xmm1,%xmm9
-	xorq	%r11,%r13
-	xorq	%rbx,%r12
-	vpxor	%xmm10,%xmm11,%xmm11
-	shrdq	$4,%r13,%r13
-	xorq	%rdx,%r14
-	vpsllq	$42,%xmm10,%xmm10
-	andq	%r11,%r12
-	xorq	%r11,%r13
-	vpxor	%xmm9,%xmm11,%xmm11
-	addq	40(%rsp),%rcx
-	movq	%rdx,%rdi
-	vpsrlq	$42,%xmm9,%xmm9
-	xorq	%rbx,%r12
-	shrdq	$6,%r14,%r14
-	vpxor	%xmm10,%xmm11,%xmm11
-	xorq	%r8,%rdi
-	addq	%r12,%rcx
-	vpxor	%xmm9,%xmm11,%xmm11
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	vpaddq	%xmm11,%xmm2,%xmm2
-	xorq	%rdx,%r14
-	addq	%r13,%rcx
-	vpaddq	-64(%rbp),%xmm2,%xmm10
-	xorq	%r8,%r15
-	shrdq	$28,%r14,%r14
-	addq	%rcx,%r10
-	addq	%r15,%rcx
-	movq	%r10,%r13
-	addq	%rcx,%r14
-	vmovdqa	%xmm10,32(%rsp)
-	vpalignr	$8,%xmm3,%xmm4,%xmm8
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rcx
-	vpalignr	$8,%xmm7,%xmm0,%xmm11
-	movq	%r11,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$1,%xmm8,%xmm10
-	xorq	%r10,%r13
-	xorq	%rax,%r12
-	vpaddq	%xmm11,%xmm3,%xmm3
-	shrdq	$4,%r13,%r13
-	xorq	%rcx,%r14
-	vpsrlq	$7,%xmm8,%xmm11
-	andq	%r10,%r12
-	xorq	%r10,%r13
-	vpsllq	$56,%xmm8,%xmm9
-	addq	48(%rsp),%rbx
-	movq	%rcx,%r15
-	vpxor	%xmm10,%xmm11,%xmm8
-	xorq	%rax,%r12
-	shrdq	$6,%r14,%r14
-	vpsrlq	$7,%xmm10,%xmm10
-	xorq	%rdx,%r15
-	addq	%r12,%rbx
-	vpxor	%xmm9,%xmm8,%xmm8
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	vpsllq	$7,%xmm9,%xmm9
-	xorq	%rcx,%r14
-	addq	%r13,%rbx
-	vpxor	%xmm10,%xmm8,%xmm8
-	xorq	%rdx,%rdi
-	shrdq	$28,%r14,%r14
-	vpsrlq	$6,%xmm2,%xmm11
-	addq	%rbx,%r9
-	addq	%rdi,%rbx
-	vpxor	%xmm9,%xmm8,%xmm8
-	movq	%r9,%r13
-	addq	%rbx,%r14
-	vpsllq	$3,%xmm2,%xmm10
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rbx
-	vpaddq	%xmm8,%xmm3,%xmm3
-	movq	%r10,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$19,%xmm2,%xmm9
-	xorq	%r9,%r13
-	xorq	%r11,%r12
-	vpxor	%xmm10,%xmm11,%xmm11
-	shrdq	$4,%r13,%r13
-	xorq	%rbx,%r14
-	vpsllq	$42,%xmm10,%xmm10
-	andq	%r9,%r12
-	xorq	%r9,%r13
-	vpxor	%xmm9,%xmm11,%xmm11
-	addq	56(%rsp),%rax
-	movq	%rbx,%rdi
-	vpsrlq	$42,%xmm9,%xmm9
-	xorq	%r11,%r12
-	shrdq	$6,%r14,%r14
-	vpxor	%xmm10,%xmm11,%xmm11
-	xorq	%rcx,%rdi
-	addq	%r12,%rax
-	vpxor	%xmm9,%xmm11,%xmm11
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	vpaddq	%xmm11,%xmm3,%xmm3
-	xorq	%rbx,%r14
-	addq	%r13,%rax
-	vpaddq	-32(%rbp),%xmm3,%xmm10
-	xorq	%rcx,%r15
-	shrdq	$28,%r14,%r14
-	addq	%rax,%r8
-	addq	%r15,%rax
-	movq	%r8,%r13
-	addq	%rax,%r14
-	vmovdqa	%xmm10,48(%rsp)
-	vpalignr	$8,%xmm4,%xmm5,%xmm8
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rax
-	vpalignr	$8,%xmm0,%xmm1,%xmm11
-	movq	%r9,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$1,%xmm8,%xmm10
-	xorq	%r8,%r13
-	xorq	%r10,%r12
-	vpaddq	%xmm11,%xmm4,%xmm4
-	shrdq	$4,%r13,%r13
-	xorq	%rax,%r14
-	vpsrlq	$7,%xmm8,%xmm11
-	andq	%r8,%r12
-	xorq	%r8,%r13
-	vpsllq	$56,%xmm8,%xmm9
-	addq	64(%rsp),%r11
-	movq	%rax,%r15
-	vpxor	%xmm10,%xmm11,%xmm8
-	xorq	%r10,%r12
-	shrdq	$6,%r14,%r14
-	vpsrlq	$7,%xmm10,%xmm10
-	xorq	%rbx,%r15
-	addq	%r12,%r11
-	vpxor	%xmm9,%xmm8,%xmm8
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	vpsllq	$7,%xmm9,%xmm9
-	xorq	%rax,%r14
-	addq	%r13,%r11
-	vpxor	%xmm10,%xmm8,%xmm8
-	xorq	%rbx,%rdi
-	shrdq	$28,%r14,%r14
-	vpsrlq	$6,%xmm3,%xmm11
-	addq	%r11,%rdx
-	addq	%rdi,%r11
-	vpxor	%xmm9,%xmm8,%xmm8
-	movq	%rdx,%r13
-	addq	%r11,%r14
-	vpsllq	$3,%xmm3,%xmm10
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r11
-	vpaddq	%xmm8,%xmm4,%xmm4
-	movq	%r8,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$19,%xmm3,%xmm9
-	xorq	%rdx,%r13
-	xorq	%r9,%r12
-	vpxor	%xmm10,%xmm11,%xmm11
-	shrdq	$4,%r13,%r13
-	xorq	%r11,%r14
-	vpsllq	$42,%xmm10,%xmm10
-	andq	%rdx,%r12
-	xorq	%rdx,%r13
-	vpxor	%xmm9,%xmm11,%xmm11
-	addq	72(%rsp),%r10
-	movq	%r11,%rdi
-	vpsrlq	$42,%xmm9,%xmm9
-	xorq	%r9,%r12
-	shrdq	$6,%r14,%r14
-	vpxor	%xmm10,%xmm11,%xmm11
-	xorq	%rax,%rdi
-	addq	%r12,%r10
-	vpxor	%xmm9,%xmm11,%xmm11
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	vpaddq	%xmm11,%xmm4,%xmm4
-	xorq	%r11,%r14
-	addq	%r13,%r10
-	vpaddq	0(%rbp),%xmm4,%xmm10
-	xorq	%rax,%r15
-	shrdq	$28,%r14,%r14
-	addq	%r10,%rcx
-	addq	%r15,%r10
-	movq	%rcx,%r13
-	addq	%r10,%r14
-	vmovdqa	%xmm10,64(%rsp)
-	vpalignr	$8,%xmm5,%xmm6,%xmm8
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r10
-	vpalignr	$8,%xmm1,%xmm2,%xmm11
-	movq	%rdx,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$1,%xmm8,%xmm10
-	xorq	%rcx,%r13
-	xorq	%r8,%r12
-	vpaddq	%xmm11,%xmm5,%xmm5
-	shrdq	$4,%r13,%r13
-	xorq	%r10,%r14
-	vpsrlq	$7,%xmm8,%xmm11
-	andq	%rcx,%r12
-	xorq	%rcx,%r13
-	vpsllq	$56,%xmm8,%xmm9
-	addq	80(%rsp),%r9
-	movq	%r10,%r15
-	vpxor	%xmm10,%xmm11,%xmm8
-	xorq	%r8,%r12
-	shrdq	$6,%r14,%r14
-	vpsrlq	$7,%xmm10,%xmm10
-	xorq	%r11,%r15
-	addq	%r12,%r9
-	vpxor	%xmm9,%xmm8,%xmm8
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	vpsllq	$7,%xmm9,%xmm9
-	xorq	%r10,%r14
-	addq	%r13,%r9
-	vpxor	%xmm10,%xmm8,%xmm8
-	xorq	%r11,%rdi
-	shrdq	$28,%r14,%r14
-	vpsrlq	$6,%xmm4,%xmm11
-	addq	%r9,%rbx
-	addq	%rdi,%r9
-	vpxor	%xmm9,%xmm8,%xmm8
-	movq	%rbx,%r13
-	addq	%r9,%r14
-	vpsllq	$3,%xmm4,%xmm10
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r9
-	vpaddq	%xmm8,%xmm5,%xmm5
-	movq	%rcx,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$19,%xmm4,%xmm9
-	xorq	%rbx,%r13
-	xorq	%rdx,%r12
-	vpxor	%xmm10,%xmm11,%xmm11
-	shrdq	$4,%r13,%r13
-	xorq	%r9,%r14
-	vpsllq	$42,%xmm10,%xmm10
-	andq	%rbx,%r12
-	xorq	%rbx,%r13
-	vpxor	%xmm9,%xmm11,%xmm11
-	addq	88(%rsp),%r8
-	movq	%r9,%rdi
-	vpsrlq	$42,%xmm9,%xmm9
-	xorq	%rdx,%r12
-	shrdq	$6,%r14,%r14
-	vpxor	%xmm10,%xmm11,%xmm11
-	xorq	%r10,%rdi
-	addq	%r12,%r8
-	vpxor	%xmm9,%xmm11,%xmm11
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	vpaddq	%xmm11,%xmm5,%xmm5
-	xorq	%r9,%r14
-	addq	%r13,%r8
-	vpaddq	32(%rbp),%xmm5,%xmm10
-	xorq	%r10,%r15
-	shrdq	$28,%r14,%r14
-	addq	%r8,%rax
-	addq	%r15,%r8
-	movq	%rax,%r13
-	addq	%r8,%r14
-	vmovdqa	%xmm10,80(%rsp)
-	vpalignr	$8,%xmm6,%xmm7,%xmm8
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r8
-	vpalignr	$8,%xmm2,%xmm3,%xmm11
-	movq	%rbx,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$1,%xmm8,%xmm10
-	xorq	%rax,%r13
-	xorq	%rcx,%r12
-	vpaddq	%xmm11,%xmm6,%xmm6
-	shrdq	$4,%r13,%r13
-	xorq	%r8,%r14
-	vpsrlq	$7,%xmm8,%xmm11
-	andq	%rax,%r12
-	xorq	%rax,%r13
-	vpsllq	$56,%xmm8,%xmm9
-	addq	96(%rsp),%rdx
-	movq	%r8,%r15
-	vpxor	%xmm10,%xmm11,%xmm8
-	xorq	%rcx,%r12
-	shrdq	$6,%r14,%r14
-	vpsrlq	$7,%xmm10,%xmm10
-	xorq	%r9,%r15
-	addq	%r12,%rdx
-	vpxor	%xmm9,%xmm8,%xmm8
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	vpsllq	$7,%xmm9,%xmm9
-	xorq	%r8,%r14
-	addq	%r13,%rdx
-	vpxor	%xmm10,%xmm8,%xmm8
-	xorq	%r9,%rdi
-	shrdq	$28,%r14,%r14
-	vpsrlq	$6,%xmm5,%xmm11
-	addq	%rdx,%r11
-	addq	%rdi,%rdx
-	vpxor	%xmm9,%xmm8,%xmm8
-	movq	%r11,%r13
-	addq	%rdx,%r14
-	vpsllq	$3,%xmm5,%xmm10
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rdx
-	vpaddq	%xmm8,%xmm6,%xmm6
-	movq	%rax,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$19,%xmm5,%xmm9
-	xorq	%r11,%r13
-	xorq	%rbx,%r12
-	vpxor	%xmm10,%xmm11,%xmm11
-	shrdq	$4,%r13,%r13
-	xorq	%rdx,%r14
-	vpsllq	$42,%xmm10,%xmm10
-	andq	%r11,%r12
-	xorq	%r11,%r13
-	vpxor	%xmm9,%xmm11,%xmm11
-	addq	104(%rsp),%rcx
-	movq	%rdx,%rdi
-	vpsrlq	$42,%xmm9,%xmm9
-	xorq	%rbx,%r12
-	shrdq	$6,%r14,%r14
-	vpxor	%xmm10,%xmm11,%xmm11
-	xorq	%r8,%rdi
-	addq	%r12,%rcx
-	vpxor	%xmm9,%xmm11,%xmm11
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	vpaddq	%xmm11,%xmm6,%xmm6
-	xorq	%rdx,%r14
-	addq	%r13,%rcx
-	vpaddq	64(%rbp),%xmm6,%xmm10
-	xorq	%r8,%r15
-	shrdq	$28,%r14,%r14
-	addq	%rcx,%r10
-	addq	%r15,%rcx
-	movq	%r10,%r13
-	addq	%rcx,%r14
-	vmovdqa	%xmm10,96(%rsp)
-	vpalignr	$8,%xmm7,%xmm0,%xmm8
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rcx
-	vpalignr	$8,%xmm3,%xmm4,%xmm11
-	movq	%r11,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$1,%xmm8,%xmm10
-	xorq	%r10,%r13
-	xorq	%rax,%r12
-	vpaddq	%xmm11,%xmm7,%xmm7
-	shrdq	$4,%r13,%r13
-	xorq	%rcx,%r14
-	vpsrlq	$7,%xmm8,%xmm11
-	andq	%r10,%r12
-	xorq	%r10,%r13
-	vpsllq	$56,%xmm8,%xmm9
-	addq	112(%rsp),%rbx
-	movq	%rcx,%r15
-	vpxor	%xmm10,%xmm11,%xmm8
-	xorq	%rax,%r12
-	shrdq	$6,%r14,%r14
-	vpsrlq	$7,%xmm10,%xmm10
-	xorq	%rdx,%r15
-	addq	%r12,%rbx
-	vpxor	%xmm9,%xmm8,%xmm8
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	vpsllq	$7,%xmm9,%xmm9
-	xorq	%rcx,%r14
-	addq	%r13,%rbx
-	vpxor	%xmm10,%xmm8,%xmm8
-	xorq	%rdx,%rdi
-	shrdq	$28,%r14,%r14
-	vpsrlq	$6,%xmm6,%xmm11
-	addq	%rbx,%r9
-	addq	%rdi,%rbx
-	vpxor	%xmm9,%xmm8,%xmm8
-	movq	%r9,%r13
-	addq	%rbx,%r14
-	vpsllq	$3,%xmm6,%xmm10
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rbx
-	vpaddq	%xmm8,%xmm7,%xmm7
-	movq	%r10,%r12
-	shrdq	$5,%r14,%r14
-	vpsrlq	$19,%xmm6,%xmm9
-	xorq	%r9,%r13
-	xorq	%r11,%r12
-	vpxor	%xmm10,%xmm11,%xmm11
-	shrdq	$4,%r13,%r13
-	xorq	%rbx,%r14
-	vpsllq	$42,%xmm10,%xmm10
-	andq	%r9,%r12
-	xorq	%r9,%r13
-	vpxor	%xmm9,%xmm11,%xmm11
-	addq	120(%rsp),%rax
-	movq	%rbx,%rdi
-	vpsrlq	$42,%xmm9,%xmm9
-	xorq	%r11,%r12
-	shrdq	$6,%r14,%r14
-	vpxor	%xmm10,%xmm11,%xmm11
-	xorq	%rcx,%rdi
-	addq	%r12,%rax
-	vpxor	%xmm9,%xmm11,%xmm11
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	vpaddq	%xmm11,%xmm7,%xmm7
-	xorq	%rbx,%r14
-	addq	%r13,%rax
-	vpaddq	96(%rbp),%xmm7,%xmm10
-	xorq	%rcx,%r15
-	shrdq	$28,%r14,%r14
-	addq	%rax,%r8
-	addq	%r15,%rax
-	movq	%r8,%r13
-	addq	%rax,%r14
-	vmovdqa	%xmm10,112(%rsp)
-	cmpb	$0,135(%rbp)
-	jne	L$avx_00_47
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rax
-	movq	%r9,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%r8,%r13
-	xorq	%r10,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%rax,%r14
-	andq	%r8,%r12
-	xorq	%r8,%r13
-	addq	0(%rsp),%r11
-	movq	%rax,%r15
-	xorq	%r10,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%rbx,%r15
-	addq	%r12,%r11
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	xorq	%rax,%r14
-	addq	%r13,%r11
-	xorq	%rbx,%rdi
-	shrdq	$28,%r14,%r14
-	addq	%r11,%rdx
-	addq	%rdi,%r11
-	movq	%rdx,%r13
-	addq	%r11,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r11
-	movq	%r8,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%rdx,%r13
-	xorq	%r9,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%r11,%r14
-	andq	%rdx,%r12
-	xorq	%rdx,%r13
-	addq	8(%rsp),%r10
-	movq	%r11,%rdi
-	xorq	%r9,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%rax,%rdi
-	addq	%r12,%r10
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	xorq	%r11,%r14
-	addq	%r13,%r10
-	xorq	%rax,%r15
-	shrdq	$28,%r14,%r14
-	addq	%r10,%rcx
-	addq	%r15,%r10
-	movq	%rcx,%r13
-	addq	%r10,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r10
-	movq	%rdx,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%rcx,%r13
-	xorq	%r8,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%r10,%r14
-	andq	%rcx,%r12
-	xorq	%rcx,%r13
-	addq	16(%rsp),%r9
-	movq	%r10,%r15
-	xorq	%r8,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%r11,%r15
-	addq	%r12,%r9
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	xorq	%r10,%r14
-	addq	%r13,%r9
-	xorq	%r11,%rdi
-	shrdq	$28,%r14,%r14
-	addq	%r9,%rbx
-	addq	%rdi,%r9
-	movq	%rbx,%r13
-	addq	%r9,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r9
-	movq	%rcx,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%rbx,%r13
-	xorq	%rdx,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%r9,%r14
-	andq	%rbx,%r12
-	xorq	%rbx,%r13
-	addq	24(%rsp),%r8
-	movq	%r9,%rdi
-	xorq	%rdx,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%r10,%rdi
-	addq	%r12,%r8
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	xorq	%r9,%r14
-	addq	%r13,%r8
-	xorq	%r10,%r15
-	shrdq	$28,%r14,%r14
-	addq	%r8,%rax
-	addq	%r15,%r8
-	movq	%rax,%r13
-	addq	%r8,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r8
-	movq	%rbx,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%rax,%r13
-	xorq	%rcx,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%r8,%r14
-	andq	%rax,%r12
-	xorq	%rax,%r13
-	addq	32(%rsp),%rdx
-	movq	%r8,%r15
-	xorq	%rcx,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%r9,%r15
-	addq	%r12,%rdx
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	xorq	%r8,%r14
-	addq	%r13,%rdx
-	xorq	%r9,%rdi
-	shrdq	$28,%r14,%r14
-	addq	%rdx,%r11
-	addq	%rdi,%rdx
-	movq	%r11,%r13
-	addq	%rdx,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rdx
-	movq	%rax,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%r11,%r13
-	xorq	%rbx,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%rdx,%r14
-	andq	%r11,%r12
-	xorq	%r11,%r13
-	addq	40(%rsp),%rcx
-	movq	%rdx,%rdi
-	xorq	%rbx,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%r8,%rdi
-	addq	%r12,%rcx
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	xorq	%rdx,%r14
-	addq	%r13,%rcx
-	xorq	%r8,%r15
-	shrdq	$28,%r14,%r14
-	addq	%rcx,%r10
-	addq	%r15,%rcx
-	movq	%r10,%r13
-	addq	%rcx,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rcx
-	movq	%r11,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%r10,%r13
-	xorq	%rax,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%rcx,%r14
-	andq	%r10,%r12
-	xorq	%r10,%r13
-	addq	48(%rsp),%rbx
-	movq	%rcx,%r15
-	xorq	%rax,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%rdx,%r15
-	addq	%r12,%rbx
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	xorq	%rcx,%r14
-	addq	%r13,%rbx
-	xorq	%rdx,%rdi
-	shrdq	$28,%r14,%r14
-	addq	%rbx,%r9
-	addq	%rdi,%rbx
-	movq	%r9,%r13
-	addq	%rbx,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rbx
-	movq	%r10,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%r9,%r13
-	xorq	%r11,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%rbx,%r14
-	andq	%r9,%r12
-	xorq	%r9,%r13
-	addq	56(%rsp),%rax
-	movq	%rbx,%rdi
-	xorq	%r11,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%rcx,%rdi
-	addq	%r12,%rax
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	xorq	%rbx,%r14
-	addq	%r13,%rax
-	xorq	%rcx,%r15
-	shrdq	$28,%r14,%r14
-	addq	%rax,%r8
-	addq	%r15,%rax
-	movq	%r8,%r13
-	addq	%rax,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rax
-	movq	%r9,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%r8,%r13
-	xorq	%r10,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%rax,%r14
-	andq	%r8,%r12
-	xorq	%r8,%r13
-	addq	64(%rsp),%r11
-	movq	%rax,%r15
-	xorq	%r10,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%rbx,%r15
-	addq	%r12,%r11
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	xorq	%rax,%r14
-	addq	%r13,%r11
-	xorq	%rbx,%rdi
-	shrdq	$28,%r14,%r14
-	addq	%r11,%rdx
-	addq	%rdi,%r11
-	movq	%rdx,%r13
-	addq	%r11,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r11
-	movq	%r8,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%rdx,%r13
-	xorq	%r9,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%r11,%r14
-	andq	%rdx,%r12
-	xorq	%rdx,%r13
-	addq	72(%rsp),%r10
-	movq	%r11,%rdi
-	xorq	%r9,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%rax,%rdi
-	addq	%r12,%r10
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	xorq	%r11,%r14
-	addq	%r13,%r10
-	xorq	%rax,%r15
-	shrdq	$28,%r14,%r14
-	addq	%r10,%rcx
-	addq	%r15,%r10
-	movq	%rcx,%r13
-	addq	%r10,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r10
-	movq	%rdx,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%rcx,%r13
-	xorq	%r8,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%r10,%r14
-	andq	%rcx,%r12
-	xorq	%rcx,%r13
-	addq	80(%rsp),%r9
-	movq	%r10,%r15
-	xorq	%r8,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%r11,%r15
-	addq	%r12,%r9
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	xorq	%r10,%r14
-	addq	%r13,%r9
-	xorq	%r11,%rdi
-	shrdq	$28,%r14,%r14
-	addq	%r9,%rbx
-	addq	%rdi,%r9
-	movq	%rbx,%r13
-	addq	%r9,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r9
-	movq	%rcx,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%rbx,%r13
-	xorq	%rdx,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%r9,%r14
-	andq	%rbx,%r12
-	xorq	%rbx,%r13
-	addq	88(%rsp),%r8
-	movq	%r9,%rdi
-	xorq	%rdx,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%r10,%rdi
-	addq	%r12,%r8
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	xorq	%r9,%r14
-	addq	%r13,%r8
-	xorq	%r10,%r15
-	shrdq	$28,%r14,%r14
-	addq	%r8,%rax
-	addq	%r15,%r8
-	movq	%rax,%r13
-	addq	%r8,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%r8
-	movq	%rbx,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%rax,%r13
-	xorq	%rcx,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%r8,%r14
-	andq	%rax,%r12
-	xorq	%rax,%r13
-	addq	96(%rsp),%rdx
-	movq	%r8,%r15
-	xorq	%rcx,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%r9,%r15
-	addq	%r12,%rdx
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	xorq	%r8,%r14
-	addq	%r13,%rdx
-	xorq	%r9,%rdi
-	shrdq	$28,%r14,%r14
-	addq	%rdx,%r11
-	addq	%rdi,%rdx
-	movq	%r11,%r13
-	addq	%rdx,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rdx
-	movq	%rax,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%r11,%r13
-	xorq	%rbx,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%rdx,%r14
-	andq	%r11,%r12
-	xorq	%r11,%r13
-	addq	104(%rsp),%rcx
-	movq	%rdx,%rdi
-	xorq	%rbx,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%r8,%rdi
-	addq	%r12,%rcx
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	xorq	%rdx,%r14
-	addq	%r13,%rcx
-	xorq	%r8,%r15
-	shrdq	$28,%r14,%r14
-	addq	%rcx,%r10
-	addq	%r15,%rcx
-	movq	%r10,%r13
-	addq	%rcx,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rcx
-	movq	%r11,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%r10,%r13
-	xorq	%rax,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%rcx,%r14
-	andq	%r10,%r12
-	xorq	%r10,%r13
-	addq	112(%rsp),%rbx
-	movq	%rcx,%r15
-	xorq	%rax,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%rdx,%r15
-	addq	%r12,%rbx
-	shrdq	$14,%r13,%r13
-	andq	%r15,%rdi
-	xorq	%rcx,%r14
-	addq	%r13,%rbx
-	xorq	%rdx,%rdi
-	shrdq	$28,%r14,%r14
-	addq	%rbx,%r9
-	addq	%rdi,%rbx
-	movq	%r9,%r13
-	addq	%rbx,%r14
-	shrdq	$23,%r13,%r13
-	movq	%r14,%rbx
-	movq	%r10,%r12
-	shrdq	$5,%r14,%r14
-	xorq	%r9,%r13
-	xorq	%r11,%r12
-	shrdq	$4,%r13,%r13
-	xorq	%rbx,%r14
-	andq	%r9,%r12
-	xorq	%r9,%r13
-	addq	120(%rsp),%rax
-	movq	%rbx,%rdi
-	xorq	%r11,%r12
-	shrdq	$6,%r14,%r14
-	xorq	%rcx,%rdi
-	addq	%r12,%rax
-	shrdq	$14,%r13,%r13
-	andq	%rdi,%r15
-	xorq	%rbx,%r14
-	addq	%r13,%rax
-	xorq	%rcx,%r15
-	shrdq	$28,%r14,%r14
-	addq	%rax,%r8
-	addq	%r15,%rax
-	movq	%r8,%r13
-	addq	%rax,%r14
-	movq	128+0(%rsp),%rdi
-	movq	%r14,%rax
-
-	addq	0(%rdi),%rax
-	leaq	128(%rsi),%rsi
-	addq	8(%rdi),%rbx
-	addq	16(%rdi),%rcx
-	addq	24(%rdi),%rdx
-	addq	32(%rdi),%r8
-	addq	40(%rdi),%r9
-	addq	48(%rdi),%r10
-	addq	56(%rdi),%r11
-
-	cmpq	128+16(%rsp),%rsi
-
-	movq	%rax,0(%rdi)
-	movq	%rbx,8(%rdi)
-	movq	%rcx,16(%rdi)
-	movq	%rdx,24(%rdi)
-	movq	%r8,32(%rdi)
-	movq	%r9,40(%rdi)
-	movq	%r10,48(%rdi)
-	movq	%r11,56(%rdi)
-	jb	L$loop_avx
-
-	movq	152(%rsp),%rsi
-
-	vzeroupper
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	movq	-16(%rsi),%rbp
-
-	movq	-8(%rsi),%rbx
-
-	leaq	(%rsi),%rsp
-
-L$epilogue_avx:
-	ret
-
-
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/vpaes-x86_64-apple.S b/apple-x86_64/crypto/fipsmodule/vpaes-x86_64-apple.S
deleted file mode 100644
index 5aea40f..0000000
--- a/apple-x86_64/crypto/fipsmodule/vpaes-x86_64-apple.S
+++ /dev/null
@@ -1,1131 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align	4
-_vpaes_encrypt_core:
-
-	movq	%rdx,%r9
-	movq	$16,%r11
-	movl	240(%rdx),%eax
-	movdqa	%xmm9,%xmm1
-	movdqa	L$k_ipt(%rip),%xmm2
-	pandn	%xmm0,%xmm1
-	movdqu	(%r9),%xmm5
-	psrld	$4,%xmm1
-	pand	%xmm9,%xmm0
-.byte	102,15,56,0,208
-	movdqa	L$k_ipt+16(%rip),%xmm0
-.byte	102,15,56,0,193
-	pxor	%xmm5,%xmm2
-	addq	$16,%r9
-	pxor	%xmm2,%xmm0
-	leaq	L$k_mc_backward(%rip),%r10
-	jmp	L$enc_entry
-
-.p2align	4
-L$enc_loop:
-
-	movdqa	%xmm13,%xmm4
-	movdqa	%xmm12,%xmm0
-.byte	102,15,56,0,226
-.byte	102,15,56,0,195
-	pxor	%xmm5,%xmm4
-	movdqa	%xmm15,%xmm5
-	pxor	%xmm4,%xmm0
-	movdqa	-64(%r11,%r10,1),%xmm1
-.byte	102,15,56,0,234
-	movdqa	(%r11,%r10,1),%xmm4
-	movdqa	%xmm14,%xmm2
-.byte	102,15,56,0,211
-	movdqa	%xmm0,%xmm3
-	pxor	%xmm5,%xmm2
-.byte	102,15,56,0,193
-	addq	$16,%r9
-	pxor	%xmm2,%xmm0
-.byte	102,15,56,0,220
-	addq	$16,%r11
-	pxor	%xmm0,%xmm3
-.byte	102,15,56,0,193
-	andq	$0x30,%r11
-	subq	$1,%rax
-	pxor	%xmm3,%xmm0
-
-L$enc_entry:
-
-	movdqa	%xmm9,%xmm1
-	movdqa	%xmm11,%xmm5
-	pandn	%xmm0,%xmm1
-	psrld	$4,%xmm1
-	pand	%xmm9,%xmm0
-.byte	102,15,56,0,232
-	movdqa	%xmm10,%xmm3
-	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,217
-	movdqa	%xmm10,%xmm4
-	pxor	%xmm5,%xmm3
-.byte	102,15,56,0,224
-	movdqa	%xmm10,%xmm2
-	pxor	%xmm5,%xmm4
-.byte	102,15,56,0,211
-	movdqa	%xmm10,%xmm3
-	pxor	%xmm0,%xmm2
-.byte	102,15,56,0,220
-	movdqu	(%r9),%xmm5
-	pxor	%xmm1,%xmm3
-	jnz	L$enc_loop
-
-
-	movdqa	-96(%r10),%xmm4
-	movdqa	-80(%r10),%xmm0
-.byte	102,15,56,0,226
-	pxor	%xmm5,%xmm4
-.byte	102,15,56,0,195
-	movdqa	64(%r11,%r10,1),%xmm1
-	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,193
-	ret
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align	4
-_vpaes_encrypt_core_2x:
-
-	movq	%rdx,%r9
-	movq	$16,%r11
-	movl	240(%rdx),%eax
-	movdqa	%xmm9,%xmm1
-	movdqa	%xmm9,%xmm7
-	movdqa	L$k_ipt(%rip),%xmm2
-	movdqa	%xmm2,%xmm8
-	pandn	%xmm0,%xmm1
-	pandn	%xmm6,%xmm7
-	movdqu	(%r9),%xmm5
-
-	psrld	$4,%xmm1
-	psrld	$4,%xmm7
-	pand	%xmm9,%xmm0
-	pand	%xmm9,%xmm6
-.byte	102,15,56,0,208
-.byte	102,68,15,56,0,198
-	movdqa	L$k_ipt+16(%rip),%xmm0
-	movdqa	%xmm0,%xmm6
-.byte	102,15,56,0,193
-.byte	102,15,56,0,247
-	pxor	%xmm5,%xmm2
-	pxor	%xmm5,%xmm8
-	addq	$16,%r9
-	pxor	%xmm2,%xmm0
-	pxor	%xmm8,%xmm6
-	leaq	L$k_mc_backward(%rip),%r10
-	jmp	L$enc2x_entry
-
-.p2align	4
-L$enc2x_loop:
-
-	movdqa	L$k_sb1(%rip),%xmm4
-	movdqa	L$k_sb1+16(%rip),%xmm0
-	movdqa	%xmm4,%xmm12
-	movdqa	%xmm0,%xmm6
-.byte	102,15,56,0,226
-.byte	102,69,15,56,0,224
-.byte	102,15,56,0,195
-.byte	102,65,15,56,0,243
-	pxor	%xmm5,%xmm4
-	pxor	%xmm5,%xmm12
-	movdqa	L$k_sb2(%rip),%xmm5
-	movdqa	%xmm5,%xmm13
-	pxor	%xmm4,%xmm0
-	pxor	%xmm12,%xmm6
-	movdqa	-64(%r11,%r10,1),%xmm1
-
-.byte	102,15,56,0,234
-.byte	102,69,15,56,0,232
-	movdqa	(%r11,%r10,1),%xmm4
-
-	movdqa	L$k_sb2+16(%rip),%xmm2
-	movdqa	%xmm2,%xmm8
-.byte	102,15,56,0,211
-.byte	102,69,15,56,0,195
-	movdqa	%xmm0,%xmm3
-	movdqa	%xmm6,%xmm11
-	pxor	%xmm5,%xmm2
-	pxor	%xmm13,%xmm8
-.byte	102,15,56,0,193
-.byte	102,15,56,0,241
-	addq	$16,%r9
-	pxor	%xmm2,%xmm0
-	pxor	%xmm8,%xmm6
-.byte	102,15,56,0,220
-.byte	102,68,15,56,0,220
-	addq	$16,%r11
-	pxor	%xmm0,%xmm3
-	pxor	%xmm6,%xmm11
-.byte	102,15,56,0,193
-.byte	102,15,56,0,241
-	andq	$0x30,%r11
-	subq	$1,%rax
-	pxor	%xmm3,%xmm0
-	pxor	%xmm11,%xmm6
-
-L$enc2x_entry:
-
-	movdqa	%xmm9,%xmm1
-	movdqa	%xmm9,%xmm7
-	movdqa	L$k_inv+16(%rip),%xmm5
-	movdqa	%xmm5,%xmm13
-	pandn	%xmm0,%xmm1
-	pandn	%xmm6,%xmm7
-	psrld	$4,%xmm1
-	psrld	$4,%xmm7
-	pand	%xmm9,%xmm0
-	pand	%xmm9,%xmm6
-.byte	102,15,56,0,232
-.byte	102,68,15,56,0,238
-	movdqa	%xmm10,%xmm3
-	movdqa	%xmm10,%xmm11
-	pxor	%xmm1,%xmm0
-	pxor	%xmm7,%xmm6
-.byte	102,15,56,0,217
-.byte	102,68,15,56,0,223
-	movdqa	%xmm10,%xmm4
-	movdqa	%xmm10,%xmm12
-	pxor	%xmm5,%xmm3
-	pxor	%xmm13,%xmm11
-.byte	102,15,56,0,224
-.byte	102,68,15,56,0,230
-	movdqa	%xmm10,%xmm2
-	movdqa	%xmm10,%xmm8
-	pxor	%xmm5,%xmm4
-	pxor	%xmm13,%xmm12
-.byte	102,15,56,0,211
-.byte	102,69,15,56,0,195
-	movdqa	%xmm10,%xmm3
-	movdqa	%xmm10,%xmm11
-	pxor	%xmm0,%xmm2
-	pxor	%xmm6,%xmm8
-.byte	102,15,56,0,220
-.byte	102,69,15,56,0,220
-	movdqu	(%r9),%xmm5
-
-	pxor	%xmm1,%xmm3
-	pxor	%xmm7,%xmm11
-	jnz	L$enc2x_loop
-
-
-	movdqa	-96(%r10),%xmm4
-	movdqa	-80(%r10),%xmm0
-	movdqa	%xmm4,%xmm12
-	movdqa	%xmm0,%xmm6
-.byte	102,15,56,0,226
-.byte	102,69,15,56,0,224
-	pxor	%xmm5,%xmm4
-	pxor	%xmm5,%xmm12
-.byte	102,15,56,0,195
-.byte	102,65,15,56,0,243
-	movdqa	64(%r11,%r10,1),%xmm1
-
-	pxor	%xmm4,%xmm0
-	pxor	%xmm12,%xmm6
-.byte	102,15,56,0,193
-.byte	102,15,56,0,241
-	ret
-
-
-
-
-
-
-
-
-
-.p2align	4
-_vpaes_decrypt_core:
-
-	movq	%rdx,%r9
-	movl	240(%rdx),%eax
-	movdqa	%xmm9,%xmm1
-	movdqa	L$k_dipt(%rip),%xmm2
-	pandn	%xmm0,%xmm1
-	movq	%rax,%r11
-	psrld	$4,%xmm1
-	movdqu	(%r9),%xmm5
-	shlq	$4,%r11
-	pand	%xmm9,%xmm0
-.byte	102,15,56,0,208
-	movdqa	L$k_dipt+16(%rip),%xmm0
-	xorq	$0x30,%r11
-	leaq	L$k_dsbd(%rip),%r10
-.byte	102,15,56,0,193
-	andq	$0x30,%r11
-	pxor	%xmm5,%xmm2
-	movdqa	L$k_mc_forward+48(%rip),%xmm5
-	pxor	%xmm2,%xmm0
-	addq	$16,%r9
-	addq	%r10,%r11
-	jmp	L$dec_entry
-
-.p2align	4
-L$dec_loop:
-
-
-
-	movdqa	-32(%r10),%xmm4
-	movdqa	-16(%r10),%xmm1
-.byte	102,15,56,0,226
-.byte	102,15,56,0,203
-	pxor	%xmm4,%xmm0
-	movdqa	0(%r10),%xmm4
-	pxor	%xmm1,%xmm0
-	movdqa	16(%r10),%xmm1
-
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
-	pxor	%xmm4,%xmm0
-	movdqa	32(%r10),%xmm4
-	pxor	%xmm1,%xmm0
-	movdqa	48(%r10),%xmm1
-
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
-	pxor	%xmm4,%xmm0
-	movdqa	64(%r10),%xmm4
-	pxor	%xmm1,%xmm0
-	movdqa	80(%r10),%xmm1
-
-.byte	102,15,56,0,226
-.byte	102,15,56,0,197
-.byte	102,15,56,0,203
-	pxor	%xmm4,%xmm0
-	addq	$16,%r9
-.byte	102,15,58,15,237,12
-	pxor	%xmm1,%xmm0
-	subq	$1,%rax
-
-L$dec_entry:
-
-	movdqa	%xmm9,%xmm1
-	pandn	%xmm0,%xmm1
-	movdqa	%xmm11,%xmm2
-	psrld	$4,%xmm1
-	pand	%xmm9,%xmm0
-.byte	102,15,56,0,208
-	movdqa	%xmm10,%xmm3
-	pxor	%xmm1,%xmm0
-.byte	102,15,56,0,217
-	movdqa	%xmm10,%xmm4
-	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,224
-	pxor	%xmm2,%xmm4
-	movdqa	%xmm10,%xmm2
-.byte	102,15,56,0,211
-	movdqa	%xmm10,%xmm3
-	pxor	%xmm0,%xmm2
-.byte	102,15,56,0,220
-	movdqu	(%r9),%xmm0
-	pxor	%xmm1,%xmm3
-	jnz	L$dec_loop
-
-
-	movdqa	96(%r10),%xmm4
-.byte	102,15,56,0,226
-	pxor	%xmm0,%xmm4
-	movdqa	112(%r10),%xmm0
-	movdqa	-352(%r11),%xmm2
-.byte	102,15,56,0,195
-	pxor	%xmm4,%xmm0
-.byte	102,15,56,0,194
-	ret
-
-
-
-
-
-
-
-
-
-.p2align	4
-_vpaes_schedule_core:
-
-
-
-
-
-
-	call	_vpaes_preheat
-	movdqa	L$k_rcon(%rip),%xmm8
-	movdqu	(%rdi),%xmm0
-
-
-	movdqa	%xmm0,%xmm3
-	leaq	L$k_ipt(%rip),%r11
-	call	_vpaes_schedule_transform
-	movdqa	%xmm0,%xmm7
-
-	leaq	L$k_sr(%rip),%r10
-	testq	%rcx,%rcx
-	jnz	L$schedule_am_decrypting
-
-
-	movdqu	%xmm0,(%rdx)
-	jmp	L$schedule_go
-
-L$schedule_am_decrypting:
-
-	movdqa	(%r8,%r10,1),%xmm1
-.byte	102,15,56,0,217
-	movdqu	%xmm3,(%rdx)
-	xorq	$0x30,%r8
-
-L$schedule_go:
-	cmpl	$192,%esi
-	ja	L$schedule_256
-	je	L$schedule_192
-
-
-
-
-
-
-
-
-
-
-L$schedule_128:
-	movl	$10,%esi
-
-L$oop_schedule_128:
-	call	_vpaes_schedule_round
-	decq	%rsi
-	jz	L$schedule_mangle_last
-	call	_vpaes_schedule_mangle
-	jmp	L$oop_schedule_128
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align	4
-L$schedule_192:
-	movdqu	8(%rdi),%xmm0
-	call	_vpaes_schedule_transform
-	movdqa	%xmm0,%xmm6
-	pxor	%xmm4,%xmm4
-	movhlps	%xmm4,%xmm6
-	movl	$4,%esi
-
-L$oop_schedule_192:
-	call	_vpaes_schedule_round
-.byte	102,15,58,15,198,8
-	call	_vpaes_schedule_mangle
-	call	_vpaes_schedule_192_smear
-	call	_vpaes_schedule_mangle
-	call	_vpaes_schedule_round
-	decq	%rsi
-	jz	L$schedule_mangle_last
-	call	_vpaes_schedule_mangle
-	call	_vpaes_schedule_192_smear
-	jmp	L$oop_schedule_192
-
-
-
-
-
-
-
-
-
-
-
-.p2align	4
-L$schedule_256:
-	movdqu	16(%rdi),%xmm0
-	call	_vpaes_schedule_transform
-	movl	$7,%esi
-
-L$oop_schedule_256:
-	call	_vpaes_schedule_mangle
-	movdqa	%xmm0,%xmm6
-
-
-	call	_vpaes_schedule_round
-	decq	%rsi
-	jz	L$schedule_mangle_last
-	call	_vpaes_schedule_mangle
-
-
-	pshufd	$0xFF,%xmm0,%xmm0
-	movdqa	%xmm7,%xmm5
-	movdqa	%xmm6,%xmm7
-	call	_vpaes_schedule_low_round
-	movdqa	%xmm5,%xmm7
-
-	jmp	L$oop_schedule_256
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align	4
-L$schedule_mangle_last:
-
-	leaq	L$k_deskew(%rip),%r11
-	testq	%rcx,%rcx
-	jnz	L$schedule_mangle_last_dec
-
-
-	movdqa	(%r8,%r10,1),%xmm1
-.byte	102,15,56,0,193
-	leaq	L$k_opt(%rip),%r11
-	addq	$32,%rdx
-
-L$schedule_mangle_last_dec:
-	addq	$-16,%rdx
-	pxor	L$k_s63(%rip),%xmm0
-	call	_vpaes_schedule_transform
-	movdqu	%xmm0,(%rdx)
-
-
-	pxor	%xmm0,%xmm0
-	pxor	%xmm1,%xmm1
-	pxor	%xmm2,%xmm2
-	pxor	%xmm3,%xmm3
-	pxor	%xmm4,%xmm4
-	pxor	%xmm5,%xmm5
-	pxor	%xmm6,%xmm6
-	pxor	%xmm7,%xmm7
-	ret
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align	4
-_vpaes_schedule_192_smear:
-
-	pshufd	$0x80,%xmm6,%xmm1
-	pshufd	$0xFE,%xmm7,%xmm0
-	pxor	%xmm1,%xmm6
-	pxor	%xmm1,%xmm1
-	pxor	%xmm0,%xmm6
-	movdqa	%xmm6,%xmm0
-	movhlps	%xmm1,%xmm6
-	ret
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align	4
-_vpaes_schedule_round:
-
-
-	pxor	%xmm1,%xmm1
-.byte	102,65,15,58,15,200,15
-.byte	102,69,15,58,15,192,15
-	pxor	%xmm1,%xmm7
-
-
-	pshufd	$0xFF,%xmm0,%xmm0
-.byte	102,15,58,15,192,1
-
-
-
-
-_vpaes_schedule_low_round:
-
-	movdqa	%xmm7,%xmm1
-	pslldq	$4,%xmm7
-	pxor	%xmm1,%xmm7
-	movdqa	%xmm7,%xmm1
-	pslldq	$8,%xmm7
-	pxor	%xmm1,%xmm7
-	pxor	L$k_s63(%rip),%xmm7
-
-
-	movdqa	%xmm9,%xmm1
-	pandn	%xmm0,%xmm1
-	psrld	$4,%xmm1
-	pand	%xmm9,%xmm0
-	movdqa	%xmm11,%xmm2
-.byte	102,15,56,0,208
-	pxor	%xmm1,%xmm0
-	movdqa	%xmm10,%xmm3
-.byte	102,15,56,0,217
-	pxor	%xmm2,%xmm3
-	movdqa	%xmm10,%xmm4
-.byte	102,15,56,0,224
-	pxor	%xmm2,%xmm4
-	movdqa	%xmm10,%xmm2
-.byte	102,15,56,0,211
-	pxor	%xmm0,%xmm2
-	movdqa	%xmm10,%xmm3
-.byte	102,15,56,0,220
-	pxor	%xmm1,%xmm3
-	movdqa	%xmm13,%xmm4
-.byte	102,15,56,0,226
-	movdqa	%xmm12,%xmm0
-.byte	102,15,56,0,195
-	pxor	%xmm4,%xmm0
-
-
-	pxor	%xmm7,%xmm0
-	movdqa	%xmm0,%xmm7
-	ret
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align	4
-_vpaes_schedule_transform:
-
-	movdqa	%xmm9,%xmm1
-	pandn	%xmm0,%xmm1
-	psrld	$4,%xmm1
-	pand	%xmm9,%xmm0
-	movdqa	(%r11),%xmm2
-.byte	102,15,56,0,208
-	movdqa	16(%r11),%xmm0
-.byte	102,15,56,0,193
-	pxor	%xmm2,%xmm0
-	ret
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.p2align	4
-_vpaes_schedule_mangle:
-
-	movdqa	%xmm0,%xmm4
-	movdqa	L$k_mc_forward(%rip),%xmm5
-	testq	%rcx,%rcx
-	jnz	L$schedule_mangle_dec
-
-
-	addq	$16,%rdx
-	pxor	L$k_s63(%rip),%xmm4
-.byte	102,15,56,0,229
-	movdqa	%xmm4,%xmm3
-.byte	102,15,56,0,229
-	pxor	%xmm4,%xmm3
-.byte	102,15,56,0,229
-	pxor	%xmm4,%xmm3
-
-	jmp	L$schedule_mangle_both
-.p2align	4
-L$schedule_mangle_dec:
-
-	leaq	L$k_dksd(%rip),%r11
-	movdqa	%xmm9,%xmm1
-	pandn	%xmm4,%xmm1
-	psrld	$4,%xmm1
-	pand	%xmm9,%xmm4
-
-	movdqa	0(%r11),%xmm2
-.byte	102,15,56,0,212
-	movdqa	16(%r11),%xmm3
-.byte	102,15,56,0,217
-	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
-
-	movdqa	32(%r11),%xmm2
-.byte	102,15,56,0,212
-	pxor	%xmm3,%xmm2
-	movdqa	48(%r11),%xmm3
-.byte	102,15,56,0,217
-	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
-
-	movdqa	64(%r11),%xmm2
-.byte	102,15,56,0,212
-	pxor	%xmm3,%xmm2
-	movdqa	80(%r11),%xmm3
-.byte	102,15,56,0,217
-	pxor	%xmm2,%xmm3
-.byte	102,15,56,0,221
-
-	movdqa	96(%r11),%xmm2
-.byte	102,15,56,0,212
-	pxor	%xmm3,%xmm2
-	movdqa	112(%r11),%xmm3
-.byte	102,15,56,0,217
-	pxor	%xmm2,%xmm3
-
-	addq	$-16,%rdx
-
-L$schedule_mangle_both:
-	movdqa	(%r8,%r10,1),%xmm1
-.byte	102,15,56,0,217
-	addq	$-16,%r8
-	andq	$0x30,%r8
-	movdqu	%xmm3,(%rdx)
-	ret
-
-
-
-
-
-
-.globl	_vpaes_set_encrypt_key
-.private_extern _vpaes_set_encrypt_key
-
-.p2align	4
-_vpaes_set_encrypt_key:
-
-_CET_ENDBR
-#ifdef BORINGSSL_DISPATCH_TEST
-
-	movb	$1,_BORINGSSL_function_hit+5(%rip)
-#endif
-
-	movl	%esi,%eax
-	shrl	$5,%eax
-	addl	$5,%eax
-	movl	%eax,240(%rdx)
-
-	movl	$0,%ecx
-	movl	$0x30,%r8d
-	call	_vpaes_schedule_core
-	xorl	%eax,%eax
-	ret
-
-
-
-.globl	_vpaes_set_decrypt_key
-.private_extern _vpaes_set_decrypt_key
-
-.p2align	4
-_vpaes_set_decrypt_key:
-
-_CET_ENDBR
-	movl	%esi,%eax
-	shrl	$5,%eax
-	addl	$5,%eax
-	movl	%eax,240(%rdx)
-	shll	$4,%eax
-	leaq	16(%rdx,%rax,1),%rdx
-
-	movl	$1,%ecx
-	movl	%esi,%r8d
-	shrl	$1,%r8d
-	andl	$32,%r8d
-	xorl	$32,%r8d
-	call	_vpaes_schedule_core
-	xorl	%eax,%eax
-	ret
-
-
-
-.globl	_vpaes_encrypt
-.private_extern _vpaes_encrypt
-
-.p2align	4
-_vpaes_encrypt:
-
-_CET_ENDBR
-#ifdef BORINGSSL_DISPATCH_TEST
-
-	movb	$1,_BORINGSSL_function_hit+4(%rip)
-#endif
-	movdqu	(%rdi),%xmm0
-	call	_vpaes_preheat
-	call	_vpaes_encrypt_core
-	movdqu	%xmm0,(%rsi)
-	ret
-
-
-
-.globl	_vpaes_decrypt
-.private_extern _vpaes_decrypt
-
-.p2align	4
-_vpaes_decrypt:
-
-_CET_ENDBR
-	movdqu	(%rdi),%xmm0
-	call	_vpaes_preheat
-	call	_vpaes_decrypt_core
-	movdqu	%xmm0,(%rsi)
-	ret
-
-
-.globl	_vpaes_cbc_encrypt
-.private_extern _vpaes_cbc_encrypt
-
-.p2align	4
-_vpaes_cbc_encrypt:
-
-_CET_ENDBR
-	xchgq	%rcx,%rdx
-	subq	$16,%rcx
-	jc	L$cbc_abort
-	movdqu	(%r8),%xmm6
-	subq	%rdi,%rsi
-	call	_vpaes_preheat
-	cmpl	$0,%r9d
-	je	L$cbc_dec_loop
-	jmp	L$cbc_enc_loop
-.p2align	4
-L$cbc_enc_loop:
-	movdqu	(%rdi),%xmm0
-	pxor	%xmm6,%xmm0
-	call	_vpaes_encrypt_core
-	movdqa	%xmm0,%xmm6
-	movdqu	%xmm0,(%rsi,%rdi,1)
-	leaq	16(%rdi),%rdi
-	subq	$16,%rcx
-	jnc	L$cbc_enc_loop
-	jmp	L$cbc_done
-.p2align	4
-L$cbc_dec_loop:
-	movdqu	(%rdi),%xmm0
-	movdqa	%xmm0,%xmm7
-	call	_vpaes_decrypt_core
-	pxor	%xmm6,%xmm0
-	movdqa	%xmm7,%xmm6
-	movdqu	%xmm0,(%rsi,%rdi,1)
-	leaq	16(%rdi),%rdi
-	subq	$16,%rcx
-	jnc	L$cbc_dec_loop
-L$cbc_done:
-	movdqu	%xmm6,(%r8)
-L$cbc_abort:
-	ret
-
-
-.globl	_vpaes_ctr32_encrypt_blocks
-.private_extern _vpaes_ctr32_encrypt_blocks
-
-.p2align	4
-_vpaes_ctr32_encrypt_blocks:
-
-_CET_ENDBR
-
-	xchgq	%rcx,%rdx
-	testq	%rcx,%rcx
-	jz	L$ctr32_abort
-	movdqu	(%r8),%xmm0
-	movdqa	L$ctr_add_one(%rip),%xmm8
-	subq	%rdi,%rsi
-	call	_vpaes_preheat
-	movdqa	%xmm0,%xmm6
-	pshufb	L$rev_ctr(%rip),%xmm6
-
-	testq	$1,%rcx
-	jz	L$ctr32_prep_loop
-
-
-
-	movdqu	(%rdi),%xmm7
-	call	_vpaes_encrypt_core
-	pxor	%xmm7,%xmm0
-	paddd	%xmm8,%xmm6
-	movdqu	%xmm0,(%rsi,%rdi,1)
-	subq	$1,%rcx
-	leaq	16(%rdi),%rdi
-	jz	L$ctr32_done
-
-L$ctr32_prep_loop:
-
-
-	movdqa	%xmm6,%xmm14
-	movdqa	%xmm6,%xmm15
-	paddd	%xmm8,%xmm15
-
-L$ctr32_loop:
-	movdqa	L$rev_ctr(%rip),%xmm1
-	movdqa	%xmm14,%xmm0
-	movdqa	%xmm15,%xmm6
-.byte	102,15,56,0,193
-.byte	102,15,56,0,241
-	call	_vpaes_encrypt_core_2x
-	movdqu	(%rdi),%xmm1
-	movdqu	16(%rdi),%xmm2
-	movdqa	L$ctr_add_two(%rip),%xmm3
-	pxor	%xmm1,%xmm0
-	pxor	%xmm2,%xmm6
-	paddd	%xmm3,%xmm14
-	paddd	%xmm3,%xmm15
-	movdqu	%xmm0,(%rsi,%rdi,1)
-	movdqu	%xmm6,16(%rsi,%rdi,1)
-	subq	$2,%rcx
-	leaq	32(%rdi),%rdi
-	jnz	L$ctr32_loop
-
-L$ctr32_done:
-L$ctr32_abort:
-	ret
-
-
-
-
-
-
-
-
-
-.p2align	4
-_vpaes_preheat:
-
-	leaq	L$k_s0F(%rip),%r10
-	movdqa	-32(%r10),%xmm10
-	movdqa	-16(%r10),%xmm11
-	movdqa	0(%r10),%xmm9
-	movdqa	48(%r10),%xmm13
-	movdqa	64(%r10),%xmm12
-	movdqa	80(%r10),%xmm15
-	movdqa	96(%r10),%xmm14
-	ret
-
-
-
-
-
-
-
-
-.section	__DATA,__const
-.p2align	6
-_vpaes_consts:
-L$k_inv:
-.quad	0x0E05060F0D080180, 0x040703090A0B0C02
-.quad	0x01040A060F0B0780, 0x030D0E0C02050809
-
-L$k_s0F:
-.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
-
-L$k_ipt:
-.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
-.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
-
-L$k_sb1:
-.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
-.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
-L$k_sb2:
-.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
-.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
-L$k_sbo:
-.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
-.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
-
-L$k_mc_forward:
-.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
-.quad	0x080B0A0904070605, 0x000302010C0F0E0D
-.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
-.quad	0x000302010C0F0E0D, 0x080B0A0904070605
-
-L$k_mc_backward:
-.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
-.quad	0x020100030E0D0C0F, 0x0A09080B06050407
-.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
-.quad	0x0A09080B06050407, 0x020100030E0D0C0F
-
-L$k_sr:
-.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
-.quad	0x030E09040F0A0500, 0x0B06010C07020D08
-.quad	0x0F060D040B020900, 0x070E050C030A0108
-.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
-
-L$k_rcon:
-.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
-
-L$k_s63:
-.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
-
-L$k_opt:
-.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
-.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
-
-L$k_deskew:
-.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
-.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
-
-
-
-
-
-L$k_dksd:
-.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
-.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
-L$k_dksb:
-.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
-.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
-L$k_dkse:
-.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
-.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
-L$k_dks9:
-.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
-.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
-
-
-
-
-
-L$k_dipt:
-.quad	0x0F505B040B545F00, 0x154A411E114E451A
-.quad	0x86E383E660056500, 0x12771772F491F194
-
-L$k_dsb9:
-.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
-.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
-L$k_dsbd:
-.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
-.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
-L$k_dsbb:
-.quad	0xD022649296B44200, 0x602646F6B0F2D404
-.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
-L$k_dsbe:
-.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
-.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
-L$k_dsbo:
-.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
-.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-
-
-L$rev_ctr:
-.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
-
-
-L$ctr_add_one:
-.quad	0x0000000000000000, 0x0000000100000000
-L$ctr_add_two:
-.quad	0x0000000000000000, 0x0000000200000000
-
-.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
-.p2align	6
-
-.text	
-#endif
diff --git a/apple-x86_64/crypto/fipsmodule/x86_64-mont-apple.S b/apple-x86_64/crypto/fipsmodule/x86_64-mont-apple.S
deleted file mode 100644
index a4c719c..0000000
--- a/apple-x86_64/crypto/fipsmodule/x86_64-mont-apple.S
+++ /dev/null
@@ -1,1250 +0,0 @@
-// This file is generated from a similarly-named Perl script in the BoringSSL
-// source tree. Do not edit by hand.
-
-#include <openssl/asm_base.h>
-
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
-.text	
-
-
-
-.globl	_bn_mul_mont
-.private_extern _bn_mul_mont
-
-.p2align	4
-_bn_mul_mont:
-
-_CET_ENDBR
-	movl	%r9d,%r9d
-	movq	%rsp,%rax
-
-	testl	$3,%r9d
-	jnz	L$mul_enter
-	cmpl	$8,%r9d
-	jb	L$mul_enter
-	leaq	_OPENSSL_ia32cap_P(%rip),%r11
-	movl	8(%r11),%r11d
-	cmpq	%rsi,%rdx
-	jne	L$mul4x_enter
-	testl	$7,%r9d
-	jz	L$sqr8x_enter
-	jmp	L$mul4x_enter
-
-.p2align	4
-L$mul_enter:
-	pushq	%rbx
-
-	pushq	%rbp
-
-	pushq	%r12
-
-	pushq	%r13
-
-	pushq	%r14
-
-	pushq	%r15
-
-
-	negq	%r9
-	movq	%rsp,%r11
-	leaq	-16(%rsp,%r9,8),%r10
-	negq	%r9
-	andq	$-1024,%r10
-
-
-
-
-
-
-
-
-
-	subq	%r10,%r11
-	andq	$-4096,%r11
-	leaq	(%r10,%r11,1),%rsp
-	movq	(%rsp),%r11
-	cmpq	%r10,%rsp
-	ja	L$mul_page_walk
-	jmp	L$mul_page_walk_done
-
-.p2align	4
-L$mul_page_walk:
-	leaq	-4096(%rsp),%rsp
-	movq	(%rsp),%r11
-	cmpq	%r10,%rsp
-	ja	L$mul_page_walk
-L$mul_page_walk_done:
-
-	movq	%rax,8(%rsp,%r9,8)
-
-L$mul_body:
-	movq	%rdx,%r12
-	movq	(%r8),%r8
-	movq	(%r12),%rbx
-	movq	(%rsi),%rax
-
-	xorq	%r14,%r14
-	xorq	%r15,%r15
-
-	movq	%r8,%rbp
-	mulq	%rbx
-	movq	%rax,%r10
-	movq	(%rcx),%rax
-
-	imulq	%r10,%rbp
-	movq	%rdx,%r11
-
-	mulq	%rbp
-	addq	%rax,%r10
-	movq	8(%rsi),%rax
-	adcq	$0,%rdx
-	movq	%rdx,%r13
-
-	leaq	1(%r15),%r15
-	jmp	L$1st_enter
-
-.p2align	4
-L$1st:
-	addq	%rax,%r13
-	movq	(%rsi,%r15,8),%rax
-	adcq	$0,%rdx
-	addq	%r11,%r13
-	movq	%r10,%r11
-	adcq	$0,%rdx
-	movq	%r13,-16(%rsp,%r15,8)
-	movq	%rdx,%r13
-
-L$1st_enter:
-	mulq	%rbx
-	addq	%rax,%r11
-	movq	(%rcx,%r15,8),%rax
-	adcq	$0,%rdx
-	leaq	1(%r15),%r15
-	movq	%rdx,%r10
-
-	mulq	%rbp
-	cmpq	%r9,%r15
-	jne	L$1st
-
-	addq	%rax,%r13
-	movq	(%rsi),%rax
-	adcq	$0,%rdx
-	addq	%r11,%r13
-	adcq	$0,%rdx
-	movq	%r13,-16(%rsp,%r15,8)
-	movq	%rdx,%r13
-	movq	%r10,%r11
-
-	xorq	%rdx,%rdx
-	addq	%r11,%r13
-	adcq	$0,%rdx
-	movq	%r13,-8(%rsp,%r9,8)
-	movq	%rdx,(%rsp,%r9,8)
-
-	leaq	1(%r14),%r14
-	jmp	L$outer
-.p2align	4
-L$outer:
-	movq	(%r12,%r14,8),%rbx
-	xorq	%r15,%r15
-	movq	%r8,%rbp
-	movq	(%rsp),%r10
-	mulq	%rbx
-	addq	%rax,%r10
-	movq	(%rcx),%rax
-	adcq	$0,%rdx
-
-	imulq	%r10,%rbp
-	movq	%rdx,%r11
-
-	mulq	%rbp
-	addq	%rax,%r10
-	movq	8(%rsi),%rax
-	adcq	$0,%rdx
-	movq	8(%rsp),%r10
-	movq	%rdx,%r13
-
-	leaq	1(%r15),%r15
-	jmp	L$inner_enter
-
-.p2align	4
-L$inner:
-	addq	%rax,%r13
-	movq	(%rsi,%r15,8),%rax
-	adcq	$0,%rdx
-	addq	%r10,%r13
-	movq	(%rsp,%r15,8),%r10
-	adcq	$0,%rdx
-	movq	%r13,-16(%rsp,%r15,8)
-	movq	%rdx,%r13
-
-L$inner_enter:
-	mulq	%rbx
-	addq	%rax,%r11
-	movq	(%rcx,%r15,8),%rax
-	adcq	$0,%rdx
-	addq	%r11,%r10
-	movq	%rdx,%r11
-	adcq	$0,%r11
-	leaq	1(%r15),%r15
-
-	mulq	%rbp
-	cmpq	%r9,%r15
-	jne	L$inner
-
-	addq	%rax,%r13
-	movq	(%rsi),%rax
-	adcq	$0,%rdx
-	addq	%r10,%r13
-	movq	(%rsp,%r15,8),%r10
-	adcq	$0,%rdx
-	movq	%r13,-16(%rsp,%r15,8)
-	movq	%rdx,%r13
-
-	xorq	%rdx,%rdx
-	addq	%r11,%r13
-	adcq	$0,%rdx
-	addq	%r10,%r13
-	adcq	$0,%rdx
-	movq	%r13,-8(%rsp,%r9,8)
-	movq	%rdx,(%rsp,%r9,8)
-
-	leaq	1(%r14),%r14
-	cmpq	%r9,%r14
-	jb	L$outer
-
-	xorq	%r14,%r14
-	movq	(%rsp),%rax
-	movq	%r9,%r15
-
-.p2align	4
-L$sub:	sbbq	(%rcx,%r14,8),%rax
-	movq	%rax,(%rdi,%r14,8)
-	movq	8(%rsp,%r14,8),%rax
-	leaq	1(%r14),%r14
-	decq	%r15
-	jnz	L$sub
-
-	sbbq	$0,%rax
-	movq	$-1,%rbx
-	xorq	%rax,%rbx
-	xorq	%r14,%r14
-	movq	%r9,%r15
-
-L$copy:
-	movq	(%rdi,%r14,8),%rcx
-	movq	(%rsp,%r14,8),%rdx
-	andq	%rbx,%rcx
-	andq	%rax,%rdx
-	movq	%r9,(%rsp,%r14,8)
-	orq	%rcx,%rdx
-	movq	%rdx,(%rdi,%r14,8)
-	leaq	1(%r14),%r14
-	subq	$1,%r15
-	jnz	L$copy
-
-	movq	8(%rsp,%r9,8),%rsi
-
-	movq	$1,%rax
-	movq	-48(%rsi),%r15
-
-	movq	-40(%rsi),%r14
-
-	movq	-32(%rsi),%r13
-
-	movq	-24(%rsi),%r12
-
-	mov