Roll generated files forward (#6)

diff --git a/BUILD.generated.gni b/BUILD.generated.gni
index 6836fab..07b6e91 100644
--- a/BUILD.generated.gni
+++ b/BUILD.generated.gni
@@ -6,10 +6,6 @@
 
 crypto_sources = [
   "err_data.c",
-  "src/crypto/aes/aes.c",
-  "src/crypto/aes/internal.h",
-  "src/crypto/aes/key_wrap.c",
-  "src/crypto/aes/mode_wrappers.c",
   "src/crypto/asn1/a_bitstr.c",
   "src/crypto/asn1/a_bool.c",
   "src/crypto/asn1/a_d2i_fp.c",
@@ -34,19 +30,16 @@
   "src/crypto/asn1/f_enum.c",
   "src/crypto/asn1/f_int.c",
   "src/crypto/asn1/f_string.c",
-  "src/crypto/asn1/t_bitst.c",
   "src/crypto/asn1/tasn_dec.c",
   "src/crypto/asn1/tasn_enc.c",
   "src/crypto/asn1/tasn_fre.c",
   "src/crypto/asn1/tasn_new.c",
   "src/crypto/asn1/tasn_typ.c",
   "src/crypto/asn1/tasn_utl.c",
-  "src/crypto/asn1/x_bignum.c",
-  "src/crypto/asn1/x_long.c",
+  "src/crypto/asn1/time_support.c",
   "src/crypto/base64/base64.c",
   "src/crypto/bio/bio.c",
   "src/crypto/bio/bio_mem.c",
-  "src/crypto/bio/buffer.c",
   "src/crypto/bio/connect.c",
   "src/crypto/bio/fd.c",
   "src/crypto/bio/file.c",
@@ -56,28 +49,8 @@
   "src/crypto/bio/printf.c",
   "src/crypto/bio/socket.c",
   "src/crypto/bio/socket_helper.c",
-  "src/crypto/bn/add.c",
-  "src/crypto/bn/asm/x86_64-gcc.c",
-  "src/crypto/bn/bn.c",
-  "src/crypto/bn/bn_asn1.c",
-  "src/crypto/bn/cmp.c",
-  "src/crypto/bn/convert.c",
-  "src/crypto/bn/ctx.c",
-  "src/crypto/bn/div.c",
-  "src/crypto/bn/exponentiation.c",
-  "src/crypto/bn/gcd.c",
-  "src/crypto/bn/generic.c",
-  "src/crypto/bn/internal.h",
-  "src/crypto/bn/kronecker.c",
-  "src/crypto/bn/montgomery.c",
-  "src/crypto/bn/montgomery_inv.c",
-  "src/crypto/bn/mul.c",
-  "src/crypto/bn/prime.c",
-  "src/crypto/bn/random.c",
-  "src/crypto/bn/rsaz_exp.c",
-  "src/crypto/bn/rsaz_exp.h",
-  "src/crypto/bn/shift.c",
-  "src/crypto/bn/sqrt.c",
+  "src/crypto/bn_extra/bn_asn1.c",
+  "src/crypto/bn_extra/convert.c",
   "src/crypto/buf/buf.c",
   "src/crypto/bytestring/asn1_compat.c",
   "src/crypto/bytestring/ber.c",
@@ -85,19 +58,18 @@
   "src/crypto/bytestring/cbs.c",
   "src/crypto/bytestring/internal.h",
   "src/crypto/chacha/chacha.c",
-  "src/crypto/cipher/aead.c",
-  "src/crypto/cipher/cipher.c",
-  "src/crypto/cipher/derive_key.c",
-  "src/crypto/cipher/e_aes.c",
-  "src/crypto/cipher/e_chacha20poly1305.c",
-  "src/crypto/cipher/e_des.c",
-  "src/crypto/cipher/e_null.c",
-  "src/crypto/cipher/e_rc2.c",
-  "src/crypto/cipher/e_rc4.c",
-  "src/crypto/cipher/e_ssl3.c",
-  "src/crypto/cipher/e_tls.c",
-  "src/crypto/cipher/internal.h",
-  "src/crypto/cipher/tls_cbc.c",
+  "src/crypto/cipher_extra/cipher_extra.c",
+  "src/crypto/cipher_extra/derive_key.c",
+  "src/crypto/cipher_extra/e_aesctrhmac.c",
+  "src/crypto/cipher_extra/e_aesgcmsiv.c",
+  "src/crypto/cipher_extra/e_chacha20poly1305.c",
+  "src/crypto/cipher_extra/e_null.c",
+  "src/crypto/cipher_extra/e_rc2.c",
+  "src/crypto/cipher_extra/e_rc4.c",
+  "src/crypto/cipher_extra/e_ssl3.c",
+  "src/crypto/cipher_extra/e_tls.c",
+  "src/crypto/cipher_extra/internal.h",
+  "src/crypto/cipher_extra/tls_cbc.c",
   "src/crypto/cmac/cmac.c",
   "src/crypto/conf/conf.c",
   "src/crypto/conf/conf_def.h",
@@ -108,41 +80,21 @@
   "src/crypto/cpu-intel.c",
   "src/crypto/cpu-ppc64le.c",
   "src/crypto/crypto.c",
-  "src/crypto/curve25519/curve25519.c",
-  "src/crypto/curve25519/internal.h",
   "src/crypto/curve25519/spake25519.c",
   "src/crypto/curve25519/x25519-x86_64.c",
-  "src/crypto/des/des.c",
-  "src/crypto/des/internal.h",
   "src/crypto/dh/check.c",
   "src/crypto/dh/dh.c",
   "src/crypto/dh/dh_asn1.c",
   "src/crypto/dh/params.c",
-  "src/crypto/digest/digest.c",
-  "src/crypto/digest/digests.c",
-  "src/crypto/digest/internal.h",
-  "src/crypto/digest/md32_common.h",
+  "src/crypto/digest_extra/digest_extra.c",
   "src/crypto/dsa/dsa.c",
   "src/crypto/dsa/dsa_asn1.c",
-  "src/crypto/ec/ec.c",
-  "src/crypto/ec/ec_asn1.c",
-  "src/crypto/ec/ec_key.c",
-  "src/crypto/ec/ec_montgomery.c",
-  "src/crypto/ec/internal.h",
-  "src/crypto/ec/oct.c",
-  "src/crypto/ec/p224-64.c",
-  "src/crypto/ec/p256-64.c",
-  "src/crypto/ec/p256-x86_64-table.h",
-  "src/crypto/ec/p256-x86_64.c",
-  "src/crypto/ec/p256-x86_64.h",
-  "src/crypto/ec/simple.c",
-  "src/crypto/ec/util-64.c",
-  "src/crypto/ec/wnaf.c",
+  "src/crypto/ec_extra/ec_asn1.c",
   "src/crypto/ecdh/ecdh.c",
-  "src/crypto/ecdsa/ecdsa.c",
-  "src/crypto/ecdsa/ecdsa_asn1.c",
+  "src/crypto/ecdsa_extra/ecdsa_asn1.c",
   "src/crypto/engine/engine.c",
   "src/crypto/err/err.c",
+  "src/crypto/err/internal.h",
   "src/crypto/evp/digestsign.c",
   "src/crypto/evp/evp.c",
   "src/crypto/evp/evp_asn1.c",
@@ -151,30 +103,39 @@
   "src/crypto/evp/p_dsa_asn1.c",
   "src/crypto/evp/p_ec.c",
   "src/crypto/evp/p_ec_asn1.c",
+  "src/crypto/evp/p_ed25519.c",
+  "src/crypto/evp/p_ed25519_asn1.c",
   "src/crypto/evp/p_rsa.c",
   "src/crypto/evp/p_rsa_asn1.c",
   "src/crypto/evp/pbkdf.c",
   "src/crypto/evp/print.c",
+  "src/crypto/evp/scrypt.c",
   "src/crypto/evp/sign.c",
   "src/crypto/ex_data.c",
+  "src/crypto/fipsmodule/aes/internal.h",
+  "src/crypto/fipsmodule/bcm.c",
+  "src/crypto/fipsmodule/bn/internal.h",
+  "src/crypto/fipsmodule/bn/rsaz_exp.h",
+  "src/crypto/fipsmodule/cipher/internal.h",
+  "src/crypto/fipsmodule/delocate.h",
+  "src/crypto/fipsmodule/des/internal.h",
+  "src/crypto/fipsmodule/digest/internal.h",
+  "src/crypto/fipsmodule/digest/md32_common.h",
+  "src/crypto/fipsmodule/ec/internal.h",
+  "src/crypto/fipsmodule/ec/p256-x86_64-table.h",
+  "src/crypto/fipsmodule/ec/p256-x86_64.h",
+  "src/crypto/fipsmodule/is_fips.c",
+  "src/crypto/fipsmodule/modes/internal.h",
+  "src/crypto/fipsmodule/rand/internal.h",
+  "src/crypto/fipsmodule/rsa/internal.h",
+  "src/crypto/fipsmodule/tls/internal.h",
   "src/crypto/hkdf/hkdf.c",
-  "src/crypto/hmac/hmac.c",
   "src/crypto/internal.h",
   "src/crypto/lhash/lhash.c",
-  "src/crypto/md4/md4.c",
-  "src/crypto/md5/md5.c",
   "src/crypto/mem.c",
-  "src/crypto/modes/cbc.c",
-  "src/crypto/modes/cfb.c",
-  "src/crypto/modes/ctr.c",
-  "src/crypto/modes/gcm.c",
-  "src/crypto/modes/internal.h",
-  "src/crypto/modes/ofb.c",
-  "src/crypto/modes/polyval.c",
   "src/crypto/obj/obj.c",
   "src/crypto/obj/obj_dat.h",
   "src/crypto/obj/obj_xref.c",
-  "src/crypto/obj/obj_xref.h",
   "src/crypto/pem/pem_all.c",
   "src/crypto/pem/pem_info.c",
   "src/crypto/pem/pem_lib.c",
@@ -183,41 +144,33 @@
   "src/crypto/pem/pem_pkey.c",
   "src/crypto/pem/pem_x509.c",
   "src/crypto/pem/pem_xaux.c",
+  "src/crypto/pkcs7/internal.h",
+  "src/crypto/pkcs7/pkcs7.c",
+  "src/crypto/pkcs7/pkcs7_x509.c",
   "src/crypto/pkcs8/internal.h",
-  "src/crypto/pkcs8/p5_pbe.c",
   "src/crypto/pkcs8/p5_pbev2.c",
-  "src/crypto/pkcs8/p8_pkey.c",
   "src/crypto/pkcs8/pkcs8.c",
+  "src/crypto/pkcs8/pkcs8_x509.c",
   "src/crypto/poly1305/internal.h",
   "src/crypto/poly1305/poly1305.c",
   "src/crypto/poly1305/poly1305_arm.c",
   "src/crypto/poly1305/poly1305_vec.c",
   "src/crypto/pool/internal.h",
   "src/crypto/pool/pool.c",
-  "src/crypto/rand/deterministic.c",
-  "src/crypto/rand/internal.h",
-  "src/crypto/rand/rand.c",
-  "src/crypto/rand/urandom.c",
-  "src/crypto/rand/windows.c",
+  "src/crypto/rand_extra/deterministic.c",
+  "src/crypto/rand_extra/forkunsafe.c",
+  "src/crypto/rand_extra/fuchsia.c",
+  "src/crypto/rand_extra/rand_extra.c",
+  "src/crypto/rand_extra/windows.c",
   "src/crypto/rc4/rc4.c",
   "src/crypto/refcount_c11.c",
   "src/crypto/refcount_lock.c",
-  "src/crypto/rsa/blinding.c",
-  "src/crypto/rsa/internal.h",
-  "src/crypto/rsa/padding.c",
-  "src/crypto/rsa/rsa.c",
-  "src/crypto/rsa/rsa_asn1.c",
-  "src/crypto/rsa/rsa_impl.c",
-  "src/crypto/sha/sha1-altivec.c",
-  "src/crypto/sha/sha1.c",
-  "src/crypto/sha/sha256.c",
-  "src/crypto/sha/sha512.c",
+  "src/crypto/rsa_extra/rsa_asn1.c",
   "src/crypto/stack/stack.c",
   "src/crypto/thread.c",
   "src/crypto/thread_none.c",
   "src/crypto/thread_pthread.c",
   "src/crypto/thread_win.c",
-  "src/crypto/time_support.c",
   "src/crypto/x509/a_digest.c",
   "src/crypto/x509/a_sign.c",
   "src/crypto/x509/a_strex.c",
@@ -229,7 +182,6 @@
   "src/crypto/x509/charmap.h",
   "src/crypto/x509/i2d_pr.c",
   "src/crypto/x509/internal.h",
-  "src/crypto/x509/pkcs7.c",
   "src/crypto/x509/rsa_pss.c",
   "src/crypto/x509/t_crl.c",
   "src/crypto/x509/t_req.c",
@@ -256,7 +208,6 @@
   "src/crypto/x509/x509name.c",
   "src/crypto/x509/x509rset.c",
   "src/crypto/x509/x509spki.c",
-  "src/crypto/x509/x509type.c",
   "src/crypto/x509/x_algor.c",
   "src/crypto/x509/x_all.c",
   "src/crypto/x509/x_attrib.c",
@@ -342,6 +293,7 @@
   "src/include/openssl/ex_data.h",
   "src/include/openssl/hkdf.h",
   "src/include/openssl/hmac.h",
+  "src/include/openssl/is_boringssl.h",
   "src/include/openssl/lhash.h",
   "src/include/openssl/lhash_macros.h",
   "src/include/openssl/md4.h",
@@ -366,15 +318,17 @@
   "src/include/openssl/rsa.h",
   "src/include/openssl/safestack.h",
   "src/include/openssl/sha.h",
+  "src/include/openssl/span.h",
   "src/include/openssl/srtp.h",
   "src/include/openssl/stack.h",
-  "src/include/openssl/stack_macros.h",
   "src/include/openssl/thread.h",
-  "src/include/openssl/time_support.h",
   "src/include/openssl/type_check.h",
   "src/include/openssl/x509.h",
   "src/include/openssl/x509_vfy.h",
   "src/include/openssl/x509v3.h",
+  "src/third_party/fiat/curve25519.c",
+  "src/third_party/fiat/curve25519_tables.h",
+  "src/third_party/fiat/internal.h",
 ]
 
 ssl_sources = [
@@ -382,181 +336,215 @@
   "src/include/openssl/ssl.h",
   "src/include/openssl/ssl3.h",
   "src/include/openssl/tls1.h",
-  "src/ssl/custom_extensions.c",
-  "src/ssl/d1_both.c",
-  "src/ssl/d1_lib.c",
-  "src/ssl/d1_pkt.c",
-  "src/ssl/d1_srtp.c",
-  "src/ssl/dtls_method.c",
-  "src/ssl/dtls_record.c",
-  "src/ssl/handshake_client.c",
-  "src/ssl/handshake_server.c",
+  "src/ssl/bio_ssl.cc",
+  "src/ssl/custom_extensions.cc",
+  "src/ssl/d1_both.cc",
+  "src/ssl/d1_lib.cc",
+  "src/ssl/d1_pkt.cc",
+  "src/ssl/d1_srtp.cc",
+  "src/ssl/dtls_method.cc",
+  "src/ssl/dtls_record.cc",
+  "src/ssl/handshake.cc",
+  "src/ssl/handshake_client.cc",
+  "src/ssl/handshake_server.cc",
   "src/ssl/internal.h",
-  "src/ssl/s3_both.c",
-  "src/ssl/s3_enc.c",
-  "src/ssl/s3_lib.c",
-  "src/ssl/s3_pkt.c",
-  "src/ssl/ssl_aead_ctx.c",
-  "src/ssl/ssl_asn1.c",
-  "src/ssl/ssl_buffer.c",
-  "src/ssl/ssl_cert.c",
-  "src/ssl/ssl_cipher.c",
-  "src/ssl/ssl_ecdh.c",
-  "src/ssl/ssl_file.c",
-  "src/ssl/ssl_lib.c",
-  "src/ssl/ssl_rsa.c",
-  "src/ssl/ssl_session.c",
-  "src/ssl/ssl_stat.c",
-  "src/ssl/t1_enc.c",
-  "src/ssl/t1_lib.c",
-  "src/ssl/tls13_both.c",
-  "src/ssl/tls13_client.c",
-  "src/ssl/tls13_enc.c",
-  "src/ssl/tls13_server.c",
-  "src/ssl/tls_method.c",
-  "src/ssl/tls_record.c",
+  "src/ssl/s3_both.cc",
+  "src/ssl/s3_lib.cc",
+  "src/ssl/s3_pkt.cc",
+  "src/ssl/ssl_aead_ctx.cc",
+  "src/ssl/ssl_asn1.cc",
+  "src/ssl/ssl_buffer.cc",
+  "src/ssl/ssl_cert.cc",
+  "src/ssl/ssl_cipher.cc",
+  "src/ssl/ssl_file.cc",
+  "src/ssl/ssl_key_share.cc",
+  "src/ssl/ssl_lib.cc",
+  "src/ssl/ssl_privkey.cc",
+  "src/ssl/ssl_session.cc",
+  "src/ssl/ssl_stat.cc",
+  "src/ssl/ssl_transcript.cc",
+  "src/ssl/ssl_versions.cc",
+  "src/ssl/ssl_x509.cc",
+  "src/ssl/t1_enc.cc",
+  "src/ssl/t1_lib.cc",
+  "src/ssl/tls13_both.cc",
+  "src/ssl/tls13_client.cc",
+  "src/ssl/tls13_enc.cc",
+  "src/ssl/tls13_server.cc",
+  "src/ssl/tls_method.cc",
+  "src/ssl/tls_record.cc",
+]
+
+crypto_sources_ios_aarch64 = [
+  "ios-aarch64/crypto/chacha/chacha-armv8.S",
+  "ios-aarch64/crypto/fipsmodule/aesv8-armx64.S",
+  "ios-aarch64/crypto/fipsmodule/armv8-mont.S",
+  "ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S",
+  "ios-aarch64/crypto/fipsmodule/sha1-armv8.S",
+  "ios-aarch64/crypto/fipsmodule/sha256-armv8.S",
+  "ios-aarch64/crypto/fipsmodule/sha512-armv8.S",
+]
+
+crypto_sources_ios_arm = [
+  "ios-arm/crypto/chacha/chacha-armv4.S",
+  "ios-arm/crypto/fipsmodule/aes-armv4.S",
+  "ios-arm/crypto/fipsmodule/aesv8-armx32.S",
+  "ios-arm/crypto/fipsmodule/armv4-mont.S",
+  "ios-arm/crypto/fipsmodule/bsaes-armv7.S",
+  "ios-arm/crypto/fipsmodule/ghash-armv4.S",
+  "ios-arm/crypto/fipsmodule/ghashv8-armx32.S",
+  "ios-arm/crypto/fipsmodule/sha1-armv4-large.S",
+  "ios-arm/crypto/fipsmodule/sha256-armv4.S",
+  "ios-arm/crypto/fipsmodule/sha512-armv4.S",
 ]
 
 crypto_sources_linux_aarch64 = [
-  "linux-aarch64/crypto/aes/aesv8-armx64.S",
-  "linux-aarch64/crypto/bn/armv8-mont.S",
   "linux-aarch64/crypto/chacha/chacha-armv8.S",
-  "linux-aarch64/crypto/modes/ghashv8-armx64.S",
-  "linux-aarch64/crypto/sha/sha1-armv8.S",
-  "linux-aarch64/crypto/sha/sha256-armv8.S",
-  "linux-aarch64/crypto/sha/sha512-armv8.S",
+  "linux-aarch64/crypto/fipsmodule/aesv8-armx64.S",
+  "linux-aarch64/crypto/fipsmodule/armv8-mont.S",
+  "linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S",
+  "linux-aarch64/crypto/fipsmodule/sha1-armv8.S",
+  "linux-aarch64/crypto/fipsmodule/sha256-armv8.S",
+  "linux-aarch64/crypto/fipsmodule/sha512-armv8.S",
 ]
 
 crypto_sources_linux_arm = [
-  "linux-arm/crypto/aes/aes-armv4.S",
-  "linux-arm/crypto/aes/aesv8-armx32.S",
-  "linux-arm/crypto/aes/bsaes-armv7.S",
-  "linux-arm/crypto/bn/armv4-mont.S",
   "linux-arm/crypto/chacha/chacha-armv4.S",
-  "linux-arm/crypto/modes/ghash-armv4.S",
-  "linux-arm/crypto/modes/ghashv8-armx32.S",
-  "linux-arm/crypto/sha/sha1-armv4-large.S",
-  "linux-arm/crypto/sha/sha256-armv4.S",
-  "linux-arm/crypto/sha/sha512-armv4.S",
+  "linux-arm/crypto/fipsmodule/aes-armv4.S",
+  "linux-arm/crypto/fipsmodule/aesv8-armx32.S",
+  "linux-arm/crypto/fipsmodule/armv4-mont.S",
+  "linux-arm/crypto/fipsmodule/bsaes-armv7.S",
+  "linux-arm/crypto/fipsmodule/ghash-armv4.S",
+  "linux-arm/crypto/fipsmodule/ghashv8-armx32.S",
+  "linux-arm/crypto/fipsmodule/sha1-armv4-large.S",
+  "linux-arm/crypto/fipsmodule/sha256-armv4.S",
+  "linux-arm/crypto/fipsmodule/sha512-armv4.S",
   "src/crypto/curve25519/asm/x25519-asm-arm.S",
   "src/crypto/poly1305/poly1305_arm_asm.S",
 ]
 
 crypto_sources_linux_ppc64le = [
-  "linux-ppc64le/crypto/aes/aesp8-ppc.S",
-  "linux-ppc64le/crypto/modes/ghashp8-ppc.S",
+  "linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S",
+  "linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S",
 ]
 
 crypto_sources_linux_x86 = [
-  "linux-x86/crypto/aes/aes-586.S",
-  "linux-x86/crypto/aes/aesni-x86.S",
-  "linux-x86/crypto/aes/vpaes-x86.S",
-  "linux-x86/crypto/bn/bn-586.S",
-  "linux-x86/crypto/bn/co-586.S",
-  "linux-x86/crypto/bn/x86-mont.S",
   "linux-x86/crypto/chacha/chacha-x86.S",
-  "linux-x86/crypto/md5/md5-586.S",
-  "linux-x86/crypto/modes/ghash-x86.S",
-  "linux-x86/crypto/sha/sha1-586.S",
-  "linux-x86/crypto/sha/sha256-586.S",
-  "linux-x86/crypto/sha/sha512-586.S",
+  "linux-x86/crypto/fipsmodule/aes-586.S",
+  "linux-x86/crypto/fipsmodule/aesni-x86.S",
+  "linux-x86/crypto/fipsmodule/bn-586.S",
+  "linux-x86/crypto/fipsmodule/co-586.S",
+  "linux-x86/crypto/fipsmodule/ghash-x86.S",
+  "linux-x86/crypto/fipsmodule/md5-586.S",
+  "linux-x86/crypto/fipsmodule/sha1-586.S",
+  "linux-x86/crypto/fipsmodule/sha256-586.S",
+  "linux-x86/crypto/fipsmodule/sha512-586.S",
+  "linux-x86/crypto/fipsmodule/vpaes-x86.S",
+  "linux-x86/crypto/fipsmodule/x86-mont.S",
 ]
 
 crypto_sources_linux_x86_64 = [
-  "linux-x86_64/crypto/aes/aes-x86_64.S",
-  "linux-x86_64/crypto/aes/aesni-x86_64.S",
-  "linux-x86_64/crypto/aes/bsaes-x86_64.S",
-  "linux-x86_64/crypto/aes/vpaes-x86_64.S",
-  "linux-x86_64/crypto/bn/rsaz-avx2.S",
-  "linux-x86_64/crypto/bn/rsaz-x86_64.S",
-  "linux-x86_64/crypto/bn/x86_64-mont.S",
-  "linux-x86_64/crypto/bn/x86_64-mont5.S",
   "linux-x86_64/crypto/chacha/chacha-x86_64.S",
-  "linux-x86_64/crypto/ec/p256-x86_64-asm.S",
-  "linux-x86_64/crypto/md5/md5-x86_64.S",
-  "linux-x86_64/crypto/modes/aesni-gcm-x86_64.S",
-  "linux-x86_64/crypto/modes/ghash-x86_64.S",
-  "linux-x86_64/crypto/rand/rdrand-x86_64.S",
-  "linux-x86_64/crypto/sha/sha1-x86_64.S",
-  "linux-x86_64/crypto/sha/sha256-x86_64.S",
-  "linux-x86_64/crypto/sha/sha512-x86_64.S",
+  "linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S",
+  "linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S",
+  "linux-x86_64/crypto/fipsmodule/aes-x86_64.S",
+  "linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S",
+  "linux-x86_64/crypto/fipsmodule/aesni-x86_64.S",
+  "linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S",
+  "linux-x86_64/crypto/fipsmodule/ghash-x86_64.S",
+  "linux-x86_64/crypto/fipsmodule/md5-x86_64.S",
+  "linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S",
+  "linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S",
+  "linux-x86_64/crypto/fipsmodule/rsaz-avx2.S",
+  "linux-x86_64/crypto/fipsmodule/sha1-x86_64.S",
+  "linux-x86_64/crypto/fipsmodule/sha256-x86_64.S",
+  "linux-x86_64/crypto/fipsmodule/sha512-x86_64.S",
+  "linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S",
+  "linux-x86_64/crypto/fipsmodule/x86_64-mont.S",
+  "linux-x86_64/crypto/fipsmodule/x86_64-mont5.S",
   "src/crypto/curve25519/asm/x25519-asm-x86_64.S",
 ]
 
 crypto_sources_mac_x86 = [
-  "mac-x86/crypto/aes/aes-586.S",
-  "mac-x86/crypto/aes/aesni-x86.S",
-  "mac-x86/crypto/aes/vpaes-x86.S",
-  "mac-x86/crypto/bn/bn-586.S",
-  "mac-x86/crypto/bn/co-586.S",
-  "mac-x86/crypto/bn/x86-mont.S",
   "mac-x86/crypto/chacha/chacha-x86.S",
-  "mac-x86/crypto/md5/md5-586.S",
-  "mac-x86/crypto/modes/ghash-x86.S",
-  "mac-x86/crypto/sha/sha1-586.S",
-  "mac-x86/crypto/sha/sha256-586.S",
-  "mac-x86/crypto/sha/sha512-586.S",
+  "mac-x86/crypto/fipsmodule/aes-586.S",
+  "mac-x86/crypto/fipsmodule/aesni-x86.S",
+  "mac-x86/crypto/fipsmodule/bn-586.S",
+  "mac-x86/crypto/fipsmodule/co-586.S",
+  "mac-x86/crypto/fipsmodule/ghash-x86.S",
+  "mac-x86/crypto/fipsmodule/md5-586.S",
+  "mac-x86/crypto/fipsmodule/sha1-586.S",
+  "mac-x86/crypto/fipsmodule/sha256-586.S",
+  "mac-x86/crypto/fipsmodule/sha512-586.S",
+  "mac-x86/crypto/fipsmodule/vpaes-x86.S",
+  "mac-x86/crypto/fipsmodule/x86-mont.S",
 ]
 
 crypto_sources_mac_x86_64 = [
-  "mac-x86_64/crypto/aes/aes-x86_64.S",
-  "mac-x86_64/crypto/aes/aesni-x86_64.S",
-  "mac-x86_64/crypto/aes/bsaes-x86_64.S",
-  "mac-x86_64/crypto/aes/vpaes-x86_64.S",
-  "mac-x86_64/crypto/bn/rsaz-avx2.S",
-  "mac-x86_64/crypto/bn/rsaz-x86_64.S",
-  "mac-x86_64/crypto/bn/x86_64-mont.S",
-  "mac-x86_64/crypto/bn/x86_64-mont5.S",
   "mac-x86_64/crypto/chacha/chacha-x86_64.S",
-  "mac-x86_64/crypto/ec/p256-x86_64-asm.S",
-  "mac-x86_64/crypto/md5/md5-x86_64.S",
-  "mac-x86_64/crypto/modes/aesni-gcm-x86_64.S",
-  "mac-x86_64/crypto/modes/ghash-x86_64.S",
-  "mac-x86_64/crypto/rand/rdrand-x86_64.S",
-  "mac-x86_64/crypto/sha/sha1-x86_64.S",
-  "mac-x86_64/crypto/sha/sha256-x86_64.S",
-  "mac-x86_64/crypto/sha/sha512-x86_64.S",
+  "mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S",
+  "mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S",
+  "mac-x86_64/crypto/fipsmodule/aes-x86_64.S",
+  "mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S",
+  "mac-x86_64/crypto/fipsmodule/aesni-x86_64.S",
+  "mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S",
+  "mac-x86_64/crypto/fipsmodule/ghash-x86_64.S",
+  "mac-x86_64/crypto/fipsmodule/md5-x86_64.S",
+  "mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S",
+  "mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S",
+  "mac-x86_64/crypto/fipsmodule/rsaz-avx2.S",
+  "mac-x86_64/crypto/fipsmodule/sha1-x86_64.S",
+  "mac-x86_64/crypto/fipsmodule/sha256-x86_64.S",
+  "mac-x86_64/crypto/fipsmodule/sha512-x86_64.S",
+  "mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S",
+  "mac-x86_64/crypto/fipsmodule/x86_64-mont.S",
+  "mac-x86_64/crypto/fipsmodule/x86_64-mont5.S",
   "src/crypto/curve25519/asm/x25519-asm-x86_64.S",
 ]
 
 crypto_sources_win_x86 = [
-  "win-x86/crypto/aes/aes-586.asm",
-  "win-x86/crypto/aes/aesni-x86.asm",
-  "win-x86/crypto/aes/vpaes-x86.asm",
-  "win-x86/crypto/bn/bn-586.asm",
-  "win-x86/crypto/bn/co-586.asm",
-  "win-x86/crypto/bn/x86-mont.asm",
   "win-x86/crypto/chacha/chacha-x86.asm",
-  "win-x86/crypto/md5/md5-586.asm",
-  "win-x86/crypto/modes/ghash-x86.asm",
-  "win-x86/crypto/sha/sha1-586.asm",
-  "win-x86/crypto/sha/sha256-586.asm",
-  "win-x86/crypto/sha/sha512-586.asm",
+  "win-x86/crypto/fipsmodule/aes-586.asm",
+  "win-x86/crypto/fipsmodule/aesni-x86.asm",
+  "win-x86/crypto/fipsmodule/bn-586.asm",
+  "win-x86/crypto/fipsmodule/co-586.asm",
+  "win-x86/crypto/fipsmodule/ghash-x86.asm",
+  "win-x86/crypto/fipsmodule/md5-586.asm",
+  "win-x86/crypto/fipsmodule/sha1-586.asm",
+  "win-x86/crypto/fipsmodule/sha256-586.asm",
+  "win-x86/crypto/fipsmodule/sha512-586.asm",
+  "win-x86/crypto/fipsmodule/vpaes-x86.asm",
+  "win-x86/crypto/fipsmodule/x86-mont.asm",
 ]
 
 crypto_sources_win_x86_64 = [
-  "win-x86_64/crypto/aes/aes-x86_64.asm",
-  "win-x86_64/crypto/aes/aesni-x86_64.asm",
-  "win-x86_64/crypto/aes/bsaes-x86_64.asm",
-  "win-x86_64/crypto/aes/vpaes-x86_64.asm",
-  "win-x86_64/crypto/bn/rsaz-avx2.asm",
-  "win-x86_64/crypto/bn/rsaz-x86_64.asm",
-  "win-x86_64/crypto/bn/x86_64-mont.asm",
-  "win-x86_64/crypto/bn/x86_64-mont5.asm",
   "win-x86_64/crypto/chacha/chacha-x86_64.asm",
-  "win-x86_64/crypto/ec/p256-x86_64-asm.asm",
-  "win-x86_64/crypto/md5/md5-x86_64.asm",
-  "win-x86_64/crypto/modes/aesni-gcm-x86_64.asm",
-  "win-x86_64/crypto/modes/ghash-x86_64.asm",
-  "win-x86_64/crypto/rand/rdrand-x86_64.asm",
-  "win-x86_64/crypto/sha/sha1-x86_64.asm",
-  "win-x86_64/crypto/sha/sha256-x86_64.asm",
-  "win-x86_64/crypto/sha/sha512-x86_64.asm",
+  "win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm",
+  "win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm",
+  "win-x86_64/crypto/fipsmodule/aes-x86_64.asm",
+  "win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm",
+  "win-x86_64/crypto/fipsmodule/aesni-x86_64.asm",
+  "win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm",
+  "win-x86_64/crypto/fipsmodule/ghash-x86_64.asm",
+  "win-x86_64/crypto/fipsmodule/md5-x86_64.asm",
+  "win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm",
+  "win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm",
+  "win-x86_64/crypto/fipsmodule/rsaz-avx2.asm",
+  "win-x86_64/crypto/fipsmodule/sha1-x86_64.asm",
+  "win-x86_64/crypto/fipsmodule/sha256-x86_64.asm",
+  "win-x86_64/crypto/fipsmodule/sha512-x86_64.asm",
+  "win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm",
+  "win-x86_64/crypto/fipsmodule/x86_64-mont.asm",
+  "win-x86_64/crypto/fipsmodule/x86_64-mont5.asm",
 ]
 
 fuzzers = [
+  "bn_div",
+  "bn_mod_exp",
   "cert",
   "client",
+  "dtls_client",
+  "dtls_server",
   "pkcs8",
   "privkey",
   "read_pem",
diff --git a/BUILD.generated_tests.gni b/BUILD.generated_tests.gni
index 2889b32..a79959d 100644
--- a/BUILD.generated_tests.gni
+++ b/BUILD.generated_tests.gni
@@ -4,605 +4,74 @@
 
 # This file is created by generate_build_files.py. Do not edit manually.
 
-_test_support_sources = [
+test_support_sources = [
   "src/crypto/test/file_test.cc",
   "src/crypto/test/file_test.h",
+  "src/crypto/test/gtest_main.h",
   "src/crypto/test/malloc.cc",
   "src/crypto/test/test_util.cc",
   "src/crypto/test/test_util.h",
   "src/ssl/test/async_bio.h",
+  "src/ssl/test/fuzzer.h",
+  "src/ssl/test/fuzzer_tags.h",
   "src/ssl/test/packeted_bio.h",
   "src/ssl/test/test_config.h",
 ]
 
-template("create_tests") {
-  executable("boringssl_aes_test") {
-    sources = [
-      "src/crypto/aes/aes_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
+crypto_test_sources = [
+  "crypto_test_data.cc",
+  "src/crypto/asn1/asn1_test.cc",
+  "src/crypto/base64/base64_test.cc",
+  "src/crypto/bio/bio_test.cc",
+  "src/crypto/buf/buf_test.cc",
+  "src/crypto/bytestring/bytestring_test.cc",
+  "src/crypto/chacha/chacha_test.cc",
+  "src/crypto/cipher_extra/aead_test.cc",
+  "src/crypto/cipher_extra/cipher_test.cc",
+  "src/crypto/cmac/cmac_test.cc",
+  "src/crypto/compiler_test.cc",
+  "src/crypto/constant_time_test.cc",
+  "src/crypto/curve25519/ed25519_test.cc",
+  "src/crypto/curve25519/spake25519_test.cc",
+  "src/crypto/curve25519/x25519_test.cc",
+  "src/crypto/dh/dh_test.cc",
+  "src/crypto/digest_extra/digest_test.cc",
+  "src/crypto/dsa/dsa_test.cc",
+  "src/crypto/ecdh/ecdh_test.cc",
+  "src/crypto/err/err_test.cc",
+  "src/crypto/evp/evp_extra_test.cc",
+  "src/crypto/evp/evp_test.cc",
+  "src/crypto/evp/pbkdf_test.cc",
+  "src/crypto/evp/scrypt_test.cc",
+  "src/crypto/fipsmodule/aes/aes_test.cc",
+  "src/crypto/fipsmodule/bn/bn_test.cc",
+  "src/crypto/fipsmodule/ec/ec_test.cc",
+  "src/crypto/fipsmodule/ec/p256-x86_64_test.cc",
+  "src/crypto/fipsmodule/ecdsa/ecdsa_test.cc",
+  "src/crypto/fipsmodule/modes/gcm_test.cc",
+  "src/crypto/fipsmodule/rand/ctrdrbg_test.cc",
+  "src/crypto/hkdf/hkdf_test.cc",
+  "src/crypto/hmac_extra/hmac_test.cc",
+  "src/crypto/lhash/lhash_test.cc",
+  "src/crypto/obj/obj_test.cc",
+  "src/crypto/pkcs7/pkcs7_test.cc",
+  "src/crypto/pkcs8/pkcs12_test.cc",
+  "src/crypto/pkcs8/pkcs8_test.cc",
+  "src/crypto/poly1305/poly1305_test.cc",
+  "src/crypto/pool/pool_test.cc",
+  "src/crypto/refcount_test.cc",
+  "src/crypto/rsa_extra/rsa_test.cc",
+  "src/crypto/self_test.cc",
+  "src/crypto/test/file_test_gtest.cc",
+  "src/crypto/test/gtest_main.cc",
+  "src/crypto/thread_test.cc",
+  "src/crypto/x509/x509_test.cc",
+  "src/crypto/x509v3/tab_test.cc",
+  "src/crypto/x509v3/v3name_test.cc",
+]
 
-  executable("boringssl_asn1_test") {
-    sources = [
-      "src/crypto/asn1/asn1_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_base64_test") {
-    sources = [
-      "src/crypto/base64/base64_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_bio_test") {
-    sources = [
-      "src/crypto/bio/bio_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_bn_test") {
-    sources = [
-      "src/crypto/bn/bn_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_bytestring_test") {
-    sources = [
-      "src/crypto/bytestring/bytestring_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_chacha_test") {
-    sources = [
-      "src/crypto/chacha/chacha_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_aead_test") {
-    sources = [
-      "src/crypto/cipher/aead_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_cipher_test") {
-    sources = [
-      "src/crypto/cipher/cipher_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_cmac_test") {
-    sources = [
-      "src/crypto/cmac/cmac_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_constant_time_test") {
-    sources = [
-      "src/crypto/constant_time_test.c",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_ed25519_test") {
-    sources = [
-      "src/crypto/curve25519/ed25519_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_spake25519_test") {
-    sources = [
-      "src/crypto/curve25519/spake25519_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_x25519_test") {
-    sources = [
-      "src/crypto/curve25519/x25519_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_dh_test") {
-    sources = [
-      "src/crypto/dh/dh_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_digest_test") {
-    sources = [
-      "src/crypto/digest/digest_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_dsa_test") {
-    sources = [
-      "src/crypto/dsa/dsa_test.c",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_ec_test") {
-    sources = [
-      "src/crypto/ec/ec_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_example_mul") {
-    sources = [
-      "src/crypto/ec/example_mul.c",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_p256-x86_64_test") {
-    sources = [
-      "src/crypto/ec/p256-x86_64_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_ecdh_test") {
-    sources = [
-      "src/crypto/ecdh/ecdh_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_ecdsa_sign_test") {
-    sources = [
-      "src/crypto/ecdsa/ecdsa_sign_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_ecdsa_test") {
-    sources = [
-      "src/crypto/ecdsa/ecdsa_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_ecdsa_verify_test") {
-    sources = [
-      "src/crypto/ecdsa/ecdsa_verify_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_err_test") {
-    sources = [
-      "src/crypto/err/err_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_evp_extra_test") {
-    sources = [
-      "src/crypto/evp/evp_extra_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_evp_test") {
-    sources = [
-      "src/crypto/evp/evp_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_pbkdf_test") {
-    sources = [
-      "src/crypto/evp/pbkdf_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_hkdf_test") {
-    sources = [
-      "src/crypto/hkdf/hkdf_test.c",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_hmac_test") {
-    sources = [
-      "src/crypto/hmac/hmac_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_lhash_test") {
-    sources = [
-      "src/crypto/lhash/lhash_test.c",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_gcm_test") {
-    sources = [
-      "src/crypto/modes/gcm_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_obj_test") {
-    sources = [
-      "src/crypto/obj/obj_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_pkcs12_test") {
-    sources = [
-      "src/crypto/pkcs8/pkcs12_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_pkcs8_test") {
-    sources = [
-      "src/crypto/pkcs8/pkcs8_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_poly1305_test") {
-    sources = [
-      "src/crypto/poly1305/poly1305_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_pool_test") {
-    sources = [
-      "src/crypto/pool/pool_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_refcount_test") {
-    sources = [
-      "src/crypto/refcount_test.c",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_rsa_test") {
-    sources = [
-      "src/crypto/rsa/rsa_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_thread_test") {
-    sources = [
-      "src/crypto/thread_test.c",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_pkcs7_test") {
-    sources = [
-      "src/crypto/x509/pkcs7_test.c",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_x509_test") {
-    sources = [
-      "src/crypto/x509/x509_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_tab_test") {
-    sources = [
-      "src/crypto/x509v3/tab_test.c",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_v3name_test") {
-    sources = [
-      "src/crypto/x509v3/v3name_test.c",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  executable("boringssl_ssl_test") {
-    sources = [
-      "src/ssl/ssl_test.cc",
-    ]
-    sources += _test_support_sources
-    if (defined(invoker.configs_exclude)) {
-      configs -= invoker.configs_exclude
-    }
-    configs += invoker.configs
-    deps = invoker.deps
-  }
-
-  group(target_name) {
-    deps = [
-      ":boringssl_aead_test",
-      ":boringssl_aes_test",
-      ":boringssl_asn1_test",
-      ":boringssl_base64_test",
-      ":boringssl_bio_test",
-      ":boringssl_bn_test",
-      ":boringssl_bytestring_test",
-      ":boringssl_chacha_test",
-      ":boringssl_cipher_test",
-      ":boringssl_cmac_test",
-      ":boringssl_constant_time_test",
-      ":boringssl_dh_test",
-      ":boringssl_digest_test",
-      ":boringssl_dsa_test",
-      ":boringssl_ec_test",
-      ":boringssl_ecdh_test",
-      ":boringssl_ecdsa_sign_test",
-      ":boringssl_ecdsa_test",
-      ":boringssl_ecdsa_verify_test",
-      ":boringssl_ed25519_test",
-      ":boringssl_err_test",
-      ":boringssl_evp_extra_test",
-      ":boringssl_evp_test",
-      ":boringssl_example_mul",
-      ":boringssl_gcm_test",
-      ":boringssl_hkdf_test",
-      ":boringssl_hmac_test",
-      ":boringssl_lhash_test",
-      ":boringssl_obj_test",
-      ":boringssl_p256-x86_64_test",
-      ":boringssl_pbkdf_test",
-      ":boringssl_pkcs12_test",
-      ":boringssl_pkcs7_test",
-      ":boringssl_pkcs8_test",
-      ":boringssl_poly1305_test",
-      ":boringssl_pool_test",
-      ":boringssl_refcount_test",
-      ":boringssl_rsa_test",
-      ":boringssl_spake25519_test",
-      ":boringssl_ssl_test",
-      ":boringssl_tab_test",
-      ":boringssl_thread_test",
-      ":boringssl_v3name_test",
-      ":boringssl_x25519_test",
-      ":boringssl_x509_test",
-    ]
-  }
-}
+ssl_test_sources = [
+  "src/crypto/test/gtest_main.cc",
+  "src/ssl/span_test.cc",
+  "src/ssl/ssl_test.cc",
+]
diff --git a/BUILD.gn b/BUILD.gn
index a9b4a50..cbc97e8 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -18,12 +18,14 @@
 config("internal_config") {
   visibility = [ ":*" ]  # Only targets in this file can depend on this.
   defines = [
+    "BORINGSSL_ALLOW_CXX_RUNTIME",
     "BORINGSSL_IMPLEMENTATION",
     "BORINGSSL_NO_STATIC_INITIALIZER",
     "OPENSSL_SMALL",
   ]
   if (is_posix) {
-    cflags_c = [ "-std=c99" ]
+    cflags_c = [ "-std=c11" ]
+    cflags_cc = [ "-std=c++14" ]
     defines += [ "_XOPEN_SOURCE=700" ]
   }
 }
diff --git a/README b/README
index b1c5469..10a7e7a 100644
--- a/README
+++ b/README
@@ -8,9 +8,11 @@
 
 at revision:
 
-d519bf6be0b447fb80fbc539d4bff4479b5482a2
+a62dbf88d8a3c04446db833a1eb80a620cb1514d
 
 To roll boringssl forward, delete all but this file and the above mentioned
 files, checkout the new boringssl into a subdirectory called src/, and run the
 script src/util/generate_build_files.py with appropriate arguments. It may also
 be necessary to update BUILD.gn, etc.
+
+Then submit a pull request on github.
diff --git a/codereview.settings b/codereview.settings
deleted file mode 100644
index ccbccd5..0000000
--- a/codereview.settings
+++ /dev/null
@@ -1,4 +0,0 @@
-# This file is used by gcl to get repository specific information.
-CODE_REVIEW_SERVER: http://codereview.chromium.org
-VIEW_VC: https://github.com/dart-lang/boringssl_gen/commit/
-CC_LIST: reviews@dartlang.org
diff --git a/err_data.c b/err_data.c
index e75a0ca..dbb34a3 100644
--- a/err_data.c
+++ b/err_data.c
@@ -62,158 +62,166 @@
     0xc348899,
     0xc3508a5,
     0xc3588c2,
-    0xc3608d4,
-    0xc3688e2,
-    0xc3708f2,
-    0xc3788ff,
-    0xc38090f,
-    0xc38891a,
-    0xc390930,
-    0xc39893f,
-    0xc3a0953,
+    0xc3608e2,
+    0xc3688f0,
+    0xc370900,
+    0xc37890d,
+    0xc38091d,
+    0xc388928,
+    0xc39093e,
+    0xc39894d,
+    0xc3a0961,
     0xc3a8845,
     0xc3b00ea,
+    0xc3b88d4,
     0x10320845,
-    0x103293ab,
-    0x103313b7,
-    0x103393d0,
-    0x103413e3,
-    0x10348e8b,
-    0x10350c19,
-    0x103593f6,
-    0x1036140b,
-    0x1036941e,
-    0x1037143d,
-    0x10379456,
-    0x1038146b,
-    0x10389489,
-    0x10391498,
-    0x103994b4,
-    0x103a14cf,
-    0x103a94de,
-    0x103b14fa,
-    0x103b9515,
-    0x103c152c,
+    0x10329535,
+    0x10331541,
+    0x1033955a,
+    0x1034156d,
+    0x10348efc,
+    0x10350c5e,
+    0x10359580,
+    0x10361595,
+    0x103695a8,
+    0x103715c7,
+    0x103795e0,
+    0x103815f5,
+    0x10389613,
+    0x10391622,
+    0x1039963e,
+    0x103a1659,
+    0x103a9668,
+    0x103b1684,
+    0x103b969f,
+    0x103c16b6,
     0x103c80ea,
-    0x103d153d,
-    0x103d9551,
-    0x103e1570,
-    0x103e957f,
-    0x103f1596,
-    0x103f95a9,
-    0x10400bea,
-    0x104095bc,
-    0x104115da,
-    0x104195ed,
-    0x10421607,
-    0x10429617,
-    0x1043162b,
-    0x10439641,
-    0x10441659,
-    0x1044966e,
-    0x10451682,
-    0x10459694,
+    0x103d16c7,
+    0x103d96db,
+    0x103e16fa,
+    0x103e9709,
+    0x103f1720,
+    0x103f9733,
+    0x10400c22,
+    0x10409746,
+    0x10411764,
+    0x10419777,
+    0x10421791,
+    0x104297a1,
+    0x104317b5,
+    0x104397cb,
+    0x104417e3,
+    0x104497f8,
+    0x1045180c,
+    0x1045981e,
     0x104605fb,
-    0x1046893f,
-    0x104716a9,
-    0x104796c0,
-    0x104816d5,
-    0x104896e3,
-    0x14320bcd,
-    0x14328bdb,
-    0x14330bea,
-    0x14338bfc,
+    0x1046894d,
+    0x10471833,
+    0x1047984a,
+    0x1048185f,
+    0x1048986d,
+    0x10490e5e,
+    0x14320c05,
+    0x14328c13,
+    0x14330c22,
+    0x14338c34,
     0x143400ac,
     0x143480ea,
     0x18320083,
-    0x18328ee1,
+    0x18328f52,
     0x183300ac,
-    0x18338ef7,
-    0x18340f0b,
+    0x18338f68,
+    0x18340f7c,
     0x183480ea,
-    0x18350f20,
-    0x18358f38,
-    0x18360f4d,
-    0x18368f61,
-    0x18370f85,
-    0x18378f9b,
-    0x18380faf,
-    0x18388fbf,
-    0x18390a57,
-    0x18398fcf,
-    0x183a0fe4,
-    0x183a8ff8,
-    0x183b0c25,
-    0x183b9005,
-    0x183c1017,
-    0x183c9022,
-    0x183d1032,
-    0x183d9043,
-    0x183e1054,
-    0x183e9066,
-    0x183f108f,
-    0x183f90a8,
-    0x184010c0,
+    0x18350f91,
+    0x18358fa9,
+    0x18360fbe,
+    0x18368fd2,
+    0x18370ff6,
+    0x1837900c,
+    0x18381020,
+    0x18389030,
+    0x18390a73,
+    0x18399040,
+    0x183a1068,
+    0x183a908e,
+    0x183b0c6a,
+    0x183b90c3,
+    0x183c10d5,
+    0x183c90e0,
+    0x183d10f0,
+    0x183d9101,
+    0x183e1112,
+    0x183e9124,
+    0x183f114d,
+    0x183f9166,
+    0x1840117e,
     0x184086d3,
-    0x203210e7,
-    0x243210f3,
-    0x24328985,
-    0x24331105,
-    0x24339112,
-    0x2434111f,
-    0x24349131,
-    0x24351140,
-    0x2435915d,
-    0x2436116a,
-    0x24369178,
-    0x24371186,
-    0x24379194,
-    0x2438119d,
-    0x243891aa,
-    0x243911bd,
-    0x28320c0d,
-    0x28328c25,
-    0x28330bea,
-    0x28338c38,
-    0x28340c19,
+    0x184110b1,
+    0x1841907c,
+    0x1842109b,
+    0x18429055,
+    0x203211b8,
+    0x203291a5,
+    0x243211c4,
+    0x24328993,
+    0x243311d6,
+    0x243391e3,
+    0x243411f0,
+    0x24349202,
+    0x24351211,
+    0x2435922e,
+    0x2436123b,
+    0x24369249,
+    0x24371257,
+    0x24379265,
+    0x2438126e,
+    0x2438927b,
+    0x2439128e,
+    0x28320c52,
+    0x28328c6a,
+    0x28330c22,
+    0x28338c7d,
+    0x28340c5e,
     0x283480ac,
     0x283500ea,
-    0x2c32299a,
-    0x2c32a9a8,
-    0x2c3329ba,
-    0x2c33a9cc,
-    0x2c3429e0,
-    0x2c34a9f2,
-    0x2c352a0d,
-    0x2c35aa1f,
-    0x2c362a32,
+    0x2c322cda,
+    0x2c3292a5,
+    0x2c332ce8,
+    0x2c33acfa,
+    0x2c342d0e,
+    0x2c34ad20,
+    0x2c352d3b,
+    0x2c35ad4d,
+    0x2c362d60,
     0x2c36832d,
-    0x2c372a3f,
-    0x2c37aa51,
-    0x2c382a64,
-    0x2c38aa7b,
-    0x2c392a89,
-    0x2c39aa99,
-    0x2c3a2aab,
-    0x2c3aaabf,
-    0x2c3b2ad0,
-    0x2c3baaef,
-    0x2c3c2b03,
-    0x2c3cab19,
-    0x2c3d2b32,
-    0x2c3dab4f,
-    0x2c3e2b60,
-    0x2c3eab6e,
-    0x2c3f2b86,
-    0x2c3fab9e,
-    0x2c402bab,
-    0x2c4090e7,
-    0x2c412bbc,
-    0x2c41abcf,
-    0x2c4210c0,
-    0x2c42abe0,
+    0x2c372d6d,
+    0x2c37ad7f,
+    0x2c382da4,
+    0x2c38adbb,
+    0x2c392dc9,
+    0x2c39add9,
+    0x2c3a2deb,
+    0x2c3aadff,
+    0x2c3b2e10,
+    0x2c3bae2f,
+    0x2c3c12b7,
+    0x2c3c92cd,
+    0x2c3d2e43,
+    0x2c3d92e6,
+    0x2c3e2e60,
+    0x2c3eae6e,
+    0x2c3f2e86,
+    0x2c3fae9e,
+    0x2c402eab,
+    0x2c4091b8,
+    0x2c412ebc,
+    0x2c41aecf,
+    0x2c42117e,
+    0x2c42aee0,
     0x2c430720,
-    0x2c43aae1,
+    0x2c43ae21,
+    0x2c442d92,
     0x30320000,
     0x30328015,
     0x3033001f,
@@ -306,248 +314,266 @@
     0x305e8700,
     0x305f0716,
     0x305f8720,
-    0x34320b47,
-    0x34328b5b,
-    0x34330b78,
-    0x34338b8b,
-    0x34340b9a,
-    0x34348bb7,
+    0x34320b63,
+    0x34328b77,
+    0x34330b94,
+    0x34338ba7,
+    0x34340bb6,
+    0x34348bef,
+    0x34350bd3,
     0x3c320083,
-    0x3c328c62,
-    0x3c330c7b,
-    0x3c338c96,
-    0x3c340cb3,
-    0x3c348cdd,
-    0x3c350cf8,
-    0x3c358d1e,
-    0x3c360d37,
-    0x3c368d4f,
-    0x3c370d60,
-    0x3c378d6e,
-    0x3c380d7b,
-    0x3c388d8f,
-    0x3c390c25,
-    0x3c398da3,
-    0x3c3a0db7,
-    0x3c3a88ff,
-    0x3c3b0dc7,
-    0x3c3b8de2,
-    0x3c3c0df4,
-    0x3c3c8e0a,
-    0x3c3d0e14,
-    0x3c3d8e28,
-    0x3c3e0e36,
-    0x3c3e8e5b,
-    0x3c3f0c4e,
-    0x3c3f8e44,
+    0x3c328ca7,
+    0x3c330cc0,
+    0x3c338cdb,
+    0x3c340cf8,
+    0x3c348d22,
+    0x3c350d3d,
+    0x3c358d63,
+    0x3c360d7c,
+    0x3c368d94,
+    0x3c370da5,
+    0x3c378db3,
+    0x3c380dc0,
+    0x3c388dd4,
+    0x3c390c6a,
+    0x3c398df7,
+    0x3c3a0e0b,
+    0x3c3a890d,
+    0x3c3b0e1b,
+    0x3c3b8e36,
+    0x3c3c0e48,
+    0x3c3c8e7b,
+    0x3c3d0e85,
+    0x3c3d8e99,
+    0x3c3e0ea7,
+    0x3c3e8ecc,
+    0x3c3f0c93,
+    0x3c3f8eb5,
     0x3c4000ac,
     0x3c4080ea,
-    0x3c410cce,
-    0x3c418d0d,
-    0x403216fa,
-    0x40329710,
-    0x4033173e,
-    0x40339748,
-    0x4034175f,
-    0x4034977d,
-    0x4035178d,
-    0x4035979f,
-    0x403617ac,
-    0x403697b8,
-    0x403717cd,
-    0x403797df,
-    0x403817ea,
-    0x403897fc,
-    0x40390e8b,
-    0x4039980c,
-    0x403a181f,
-    0x403a9840,
-    0x403b1851,
-    0x403b9861,
+    0x3c410d13,
+    0x3c418d52,
+    0x3c420e5e,
+    0x3c428de8,
+    0x403218c6,
+    0x403298dc,
+    0x4033190a,
+    0x40339914,
+    0x4034192b,
+    0x40349949,
+    0x40351959,
+    0x4035996b,
+    0x40361978,
+    0x40369984,
+    0x40371999,
+    0x403799ab,
+    0x403819b6,
+    0x403899c8,
+    0x40390efc,
+    0x403999d8,
+    0x403a19eb,
+    0x403a9a0c,
+    0x403b1a1d,
+    0x403b9a2d,
     0x403c0064,
     0x403c8083,
-    0x403d18aa,
-    0x403d98c0,
-    0x403e18cf,
-    0x403e98e2,
-    0x403f18fc,
-    0x403f990a,
-    0x4040191f,
-    0x40409933,
-    0x40411950,
-    0x4041996b,
-    0x40421984,
-    0x40429997,
-    0x404319ab,
-    0x404399c3,
-    0x404419da,
+    0x403d1ab1,
+    0x403d9ac7,
+    0x403e1ad6,
+    0x403e9b0e,
+    0x403f1b28,
+    0x403f9b36,
+    0x40401b4b,
+    0x40409b5f,
+    0x40411b7c,
+    0x40419b97,
+    0x40421bb0,
+    0x40429bc3,
+    0x40431bd7,
+    0x40439bef,
+    0x40441c06,
     0x404480ac,
-    0x404519ef,
-    0x40459a01,
-    0x40461a25,
-    0x40469a45,
-    0x40471a53,
-    0x40479a7a,
-    0x40481ab7,
-    0x40489ad0,
-    0x40491ae7,
-    0x40499b01,
-    0x404a1b18,
-    0x404a9b36,
-    0x404b1b4e,
-    0x404b9b65,
-    0x404c1b7b,
-    0x404c9b8d,
-    0x404d1bae,
-    0x404d9bd0,
-    0x404e1be4,
-    0x404e9bf1,
-    0x404f1c1e,
-    0x404f9c47,
-    0x40501c82,
-    0x40509c96,
-    0x40511cb1,
-    0x40519cc1,
-    0x40521cd8,
-    0x40529cfc,
-    0x40531d14,
-    0x40539d27,
-    0x40541d3c,
-    0x40549d5f,
-    0x40551d6d,
-    0x40559d8a,
-    0x40561d97,
-    0x40569db0,
-    0x40571dc8,
-    0x40579ddb,
-    0x40581df0,
-    0x40589e17,
-    0x40591e46,
-    0x40599e73,
-    0x405a1e87,
-    0x405a9e97,
-    0x405b1eaf,
-    0x405b9ec0,
-    0x405c1ed3,
-    0x405c9ef4,
-    0x405d1f01,
-    0x405d9f18,
-    0x405e1f56,
-    0x405e8a95,
-    0x405f1f77,
-    0x405f9f84,
-    0x40601f92,
-    0x40609fb4,
-    0x40611ff8,
-    0x4061a030,
-    0x40622047,
-    0x4062a058,
-    0x40632069,
-    0x4063a07e,
-    0x40642095,
-    0x4064a0c1,
-    0x406520dc,
-    0x4065a0f3,
-    0x4066210b,
-    0x4066a135,
-    0x40672160,
-    0x4067a181,
-    0x40682194,
-    0x4068a1b5,
-    0x406921e7,
-    0x4069a215,
-    0x406a2236,
-    0x406aa256,
-    0x406b23de,
-    0x406ba401,
-    0x406c2417,
-    0x406ca679,
-    0x406d26a8,
-    0x406da6d0,
-    0x406e26fe,
-    0x406ea732,
-    0x406f2751,
-    0x406fa766,
-    0x40702779,
-    0x4070a796,
+    0x40451c1b,
+    0x40459c2d,
+    0x40461c51,
+    0x40469c71,
+    0x40471c7f,
+    0x40479ca6,
+    0x40481cf9,
+    0x40489d2c,
+    0x40491d43,
+    0x40499d5d,
+    0x404a1d74,
+    0x404a9d92,
+    0x404b1daa,
+    0x404b9dc1,
+    0x404c1dd7,
+    0x404c9de9,
+    0x404d1e0a,
+    0x404d9e43,
+    0x404e1e57,
+    0x404e9e64,
+    0x404f1e91,
+    0x404f9eba,
+    0x40501ef5,
+    0x40509f09,
+    0x40511f24,
+    0x40521f34,
+    0x40529f58,
+    0x40531f70,
+    0x40539f83,
+    0x40541f98,
+    0x40549fbb,
+    0x40551fc9,
+    0x4055a006,
+    0x40562013,
+    0x4056a02c,
+    0x40572044,
+    0x4057a057,
+    0x4058206c,
+    0x4058a093,
+    0x405920c2,
+    0x4059a0ef,
+    0x405a2103,
+    0x405aa113,
+    0x405b212b,
+    0x405ba13c,
+    0x405c214f,
+    0x405ca18e,
+    0x405d219b,
+    0x405da1b2,
+    0x405e21f0,
+    0x405e8ab1,
+    0x405f2211,
+    0x405fa21e,
+    0x4060222c,
+    0x4060a24e,
+    0x40612292,
+    0x4061a2ca,
+    0x406222e1,
+    0x4062a2f2,
+    0x40632303,
+    0x4063a318,
+    0x4064232f,
+    0x4064a35b,
+    0x40652376,
+    0x4065a38d,
+    0x406623a5,
+    0x4066a3cf,
+    0x406723fa,
+    0x4067a41b,
+    0x40682463,
+    0x4068a484,
+    0x406924b6,
+    0x4069a4e4,
+    0x406a2505,
+    0x406aa525,
+    0x406b26ad,
+    0x406ba6d0,
+    0x406c26e6,
+    0x406ca961,
+    0x406d2990,
+    0x406da9b8,
+    0x406e29e6,
+    0x406eaa33,
+    0x406f2a52,
+    0x406faa8a,
+    0x40702a9d,
+    0x4070aaba,
     0x40710800,
-    0x4071a7a8,
-    0x407227bb,
-    0x4072a7d4,
-    0x407327ec,
-    0x4073936d,
-    0x40742800,
-    0x4074a81a,
-    0x4075282b,
-    0x4075a83f,
-    0x4076284d,
-    0x407691aa,
-    0x40772872,
-    0x4077a894,
-    0x407828af,
-    0x4078a8e8,
-    0x407928ff,
-    0x4079a915,
-    0x407a2921,
-    0x407aa934,
-    0x407b2949,
-    0x407ba95b,
-    0x407c2970,
-    0x407ca979,
-    0x407d21d0,
-    0x407d9c57,
-    0x407e28c4,
-    0x407e9e27,
-    0x407f1a67,
-    0x407f9887,
-    0x40801c2e,
-    0x40809a8f,
-    0x40811cea,
-    0x40819c08,
-    0x408226e9,
-    0x4082986d,
-    0x40831e02,
-    0x4083a0a6,
-    0x40841aa3,
-    0x40849e5f,
-    0x40851ee4,
-    0x40859fdc,
-    0x40861f38,
-    0x40869c71,
-    0x40872716,
-    0x4087a00d,
-    0x41f42309,
-    0x41f9239b,
-    0x41fe228e,
-    0x41fea46a,
-    0x41ff255b,
-    0x42032322,
-    0x42082344,
-    0x4208a380,
-    0x42092272,
-    0x4209a3ba,
-    0x420a22c9,
-    0x420aa2a9,
-    0x420b22e9,
-    0x420ba362,
-    0x420c2577,
-    0x420ca437,
-    0x420d2451,
-    0x420da488,
-    0x421224a2,
-    0x4217253e,
-    0x4217a4e4,
-    0x421c2506,
-    0x421f24c1,
-    0x4221258e,
-    0x42262521,
-    0x422b265d,
-    0x422ba60b,
-    0x422c2645,
-    0x422ca5ca,
-    0x422d25a9,
-    0x422da62a,
-    0x422e25f0,
+    0x4071aacc,
+    0x40722adf,
+    0x4072aaf8,
+    0x40732b10,
+    0x407394a4,
+    0x40742b24,
+    0x4074ab3e,
+    0x40752b4f,
+    0x4075ab63,
+    0x40762b71,
+    0x4076927b,
+    0x40772b96,
+    0x4077abb8,
+    0x40782bd3,
+    0x4078ac0c,
+    0x40792c23,
+    0x4079ac39,
+    0x407a2c45,
+    0x407aac58,
+    0x407b2c6d,
+    0x407bac7f,
+    0x407c2cb0,
+    0x407cacb9,
+    0x407d249f,
+    0x407d9eca,
+    0x407e2be8,
+    0x407ea0a3,
+    0x407f1c93,
+    0x407f9a53,
+    0x40801ea1,
+    0x40809cbb,
+    0x40811f46,
+    0x40819e7b,
+    0x408229d1,
+    0x40829a39,
+    0x4083207e,
+    0x4083a340,
+    0x40841ccf,
+    0x4084a0db,
+    0x40852160,
+    0x4085a276,
+    0x408621d2,
+    0x40869ee4,
+    0x40872a17,
+    0x4087a2a7,
+    0x40881a9a,
+    0x4088a42e,
+    0x40891ae9,
+    0x40899a76,
+    0x408a2706,
+    0x408a9884,
+    0x408b2c94,
+    0x408baa67,
+    0x408c2170,
+    0x408c98a0,
+    0x408d1d12,
+    0x408d9ce3,
+    0x408e1e2c,
+    0x408e9fe6,
+    0x408f2442,
+    0x41f425d8,
+    0x41f9266a,
+    0x41fe255d,
+    0x41fea752,
+    0x41ff2843,
+    0x420325f1,
+    0x42082613,
+    0x4208a64f,
+    0x42092541,
+    0x4209a689,
+    0x420a2598,
+    0x420aa578,
+    0x420b25b8,
+    0x420ba631,
+    0x420c285f,
+    0x420ca71f,
+    0x420d2739,
+    0x420da770,
+    0x4212278a,
+    0x42172826,
+    0x4217a7cc,
+    0x421c27ee,
+    0x421f27a9,
+    0x42212876,
+    0x42262809,
+    0x422b2945,
+    0x422ba8f3,
+    0x422c292d,
+    0x422ca8b2,
+    0x422d2891,
+    0x422da912,
+    0x422e28d8,
+    0x422ea9fe,
     0x4432072b,
     0x4432873a,
     0x44330746,
@@ -565,131 +591,144 @@
     0x44390800,
     0x4439880e,
     0x443a0821,
-    0x4c3211d4,
-    0x4c3291e4,
-    0x4c3311f7,
-    0x4c339217,
+    0x483212a5,
+    0x483292b7,
+    0x483312cd,
+    0x483392e6,
+    0x4c32130b,
+    0x4c32931b,
+    0x4c33132e,
+    0x4c33934e,
     0x4c3400ac,
     0x4c3480ea,
-    0x4c351223,
-    0x4c359231,
-    0x4c36124d,
-    0x4c369260,
-    0x4c37126f,
-    0x4c37927d,
-    0x4c381292,
-    0x4c38929e,
-    0x4c3912be,
-    0x4c3992e8,
-    0x4c3a1301,
-    0x4c3a931a,
+    0x4c35135a,
+    0x4c359368,
+    0x4c361384,
+    0x4c369397,
+    0x4c3713a6,
+    0x4c3793b4,
+    0x4c3813c9,
+    0x4c3893d5,
+    0x4c3913f5,
+    0x4c39941f,
+    0x4c3a1438,
+    0x4c3a9451,
     0x4c3b05fb,
-    0x4c3b9333,
-    0x4c3c1345,
-    0x4c3c9354,
-    0x4c3d136d,
-    0x4c3d937c,
-    0x4c3e1389,
-    0x50322bf2,
-    0x5032ac01,
-    0x50332c0c,
-    0x5033ac1c,
-    0x50342c35,
-    0x5034ac4f,
-    0x50352c5d,
-    0x5035ac73,
-    0x50362c85,
-    0x5036ac9b,
-    0x50372cb4,
-    0x5037acc7,
-    0x50382cdf,
-    0x5038acf0,
-    0x50392d05,
-    0x5039ad19,
-    0x503a2d39,
-    0x503aad4f,
-    0x503b2d67,
-    0x503bad79,
-    0x503c2d95,
-    0x503cadac,
-    0x503d2dc5,
-    0x503daddb,
-    0x503e2de8,
-    0x503eadfe,
-    0x503f2e10,
+    0x4c3b946a,
+    0x4c3c147c,
+    0x4c3c948b,
+    0x4c3d14a4,
+    0x4c3d8c45,
+    0x4c3e14fd,
+    0x4c3e94b3,
+    0x4c3f151f,
+    0x4c3f927b,
+    0x4c4014c9,
+    0x4c4092f7,
+    0x4c4114ed,
+    0x50322ef2,
+    0x5032af01,
+    0x50332f0c,
+    0x5033af1c,
+    0x50342f35,
+    0x5034af4f,
+    0x50352f5d,
+    0x5035af73,
+    0x50362f85,
+    0x5036af9b,
+    0x50372fb4,
+    0x5037afc7,
+    0x50382fdf,
+    0x5038aff0,
+    0x50393005,
+    0x5039b019,
+    0x503a3039,
+    0x503ab04f,
+    0x503b3067,
+    0x503bb079,
+    0x503c3095,
+    0x503cb0ac,
+    0x503d30c5,
+    0x503db0db,
+    0x503e30e8,
+    0x503eb0fe,
+    0x503f3110,
     0x503f8382,
-    0x50402e23,
-    0x5040ae33,
-    0x50412e4d,
-    0x5041ae5c,
-    0x50422e76,
-    0x5042ae93,
-    0x50432ea3,
-    0x5043aeb3,
-    0x50442ec2,
+    0x50403123,
+    0x5040b133,
+    0x5041314d,
+    0x5041b15c,
+    0x50423176,
+    0x5042b193,
+    0x504331a3,
+    0x5043b1b3,
+    0x504431c2,
     0x5044843f,
-    0x50452ed6,
-    0x5045aef4,
-    0x50462f07,
-    0x5046af1d,
-    0x50472f2f,
-    0x5047af44,
-    0x50482f6a,
-    0x5048af78,
-    0x50492f8b,
-    0x5049afa0,
-    0x504a2fb6,
-    0x504aafc6,
-    0x504b2fe6,
-    0x504baff9,
-    0x504c301c,
-    0x504cb04a,
-    0x504d305c,
-    0x504db079,
-    0x504e3094,
-    0x504eb0b0,
-    0x504f30c2,
-    0x504fb0d9,
-    0x505030e8,
+    0x504531d6,
+    0x5045b1f4,
+    0x50463207,
+    0x5046b21d,
+    0x5047322f,
+    0x5047b244,
+    0x5048326a,
+    0x5048b278,
+    0x5049328b,
+    0x5049b2a0,
+    0x504a32b6,
+    0x504ab2c6,
+    0x504b32e6,
+    0x504bb2f9,
+    0x504c331c,
+    0x504cb34a,
+    0x504d335c,
+    0x504db379,
+    0x504e3394,
+    0x504eb3b0,
+    0x504f33c2,
+    0x504fb3d9,
+    0x505033e8,
     0x505086ef,
-    0x505130fb,
-    0x58320ec9,
-    0x68320e8b,
-    0x68328c25,
-    0x68330c38,
-    0x68338e99,
-    0x68340ea9,
+    0x505133fb,
+    0x58320f3a,
+    0x68320efc,
+    0x68328c6a,
+    0x68330c7d,
+    0x68338f0a,
+    0x68340f1a,
     0x683480ea,
-    0x6c320e67,
-    0x6c328bfc,
-    0x6c330e72,
-    0x74320a0b,
-    0x78320970,
-    0x78328985,
-    0x78330991,
+    0x6c320ed8,
+    0x6c328c34,
+    0x6c330ee3,
+    0x74320a19,
+    0x743280ac,
+    0x74330c45,
+    0x7832097e,
+    0x78328993,
+    0x7833099f,
     0x78338083,
-    0x783409a0,
-    0x783489b5,
-    0x783509d4,
-    0x783589f6,
-    0x78360a0b,
-    0x78368a21,
-    0x78370a31,
-    0x78378a44,
-    0x78380a57,
-    0x78388a69,
-    0x78390a76,
-    0x78398a95,
-    0x783a0aaa,
-    0x783a8ab8,
-    0x783b0ac2,
-    0x783b8ad6,
-    0x783c0aed,
-    0x783c8b02,
-    0x783d0b19,
-    0x783d8b2e,
-    0x783e0a84,
-    0x7c3210d6,
+    0x783409ae,
+    0x783489c3,
+    0x783509e2,
+    0x78358a04,
+    0x78360a19,
+    0x78368a2f,
+    0x78370a3f,
+    0x78378a60,
+    0x78380a73,
+    0x78388a85,
+    0x78390a92,
+    0x78398ab1,
+    0x783a0ac6,
+    0x783a8ad4,
+    0x783b0ade,
+    0x783b8af2,
+    0x783c0b09,
+    0x783c8b1e,
+    0x783d0b35,
+    0x783d8b4a,
+    0x783e0aa0,
+    0x783e8a52,
+    0x7c321194,
 };
 
 const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]);
@@ -813,6 +852,7 @@
     "DIV_BY_ZERO\0"
     "EXPAND_ON_STATIC_BIGNUM_DATA\0"
     "INPUT_NOT_REDUCED\0"
+    "INVALID_INPUT\0"
     "INVALID_RANGE\0"
     "NEGATIVE_NUMBER\0"
     "NOT_A_SQUARE\0"
@@ -832,6 +872,7 @@
     "INPUT_NOT_INITIALIZED\0"
     "INVALID_AD_SIZE\0"
     "INVALID_KEY_LENGTH\0"
+    "INVALID_NONCE\0"
     "INVALID_NONCE_SIZE\0"
     "INVALID_OPERATION\0"
     "IV_TOO_LARGE\0"
@@ -851,11 +892,13 @@
     "MISSING_EQUAL_SIGN\0"
     "NO_CLOSE_BRACE\0"
     "UNABLE_TO_CREATE_NEW_SECTION\0"
+    "VARIABLE_EXPANSION_TOO_LONG\0"
     "VARIABLE_HAS_NO_VALUE\0"
     "BAD_GENERATOR\0"
     "INVALID_PUBKEY\0"
     "MODULUS_TOO_LARGE\0"
     "NO_PRIVATE_VALUE\0"
+    "UNKNOWN_HASH\0"
     "BAD_Q_VALUE\0"
     "BAD_VERSION\0"
     "MISSING_PARAMETERS\0"
@@ -876,11 +919,13 @@
     "INVALID_FORM\0"
     "INVALID_GROUP_ORDER\0"
     "INVALID_PRIVATE_KEY\0"
+    "INVALID_SCALAR\0"
     "MISSING_PRIVATE_KEY\0"
     "NON_NAMED_CURVE\0"
     "PKPARAMETERS2GROUP_FAILURE\0"
     "POINT_AT_INFINITY\0"
     "POINT_IS_NOT_ON_CURVE\0"
+    "PUBLIC_KEY_VALIDATION_FAILED\0"
     "SLOT_FULL\0"
     "UNDEFINED_GENERATOR\0"
     "UNKNOWN_GROUP\0"
@@ -905,8 +950,12 @@
     "INVALID_KEYBITS\0"
     "INVALID_MGF1_MD\0"
     "INVALID_PADDING_MODE\0"
+    "INVALID_PARAMETERS\0"
     "INVALID_PSS_SALTLEN\0"
+    "INVALID_SIGNATURE\0"
     "KEYS_NOT_SET\0"
+    "MEMORY_LIMIT_EXCEEDED\0"
+    "NOT_A_PRIVATE_KEY\0"
     "NO_DEFAULT_DIGEST\0"
     "NO_KEY_SET\0"
     "NO_MDC2_SUPPORT\0"
@@ -918,6 +967,7 @@
     "UNKNOWN_PUBLIC_KEY_TYPE\0"
     "UNSUPPORTED_ALGORITHM\0"
     "OUTPUT_TOO_LARGE\0"
+    "INVALID_OID_STRING\0"
     "UNKNOWN_NID\0"
     "BAD_BASE64_DECODE\0"
     "BAD_END_LINE\0"
@@ -933,6 +983,11 @@
     "SHORT_HEADER\0"
     "UNSUPPORTED_CIPHER\0"
     "UNSUPPORTED_ENCRYPTION\0"
+    "BAD_PKCS7_VERSION\0"
+    "NOT_PKCS7_SIGNED_DATA\0"
+    "NO_CERTIFICATES_INCLUDED\0"
+    "NO_CRLS_INCLUDED\0"
+    "BAD_ITERATION_COUNT\0"
     "BAD_PKCS12_DATA\0"
     "BAD_PKCS12_VERSION\0"
     "CIPHER_HAS_NO_OBJECT_IDENTIFIER\0"
@@ -953,8 +1008,11 @@
     "UNKNOWN_CIPHER\0"
     "UNKNOWN_CIPHER_ALGORITHM\0"
     "UNKNOWN_DIGEST\0"
-    "UNKNOWN_HASH\0"
+    "UNSUPPORTED_KEYLENGTH\0"
+    "UNSUPPORTED_KEY_DERIVATION_FUNCTION\0"
+    "UNSUPPORTED_PRF\0"
     "UNSUPPORTED_PRIVATE_KEY_ALGORITHM\0"
+    "UNSUPPORTED_SALT_TYPE\0"
     "BAD_E_VALUE\0"
     "BAD_FIXED_HEADER_DECRYPT\0"
     "BAD_PAD_BYTE_COUNT\0"
@@ -994,6 +1052,8 @@
     "UNKNOWN_PADDING_TYPE\0"
     "VALUE_MISSING\0"
     "WRONG_SIGNATURE_LENGTH\0"
+    "ALPN_MISMATCH_ON_EARLY_DATA\0"
+    "APPLICATION_DATA_INSTEAD_OF_HANDSHAKE\0"
     "APP_DATA_IN_HANDSHAKE\0"
     "ATTEMPT_TO_REUSE_SESSION_IN_DIFFERENT_CONTEXT\0"
     "BAD_ALERT\0"
@@ -1015,9 +1075,12 @@
     "BIO_NOT_SET\0"
     "BLOCK_CIPHER_PAD_IS_WRONG\0"
     "BUFFERED_MESSAGES_ON_CIPHER_CHANGE\0"
+    "CANNOT_HAVE_BOTH_PRIVKEY_AND_METHOD\0"
+    "CANNOT_PARSE_LEAF_CERT\0"
     "CA_DN_LENGTH_MISMATCH\0"
     "CA_DN_TOO_LONG\0"
     "CCS_RECEIVED_EARLY\0"
+    "CERTIFICATE_AND_PRIVATE_KEY_MISMATCH\0"
     "CERTIFICATE_VERIFY_FAILED\0"
     "CERT_CB_ERROR\0"
     "CERT_LENGTH_MISMATCH\0"
@@ -1039,7 +1102,9 @@
     "DTLS_MESSAGE_TOO_BIG\0"
     "DUPLICATE_EXTENSION\0"
     "DUPLICATE_KEY_SHARE\0"
+    "EARLY_DATA_NOT_IN_USE\0"
     "ECC_CERT_NOT_FOR_SIGNING\0"
+    "EMPTY_HELLO_RETRY_REQUEST\0"
     "EMS_STATE_INCONSISTENT\0"
     "ENCRYPTED_LENGTH_TOO_LONG\0"
     "ERROR_ADDING_EXTENSION\0"
@@ -1050,6 +1115,7 @@
     "FRAGMENT_MISMATCH\0"
     "GOT_NEXT_PROTO_WITHOUT_EXTENSION\0"
     "HANDSHAKE_FAILURE_ON_CLIENT_HELLO\0"
+    "HANDSHAKE_NOT_COMPLETE\0"
     "HTTPS_PROXY_REQUEST\0"
     "HTTP_REQUEST\0"
     "INAPPROPRIATE_FALLBACK\0"
@@ -1062,7 +1128,6 @@
     "INVALID_SSL_SESSION\0"
     "INVALID_TICKET_KEYS_LENGTH\0"
     "LENGTH_MISMATCH\0"
-    "LIBRARY_HAS_NO_CIPHERS\0"
     "MISSING_EXTENSION\0"
     "MISSING_KEY_SHARE\0"
     "MISSING_RSA_CERTIFICATE\0"
@@ -1071,6 +1136,7 @@
     "MIXED_SPECIAL_OPERATOR_WITH_GROUPS\0"
     "MTU_TOO_SMALL\0"
     "NEGOTIATED_BOTH_NPN_AND_ALPN\0"
+    "NEGOTIATED_TB_WITHOUT_EMS_OR_RI\0"
     "NESTED_GROUP\0"
     "NO_CERTIFICATES_RETURNED\0"
     "NO_CERTIFICATE_ASSIGNED\0"
@@ -1089,6 +1155,7 @@
     "NO_REQUIRED_DIGEST\0"
     "NO_SHARED_CIPHER\0"
     "NO_SHARED_GROUP\0"
+    "NO_SUPPORTED_VERSIONS_ENABLED\0"
     "NULL_SSL_CTX\0"
     "NULL_SSL_METHOD_PASSED\0"
     "OLD_SESSION_CIPHER_NOT_RETURNED\0"
@@ -1115,6 +1182,8 @@
     "RESUMED_NON_EMS_SESSION_WITH_EMS_EXTENSION\0"
     "SCSV_RECEIVED_WHEN_RENEGOTIATING\0"
     "SERVERHELLO_TLSEXT\0"
+    "SERVER_CERT_CHANGED\0"
+    "SERVER_ECHOED_INVALID_SESSION_ID\0"
     "SESSION_ID_CONTEXT_UNINITIALIZED\0"
     "SESSION_MAY_NOT_BE_CREATED\0"
     "SHUTDOWN_WHILE_IN_INIT\0"
@@ -1137,6 +1206,7 @@
     "SSL_CTX_HAS_NO_DEFAULT_SSL_VERSION\0"
     "SSL_HANDSHAKE_FAILURE\0"
     "SSL_SESSION_ID_CONTEXT_TOO_LONG\0"
+    "TICKET_ENCRYPTION_FAILED\0"
     "TLSV1_ALERT_ACCESS_DENIED\0"
     "TLSV1_ALERT_DECODE_ERROR\0"
     "TLSV1_ALERT_DECRYPTION_FAILED\0"
@@ -1162,9 +1232,11 @@
     "TOO_MANY_EMPTY_FRAGMENTS\0"
     "TOO_MANY_KEY_UPDATES\0"
     "TOO_MANY_WARNING_ALERTS\0"
+    "TOO_MUCH_READ_EARLY_DATA\0"
     "TOO_MUCH_SKIPPED_EARLY_DATA\0"
     "UNABLE_TO_FIND_ECDH_PARAMETERS\0"
     "UNEXPECTED_EXTENSION\0"
+    "UNEXPECTED_EXTENSION_ON_EARLY_DATA\0"
     "UNEXPECTED_MESSAGE\0"
     "UNEXPECTED_OPERATOR_IN_GROUP\0"
     "UNEXPECTED_RECORD\0"
@@ -1188,10 +1260,10 @@
     "WRONG_SIGNATURE_TYPE\0"
     "WRONG_SSL_VERSION\0"
     "WRONG_VERSION_NUMBER\0"
+    "WRONG_VERSION_ON_EARLY_DATA\0"
     "X509_LIB\0"
     "X509_VERIFICATION_SETUP_PROBLEMS\0"
     "AKID_MISMATCH\0"
-    "BAD_PKCS7_VERSION\0"
     "BAD_X509_FILETYPE\0"
     "BASE64_DECODE_ERROR\0"
     "CANT_CHECK_DH_KEY\0"
@@ -1201,6 +1273,7 @@
     "IDP_MISMATCH\0"
     "INVALID_DIRECTORY\0"
     "INVALID_FIELD_NAME\0"
+    "INVALID_PARAMETER\0"
     "INVALID_PSS_PARAMETERS\0"
     "INVALID_TRUST\0"
     "ISSUER_MISMATCH\0"
@@ -1210,10 +1283,7 @@
     "LOADING_DEFAULTS\0"
     "NAME_TOO_LONG\0"
     "NEWER_CRL_NOT_NEWER\0"
-    "NOT_PKCS7_SIGNED_DATA\0"
-    "NO_CERTIFICATES_INCLUDED\0"
     "NO_CERT_SET_FOR_US_TO_VERIFY\0"
-    "NO_CRLS_INCLUDED\0"
     "NO_CRL_NUMBER\0"
     "PUBLIC_KEY_DECODE_ERROR\0"
     "PUBLIC_KEY_ENCODE_ERROR\0"
diff --git a/ios-aarch64/crypto/chacha/chacha-armv8.S b/ios-aarch64/crypto/chacha/chacha-armv8.S
new file mode 100644
index 0000000..195e8f0
--- /dev/null
+++ b/ios-aarch64/crypto/chacha/chacha-armv8.S
@@ -0,0 +1,1969 @@
+#include <openssl/arm_arch.h>
+
+.text
+
+
+
+.align	5
+Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+Lone:
+.long	1,0,0,0
+LOPENSSL_armcap_P:
+#ifdef	__ILP32__
+.long	_OPENSSL_armcap_P-.
+#else
+.quad	_OPENSSL_armcap_P-.
+#endif
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+
+.globl	_ChaCha20_ctr32
+.private_extern	_ChaCha20_ctr32
+
+.align	5
+_ChaCha20_ctr32:
+	cbz	x2,Labort
+	adr	x5,LOPENSSL_armcap_P
+	cmp	x2,#192
+	b.lo	Lshort
+#ifdef	__ILP32__
+	ldrsw	x6,[x5]
+#else
+	ldr	x6,[x5]
+#endif
+	ldr	w17,[x6,x5]
+	tst	w17,#ARMV7_NEON
+	b.ne	ChaCha20_neon
+
+Lshort:
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adr	x5,Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ldp	x28,x30,[x4]		// load counter
+#ifdef	__ARMEB__
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+
+Loop_outer:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#64
+Loop:
+	sub	x4,x4,#1
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	ror	w21,w21,#16
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#20
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	ror	w21,w21,#24
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#25
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#16
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	ror	w9,w9,#20
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#24
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	ror	w9,w9,#25
+	cbnz	x4,Loop
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	b.lo	Ltail
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+
+	b.hi	Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+Labort:
+	ret
+
+.align	4
+Ltail:
+	add	x2,x2,#64
+Less_than_64:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	stp	x5,x7,[sp,#0]
+	stp	x9,x11,[sp,#16]
+	stp	x13,x15,[sp,#32]
+	stp	x17,x20,[sp,#48]
+
+Loop_tail:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	ret
+
+
+
+.align	5
+ChaCha20_neon:
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adr	x5,Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	x2,#512
+	b.hs	L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__ARMEB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+Loop_outer_neon:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	v0.16b,v24.16b
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	v4.16b,v24.16b
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	v16.16b,v24.16b
+	mov	w11,w25
+	mov	v1.16b,v25.16b
+	lsr	x12,x25,#32
+	mov	v5.16b,v25.16b
+	mov	w13,w26
+	mov	v17.16b,v25.16b
+	lsr	x14,x26,#32
+	mov	v3.16b,v27.16b
+	mov	w15,w27
+	mov	v7.16b,v28.16b
+	lsr	x16,x27,#32
+	mov	v19.16b,v29.16b
+	mov	w17,w28
+	mov	v2.16b,v26.16b
+	lsr	x19,x28,#32
+	mov	v6.16b,v26.16b
+	mov	w20,w30
+	mov	v18.16b,v26.16b
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#256
+Loop_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w11
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w12
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w17,w17,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w19,w19,w6
+	rev32	v3.8h,v3.8h
+	eor	w20,w20,w7
+	rev32	v7.8h,v7.8h
+	eor	w21,w21,w8
+	rev32	v19.8h,v19.8h
+	ror	w17,w17,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#20
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#20
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#20
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#12
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#12
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#12
+	ror	w9,w9,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w10,w10,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w11,w11,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w12,w12,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w9
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w10
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w11
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w12
+	ushr	v7.4s,v21.4s,#24
+	eor	w17,w17,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w19,w19,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w20,w20,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w21,w21,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w17,w17,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#25
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#25
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#25
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#7
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#7
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#7
+	ror	w9,w9,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w10,w10,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w10
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w11
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w12
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w9
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w17,w17,w6
+	rev32	v3.8h,v3.8h
+	eor	w19,w19,w7
+	rev32	v7.8h,v7.8h
+	eor	w20,w20,w8
+	rev32	v19.8h,v19.8h
+	ror	w21,w21,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#20
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#20
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#20
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#12
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#12
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#12
+	ror	w10,w10,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w11,w11,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w12,w12,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w9,w9,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w12
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w9
+	ushr	v7.4s,v21.4s,#24
+	eor	w21,w21,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w17,w17,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w19,w19,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w20,w20,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w21,w21,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#25
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#25
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#25
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#7
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#7
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#7
+	ror	w10,w10,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w11,w11,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w12,w12,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	cbnz	x4,Loop_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	v0.4s,v0.4s,v24.4s
+	add	x6,x6,x22,lsr#32
+	add	v4.4s,v4.4s,v24.4s
+	add	w7,w7,w23
+	add	v16.4s,v16.4s,v24.4s
+	add	x8,x8,x23,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w9,w9,w24
+	add	v6.4s,v6.4s,v26.4s
+	add	x10,x10,x24,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w11,w11,w25
+	add	v3.4s,v3.4s,v27.4s
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	v7.4s,v7.4s,v28.4s
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	v19.4s,v19.4s,v29.4s
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	v1.4s,v1.4s,v25.4s
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	v5.4s,v5.4s,v25.4s
+	add	x21,x21,x30,lsr#32
+	add	v17.4s,v17.4s,v25.4s
+
+	b.lo	Ltail_neon
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v20.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v21.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v22.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v23.16b
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	add	v27.4s,v27.4s,v31.4s		// += 4
+	stp	x13,x15,[x0,#32]
+	add	v28.4s,v28.4s,v31.4s
+	stp	x17,x20,[x0,#48]
+	add	v29.4s,v29.4s,v31.4s
+	add	x0,x0,#64
+
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	eor	v16.16b,v16.16b,v0.16b
+	eor	v17.16b,v17.16b,v1.16b
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v19.16b,v19.16b,v3.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	b.hi	Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	ret
+
+Ltail_neon:
+	add	x2,x2,#256
+	cmp	x2,#64
+	b.lo	Less_than_64
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_128
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v0.16b,v0.16b,v20.16b
+	eor	v1.16b,v1.16b,v21.16b
+	eor	v2.16b,v2.16b,v22.16b
+	eor	v3.16b,v3.16b,v23.16b
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_192
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+	b	Last_neon
+
+Less_than_128:
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+	b	Last_neon
+Less_than_192:
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+	b	Last_neon
+
+.align	4
+Last_neon:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+Loop_tail_neon:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	ret
+
+
+.align	5
+ChaCha20_512_neon:
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adr	x5,Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__ARMEB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
+	add	v27.4s,v27.4s,v31.4s		// not typo
+	str	q26,[sp,#32]
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	add	v30.4s,v29.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	x2,x2,#512			// not typo
+
+Loop_outer_512_neon:
+	mov	v0.16b,v24.16b
+	mov	v4.16b,v24.16b
+	mov	v8.16b,v24.16b
+	mov	v12.16b,v24.16b
+	mov	v16.16b,v24.16b
+	mov	v20.16b,v24.16b
+	mov	v1.16b,v25.16b
+	mov	w5,w22			// unpack key block
+	mov	v5.16b,v25.16b
+	lsr	x6,x22,#32
+	mov	v9.16b,v25.16b
+	mov	w7,w23
+	mov	v13.16b,v25.16b
+	lsr	x8,x23,#32
+	mov	v17.16b,v25.16b
+	mov	w9,w24
+	mov	v21.16b,v25.16b
+	lsr	x10,x24,#32
+	mov	v3.16b,v27.16b
+	mov	w11,w25
+	mov	v7.16b,v28.16b
+	lsr	x12,x25,#32
+	mov	v11.16b,v29.16b
+	mov	w13,w26
+	mov	v15.16b,v30.16b
+	lsr	x14,x26,#32
+	mov	v2.16b,v26.16b
+	mov	w15,w27
+	mov	v6.16b,v26.16b
+	lsr	x16,x27,#32
+	add	v19.4s,v3.4s,v31.4s			// +4
+	mov	w17,w28
+	add	v23.4s,v7.4s,v31.4s			// +4
+	lsr	x19,x28,#32
+	mov	v10.16b,v26.16b
+	mov	w20,w30
+	mov	v14.16b,v26.16b
+	lsr	x21,x30,#32
+	mov	v18.16b,v26.16b
+	stp	q27,q28,[sp,#48]		// off-load key block, variable part
+	mov	v22.16b,v26.16b
+	str	q29,[sp,#80]
+
+	mov	x4,#5
+	subs	x2,x2,#512
+Loop_upper_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_upper_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	stp	x9,x11,[x0,#16]
+	mov	w7,w23
+	lsr	x8,x23,#32
+	stp	x13,x15,[x0,#32]
+	mov	w9,w24
+	lsr	x10,x24,#32
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#5
+Loop_lower_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_lower_neon
+
+	add	w5,w5,w22		// accumulate key block
+	ldp	q24,q25,[sp,#0]
+	add	x6,x6,x22,lsr#32
+	ldp	q26,q27,[sp,#32]
+	add	w7,w7,w23
+	ldp	q28,q29,[sp,#64]
+	add	x8,x8,x23,lsr#32
+	add	v0.4s,v0.4s,v24.4s
+	add	w9,w9,w24
+	add	v4.4s,v4.4s,v24.4s
+	add	x10,x10,x24,lsr#32
+	add	v8.4s,v8.4s,v24.4s
+	add	w11,w11,w25
+	add	v12.4s,v12.4s,v24.4s
+	add	x12,x12,x25,lsr#32
+	add	v16.4s,v16.4s,v24.4s
+	add	w13,w13,w26
+	add	v20.4s,v20.4s,v24.4s
+	add	x14,x14,x26,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w15,w15,w27
+	add	v6.4s,v6.4s,v26.4s
+	add	x16,x16,x27,lsr#32
+	add	v10.4s,v10.4s,v26.4s
+	add	w17,w17,w28
+	add	v14.4s,v14.4s,v26.4s
+	add	x19,x19,x28,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w20,w20,w30
+	add	v22.4s,v22.4s,v26.4s
+	add	x21,x21,x30,lsr#32
+	add	v19.4s,v19.4s,v31.4s			// +4
+	add	x5,x5,x6,lsl#32	// pack
+	add	v23.4s,v23.4s,v31.4s			// +4
+	add	x7,x7,x8,lsl#32
+	add	v3.4s,v3.4s,v27.4s
+	ldp	x6,x8,[x1,#0]		// load input
+	add	v7.4s,v7.4s,v28.4s
+	add	x9,x9,x10,lsl#32
+	add	v11.4s,v11.4s,v29.4s
+	add	x11,x11,x12,lsl#32
+	add	v15.4s,v15.4s,v30.4s
+	ldp	x10,x12,[x1,#16]
+	add	v19.4s,v19.4s,v27.4s
+	add	x13,x13,x14,lsl#32
+	add	v23.4s,v23.4s,v28.4s
+	add	x15,x15,x16,lsl#32
+	add	v1.4s,v1.4s,v25.4s
+	ldp	x14,x16,[x1,#32]
+	add	v5.4s,v5.4s,v25.4s
+	add	x17,x17,x19,lsl#32
+	add	v9.4s,v9.4s,v25.4s
+	add	x20,x20,x21,lsl#32
+	add	v13.4s,v13.4s,v25.4s
+	ldp	x19,x21,[x1,#48]
+	add	v17.4s,v17.4s,v25.4s
+	add	x1,x1,#64
+	add	v21.4s,v21.4s,v25.4s
+
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v24.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v25.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v26.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v27.16b
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#7			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+	eor	v4.16b,v4.16b,v24.16b
+	eor	v5.16b,v5.16b,v25.16b
+	eor	v6.16b,v6.16b,v26.16b
+	eor	v7.16b,v7.16b,v27.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	eor	v8.16b,v8.16b,v0.16b
+	ldp	q24,q25,[sp,#0]
+	eor	v9.16b,v9.16b,v1.16b
+	ldp	q26,q27,[sp,#32]
+	eor	v10.16b,v10.16b,v2.16b
+	eor	v11.16b,v11.16b,v3.16b
+	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+	eor	v12.16b,v12.16b,v4.16b
+	eor	v13.16b,v13.16b,v5.16b
+	eor	v14.16b,v14.16b,v6.16b
+	eor	v15.16b,v15.16b,v7.16b
+	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+	eor	v16.16b,v16.16b,v8.16b
+	eor	v17.16b,v17.16b,v9.16b
+	eor	v18.16b,v18.16b,v10.16b
+	eor	v19.16b,v19.16b,v11.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	shl	v0.4s,v31.4s,#1			// 4 -> 8
+	eor	v20.16b,v20.16b,v12.16b
+	eor	v21.16b,v21.16b,v13.16b
+	eor	v22.16b,v22.16b,v14.16b
+	eor	v23.16b,v23.16b,v15.16b
+	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+	add	v27.4s,v27.4s,v0.4s			// += 8
+	add	v28.4s,v28.4s,v0.4s
+	add	v29.4s,v29.4s,v0.4s
+	add	v30.4s,v30.4s,v0.4s
+
+	b.hs	Loop_outer_512_neon
+
+	adds	x2,x2,#512
+	ushr	v0.4s,v31.4s,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	q24,q31,[sp,#0]		// wipe off-load area
+	stp	q24,q31,[sp,#32]
+	stp	q24,q31,[sp,#64]
+
+	b.eq	Ldone_512_neon
+
+	cmp	x2,#192
+	sub	v27.4s,v27.4s,v0.4s			// -= 1
+	sub	v28.4s,v28.4s,v0.4s
+	sub	v29.4s,v29.4s,v0.4s
+	add	sp,sp,#128
+	b.hs	Loop_outer_neon
+
+	eor	v25.16b,v25.16b,v25.16b
+	eor	v26.16b,v26.16b,v26.16b
+	eor	v27.16b,v27.16b,v27.16b
+	eor	v28.16b,v28.16b,v28.16b
+	eor	v29.16b,v29.16b,v29.16b
+	eor	v30.16b,v30.16b,v30.16b
+	b	Loop_outer
+
+Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	ret
+
diff --git a/ios-aarch64/crypto/fipsmodule/aesv8-armx64.S b/ios-aarch64/crypto/fipsmodule/aesv8-armx64.S
new file mode 100644
index 0000000..787cce2
--- /dev/null
+++ b/ios-aarch64/crypto/fipsmodule/aesv8-armx64.S
@@ -0,0 +1,753 @@
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.align	5
+Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.globl	_aes_hw_set_encrypt_key
+.private_extern	_aes_hw_set_encrypt_key
+
+.align	5
+_aes_hw_set_encrypt_key:
+Lenc_key:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	mov	x3,#-1
+	cmp	x0,#0
+	b.eq	Lenc_key_abort
+	cmp	x2,#0
+	b.eq	Lenc_key_abort
+	mov	x3,#-2
+	cmp	w1,#128
+	b.lt	Lenc_key_abort
+	cmp	w1,#256
+	b.gt	Lenc_key_abort
+	tst	w1,#0x3f
+	b.ne	Lenc_key_abort
+
+	adr	x3,Lrcon
+	cmp	w1,#192
+
+	eor	v0.16b,v0.16b,v0.16b
+	ld1	{v3.16b},[x0],#16
+	mov	w1,#8		// reuse w1
+	ld1	{v1.4s,v2.4s},[x3],#32
+
+	b.lt	Loop128
+	b.eq	L192
+	b	L256
+
+.align	4
+Loop128:
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	b.ne	Loop128
+
+	ld1	{v1.4s},[x3]
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2]
+	add	x2,x2,#0x50
+
+	mov	w12,#10
+	b	Ldone
+
+.align	4
+L192:
+	ld1	{v4.8b},[x0],#8
+	movi	v6.16b,#8			// borrow v6.16b
+	st1	{v3.4s},[x2],#16
+	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
+
+Loop192:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.8b},[x2],#8
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+
+	dup	v5.4s,v3.s[3]
+	eor	v5.16b,v5.16b,v4.16b
+	eor	v6.16b,v6.16b,v1.16b
+	ext	v4.16b,v0.16b,v4.16b,#12
+	shl	v1.16b,v1.16b,#1
+	eor	v4.16b,v4.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	eor	v4.16b,v4.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.ne	Loop192
+
+	mov	w12,#12
+	add	x2,x2,#0x20
+	b	Ldone
+
+.align	4
+L256:
+	ld1	{v4.16b},[x0]
+	mov	w1,#7
+	mov	w12,#14
+	st1	{v3.4s},[x2],#16
+
+Loop256:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.eq	Ldone
+
+	dup	v6.4s,v3.s[3]		// just splat
+	ext	v5.16b,v0.16b,v4.16b,#12
+	aese	v6.16b,v0.16b
+
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+
+	eor	v4.16b,v4.16b,v6.16b
+	b	Loop256
+
+Ldone:
+	str	w12,[x2]
+	mov	x3,#0
+
+Lenc_key_abort:
+	mov	x0,x3			// return value
+	ldr	x29,[sp],#16
+	ret
+
+
+.globl	_aes_hw_set_decrypt_key
+.private_extern	_aes_hw_set_decrypt_key
+
+.align	5
+_aes_hw_set_decrypt_key:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	bl	Lenc_key
+
+	cmp	x0,#0
+	b.ne	Ldec_key_abort
+
+	sub	x2,x2,#240		// restore original x2
+	mov	x4,#-16
+	add	x0,x2,x12,lsl#4	// end of key schedule
+
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+
+Loop_imc:
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	aesimc	v0.16b,v0.16b
+	aesimc	v1.16b,v1.16b
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+	cmp	x0,x2
+	b.hi	Loop_imc
+
+	ld1	{v0.4s},[x2]
+	aesimc	v0.16b,v0.16b
+	st1	{v0.4s},[x0]
+
+	eor	x0,x0,x0		// return value
+Ldec_key_abort:
+	ldp	x29,x30,[sp],#16
+	ret
+
+.globl	_aes_hw_encrypt
+.private_extern	_aes_hw_encrypt
+
+.align	5
+_aes_hw_encrypt:
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+Loop_enc:
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aese	v2.16b,v1.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	Loop_enc
+
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aese	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+
+.globl	_aes_hw_decrypt
+.private_extern	_aes_hw_decrypt
+
+.align	5
+_aes_hw_decrypt:
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+Loop_dec:
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aesd	v2.16b,v1.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	Loop_dec
+
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aesd	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+
+.globl	_aes_hw_cbc_encrypt
+.private_extern	_aes_hw_cbc_encrypt
+
+.align	5
+_aes_hw_cbc_encrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	subs	x2,x2,#16
+	mov	x8,#16
+	b.lo	Lcbc_abort
+	csel	x8,xzr,x8,eq
+
+	cmp	w5,#0			// en- or decrypting?
+	ldr	w5,[x3,#240]
+	and	x2,x2,#-16
+	ld1	{v6.16b},[x4]
+	ld1	{v0.16b},[x0],x8
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#6
+	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
+	sub	w5,w5,#2
+	ld1	{v18.4s,v19.4s},[x7],#32
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+
+	add	x7,x3,#32
+	mov	w6,w5
+	b.eq	Lcbc_dec
+
+	cmp	w5,#2
+	eor	v0.16b,v0.16b,v6.16b
+	eor	v5.16b,v16.16b,v7.16b
+	b.eq	Lcbc_enc128
+
+	ld1	{v2.4s,v3.4s},[x7]
+	add	x7,x3,#16
+	add	x6,x3,#16*4
+	add	x12,x3,#16*5
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	add	x14,x3,#16*6
+	add	x3,x3,#16*7
+	b	Lenter_cbc_enc
+
+.align	4
+Loop_cbc_enc:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+Lenter_cbc_enc:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x6]
+	cmp	w5,#4
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x12]
+	b.eq	Lcbc_enc192
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x14]
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x3]
+	nop
+
+Lcbc_enc192:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	Loop_cbc_enc
+
+	st1	{v6.16b},[x1],#16
+	b	Lcbc_done
+
+.align	5
+Lcbc_enc128:
+	ld1	{v2.4s,v3.4s},[x7]
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	b	Lenter_cbc_enc128
+Loop_cbc_enc128:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+Lenter_cbc_enc128:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	Loop_cbc_enc128
+
+	st1	{v6.16b},[x1],#16
+	b	Lcbc_done
+.align	5
+Lcbc_dec:
+	ld1	{v18.16b},[x0],#16
+	subs	x2,x2,#32		// bias
+	add	w6,w5,#2
+	orr	v3.16b,v0.16b,v0.16b
+	orr	v1.16b,v0.16b,v0.16b
+	orr	v19.16b,v18.16b,v18.16b
+	b.lo	Lcbc_dec_tail
+
+	orr	v1.16b,v18.16b,v18.16b
+	ld1	{v18.16b},[x0],#16
+	orr	v2.16b,v0.16b,v0.16b
+	orr	v3.16b,v1.16b,v1.16b
+	orr	v19.16b,v18.16b,v18.16b
+
+Loop3x_cbc_dec:
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_cbc_dec
+
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	eor	v4.16b,v6.16b,v7.16b
+	subs	x2,x2,#0x30
+	eor	v5.16b,v2.16b,v7.16b
+	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	add	x0,x0,x6		// x0 is adjusted in such way that
+					// at exit from the loop v1.16b-v18.16b
+					// are loaded with last "words"
+	orr	v6.16b,v19.16b,v19.16b
+	mov	x7,x3
+	aesd	v0.16b,v20.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v2.16b},[x0],#16
+	aesd	v0.16b,v21.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	aesd	v0.16b,v22.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v19.16b},[x0],#16
+	aesd	v0.16b,v23.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	add	w6,w5,#2
+	eor	v4.16b,v4.16b,v0.16b
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v18.16b,v18.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v4.16b},[x1],#16
+	orr	v0.16b,v2.16b,v2.16b
+	st1	{v5.16b},[x1],#16
+	orr	v1.16b,v3.16b,v3.16b
+	st1	{v18.16b},[x1],#16
+	orr	v18.16b,v19.16b,v19.16b
+	b.hs	Loop3x_cbc_dec
+
+	cmn	x2,#0x30
+	b.eq	Lcbc_done
+	nop
+
+Lcbc_dec_tail:
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lcbc_dec_tail
+
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	cmn	x2,#0x20
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	eor	v5.16b,v6.16b,v7.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	b.eq	Lcbc_dec_one
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v17.16b,v17.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+	st1	{v17.16b},[x1],#16
+	b	Lcbc_done
+
+Lcbc_dec_one:
+	eor	v5.16b,v5.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+
+Lcbc_done:
+	st1	{v6.16b},[x4]
+Lcbc_abort:
+	ldr	x29,[sp],#16
+	ret
+
+.globl	_aes_hw_ctr32_encrypt_blocks
+.private_extern	_aes_hw_ctr32_encrypt_blocks
+
+.align	5
+_aes_hw_ctr32_encrypt_blocks:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	ldr	w5,[x3,#240]
+
+	ldr	w8, [x4, #12]
+	ld1	{v0.4s},[x4]
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#4
+	mov	x12,#16
+	cmp	x2,#2
+	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
+	sub	w5,w5,#2
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+	add	x7,x3,#32
+	mov	w6,w5
+	csel	x12,xzr,x12,lo
+#ifndef __ARMEB__
+	rev	w8, w8
+#endif
+	orr	v1.16b,v0.16b,v0.16b
+	add	w10, w8, #1
+	orr	v18.16b,v0.16b,v0.16b
+	add	w8, w8, #2
+	orr	v6.16b,v0.16b,v0.16b
+	rev	w10, w10
+	mov	v1.s[3],w10
+	b.ls	Lctr32_tail
+	rev	w12, w8
+	sub	x2,x2,#3		// bias
+	mov	v18.s[3],w12
+	b	Loop3x_ctr32
+
+.align	4
+Loop3x_ctr32:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v17.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_ctr32
+
+	aese	v0.16b,v16.16b
+	aesmc	v4.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v5.16b,v1.16b
+	ld1	{v2.16b},[x0],#16
+	orr	v0.16b,v6.16b,v6.16b
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	orr	v1.16b,v6.16b,v6.16b
+	aese	v4.16b,v17.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v17.16b
+	aesmc	v5.16b,v5.16b
+	ld1	{v19.16b},[x0],#16
+	mov	x7,x3
+	aese	v18.16b,v17.16b
+	aesmc	v17.16b,v18.16b
+	orr	v18.16b,v6.16b,v6.16b
+	add	w9,w8,#1
+	aese	v4.16b,v20.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v20.16b
+	aesmc	v5.16b,v5.16b
+	eor	v2.16b,v2.16b,v7.16b
+	add	w10,w8,#2
+	aese	v17.16b,v20.16b
+	aesmc	v17.16b,v17.16b
+	eor	v3.16b,v3.16b,v7.16b
+	add	w8,w8,#3
+	aese	v4.16b,v21.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v21.16b
+	aesmc	v5.16b,v5.16b
+	eor	v19.16b,v19.16b,v7.16b
+	rev	w9,w9
+	aese	v17.16b,v21.16b
+	aesmc	v17.16b,v17.16b
+	mov	v0.s[3], w9
+	rev	w10,w10
+	aese	v4.16b,v22.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v22.16b
+	aesmc	v5.16b,v5.16b
+	mov	v1.s[3], w10
+	rev	w12,w8
+	aese	v17.16b,v22.16b
+	aesmc	v17.16b,v17.16b
+	mov	v18.s[3], w12
+	subs	x2,x2,#3
+	aese	v4.16b,v23.16b
+	aese	v5.16b,v23.16b
+	aese	v17.16b,v23.16b
+
+	eor	v2.16b,v2.16b,v4.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	st1	{v2.16b},[x1],#16
+	eor	v3.16b,v3.16b,v5.16b
+	mov	w6,w5
+	st1	{v3.16b},[x1],#16
+	eor	v19.16b,v19.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v19.16b},[x1],#16
+	b.hs	Loop3x_ctr32
+
+	adds	x2,x2,#3
+	b.eq	Lctr32_done
+	cmp	x2,#1
+	mov	x12,#16
+	csel	x12,xzr,x12,eq
+
+Lctr32_tail:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lctr32_tail
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v2.16b},[x0],x12
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v20.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v3.16b},[x0]
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v21.16b
+	aesmc	v1.16b,v1.16b
+	eor	v2.16b,v2.16b,v7.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v22.16b
+	aesmc	v1.16b,v1.16b
+	eor	v3.16b,v3.16b,v7.16b
+	aese	v0.16b,v23.16b
+	aese	v1.16b,v23.16b
+
+	cmp	x2,#1
+	eor	v2.16b,v2.16b,v0.16b
+	eor	v3.16b,v3.16b,v1.16b
+	st1	{v2.16b},[x1],#16
+	b.eq	Lctr32_done
+	st1	{v3.16b},[x1]
+
+Lctr32_done:
+	ldr	x29,[sp],#16
+	ret
+
+#endif
diff --git a/ios-aarch64/crypto/fipsmodule/armv8-mont.S b/ios-aarch64/crypto/fipsmodule/armv8-mont.S
new file mode 100644
index 0000000..5626574
--- /dev/null
+++ b/ios-aarch64/crypto/fipsmodule/armv8-mont.S
@@ -0,0 +1,1405 @@
+.text
+
+.globl	_bn_mul_mont
+.private_extern	_bn_mul_mont
+
+.align	5
+_bn_mul_mont:
+	tst	x5,#7
+	b.eq	__bn_sqr8x_mont
+	tst	x5,#3
+	b.eq	__bn_mul4x_mont
+Lmul_mont:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldr	x9,[x2],#8		// bp[0]
+	sub	x22,sp,x5,lsl#3
+	ldp	x7,x8,[x1],#16	// ap[0..1]
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	and	x22,x22,#-16		// ABI says so
+	ldp	x13,x14,[x3],#16	// np[0..1]
+
+	mul	x6,x7,x9		// ap[0]*bp[0]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	mul	x10,x8,x9		// ap[1]*bp[0]
+	umulh	x11,x8,x9
+
+	mul	x15,x6,x4		// "tp[0]"*n0
+	mov	sp,x22			// alloca
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6	// discarded
+	// (*)	As for removal of first multiplication and addition
+	//	instructions. The outcome of first addition is
+	//	guaranteed to be zero, which leaves two computationally
+	//	significant outcomes: it either carries or not. Then
+	//	question is when does it carry? Is there alternative
+	//	way to deduce it? If you follow operations, you can
+	//	observe that condition for carry is quite simple:
+	//	x6 being non-zero. So that carry can be calculated
+	//	by adding -1 to x6. That's what next instruction does.
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	adc	x13,x13,xzr
+	cbz	x21,L1st_skip
+
+L1st:
+	ldr	x8,[x1],#8
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	ldr	x14,[x3],#8
+	adds	x12,x16,x13
+	mul	x10,x8,x9		// ap[j]*bp[0]
+	adc	x13,x17,xzr
+	umulh	x11,x8,x9
+
+	adds	x12,x12,x6
+	mul	x16,x14,x15		// np[j]*m1
+	adc	x13,x13,xzr
+	umulh	x17,x14,x15
+	str	x12,[x22],#8		// tp[j-1]
+	cbnz	x21,L1st
+
+L1st_skip:
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adc	x13,x17,xzr
+
+	adds	x12,x12,x6
+	sub	x20,x5,#8		// i=num-1
+	adcs	x13,x13,x7
+
+	adc	x19,xzr,xzr		// upmost overflow bit
+	stp	x12,x13,[x22]
+
+Louter:
+	ldr	x9,[x2],#8		// bp[i]
+	ldp	x7,x8,[x1],#16
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+
+	mul	x6,x7,x9		// ap[0]*bp[i]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	ldp	x13,x14,[x3],#16
+	mul	x10,x8,x9		// ap[1]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x15,x6,x4
+	sub	x20,x20,#8		// i--
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	cbz	x21,Linner_skip
+
+Linner:
+	ldr	x8,[x1],#8
+	adc	x13,x13,xzr
+	ldr	x23,[x22],#8		// tp[j]
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	ldr	x14,[x3],#8
+	adc	x13,x17,xzr
+
+	mul	x10,x8,x9		// ap[j]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x16,x14,x15		// np[j]*m1
+	adds	x12,x12,x6
+	umulh	x17,x14,x15
+	str	x12,[x22,#-16]		// tp[j-1]
+	cbnz	x21,Linner
+
+Linner_skip:
+	ldr	x23,[x22],#8		// tp[j]
+	adc	x13,x13,xzr
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adcs	x13,x17,x19
+	adc	x19,xzr,xzr
+
+	adds	x6,x6,x23
+	adc	x7,x7,xzr
+
+	adds	x12,x12,x6
+	adcs	x13,x13,x7
+	adc	x19,x19,xzr		// upmost overflow bit
+	stp	x12,x13,[x22,#-16]
+
+	cbnz	x20,Louter
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x14,[x3],#8		// np[0]
+	subs	x21,x5,#8		// j=num-1 and clear borrow
+	mov	x1,x0
+Lsub:
+	sbcs	x8,x23,x14		// tp[j]-np[j]
+	ldr	x23,[x22],#8
+	sub	x21,x21,#8		// j--
+	ldr	x14,[x3],#8
+	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
+	cbnz	x21,Lsub
+
+	sbcs	x8,x23,x14
+	sbcs	x19,x19,xzr		// did it borrow?
+	str	x8,[x1],#8		// rp[num-1]
+
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x8,[x0],#8		// rp[0]
+	sub	x5,x5,#8		// num--
+	nop
+Lcond_copy:
+	sub	x5,x5,#8		// num--
+	csel	x14,x23,x8,lo		// did it borrow?
+	ldr	x23,[x22],#8
+	ldr	x8,[x0],#8
+	str	xzr,[x22,#-16]		// wipe tp
+	str	x14,[x0,#-16]
+	cbnz	x5,Lcond_copy
+
+	csel	x14,x23,x8,lo
+	str	xzr,[x22,#-8]		// wipe tp
+	str	x14,[x0,#-8]
+
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	ret
+
+
+.align	5
+__bn_sqr8x_mont:
+	cmp	x1,x2
+	b.ne	__bn_mul4x_mont
+Lsqr8x_mont:
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x3,[sp,#96]	// offload rp and np
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	ldp	x12,x13,[x1,#8*6]
+
+	sub	x2,sp,x5,lsl#4
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	mov	sp,x2			// alloca
+	sub	x27,x5,#8*8
+	b	Lsqr8x_zero_start
+
+Lsqr8x_zero:
+	sub	x27,x27,#8*8
+	stp	xzr,xzr,[x2,#8*0]
+	stp	xzr,xzr,[x2,#8*2]
+	stp	xzr,xzr,[x2,#8*4]
+	stp	xzr,xzr,[x2,#8*6]
+Lsqr8x_zero_start:
+	stp	xzr,xzr,[x2,#8*8]
+	stp	xzr,xzr,[x2,#8*10]
+	stp	xzr,xzr,[x2,#8*12]
+	stp	xzr,xzr,[x2,#8*14]
+	add	x2,x2,#8*16
+	cbnz	x27,Lsqr8x_zero
+
+	add	x3,x1,x5
+	add	x1,x1,#8*8
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	mov	x23,xzr
+	mov	x24,xzr
+	mov	x25,xzr
+	mov	x26,xzr
+	mov	x2,sp
+	str	x4,[x29,#112]		// offload n0
+
+	// Multiply everything but a[i]*a[i]
+.align	4
+Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]	(i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]		(ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]			(iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]				(iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]					(v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]						(vi)
+        //     a[7]a[5]
+        // a[7]a[6]							(vii)
+
+	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
+	mul	x15,x8,x6
+	mul	x16,x9,x6
+	mul	x17,x10,x6
+	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
+	mul	x14,x11,x6
+	adcs	x21,x21,x15
+	mul	x15,x12,x6
+	adcs	x22,x22,x16
+	mul	x16,x13,x6
+	adcs	x23,x23,x17
+	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
+	adcs	x24,x24,x14
+	umulh	x14,x8,x6
+	adcs	x25,x25,x15
+	umulh	x15,x9,x6
+	adcs	x26,x26,x16
+	umulh	x16,x10,x6
+	stp	x19,x20,[x2],#8*2	// t[0..1]
+	adc	x19,xzr,xzr		// t[8]
+	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
+	umulh	x17,x11,x6
+	adcs	x22,x22,x14
+	umulh	x14,x12,x6
+	adcs	x23,x23,x15
+	umulh	x15,x13,x6
+	adcs	x24,x24,x16
+	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
+	adcs	x25,x25,x17
+	mul	x17,x9,x7
+	adcs	x26,x26,x14
+	mul	x14,x10,x7
+	adc	x19,x19,x15
+
+	mul	x15,x11,x7
+	adds	x22,x22,x16
+	mul	x16,x12,x7
+	adcs	x23,x23,x17
+	mul	x17,x13,x7
+	adcs	x24,x24,x14
+	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
+	adcs	x25,x25,x15
+	umulh	x15,x9,x7
+	adcs	x26,x26,x16
+	umulh	x16,x10,x7
+	adcs	x19,x19,x17
+	umulh	x17,x11,x7
+	stp	x21,x22,[x2],#8*2	// t[2..3]
+	adc	x20,xzr,xzr		// t[9]
+	adds	x23,x23,x14
+	umulh	x14,x12,x7
+	adcs	x24,x24,x15
+	umulh	x15,x13,x7
+	adcs	x25,x25,x16
+	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
+	adcs	x26,x26,x17
+	mul	x17,x10,x8
+	adcs	x19,x19,x14
+	mul	x14,x11,x8
+	adc	x20,x20,x15
+
+	mul	x15,x12,x8
+	adds	x24,x24,x16
+	mul	x16,x13,x8
+	adcs	x25,x25,x17
+	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
+	adcs	x26,x26,x14
+	umulh	x14,x10,x8
+	adcs	x19,x19,x15
+	umulh	x15,x11,x8
+	adcs	x20,x20,x16
+	umulh	x16,x12,x8
+	stp	x23,x24,[x2],#8*2	// t[4..5]
+	adc	x21,xzr,xzr		// t[10]
+	adds	x25,x25,x17
+	umulh	x17,x13,x8
+	adcs	x26,x26,x14
+	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
+	adcs	x19,x19,x15
+	mul	x15,x11,x9
+	adcs	x20,x20,x16
+	mul	x16,x12,x9
+	adc	x21,x21,x17
+
+	mul	x17,x13,x9
+	adds	x26,x26,x14
+	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
+	adcs	x19,x19,x15
+	umulh	x15,x11,x9
+	adcs	x20,x20,x16
+	umulh	x16,x12,x9
+	adcs	x21,x21,x17
+	umulh	x17,x13,x9
+	stp	x25,x26,[x2],#8*2	// t[6..7]
+	adc	x22,xzr,xzr		// t[11]
+	adds	x19,x19,x14
+	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
+	adcs	x20,x20,x15
+	mul	x15,x12,x10
+	adcs	x21,x21,x16
+	mul	x16,x13,x10
+	adc	x22,x22,x17
+
+	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
+	adds	x20,x20,x14
+	umulh	x14,x12,x10
+	adcs	x21,x21,x15
+	umulh	x15,x13,x10
+	adcs	x22,x22,x16
+	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
+	adc	x23,xzr,xzr		// t[12]
+	adds	x21,x21,x17
+	mul	x17,x13,x11
+	adcs	x22,x22,x14
+	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
+	adc	x23,x23,x15
+
+	umulh	x15,x13,x11
+	adds	x22,x22,x16
+	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
+	adcs	x23,x23,x17
+	umulh	x17,x13,x12		// hi(a[7]*a[6])
+	adc	x24,xzr,xzr		// t[13]
+	adds	x23,x23,x14
+	sub	x27,x3,x1	// done yet?
+	adc	x24,x24,x15
+
+	adds	x24,x24,x16
+	sub	x14,x3,x5	// rewinded ap
+	adc	x25,xzr,xzr		// t[14]
+	add	x25,x25,x17
+
+	cbz	x27,Lsqr8x_outer_break
+
+	mov	x4,x6
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x0,x1
+	adcs	x26,xzr,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved below
+	mov	x27,#-8*8
+
+	//                                                         a[8]a[0]
+	//                                                     a[9]a[0]
+	//                                                 a[a]a[0]
+	//                                             a[b]a[0]
+	//                                         a[c]a[0]
+	//                                     a[d]a[0]
+	//                                 a[e]a[0]
+	//                             a[f]a[0]
+	//                                                     a[8]a[1]
+	//                         a[f]a[1]........................
+	//                                                 a[8]a[2]
+	//                     a[f]a[2]........................
+	//                                             a[8]a[3]
+	//                 a[f]a[3]........................
+	//                                         a[8]a[4]
+	//             a[f]a[4]........................
+	//                                     a[8]a[5]
+	//         a[f]a[5]........................
+	//                                 a[8]a[6]
+	//     a[f]a[6]........................
+	//                             a[8]a[7]
+	// a[f]a[7]........................
+Lsqr8x_mul:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_mul
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	cmp	x1,x3		// done yet?
+	b.eq	Lsqr8x_break
+
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	ldr	x4,[x0,#-8*8]
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_mul
+
+.align	4
+Lsqr8x_break:
+	ldp	x6,x7,[x0,#8*0]
+	add	x1,x0,#8*8
+	ldp	x8,x9,[x0,#8*2]
+	sub	x14,x3,x1		// is it last iteration?
+	ldp	x10,x11,[x0,#8*4]
+	sub	x15,x2,x14
+	ldp	x12,x13,[x0,#8*6]
+	cbz	x14,Lsqr8x_outer_loop
+
+	stp	x19,x20,[x2,#8*0]
+	ldp	x19,x20,[x15,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x15,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x15,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x15
+	ldp	x25,x26,[x15,#8*6]
+	b	Lsqr8x_outer_loop
+
+.align	4
+Lsqr8x_outer_break:
+	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
+	ldp	x15,x16,[sp,#8*1]
+	ldp	x11,x13,[x14,#8*2]
+	add	x1,x14,#8*4
+	ldp	x17,x14,[sp,#8*3]
+
+	stp	x19,x20,[x2,#8*0]
+	mul	x19,x7,x7
+	stp	x21,x22,[x2,#8*2]
+	umulh	x7,x7,x7
+	stp	x23,x24,[x2,#8*4]
+	mul	x8,x9,x9
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,sp
+	umulh	x9,x9,x9
+	adds	x20,x7,x15,lsl#1
+	extr	x15,x16,x15,#63
+	sub	x27,x5,#8*4
+
+Lsqr4x_shift_n_add:
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	sub	x27,x27,#8*4
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	ldp	x7,x9,[x1],#8*2
+	umulh	x11,x11,x11
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	extr	x17,x14,x17,#63
+	stp	x19,x20,[x2,#8*0]
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	stp	x21,x22,[x2,#8*2]
+	adcs	x24,x11,x14
+	ldp	x17,x14,[x2,#8*7]
+	extr	x15,x16,x15,#63
+	adcs	x25,x12,x15
+	extr	x16,x17,x16,#63
+	adcs	x26,x13,x16
+	ldp	x15,x16,[x2,#8*9]
+	mul	x6,x7,x7
+	ldp	x11,x13,[x1],#8*2
+	umulh	x7,x7,x7
+	mul	x8,x9,x9
+	umulh	x9,x9,x9
+	stp	x23,x24,[x2,#8*4]
+	extr	x17,x14,x17,#63
+	stp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	adcs	x19,x6,x17
+	extr	x14,x15,x14,#63
+	adcs	x20,x7,x14
+	ldp	x17,x14,[x2,#8*3]
+	extr	x15,x16,x15,#63
+	cbnz	x27,Lsqr4x_shift_n_add
+	ldp	x1,x4,[x29,#104]	// pull np and n0
+
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	umulh	x11,x11,x11
+	stp	x19,x20,[x2,#8*0]
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	stp	x21,x22,[x2,#8*2]
+	extr	x17,x14,x17,#63
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	ldp	x19,x20,[sp,#8*0]
+	adcs	x24,x11,x14
+	extr	x15,x16,x15,#63
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x25,x12,x15
+	extr	x16,xzr,x16,#63
+	ldp	x8,x9,[x1,#8*2]
+	adc	x26,x13,x16
+	ldp	x10,x11,[x1,#8*4]
+
+	// Reduce by 512 bits per iteration
+	mul	x28,x4,x19		// t[0]*n0
+	ldp	x12,x13,[x1,#8*6]
+	add	x3,x1,x5
+	ldp	x21,x22,[sp,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[sp,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	ldp	x25,x26,[sp,#8*6]
+	add	x1,x1,#8*8
+	mov	x30,xzr		// initial top-most carry
+	mov	x2,sp
+	mov	x27,#8
+
+Lsqr8x_reduction:
+	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
+	mul	x15,x7,x28
+	sub	x27,x27,#1
+	mul	x16,x8,x28
+	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
+	mul	x17,x9,x28
+	// (*)	adds	xzr,x19,x14
+	subs	xzr,x19,#1		// (*)
+	mul	x14,x10,x28
+	adcs	x19,x20,x15
+	mul	x15,x11,x28
+	adcs	x20,x21,x16
+	mul	x16,x12,x28
+	adcs	x21,x22,x17
+	mul	x17,x13,x28
+	adcs	x22,x23,x14
+	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
+	adcs	x23,x24,x15
+	umulh	x15,x7,x28
+	adcs	x24,x25,x16
+	umulh	x16,x8,x28
+	adcs	x25,x26,x17
+	umulh	x17,x9,x28
+	adc	x26,xzr,xzr
+	adds	x19,x19,x14
+	umulh	x14,x10,x28
+	adcs	x20,x20,x15
+	umulh	x15,x11,x28
+	adcs	x21,x21,x16
+	umulh	x16,x12,x28
+	adcs	x22,x22,x17
+	umulh	x17,x13,x28
+	mul	x28,x4,x19		// next t[0]*n0
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adc	x26,x26,x17
+	cbnz	x27,Lsqr8x_reduction
+
+	ldp	x14,x15,[x2,#8*0]
+	ldp	x16,x17,[x2,#8*2]
+	mov	x0,x2
+	sub	x27,x3,x1	// done yet?
+	adds	x19,x19,x14
+	adcs	x20,x20,x15
+	ldp	x14,x15,[x2,#8*4]
+	adcs	x21,x21,x16
+	adcs	x22,x22,x17
+	ldp	x16,x17,[x2,#8*6]
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adcs	x26,x26,x17
+	//adc	x28,xzr,xzr		// moved below
+	cbz	x27,Lsqr8x8_post_condition
+
+	ldr	x4,[x2,#-8*8]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	mov	x27,#-8*8
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+
+Lsqr8x_tail:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_tail
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	ldp	x6,x7,[x2,#8*0]
+	sub	x27,x3,x1	// done yet?
+	sub	x16,x3,x5	// rewinded np
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	cbz	x27,Lsqr8x_tail_break
+
+	ldr	x4,[x0,#-8*8]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_tail
+
+.align	4
+Lsqr8x_tail_break:
+	ldr	x4,[x29,#112]		// pull n0
+	add	x27,x2,#8*8		// end of current t[num] window
+
+	subs	xzr,x30,#1		// "move" top-most carry to carry bit
+	adcs	x14,x19,x6
+	adcs	x15,x20,x7
+	ldp	x19,x20,[x0,#8*0]
+	adcs	x21,x21,x8
+	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x16,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x16,#8*4]
+	adcs	x25,x25,x12
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x16,#8*6]
+	add	x1,x16,#8*8
+	adc	x30,xzr,xzr	// top-most carry
+	mul	x28,x4,x19
+	stp	x14,x15,[x2,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x0,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x0,#8*4]
+	cmp	x27,x29		// did we hit the bottom?
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x0			// slide the window
+	ldp	x25,x26,[x0,#8*6]
+	mov	x27,#8
+	b.ne	Lsqr8x_reduction
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x0,[x29,#96]		// pull rp
+	add	x2,x2,#8*8
+	subs	x14,x19,x6
+	sbcs	x15,x20,x7
+	sub	x27,x5,#8*8
+	mov	x3,x0		// x0 copy
+
+Lsqr8x_sub:
+	sbcs	x16,x21,x8
+	ldp	x6,x7,[x1,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x1,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x10,x11,[x1,#8*4]
+	sbcs	x17,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	ldp	x19,x20,[x2,#8*0]
+	sub	x27,x27,#8*8
+	ldp	x21,x22,[x2,#8*2]
+	ldp	x23,x24,[x2,#8*4]
+	ldp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	stp	x14,x15,[x0,#8*4]
+	sbcs	x14,x19,x6
+	stp	x16,x17,[x0,#8*6]
+	add	x0,x0,#8*8
+	sbcs	x15,x20,x7
+	cbnz	x27,Lsqr8x_sub
+
+	sbcs	x16,x21,x8
+	mov	x2,sp
+	add	x1,sp,x5
+	ldp	x6,x7,[x3,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x3,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x19,x20,[x1,#8*0]
+	sbcs	x17,x26,x13
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+	stp	x14,x15,[x0,#8*4]
+	stp	x16,x17,[x0,#8*6]
+
+	sub	x27,x5,#8*4
+Lsqr4x_cond_copy:
+	sub	x27,x27,#8*4
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	ldp	x6,x7,[x3,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x16,x21,x8,lo
+	stp	xzr,xzr,[x2,#8*2]
+	add	x2,x2,#8*4
+	csel	x17,x22,x9,lo
+	ldp	x8,x9,[x3,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	stp	xzr,xzr,[x1,#8*0]
+	stp	xzr,xzr,[x1,#8*2]
+	cbnz	x27,Lsqr4x_cond_copy
+
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	stp	xzr,xzr,[x2,#8*2]
+	csel	x16,x21,x8,lo
+	csel	x17,x22,x9,lo
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+
+	b	Lsqr8x_done
+
+.align	4
+Lsqr8x8_post_condition:
+	adc	x28,xzr,xzr
+	ldr	x30,[x29,#8]		// pull return address
+	// x19-7,x28 hold result, x6-7 hold modulus
+	subs	x6,x19,x6
+	ldr	x1,[x29,#96]		// pull rp
+	sbcs	x7,x20,x7
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x8
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x9
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	x10,x23,x10
+	stp	xzr,xzr,[sp,#8*6]
+	sbcs	x11,x24,x11
+	stp	xzr,xzr,[sp,#8*8]
+	sbcs	x12,x25,x12
+	stp	xzr,xzr,[sp,#8*10]
+	sbcs	x13,x26,x13
+	stp	xzr,xzr,[sp,#8*12]
+	sbcs	x28,x28,xzr	// did it borrow?
+	stp	xzr,xzr,[sp,#8*14]
+
+	// x6-7 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	csel	x10,x23,x10,lo
+	csel	x11,x24,x11,lo
+	stp	x8,x9,[x1,#8*2]
+	csel	x12,x25,x12,lo
+	csel	x13,x26,x13,lo
+	stp	x10,x11,[x1,#8*4]
+	stp	x12,x13,[x1,#8*6]
+
+Lsqr8x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	ret
+
+
+.align	5
+__bn_mul4x_mont:
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	sub	x26,sp,x5,lsl#3
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	sub	sp,x26,#8*4		// alloca
+
+	add	x10,x2,x5
+	add	x27,x1,x5
+	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
+
+	ldr	x24,[x2,#8*0]		// b[0]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x28,#0
+	mov	x26,sp
+
+Loop_mul4x_1st_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[0])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	sub	x10,x27,x1
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_reduction
+
+	cbz	x10,Lmul4x4_post_condition
+
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldr	x25,[sp]		// a[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+Loop_mul4x_1st_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[i])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	adcs	x23,x23,x0
+	umulh	x13,x17,x25
+	adc	x0,xzr,xzr
+	ldr	x25,[sp,x28]		// next t[0]*n0
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_tail
+
+	sub	x11,x27,x5	// rewinded x1
+	cbz	x10,Lmul4x_proceed
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_1st_tail
+
+.align	5
+Lmul4x_proceed:
+	ldr	x24,[x2,#8*4]!		// *++b
+	adc	x30,x0,xzr
+	ldp	x6,x7,[x11,#8*0]	// a[0..3]
+	sub	x3,x3,x5		// rewind np
+	ldp	x8,x9,[x11,#8*2]
+	add	x1,x11,#8*4
+
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	ldp	x21,x22,[sp,#8*6]
+
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	mov	x26,sp
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+
+.align	4
+Loop_mul4x_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_reduction
+
+	adc	x0,x0,xzr
+	ldp	x10,x11,[x26,#8*4]	// t[4..7]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+
+	ldr	x25,[sp]		// t[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.align	4
+Loop_mul4x_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	umulh	x13,x17,x25
+	adcs	x23,x23,x0
+	ldr	x25,[sp,x28]		// next a[0]*n0
+	adc	x0,xzr,xzr
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_tail
+
+	sub	x11,x3,x5		// rewinded np?
+	adc	x0,x0,xzr
+	cbz	x10,Loop_mul4x_break
+
+	ldp	x10,x11,[x26,#8*4]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_tail
+
+.align	4
+Loop_mul4x_break:
+	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
+	adds	x19,x19,x30
+	add	x2,x2,#8*4		// bp++
+	adcs	x20,x20,xzr
+	sub	x1,x1,x5		// rewind ap
+	adcs	x21,x21,xzr
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	adcs	x22,x22,xzr
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	adc	x30,x0,xzr
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	cmp	x2,x13			// done yet?
+	ldp	x21,x22,[sp,#8*6]
+	ldp	x14,x15,[x11,#8*0]	// n[0..3]
+	ldp	x16,x17,[x11,#8*2]
+	add	x3,x11,#8*4
+	b.eq	Lmul4x_post
+
+	ldr	x24,[x2]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	adds	x1,x1,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x26,sp
+	b	Loop_mul4x_reduction
+
+.align	4
+Lmul4x_post:
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	mov	x0,x12
+	mov	x27,x12		// x0 copy
+	subs	x10,x19,x14
+	add	x26,sp,#8*8
+	sbcs	x11,x20,x15
+	sub	x28,x5,#8*4
+
+Lmul4x_sub:
+	sbcs	x12,x21,x16
+	ldp	x14,x15,[x3,#8*0]
+	sub	x28,x28,#8*4
+	ldp	x19,x20,[x26,#8*0]
+	sbcs	x13,x22,x17
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	ldp	x21,x22,[x26,#8*2]
+	add	x26,x26,#8*4
+	stp	x10,x11,[x0,#8*0]
+	sbcs	x10,x19,x14
+	stp	x12,x13,[x0,#8*2]
+	add	x0,x0,#8*4
+	sbcs	x11,x20,x15
+	cbnz	x28,Lmul4x_sub
+
+	sbcs	x12,x21,x16
+	mov	x26,sp
+	add	x1,sp,#8*4
+	ldp	x6,x7,[x27,#8*0]
+	sbcs	x13,x22,x17
+	stp	x10,x11,[x0,#8*0]
+	ldp	x8,x9,[x27,#8*2]
+	stp	x12,x13,[x0,#8*2]
+	ldp	x19,x20,[x1,#8*0]
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+
+	sub	x28,x5,#8*4
+Lmul4x_cond_copy:
+	sub	x28,x28,#8*4
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	ldp	x6,x7,[x27,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*2]
+	add	x26,x26,#8*4
+	csel	x13,x22,x9,lo
+	ldp	x8,x9,[x27,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+	add	x27,x27,#8*4
+	cbnz	x28,Lmul4x_cond_copy
+
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	stp	xzr,xzr,[x26,#8*2]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*3]
+	csel	x13,x22,x9,lo
+	stp	xzr,xzr,[x26,#8*4]
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+
+	b	Lmul4x_done
+
+.align	4
+Lmul4x4_post_condition:
+	adc	x0,x0,xzr
+	ldr	x1,[x29,#96]		// pull rp
+	// x19-3,x0 hold result, x14-7 hold modulus
+	subs	x6,x19,x14
+	ldr	x30,[x29,#8]		// pull return address
+	sbcs	x7,x20,x15
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x16
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x17
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	xzr,x0,xzr		// did it borrow?
+	stp	xzr,xzr,[sp,#8*6]
+
+	// x6-3 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	stp	x8,x9,[x1,#8*2]
+
+Lmul4x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	ret
+
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	4
diff --git a/ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S b/ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S
new file mode 100644
index 0000000..cdf64e6
--- /dev/null
+++ b/ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S
@@ -0,0 +1,231 @@
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	_gcm_init_v8
+.private_extern	_gcm_init_v8
+
+.align	4
+_gcm_init_v8:
+	ld1	{v17.2d},[x1]		//load input H
+	movi	v19.16b,#0xe1
+	shl	v19.2d,v19.2d,#57		//0xc2.0
+	ext	v3.16b,v17.16b,v17.16b,#8
+	ushr	v18.2d,v19.2d,#63
+	dup	v17.4s,v17.s[1]
+	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
+	ushr	v18.2d,v3.2d,#63
+	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
+	and	v18.16b,v18.16b,v16.16b
+	shl	v3.2d,v3.2d,#1
+	ext	v18.16b,v18.16b,v18.16b,#8
+	and	v16.16b,v16.16b,v17.16b
+	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
+	eor	v20.16b,v3.16b,v16.16b		//twisted H
+	st1	{v20.2d},[x0],#16		//store Htable[0]
+
+	//calculate H^2
+	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
+	pmull	v0.1q,v20.1d,v20.1d
+	eor	v16.16b,v16.16b,v20.16b
+	pmull2	v2.1q,v20.2d,v20.2d
+	pmull	v1.1q,v16.1d,v16.1d
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v22.16b,v0.16b,v18.16b
+
+	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
+
+	ret
+
+.globl	_gcm_gmult_v8
+.private_extern	_gcm_gmult_v8
+
+.align	4
+_gcm_gmult_v8:
+	ld1	{v17.2d},[x0]		//load Xi
+	movi	v19.16b,#0xe1
+	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
+	shl	v19.2d,v19.2d,#57
+#ifndef __ARMEB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v3.16b,v17.16b,v17.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.globl	_gcm_ghash_v8
+.private_extern	_gcm_ghash_v8
+
+.align	4
+_gcm_ghash_v8:
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+						//"[rotated]" means that
+						//loaded value would have
+						//to be rotated in order to
+						//make it appear as in
+						//alorithm specification
+	subs	x3,x3,#32		//see if x3 is 32 or larger
+	mov	x12,#16		//x12 is used as post-
+						//increment for input pointer;
+						//as loop is modulo-scheduled
+						//x12 is zeroed just in time
+						//to preclude oversteping
+						//inp[len], which means that
+						//last block[s] are actually
+						//loaded twice, but last
+						//copy is not processed
+	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v22.2d},[x1]
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
+	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+#ifndef __ARMEB__
+	rev64	v16.16b,v16.16b
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
+	b.lo	Lodd_tail_v8		//x3 was less than 32
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
+#ifndef __ARMEB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v7.16b,v17.16b,v17.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	pmull2	v6.1q,v20.2d,v7.2d
+	b	Loop_mod2x_v8
+
+.align	4
+Loop_mod2x_v8:
+	ext	v18.16b,v3.16b,v3.16b,#8
+	subs	x3,x3,#32		//is there more data?
+	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
+	csel	x12,xzr,x12,lo			//is it time to zero x12?
+
+	pmull	v5.1q,v21.1d,v17.1d
+	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
+	eor	v0.16b,v0.16b,v4.16b		//accumulate
+	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
+
+	eor	v2.16b,v2.16b,v6.16b
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	eor	v1.16b,v1.16b,v5.16b
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
+#ifndef __ARMEB__
+	rev64	v16.16b,v16.16b
+#endif
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+#ifndef __ARMEB__
+	rev64	v17.16b,v17.16b
+#endif
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v7.16b,v17.16b,v17.16b,#8
+	ext	v3.16b,v16.16b,v16.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v3.16b,v3.16b,v18.16b
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	eor	v3.16b,v3.16b,v0.16b
+	pmull2	v6.1q,v20.2d,v7.2d
+	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
+
+	eor	v2.16b,v2.16b,v18.16b
+	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
+	adds	x3,x3,#32		//re-construct x3
+	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
+	b.eq	Ldone_v8		//is x3 zero?
+Lodd_tail_v8:
+	ext	v18.16b,v0.16b,v0.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
+	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+Ldone_v8:
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
diff --git a/ios-aarch64/crypto/fipsmodule/sha1-armv8.S b/ios-aarch64/crypto/fipsmodule/sha1-armv8.S
new file mode 100644
index 0000000..de9cdf8
--- /dev/null
+++ b/ios-aarch64/crypto/fipsmodule/sha1-armv8.S
@@ -0,0 +1,1222 @@
+#include <openssl/arm_arch.h>
+
+.text
+
+
+.globl	_sha1_block_data_order
+.private_extern	_sha1_block_data_order
+
+.align	6
+_sha1_block_data_order:
+#ifdef	__ILP32__
+	ldrsw	x16,LOPENSSL_armcap_P
+#else
+	ldr	x16,LOPENSSL_armcap_P
+#endif
+	adr	x17,LOPENSSL_armcap_P
+	add	x16,x16,x17
+	ldr	w16,[x16]
+	tst	w16,#ARMV8_SHA1
+	b.ne	Lv8_entry
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	w20,w21,[x0]
+	ldp	w22,w23,[x0,#8]
+	ldr	w24,[x0,#16]
+
+Loop:
+	ldr	x3,[x1],#64
+	movz	w28,#0x7999
+	sub	x2,x2,#1
+	movk	w28,#0x5a82,lsl#16
+#ifdef	__ARMEB__
+	ror	x3,x3,#32
+#else
+	rev32	x3,x3
+#endif
+	add	w24,w24,w28		// warm it up
+	add	w24,w24,w3
+	lsr	x4,x3,#32
+	ldr	x5,[x1,#-56]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w4	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x5,x5,#32
+#else
+	rev32	x5,x5
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w5	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x6,x5,#32
+	ldr	x7,[x1,#-48]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w6	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x7,x7,#32
+#else
+	rev32	x7,x7
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w7	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x8,x7,#32
+	ldr	x9,[x1,#-40]
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w8	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x9,x9,#32
+#else
+	rev32	x9,x9
+#endif
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w9	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	lsr	x10,x9,#32
+	ldr	x11,[x1,#-32]
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w10	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x11,x11,#32
+#else
+	rev32	x11,x11
+#endif
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w11	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	lsr	x12,x11,#32
+	ldr	x13,[x1,#-24]
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w12	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x13,x13,#32
+#else
+	rev32	x13,x13
+#endif
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w13	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	lsr	x14,x13,#32
+	ldr	x15,[x1,#-16]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w14	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x15,x15,#32
+#else
+	rev32	x15,x15
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w15	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x16,x15,#32
+	ldr	x17,[x1,#-8]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w16	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x17,x17,#32
+#else
+	rev32	x17,x17
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w17	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x19,x17,#32
+	eor	w3,w3,w5
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w3,w3,w11
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w3,w3,w16
+	ror	w22,w22,#2
+	add	w24,w24,w19	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	eor	w4,w4,w12
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	eor	w4,w4,w17
+	ror	w21,w21,#2
+	add	w23,w23,w3	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	eor	w5,w5,w13
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	eor	w5,w5,w19
+	ror	w20,w20,#2
+	add	w22,w22,w4	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	eor	w6,w6,w14
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	eor	w6,w6,w3
+	ror	w24,w24,#2
+	add	w21,w21,w5	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	eor	w7,w7,w15
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	eor	w7,w7,w4
+	ror	w23,w23,#2
+	add	w20,w20,w6	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	movz	w28,#0xeba1
+	movk	w28,#0x6ed9,lsl#16
+	eor	w8,w8,w10
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w8,w8,w16
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w8,w8,w5
+	ror	w22,w22,#2
+	add	w24,w24,w7	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w9,w9,w6
+	add	w23,w23,w8	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w10,w10,w7
+	add	w22,w22,w9	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w11,w11,w8
+	add	w21,w21,w10	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w12,w12,w9
+	add	w20,w20,w11	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w13,w13,w10
+	add	w24,w24,w12	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w14,w14,w11
+	add	w23,w23,w13	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w15,w15,w12
+	add	w22,w22,w14	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w16,w16,w13
+	add	w21,w21,w15	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w17,w17,w14
+	add	w20,w20,w16	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w19,w19,w15
+	add	w24,w24,w17	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w3,w3,w16
+	add	w23,w23,w19	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w4,w4,w17
+	add	w22,w22,w3	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w5,w5,w19
+	add	w21,w21,w4	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w6,w6,w3
+	add	w20,w20,w5	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w7,w7,w4
+	add	w24,w24,w6	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w8,w8,w5
+	add	w23,w23,w7	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w9,w9,w6
+	add	w22,w22,w8	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w10,w10,w7
+	add	w21,w21,w9	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w11,w11,w8
+	add	w20,w20,w10	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	movz	w28,#0xbcdc
+	movk	w28,#0x8f1b,lsl#16
+	eor	w12,w12,w14
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w12,w12,w9
+	add	w24,w24,w11	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w13,w13,w15
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w13,w13,w10
+	add	w23,w23,w12	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w14,w14,w16
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w14,w14,w11
+	add	w22,w22,w13	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w15,w15,w17
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w15,w15,w12
+	add	w21,w21,w14	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w16,w16,w19
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w16,w16,w13
+	add	w20,w20,w15	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w17,w17,w3
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w17,w17,w9
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w17,w17,w14
+	add	w24,w24,w16	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w19,w19,w4
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w19,w19,w10
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w19,w19,w15
+	add	w23,w23,w17	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w3,w3,w5
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w3,w3,w11
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w3,w3,w16
+	add	w22,w22,w19	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w4,w4,w6
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w4,w4,w12
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w4,w4,w17
+	add	w21,w21,w3	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w5,w5,w7
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w5,w5,w13
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w5,w5,w19
+	add	w20,w20,w4	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w6,w6,w8
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w6,w6,w14
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w6,w6,w3
+	add	w24,w24,w5	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w7,w7,w9
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w7,w7,w15
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w7,w7,w4
+	add	w23,w23,w6	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w8,w8,w10
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w8,w8,w16
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w8,w8,w5
+	add	w22,w22,w7	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w9,w9,w11
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w9,w9,w17
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w9,w9,w6
+	add	w21,w21,w8	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w10,w10,w12
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w10,w10,w19
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w10,w10,w7
+	add	w20,w20,w9	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w11,w11,w13
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w11,w11,w3
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w11,w11,w8
+	add	w24,w24,w10	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w12,w12,w14
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w12,w12,w4
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w12,w12,w9
+	add	w23,w23,w11	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w13,w13,w15
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w13,w13,w10
+	add	w22,w22,w12	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w14,w14,w16
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w14,w14,w11
+	add	w21,w21,w13	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w15,w15,w17
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w15,w15,w12
+	add	w20,w20,w14	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	movz	w28,#0xc1d6
+	movk	w28,#0xca62,lsl#16
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w16,w16,w19
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w16,w16,w13
+	add	w24,w24,w15	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w17,w17,w14
+	add	w23,w23,w16	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w19,w19,w15
+	add	w22,w22,w17	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w3,w3,w16
+	add	w21,w21,w19	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w4,w4,w17
+	add	w20,w20,w3	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w5,w5,w19
+	add	w24,w24,w4	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w6,w6,w3
+	add	w23,w23,w5	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w7,w7,w4
+	add	w22,w22,w6	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w8,w8,w5
+	add	w21,w21,w7	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w9,w9,w6
+	add	w20,w20,w8	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w10,w10,w7
+	add	w24,w24,w9	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w11,w11,w8
+	add	w23,w23,w10	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w12,w12,w9
+	add	w22,w22,w11	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w13,w13,w10
+	add	w21,w21,w12	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w14,w14,w11
+	add	w20,w20,w13	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w15,w15,w12
+	add	w24,w24,w14	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w16,w16,w13
+	add	w23,w23,w15	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w17,w17,w14
+	add	w22,w22,w16	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w19,w19,w15
+	add	w21,w21,w17	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	ldp	w4,w5,[x0]
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w19	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ldp	w6,w7,[x0,#8]
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	ldr	w8,[x0,#16]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	add	w21,w21,w5
+	add	w22,w22,w6
+	add	w20,w20,w4
+	add	w23,w23,w7
+	add	w24,w24,w8
+	stp	w20,w21,[x0]
+	stp	w22,w23,[x0,#8]
+	str	w24,[x0,#16]
+	cbnz	x2,Loop
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldr	x29,[sp],#96
+	ret
+
+
+.align	6
+sha1_block_armv8:
+Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	adr	x4,Lconst
+	eor	v1.16b,v1.16b,v1.16b
+	ld1	{v0.4s},[x0],#16
+	ld1	{v1.s}[0],[x0]
+	sub	x0,x0,#16
+	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x4]
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+
+	add	v20.4s,v16.4s,v4.4s
+	rev32	v6.16b,v6.16b
+	orr	v22.16b,v0.16b,v0.16b	// offload
+
+	add	v21.4s,v16.4s,v5.4s
+	rev32	v7.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b
+.long	0x5e140020	//sha1c v0.16b,v1.16b,v20.4s		// 0
+	add	v20.4s,v16.4s,v6.4s
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 1
+.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v16.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 2
+.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v16.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 3
+.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 4
+.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 5
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 6
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 7
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 8
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 9
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 10
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 11
+.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 12
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 13
+.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 14
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 15
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 16
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 17
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 18
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 19
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+
+	add	v1.4s,v1.4s,v2.4s
+	add	v0.4s,v0.4s,v22.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s},[x0],#16
+	st1	{v1.s}[0],[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+.align	6
+Lconst:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
+LOPENSSL_armcap_P:
+#ifdef	__ILP32__
+.long	_OPENSSL_armcap_P-.
+#else
+.quad	_OPENSSL_armcap_P-.
+#endif
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.comm	_OPENSSL_armcap_P,4,4
diff --git a/ios-aarch64/crypto/fipsmodule/sha256-armv8.S b/ios-aarch64/crypto/fipsmodule/sha256-armv8.S
new file mode 100644
index 0000000..43811dd
--- /dev/null
+++ b/ios-aarch64/crypto/fipsmodule/sha256-armv8.S
@@ -0,0 +1,1201 @@
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significanty faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+
+.globl	_sha256_block_data_order
+.private_extern	_sha256_block_data_order
+
+.align	6
+_sha256_block_data_order:
+#ifndef	__KERNEL__
+# ifdef	__ILP32__
+	ldrsw	x16,LOPENSSL_armcap_P
+# else
+	ldr	x16,LOPENSSL_armcap_P
+# endif
+	adr	x17,LOPENSSL_armcap_P
+	add	x16,x16,x17
+	ldr	w16,[x16]
+	tst	w16,#ARMV8_SHA256
+	b.ne	Lv8_entry
+#endif
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*4
+
+	ldp	w20,w21,[x0]				// load context
+	ldp	w22,w23,[x0,#2*4]
+	ldp	w24,w25,[x0,#4*4]
+	add	x2,x1,x2,lsl#6	// end of input
+	ldp	w26,w27,[x0,#6*4]
+	adr	x30,LK256
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	w3,w4,[x1],#2*4
+	ldr	w19,[x30],#4			// *K++
+	eor	w28,w21,w22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__ARMEB__
+	rev	w3,w3			// 0
+#endif
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w6,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w3			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w4,w4			// 1
+#endif
+	ldp	w5,w6,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w7,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w4			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w5,w5			// 2
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w8,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w5			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w6,w6			// 3
+#endif
+	ldp	w7,w8,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w9,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w6			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w7,w7			// 4
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w10,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w7			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w10,ror#11	// Sigma1(e)
+	ror	w10,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w10,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w8,w8			// 5
+#endif
+	ldp	w9,w10,[x1],#2*4
+	add	w23,w23,w17			// h+=Sigma0(a)
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w11,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w8			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w11,ror#11	// Sigma1(e)
+	ror	w11,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w11,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w9,w9			// 6
+#endif
+	add	w22,w22,w17			// h+=Sigma0(a)
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w12,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w9			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w12,ror#11	// Sigma1(e)
+	ror	w12,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w12,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w10,w10			// 7
+#endif
+	ldp	w11,w12,[x1],#2*4
+	add	w21,w21,w17			// h+=Sigma0(a)
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	eor	w13,w25,w25,ror#14
+	and	w17,w26,w25
+	bic	w28,w27,w25
+	add	w20,w20,w10			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w13,ror#11	// Sigma1(e)
+	ror	w13,w21,#2
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	eor	w17,w21,w21,ror#9
+	add	w20,w20,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w24,w24,w20			// d+=h
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w13,w17,ror#13	// Sigma0(a)
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w20,w20,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w11,w11			// 8
+#endif
+	add	w20,w20,w17			// h+=Sigma0(a)
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w14,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w11			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w14,ror#11	// Sigma1(e)
+	ror	w14,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w14,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w12,w12			// 9
+#endif
+	ldp	w13,w14,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w15,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w12			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w15,ror#11	// Sigma1(e)
+	ror	w15,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w15,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w13,w13			// 10
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w0,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w13			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w0,ror#11	// Sigma1(e)
+	ror	w0,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w0,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w14,w14			// 11
+#endif
+	ldp	w15,w0,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w6,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w14			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w15,w15			// 12
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w7,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w15			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w0,w0			// 13
+#endif
+	ldp	w1,w2,[x1]
+	add	w23,w23,w17			// h+=Sigma0(a)
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w8,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w0			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w1,w1			// 14
+#endif
+	ldr	w6,[sp,#12]
+	add	w22,w22,w17			// h+=Sigma0(a)
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w9,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w1			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w2,w2			// 15
+#endif
+	ldr	w7,[sp,#0]
+	add	w21,w21,w17			// h+=Sigma0(a)
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+Loop_16_xx:
+	ldr	w8,[sp,#4]
+	str	w11,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w10,w5,#7
+	and	w17,w25,w24
+	ror	w9,w2,#17
+	bic	w19,w26,w24
+	ror	w11,w20,#2
+	add	w27,w27,w3			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w10,w10,w5,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w11,w11,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w9,w9,w2,ror#19
+	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w11,w20,ror#22	// Sigma0(a)
+	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
+	add	w4,w4,w13
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w4,w4,w10
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w4,w4,w9
+	ldr	w9,[sp,#8]
+	str	w12,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w11,w6,#7
+	and	w17,w24,w23
+	ror	w10,w3,#17
+	bic	w28,w25,w23
+	ror	w12,w27,#2
+	add	w26,w26,w4			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w11,w11,w6,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w12,w12,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w10,w10,w3,ror#19
+	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w12,w27,ror#22	// Sigma0(a)
+	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
+	add	w5,w5,w14
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w5,w5,w11
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w5,w5,w10
+	ldr	w10,[sp,#12]
+	str	w13,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w12,w7,#7
+	and	w17,w23,w22
+	ror	w11,w4,#17
+	bic	w19,w24,w22
+	ror	w13,w26,#2
+	add	w25,w25,w5			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w12,w12,w7,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w13,w13,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w11,w11,w4,ror#19
+	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w13,w26,ror#22	// Sigma0(a)
+	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
+	add	w6,w6,w15
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w6,w6,w12
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w6,w6,w11
+	ldr	w11,[sp,#0]
+	str	w14,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w13,w8,#7
+	and	w17,w22,w21
+	ror	w12,w5,#17
+	bic	w28,w23,w21
+	ror	w14,w25,#2
+	add	w24,w24,w6			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w13,w13,w8,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w14,w14,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w12,w12,w5,ror#19
+	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w14,w25,ror#22	// Sigma0(a)
+	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
+	add	w7,w7,w0
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w7,w7,w13
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w7,w7,w12
+	ldr	w12,[sp,#4]
+	str	w15,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w14,w9,#7
+	and	w17,w21,w20
+	ror	w13,w6,#17
+	bic	w19,w22,w20
+	ror	w15,w24,#2
+	add	w23,w23,w7			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w14,w14,w9,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w15,w15,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w13,w13,w6,ror#19
+	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w15,w24,ror#22	// Sigma0(a)
+	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
+	add	w8,w8,w1
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w8,w8,w14
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w8,w8,w13
+	ldr	w13,[sp,#8]
+	str	w0,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w15,w10,#7
+	and	w17,w20,w27
+	ror	w14,w7,#17
+	bic	w28,w21,w27
+	ror	w0,w23,#2
+	add	w22,w22,w8			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w15,w15,w10,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w0,w0,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w14,w14,w7,ror#19
+	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w0,w23,ror#22	// Sigma0(a)
+	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
+	add	w9,w9,w2
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w9,w9,w15
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w9,w9,w14
+	ldr	w14,[sp,#12]
+	str	w1,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w0,w11,#7
+	and	w17,w27,w26
+	ror	w15,w8,#17
+	bic	w19,w20,w26
+	ror	w1,w22,#2
+	add	w21,w21,w9			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w0,w0,w11,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w1,w1,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w15,w15,w8,ror#19
+	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w1,w22,ror#22	// Sigma0(a)
+	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
+	add	w10,w10,w3
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w10,w10,w0
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w10,w10,w15
+	ldr	w15,[sp,#0]
+	str	w2,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w1,w12,#7
+	and	w17,w26,w25
+	ror	w0,w9,#17
+	bic	w28,w27,w25
+	ror	w2,w21,#2
+	add	w20,w20,w10			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w1,w1,w12,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w2,w2,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w0,w0,w9,ror#19
+	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w2,w21,ror#22	// Sigma0(a)
+	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
+	add	w11,w11,w4
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w11,w11,w1
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w11,w11,w0
+	ldr	w0,[sp,#4]
+	str	w3,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w2,w13,#7
+	and	w17,w25,w24
+	ror	w1,w10,#17
+	bic	w19,w26,w24
+	ror	w3,w20,#2
+	add	w27,w27,w11			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w2,w2,w13,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w3,w3,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w1,w1,w10,ror#19
+	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w3,w20,ror#22	// Sigma0(a)
+	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
+	add	w12,w12,w5
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w12,w12,w2
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w12,w12,w1
+	ldr	w1,[sp,#8]
+	str	w4,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w3,w14,#7
+	and	w17,w24,w23
+	ror	w2,w11,#17
+	bic	w28,w25,w23
+	ror	w4,w27,#2
+	add	w26,w26,w12			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w3,w3,w14,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w4,w4,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w2,w2,w11,ror#19
+	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w4,w27,ror#22	// Sigma0(a)
+	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
+	add	w13,w13,w6
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w13,w13,w3
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w13,w13,w2
+	ldr	w2,[sp,#12]
+	str	w5,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w4,w15,#7
+	and	w17,w23,w22
+	ror	w3,w12,#17
+	bic	w19,w24,w22
+	ror	w5,w26,#2
+	add	w25,w25,w13			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w4,w4,w15,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w5,w5,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w3,w3,w12,ror#19
+	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w5,w26,ror#22	// Sigma0(a)
+	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
+	add	w14,w14,w7
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w14,w14,w4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w14,w14,w3
+	ldr	w3,[sp,#0]
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w5,w0,#7
+	and	w17,w22,w21
+	ror	w4,w13,#17
+	bic	w28,w23,w21
+	ror	w6,w25,#2
+	add	w24,w24,w14			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w5,w5,w0,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w6,w6,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w4,w4,w13,ror#19
+	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w25,ror#22	// Sigma0(a)
+	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
+	add	w15,w15,w8
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w15,w15,w5
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w15,w15,w4
+	ldr	w4,[sp,#4]
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w6,w1,#7
+	and	w17,w21,w20
+	ror	w5,w14,#17
+	bic	w19,w22,w20
+	ror	w7,w24,#2
+	add	w23,w23,w15			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w6,w6,w1,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w7,w7,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w5,w5,w14,ror#19
+	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w24,ror#22	// Sigma0(a)
+	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
+	add	w0,w0,w9
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w0,w0,w6
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w0,w0,w5
+	ldr	w5,[sp,#8]
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w7,w2,#7
+	and	w17,w20,w27
+	ror	w6,w15,#17
+	bic	w28,w21,w27
+	ror	w8,w23,#2
+	add	w22,w22,w0			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w7,w7,w2,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w8,w8,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w6,w6,w15,ror#19
+	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w23,ror#22	// Sigma0(a)
+	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
+	add	w1,w1,w10
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w1,w1,w7
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w1,w1,w6
+	ldr	w6,[sp,#12]
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w8,w3,#7
+	and	w17,w27,w26
+	ror	w7,w0,#17
+	bic	w19,w20,w26
+	ror	w9,w22,#2
+	add	w21,w21,w1			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w8,w8,w3,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w9,w9,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w7,w7,w0,ror#19
+	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w22,ror#22	// Sigma0(a)
+	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
+	add	w2,w2,w11
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w2,w2,w8
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w2,w2,w7
+	ldr	w7,[sp,#0]
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+	cbnz	w19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#260		// rewind
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#2*4]
+	add	x1,x1,#14*4			// advance input pointer
+	ldp	w7,w8,[x0,#4*4]
+	add	w20,w20,w3
+	ldp	w9,w10,[x0,#6*4]
+	add	w21,w21,w4
+	add	w22,w22,w5
+	add	w23,w23,w6
+	stp	w20,w21,[x0]
+	add	w24,w24,w7
+	add	w25,w25,w8
+	stp	w22,w23,[x0,#2*4]
+	add	w26,w26,w9
+	add	w27,w27,w10
+	cmp	x1,x2
+	stp	w24,w25,[x0,#4*4]
+	stp	w26,w27,[x0,#6*4]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*4
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	ret
+
+
+.align	6
+
+LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+
+#ifndef	__KERNEL__
+.align	3
+LOPENSSL_armcap_P:
+# ifdef	__ILP32__
+.long	_OPENSSL_armcap_P-.
+# else
+.quad	_OPENSSL_armcap_P-.
+# endif
+#endif
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#ifndef	__KERNEL__
+
+.align	6
+sha256_block_armv8:
+Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adr	x3,LK256
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#ifndef	__KERNEL__
+.comm	_OPENSSL_armcap_P,4,4
+#endif
diff --git a/ios-aarch64/crypto/fipsmodule/sha512-armv8.S b/ios-aarch64/crypto/fipsmodule/sha512-armv8.S
new file mode 100644
index 0000000..41159c3
--- /dev/null
+++ b/ios-aarch64/crypto/fipsmodule/sha512-armv8.S
@@ -0,0 +1,1073 @@
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significanty faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+
+.globl	_sha512_block_data_order
+.private_extern	_sha512_block_data_order
+
+.align	6
+_sha512_block_data_order:
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*8
+
+	ldp	x20,x21,[x0]				// load context
+	ldp	x22,x23,[x0,#2*8]
+	ldp	x24,x25,[x0,#4*8]
+	add	x2,x1,x2,lsl#7	// end of input
+	ldp	x26,x27,[x0,#6*8]
+	adr	x30,LK512
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	x3,x4,[x1],#2*8
+	ldr	x19,[x30],#8			// *K++
+	eor	x28,x21,x22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__ARMEB__
+	rev	x3,x3			// 0
+#endif
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x6,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x3			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x4,x4			// 1
+#endif
+	ldp	x5,x6,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x7,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x4			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x5,x5			// 2
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x8,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x5			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x6,x6			// 3
+#endif
+	ldp	x7,x8,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x9,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x6			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x7,x7			// 4
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x10,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x7			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x10,ror#18	// Sigma1(e)
+	ror	x10,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x10,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x8,x8			// 5
+#endif
+	ldp	x9,x10,[x1],#2*8
+	add	x23,x23,x17			// h+=Sigma0(a)
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x11,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x8			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x11,ror#18	// Sigma1(e)
+	ror	x11,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x11,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x9,x9			// 6
+#endif
+	add	x22,x22,x17			// h+=Sigma0(a)
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x12,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x9			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x12,ror#18	// Sigma1(e)
+	ror	x12,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x12,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x10,x10			// 7
+#endif
+	ldp	x11,x12,[x1],#2*8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	eor	x13,x25,x25,ror#23
+	and	x17,x26,x25
+	bic	x28,x27,x25
+	add	x20,x20,x10			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x13,ror#18	// Sigma1(e)
+	ror	x13,x21,#28
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	eor	x17,x21,x21,ror#5
+	add	x20,x20,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x24,x24,x20			// d+=h
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x13,x17,ror#34	// Sigma0(a)
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x20,x20,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x11,x11			// 8
+#endif
+	add	x20,x20,x17			// h+=Sigma0(a)
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x14,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x11			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x14,ror#18	// Sigma1(e)
+	ror	x14,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x14,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x12,x12			// 9
+#endif
+	ldp	x13,x14,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x15,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x12			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x15,ror#18	// Sigma1(e)
+	ror	x15,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x15,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x13,x13			// 10
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x0,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x13			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x0,ror#18	// Sigma1(e)
+	ror	x0,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x0,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x14,x14			// 11
+#endif
+	ldp	x15,x0,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x6,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x14			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x15,x15			// 12
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x7,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x15			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x0,x0			// 13
+#endif
+	ldp	x1,x2,[x1]
+	add	x23,x23,x17			// h+=Sigma0(a)
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x8,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x0			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x1,x1			// 14
+#endif
+	ldr	x6,[sp,#24]
+	add	x22,x22,x17			// h+=Sigma0(a)
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x9,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x1			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x2,x2			// 15
+#endif
+	ldr	x7,[sp,#0]
+	add	x21,x21,x17			// h+=Sigma0(a)
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+Loop_16_xx:
+	ldr	x8,[sp,#8]
+	str	x11,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x10,x5,#1
+	and	x17,x25,x24
+	ror	x9,x2,#19
+	bic	x19,x26,x24
+	ror	x11,x20,#28
+	add	x27,x27,x3			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x10,x10,x5,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x11,x11,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x9,x9,x2,ror#61
+	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x11,x20,ror#39	// Sigma0(a)
+	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
+	add	x4,x4,x13
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x4,x4,x10
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x4,x4,x9
+	ldr	x9,[sp,#16]
+	str	x12,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x11,x6,#1
+	and	x17,x24,x23
+	ror	x10,x3,#19
+	bic	x28,x25,x23
+	ror	x12,x27,#28
+	add	x26,x26,x4			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x11,x11,x6,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x12,x12,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x10,x10,x3,ror#61
+	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x12,x27,ror#39	// Sigma0(a)
+	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
+	add	x5,x5,x14
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x5,x5,x11
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x5,x5,x10
+	ldr	x10,[sp,#24]
+	str	x13,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x12,x7,#1
+	and	x17,x23,x22
+	ror	x11,x4,#19
+	bic	x19,x24,x22
+	ror	x13,x26,#28
+	add	x25,x25,x5			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x12,x12,x7,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x13,x13,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x11,x11,x4,ror#61
+	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x13,x26,ror#39	// Sigma0(a)
+	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
+	add	x6,x6,x15
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x6,x6,x12
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x6,x6,x11
+	ldr	x11,[sp,#0]
+	str	x14,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x13,x8,#1
+	and	x17,x22,x21
+	ror	x12,x5,#19
+	bic	x28,x23,x21
+	ror	x14,x25,#28
+	add	x24,x24,x6			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x13,x13,x8,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x14,x14,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x12,x12,x5,ror#61
+	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x14,x25,ror#39	// Sigma0(a)
+	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
+	add	x7,x7,x0
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x7,x7,x13
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x7,x7,x12
+	ldr	x12,[sp,#8]
+	str	x15,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x14,x9,#1
+	and	x17,x21,x20
+	ror	x13,x6,#19
+	bic	x19,x22,x20
+	ror	x15,x24,#28
+	add	x23,x23,x7			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x14,x14,x9,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x15,x15,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x13,x13,x6,ror#61
+	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x15,x24,ror#39	// Sigma0(a)
+	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
+	add	x8,x8,x1
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x8,x8,x14
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x8,x8,x13
+	ldr	x13,[sp,#16]
+	str	x0,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x15,x10,#1
+	and	x17,x20,x27
+	ror	x14,x7,#19
+	bic	x28,x21,x27
+	ror	x0,x23,#28
+	add	x22,x22,x8			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x15,x15,x10,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x0,x0,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x14,x14,x7,ror#61
+	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x0,x23,ror#39	// Sigma0(a)
+	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
+	add	x9,x9,x2
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x9,x9,x15
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x9,x9,x14
+	ldr	x14,[sp,#24]
+	str	x1,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x0,x11,#1
+	and	x17,x27,x26
+	ror	x15,x8,#19
+	bic	x19,x20,x26
+	ror	x1,x22,#28
+	add	x21,x21,x9			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x0,x0,x11,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x1,x1,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x15,x15,x8,ror#61
+	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x1,x22,ror#39	// Sigma0(a)
+	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
+	add	x10,x10,x3
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x10,x10,x0
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x10,x10,x15
+	ldr	x15,[sp,#0]
+	str	x2,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x1,x12,#1
+	and	x17,x26,x25
+	ror	x0,x9,#19
+	bic	x28,x27,x25
+	ror	x2,x21,#28
+	add	x20,x20,x10			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x1,x1,x12,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x2,x2,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x0,x0,x9,ror#61
+	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x2,x21,ror#39	// Sigma0(a)
+	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
+	add	x11,x11,x4
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x11,x11,x1
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x11,x11,x0
+	ldr	x0,[sp,#8]
+	str	x3,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x2,x13,#1
+	and	x17,x25,x24
+	ror	x1,x10,#19
+	bic	x19,x26,x24
+	ror	x3,x20,#28
+	add	x27,x27,x11			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x2,x2,x13,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x3,x3,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x1,x1,x10,ror#61
+	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x3,x20,ror#39	// Sigma0(a)
+	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
+	add	x12,x12,x5
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x12,x12,x2
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x12,x12,x1
+	ldr	x1,[sp,#16]
+	str	x4,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x3,x14,#1
+	and	x17,x24,x23
+	ror	x2,x11,#19
+	bic	x28,x25,x23
+	ror	x4,x27,#28
+	add	x26,x26,x12			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x3,x3,x14,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x4,x4,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x2,x2,x11,ror#61
+	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x4,x27,ror#39	// Sigma0(a)
+	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
+	add	x13,x13,x6
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x13,x13,x3
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x13,x13,x2
+	ldr	x2,[sp,#24]
+	str	x5,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x4,x15,#1
+	and	x17,x23,x22
+	ror	x3,x12,#19
+	bic	x19,x24,x22
+	ror	x5,x26,#28
+	add	x25,x25,x13			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x4,x4,x15,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x5,x5,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x3,x3,x12,ror#61
+	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x5,x26,ror#39	// Sigma0(a)
+	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
+	add	x14,x14,x7
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x14,x14,x4
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x14,x14,x3
+	ldr	x3,[sp,#0]
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x5,x0,#1
+	and	x17,x22,x21
+	ror	x4,x13,#19
+	bic	x28,x23,x21
+	ror	x6,x25,#28
+	add	x24,x24,x14			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x5,x5,x0,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x6,x6,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x4,x4,x13,ror#61
+	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x25,ror#39	// Sigma0(a)
+	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
+	add	x15,x15,x8
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x15,x15,x5
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x15,x15,x4
+	ldr	x4,[sp,#8]
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x6,x1,#1
+	and	x17,x21,x20
+	ror	x5,x14,#19
+	bic	x19,x22,x20
+	ror	x7,x24,#28
+	add	x23,x23,x15			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x6,x6,x1,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x7,x7,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x5,x5,x14,ror#61
+	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x24,ror#39	// Sigma0(a)
+	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
+	add	x0,x0,x9
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x0,x0,x6
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x0,x0,x5
+	ldr	x5,[sp,#16]
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x7,x2,#1
+	and	x17,x20,x27
+	ror	x6,x15,#19
+	bic	x28,x21,x27
+	ror	x8,x23,#28
+	add	x22,x22,x0			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x7,x7,x2,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x8,x8,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x6,x6,x15,ror#61
+	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x23,ror#39	// Sigma0(a)
+	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
+	add	x1,x1,x10
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x1,x1,x7
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x1,x1,x6
+	ldr	x6,[sp,#24]
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x8,x3,#1
+	and	x17,x27,x26
+	ror	x7,x0,#19
+	bic	x19,x20,x26
+	ror	x9,x22,#28
+	add	x21,x21,x1			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x8,x8,x3,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x9,x9,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x7,x7,x0,ror#61
+	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x22,ror#39	// Sigma0(a)
+	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
+	add	x2,x2,x11
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x2,x2,x8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x2,x2,x7
+	ldr	x7,[sp,#0]
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+	cbnz	x19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#648		// rewind
+
+	ldp	x3,x4,[x0]
+	ldp	x5,x6,[x0,#2*8]
+	add	x1,x1,#14*8			// advance input pointer
+	ldp	x7,x8,[x0,#4*8]
+	add	x20,x20,x3
+	ldp	x9,x10,[x0,#6*8]
+	add	x21,x21,x4
+	add	x22,x22,x5
+	add	x23,x23,x6
+	stp	x20,x21,[x0]
+	add	x24,x24,x7
+	add	x25,x25,x8
+	stp	x22,x23,[x0,#2*8]
+	add	x26,x26,x9
+	add	x27,x27,x10
+	cmp	x1,x2
+	stp	x24,x25,[x0,#4*8]
+	stp	x26,x27,[x0,#6*8]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*8
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	ret
+
+
+.align	6
+
+LK512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0	// terminator
+
+#ifndef	__KERNEL__
+.align	3
+LOPENSSL_armcap_P:
+# ifdef	__ILP32__
+.long	_OPENSSL_armcap_P-.
+# else
+.quad	_OPENSSL_armcap_P-.
+# endif
+#endif
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#ifndef	__KERNEL__
+.comm	_OPENSSL_armcap_P,4,4
+#endif
diff --git a/ios-arm/crypto/chacha/chacha-armv4.S b/ios-arm/crypto/chacha/chacha-armv4.S
new file mode 100644
index 0000000..2ec7129
--- /dev/null
+++ b/ios-arm/crypto/chacha/chacha-armv4.S
@@ -0,0 +1,1483 @@
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb	ldrbhs
+#endif
+
+.align	5
+Lsigma:
+.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
+Lone:
+.long	1,0,0,0
+#if __ARM_MAX_ARCH__>=7
+LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-LChaCha20_ctr32
+#else
+.word	-1
+#endif
+
+.globl	_ChaCha20_ctr32
+.private_extern	_ChaCha20_ctr32
+#ifdef __thumb2__
+.thumb_func	_ChaCha20_ctr32
+#endif
+.align	5
+_ChaCha20_ctr32:
+LChaCha20_ctr32:
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+	sub	r14,pc,#16		@ _ChaCha20_ctr32
+#else
+	adr	r14,LChaCha20_ctr32
+#endif
+	cmp	r2,#0			@ len==0?
+#ifdef	__thumb2__
+	itt	eq
+#endif
+	addeq	sp,sp,#4*3
+	beq	Lno_data
+#if __ARM_MAX_ARCH__>=7
+	cmp	r2,#192			@ test len
+	bls	Lshort
+	ldr	r4,[r14,#-32]
+	ldr	r4,[r14,r4]
+# ifdef	__APPLE__
+	ldr	r4,[r4]
+# endif
+	tst	r4,#ARMV7_NEON
+	bne	LChaCha20_neon
+Lshort:
+#endif
+	ldmia	r12,{r4,r5,r6,r7}		@ load counter and nonce
+	sub	sp,sp,#4*(16)		@ off-load area
+	sub	r14,r14,#64		@ Lsigma
+	stmdb	sp!,{r4,r5,r6,r7}		@ copy counter and nonce
+	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
+	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy key
+	stmdb	sp!,{r0,r1,r2,r3}		@ copy sigma
+	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
+	b	Loop_outer_enter
+
+.align	4
+Loop_outer:
+	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
+	str	r11,[sp,#4*(32+2)]	@ save len
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	str	r14,  [sp,#4*(32+0)]	@ save out
+Loop_outer_enter:
+	ldr	r11, [sp,#4*(15)]
+	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
+	ldr	r10, [sp,#4*(13)]
+	ldr	r14,[sp,#4*(14)]
+	str	r11, [sp,#4*(16+15)]
+	mov	r11,#10
+	b	Loop
+
+.align	4
+Loop:
+	subs	r11,r11,#1
+	add	r0,r0,r4
+	mov	r12,r12,ror#16
+	add	r1,r1,r5
+	mov	r10,r10,ror#16
+	eor	r12,r12,r0,ror#16
+	eor	r10,r10,r1,ror#16
+	add	r8,r8,r12
+	mov	r4,r4,ror#20
+	add	r9,r9,r10
+	mov	r5,r5,ror#20
+	eor	r4,r4,r8,ror#20
+	eor	r5,r5,r9,ror#20
+	add	r0,r0,r4
+	mov	r12,r12,ror#24
+	add	r1,r1,r5
+	mov	r10,r10,ror#24
+	eor	r12,r12,r0,ror#24
+	eor	r10,r10,r1,ror#24
+	add	r8,r8,r12
+	mov	r4,r4,ror#25
+	add	r9,r9,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+13)]
+	ldr	r10,[sp,#4*(16+15)]
+	eor	r4,r4,r8,ror#25
+	eor	r5,r5,r9,ror#25
+	str	r8,[sp,#4*(16+8)]
+	ldr	r8,[sp,#4*(16+10)]
+	add	r2,r2,r6
+	mov	r14,r14,ror#16
+	str	r9,[sp,#4*(16+9)]
+	ldr	r9,[sp,#4*(16+11)]
+	add	r3,r3,r7
+	mov	r10,r10,ror#16
+	eor	r14,r14,r2,ror#16
+	eor	r10,r10,r3,ror#16
+	add	r8,r8,r14
+	mov	r6,r6,ror#20
+	add	r9,r9,r10
+	mov	r7,r7,ror#20
+	eor	r6,r6,r8,ror#20
+	eor	r7,r7,r9,ror#20
+	add	r2,r2,r6
+	mov	r14,r14,ror#24
+	add	r3,r3,r7
+	mov	r10,r10,ror#24
+	eor	r14,r14,r2,ror#24
+	eor	r10,r10,r3,ror#24
+	add	r8,r8,r14
+	mov	r6,r6,ror#25
+	add	r9,r9,r10
+	mov	r7,r7,ror#25
+	eor	r6,r6,r8,ror#25
+	eor	r7,r7,r9,ror#25
+	add	r0,r0,r5
+	mov	r10,r10,ror#16
+	add	r1,r1,r6
+	mov	r12,r12,ror#16
+	eor	r10,r10,r0,ror#16
+	eor	r12,r12,r1,ror#16
+	add	r8,r8,r10
+	mov	r5,r5,ror#20
+	add	r9,r9,r12
+	mov	r6,r6,ror#20
+	eor	r5,r5,r8,ror#20
+	eor	r6,r6,r9,ror#20
+	add	r0,r0,r5
+	mov	r10,r10,ror#24
+	add	r1,r1,r6
+	mov	r12,r12,ror#24
+	eor	r10,r10,r0,ror#24
+	eor	r12,r12,r1,ror#24
+	add	r8,r8,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+15)]
+	ldr	r10,[sp,#4*(16+13)]
+	add	r9,r9,r12
+	mov	r6,r6,ror#25
+	eor	r5,r5,r8,ror#25
+	eor	r6,r6,r9,ror#25
+	str	r8,[sp,#4*(16+10)]
+	ldr	r8,[sp,#4*(16+8)]
+	add	r2,r2,r7
+	mov	r10,r10,ror#16
+	str	r9,[sp,#4*(16+11)]
+	ldr	r9,[sp,#4*(16+9)]
+	add	r3,r3,r4
+	mov	r14,r14,ror#16
+	eor	r10,r10,r2,ror#16
+	eor	r14,r14,r3,ror#16
+	add	r8,r8,r10
+	mov	r7,r7,ror#20
+	add	r9,r9,r14
+	mov	r4,r4,ror#20
+	eor	r7,r7,r8,ror#20
+	eor	r4,r4,r9,ror#20
+	add	r2,r2,r7
+	mov	r10,r10,ror#24
+	add	r3,r3,r4
+	mov	r14,r14,ror#24
+	eor	r10,r10,r2,ror#24
+	eor	r14,r14,r3,ror#24
+	add	r8,r8,r10
+	mov	r7,r7,ror#25
+	add	r9,r9,r14
+	mov	r4,r4,ror#25
+	eor	r7,r7,r8,ror#25
+	eor	r4,r4,r9,ror#25
+	bne	Loop
+
+	ldr	r11,[sp,#4*(32+2)]	@ load len
+
+	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	r9, [sp,#4*(16+9)]
+	str	r12,[sp,#4*(16+12)]
+	str	r10, [sp,#4*(16+13)]
+	str	r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	cmp	r11,#64		@ done yet?
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	addlo	r12,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
+	addlo	r14,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
+
+	ldr	r8,[sp,#4*(0)]	@ load key material
+	ldr	r9,[sp,#4*(1)]
+
+#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH__<7
+	orr	r10,r12,r14
+	tst	r10,#3		@ are input and output aligned?
+	ldr	r10,[sp,#4*(2)]
+	bne	Lunaligned
+	cmp	r11,#64		@ restore flags
+# else
+	ldr	r10,[sp,#4*(2)]
+# endif
+	ldr	r11,[sp,#4*(3)]
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+
+	add	r2,r2,r10
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r0,r0,r8	@ xor with input
+	eorhs	r1,r1,r9
+	add	r8,sp,#4*(4)
+	str	r0,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r1,[r14,#-12]
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+	add	r8,sp,#4*(8)
+	str	r4,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r5,[r14,#-12]
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r6,[r14,#-8]
+	add	r0,sp,#4*(16+8)
+	str	r7,[r14,#-4]
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add	r2,r2,r10
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r0,r0,r8
+	eorhs	r1,r1,r9
+	add	r8,sp,#4*(12)
+	str	r0,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	str	r1,[r14,#-12]
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	addhi	r8,r8,#1		@ next counter value
+	strhi	r8,[sp,#4*(12)]	@ save next counter value
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+# ifdef	__thumb2__
+	it	ne
+# endif
+	ldrne	r8,[sp,#4*(32+2)]	@ re-load len
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r4,[r14],#16		@ store output
+	str	r5,[r14,#-12]
+# ifdef	__thumb2__
+	it	hs
+# endif
+	subhs	r11,r8,#64		@ len-=64
+	str	r6,[r14,#-8]
+	str	r7,[r14,#-4]
+	bhi	Loop_outer
+
+	beq	Ldone
+# if __ARM_ARCH__<7
+	b	Ltail
+
+.align	4
+Lunaligned:@ unaligned endian-neutral path
+	cmp	r11,#64		@ restore flags
+# endif
+#endif
+#if __ARM_ARCH__<7
+	ldr	r11,[sp,#4*(3)]
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r1,[r14,#-10]
+	strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	strb	r2,[r14,#-5]
+	strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+0)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	add	r0,sp,#4*(16+8)
+	add	r4,r4,r8		@ accumulate key material
+	add	r5,r5,r9
+	add	r6,r6,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r5,[r14,#-10]
+	strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	strb	r6,[r14,#-5]
+	strb	r7,[r14,#-1]
+	add	r8,sp,#4*(4+4)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}		@ load second half
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]		@ copy "rx"
+	strhi	r11,[sp,#4*(16+11)]		@ copy "rx"
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r1,[r14,#-10]
+	strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	strb	r2,[r14,#-5]
+	strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+8)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	add	r4,r4,r8		@ accumulate key material
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	addhi	r8,r8,#1			@ next counter value
+	strhi	r8,[sp,#4*(12)]		@ save next counter value
+	add	r5,r5,r9
+	add	r6,r6,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r5,[r14,#-10]
+	strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	strb	r6,[r14,#-5]
+	strb	r7,[r14,#-1]
+# ifdef	__thumb2__
+	it	ne
+# endif
+	ldrne	r8,[sp,#4*(32+2)]		@ re-load len
+# ifdef	__thumb2__
+	it	hs
+# endif
+	subhs	r11,r8,#64			@ len-=64
+	bhi	Loop_outer
+
+	beq	Ldone
+#endif
+
+Ltail:
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	add	r9,sp,#4*(0)
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+Loop_tail:
+	ldrb	r10,[r9],#1	@ read buffer on stack
+	ldrb	r11,[r12],#1		@ read input
+	subs	r8,r8,#1
+	eor	r11,r11,r10
+	strb	r11,[r14],#1		@ store output
+	bne	Loop_tail
+
+Ldone:
+	add	sp,sp,#4*(32+3)
+Lno_data:
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+#ifdef __thumb2__
+.thumb_func	ChaCha20_neon
+#endif
+.align	5
+ChaCha20_neon:
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+LChaCha20_neon:
+	adr	r14,Lsigma
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI spec says so
+	stmdb	sp!,{r0,r1,r2,r3}
+
+	vld1.32	{q1,q2},[r3]		@ load key
+	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
+
+	sub	sp,sp,#4*(16+16)
+	vld1.32	{q3},[r12]		@ load counter and nonce
+	add	r12,sp,#4*8
+	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
+	vld1.32	{q0},[r14]!		@ load sigma
+	vld1.32	{q12},[r14]		@ one
+	vst1.32	{q2,q3},[r12]		@ copy 1/2key|counter|nonce
+	vst1.32	{q0,q1},[sp]		@ copy sigma|1/2key
+
+	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
+	vshl.i32	d26,d24,#1	@ two
+	vstr	d24,[sp,#4*(16+0)]
+	vshl.i32	d28,d24,#2	@ four
+	vstr	d26,[sp,#4*(16+2)]
+	vmov	q4,q0
+	vstr	d28,[sp,#4*(16+4)]
+	vmov	q8,q0
+	vmov	q5,q1
+	vmov	q9,q1
+	b	Loop_neon_enter
+
+.align	4
+Loop_neon_outer:
+	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
+	cmp	r11,#64*2		@ if len<=64*2
+	bls	Lbreak_neon		@ switch to integer-only
+	vmov	q4,q0
+	str	r11,[sp,#4*(32+2)]	@ save len
+	vmov	q8,q0
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	vmov	q5,q1
+	str	r14,  [sp,#4*(32+0)]	@ save out
+	vmov	q9,q1
+Loop_neon_enter:
+	ldr	r11, [sp,#4*(15)]
+	vadd.i32	q7,q3,q12		@ counter+1
+	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
+	vmov	q6,q2
+	ldr	r10, [sp,#4*(13)]
+	vmov	q10,q2
+	ldr	r14,[sp,#4*(14)]
+	vadd.i32	q11,q7,q12		@ counter+2
+	str	r11, [sp,#4*(16+15)]
+	mov	r11,#10
+	add	r12,r12,#3	@ counter+3
+	b	Loop_neon
+
+.align	4
+Loop_neon:
+	subs	r11,r11,#1
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r4
+	vadd.i32	q4,q4,q5
+	mov	r12,r12,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r5
+	veor	q3,q3,q0
+	mov	r10,r10,ror#16
+	veor	q7,q7,q4
+	eor	r12,r12,r0,ror#16
+	veor	q11,q11,q8
+	eor	r10,r10,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r12
+	vrev32.16	q7,q7
+	mov	r4,r4,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r10
+	vadd.i32	q2,q2,q3
+	mov	r5,r5,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r4,r4,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r5,r5,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r4
+	veor	q13,q5,q6
+	mov	r12,r12,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r5
+	vshr.u32	q1,q12,#20
+	mov	r10,r10,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r12,r12,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r10,r10,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r12
+	vsli.32	q5,q13,#12
+	mov	r4,r4,ror#25
+	vsli.32	q9,q14,#12
+	add	r9,r9,r10
+	vadd.i32	q0,q0,q1
+	mov	r5,r5,ror#25
+	vadd.i32	q4,q4,q5
+	str	r10,[sp,#4*(16+13)]
+	vadd.i32	q8,q8,q9
+	ldr	r10,[sp,#4*(16+15)]
+	veor	q12,q3,q0
+	eor	r4,r4,r8,ror#25
+	veor	q13,q7,q4
+	eor	r5,r5,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+8)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+10)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r6
+	vshr.u32	q11,q14,#24
+	mov	r14,r14,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+9)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+11)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r7
+	vadd.i32	q2,q2,q3
+	mov	r10,r10,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r14,r14,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r10,r10,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r14
+	veor	q13,q5,q6
+	mov	r6,r6,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r10
+	vshr.u32	q1,q12,#25
+	mov	r7,r7,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r6,r6,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r7,r7,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r6
+	vsli.32	q5,q13,#7
+	mov	r14,r14,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r7
+	vext.8	q2,q2,q2,#8
+	mov	r10,r10,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r14,r14,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r10,r10,r3,ror#24
+	vext.8	q1,q1,q1,#4
+	add	r8,r8,r14
+	vext.8	q5,q5,q5,#4
+	mov	r6,r6,ror#25
+	vext.8	q9,q9,q9,#4
+	add	r9,r9,r10
+	vext.8	q3,q3,q3,#12
+	mov	r7,r7,ror#25
+	vext.8	q7,q7,q7,#12
+	eor	r6,r6,r8,ror#25
+	vext.8	q11,q11,q11,#12
+	eor	r7,r7,r9,ror#25
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r5
+	vadd.i32	q4,q4,q5
+	mov	r10,r10,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r6
+	veor	q3,q3,q0
+	mov	r12,r12,ror#16
+	veor	q7,q7,q4
+	eor	r10,r10,r0,ror#16
+	veor	q11,q11,q8
+	eor	r12,r12,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r10
+	vrev32.16	q7,q7
+	mov	r5,r5,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r12
+	vadd.i32	q2,q2,q3
+	mov	r6,r6,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r5,r5,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r6,r6,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r5
+	veor	q13,q5,q6
+	mov	r10,r10,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r6
+	vshr.u32	q1,q12,#20
+	mov	r12,r12,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r10,r10,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r12,r12,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r10
+	vsli.32	q5,q13,#12
+	mov	r5,r5,ror#25
+	vsli.32	q9,q14,#12
+	str	r10,[sp,#4*(16+15)]
+	vadd.i32	q0,q0,q1
+	ldr	r10,[sp,#4*(16+13)]
+	vadd.i32	q4,q4,q5
+	add	r9,r9,r12
+	vadd.i32	q8,q8,q9
+	mov	r6,r6,ror#25
+	veor	q12,q3,q0
+	eor	r5,r5,r8,ror#25
+	veor	q13,q7,q4
+	eor	r6,r6,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+10)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+8)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r7
+	vshr.u32	q11,q14,#24
+	mov	r10,r10,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+11)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+9)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r4
+	vadd.i32	q2,q2,q3
+	mov	r14,r14,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r10,r10,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r14,r14,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r10
+	veor	q13,q5,q6
+	mov	r7,r7,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r14
+	vshr.u32	q1,q12,#25
+	mov	r4,r4,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r7,r7,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r4,r4,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r7
+	vsli.32	q5,q13,#7
+	mov	r10,r10,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r4
+	vext.8	q2,q2,q2,#8
+	mov	r14,r14,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r10,r10,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r14,r14,r3,ror#24
+	vext.8	q1,q1,q1,#12
+	add	r8,r8,r10
+	vext.8	q5,q5,q5,#12
+	mov	r7,r7,ror#25
+	vext.8	q9,q9,q9,#12
+	add	r9,r9,r14
+	vext.8	q3,q3,q3,#4
+	mov	r4,r4,ror#25
+	vext.8	q7,q7,q7,#4
+	eor	r7,r7,r8,ror#25
+	vext.8	q11,q11,q11,#4
+	eor	r4,r4,r9,ror#25
+	bne	Loop_neon
+
+	add	r11,sp,#32
+	vld1.32	{q12,q13},[sp]		@ load key material
+	vld1.32	{q14,q15},[r11]
+
+	ldr	r11,[sp,#4*(32+2)]	@ load len
+
+	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	r9, [sp,#4*(16+9)]
+	str	r12,[sp,#4*(16+12)]
+	str	r10, [sp,#4*(16+13)]
+	str	r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+	vadd.i32	q0,q0,q12		@ accumulate key material
+	vadd.i32	q4,q4,q12
+	vadd.i32	q8,q8,q12
+	vldr	d24,[sp,#4*(16+0)]	@ one
+
+	vadd.i32	q1,q1,q13
+	vadd.i32	q5,q5,q13
+	vadd.i32	q9,q9,q13
+	vldr	d26,[sp,#4*(16+2)]	@ two
+
+	vadd.i32	q2,q2,q14
+	vadd.i32	q6,q6,q14
+	vadd.i32	q10,q10,q14
+	vadd.i32	d14,d14,d24	@ counter+1
+	vadd.i32	d22,d22,d26	@ counter+2
+
+	vadd.i32	q3,q3,q15
+	vadd.i32	q7,q7,q15
+	vadd.i32	q11,q11,q15
+
+	cmp	r11,#64*4
+	blo	Ltail_neon
+
+	vld1.8	{q12,q13},[r12]!	@ load input
+	mov	r11,sp
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12		@ xor with input
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	vst1.8	{q0,q1},[r14]!	@ store output
+	veor	q5,q5,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q6,q6,q14
+	vst1.8	{q2,q3},[r14]!
+	veor	q7,q7,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q8,q8,q12
+	vld1.32	{q0,q1},[r11]!	@ load for next iteration
+	veor	d25,d25,d25
+	vldr	d24,[sp,#4*(16+4)]	@ four
+	veor	q9,q9,q13
+	vld1.32	{q2,q3},[r11]
+	veor	q10,q10,q14
+	vst1.8	{q4,q5},[r14]!
+	veor	q11,q11,q15
+	vst1.8	{q6,q7},[r14]!
+
+	vadd.i32	d6,d6,d24	@ next counter value
+	vldr	d24,[sp,#4*(16+0)]	@ one
+
+	ldmia	sp,{r8,r9,r10,r11}	@ load key material
+	add	r0,r0,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	vst1.8	{q8,q9},[r14]!
+	add	r1,r1,r9
+	ldr	r9,[r12,#-12]
+	vst1.8	{q10,q11},[r14]!
+	add	r2,r2,r10
+	ldr	r10,[r12,#-8]
+	add	r3,r3,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	eor	r0,r0,r8	@ xor with input
+	add	r8,sp,#4*(4)
+	eor	r1,r1,r9
+	str	r0,[r14],#16		@ store output
+	eor	r2,r2,r10
+	str	r1,[r14,#-12]
+	eor	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	add	r5,r5,r9
+	ldr	r9,[r12,#-12]
+	add	r6,r6,r10
+	ldr	r10,[r12,#-8]
+	add	r7,r7,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	eor	r4,r4,r8
+	add	r8,sp,#4*(8)
+	eor	r5,r5,r9
+	str	r4,[r14],#16		@ store output
+	eor	r6,r6,r10
+	str	r5,[r14,#-12]
+	eor	r7,r7,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r6,[r14,#-8]
+	add	r0,sp,#4*(16+8)
+	str	r7,[r14,#-4]
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	add	r1,r1,r9
+	ldr	r9,[r12,#-12]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	add	r2,r2,r10
+	ldr	r10,[r12,#-8]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add	r3,r3,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	eor	r0,r0,r8
+	add	r8,sp,#4*(12)
+	eor	r1,r1,r9
+	str	r0,[r14],#16		@ store output
+	eor	r2,r2,r10
+	str	r1,[r14,#-12]
+	eor	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,r8,#4		@ next counter value
+	add	r5,r5,r9
+	str	r8,[sp,#4*(12)]	@ save next counter value
+	ldr	r8,[r12],#16		@ load input
+	add	r6,r6,r10
+	add	r4,r4,#3		@ counter+3
+	ldr	r9,[r12,#-12]
+	add	r7,r7,r11
+	ldr	r10,[r12,#-8]
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	eor	r4,r4,r8
+# ifdef	__thumb2__
+	it	hi
+# endif
+	ldrhi	r8,[sp,#4*(32+2)]	@ re-load len
+	eor	r5,r5,r9
+	eor	r6,r6,r10
+	str	r4,[r14],#16		@ store output
+	eor	r7,r7,r11
+	str	r5,[r14,#-12]
+	sub	r11,r8,#64*4	@ len-=64*4
+	str	r6,[r14,#-8]
+	str	r7,[r14,#-4]
+	bhi	Loop_neon_outer
+
+	b	Ldone_neon
+
+.align	4
+Lbreak_neon:
+	@ harmonize NEON and integer-only stack frames: load data
+	@ from NEON frame, but save to integer-only one; distance
+	@ between the two is 4*(32+4+16-32)=4*(20).
+
+	str	r11, [sp,#4*(20+32+2)]	@ save len
+	add	r11,sp,#4*(32+4)
+	str	r12,   [sp,#4*(20+32+1)]	@ save inp
+	str	r14,   [sp,#4*(20+32+0)]	@ save out
+
+	ldr	r12,[sp,#4*(16+10)]
+	ldr	r14,[sp,#4*(16+11)]
+	vldmia	r11,{d8,d9,d10,d11,d12,d13,d14,d15}			@ fulfill ABI requirement
+	str	r12,[sp,#4*(20+16+10)]	@ copy "rx"
+	str	r14,[sp,#4*(20+16+11)]	@ copy "rx"
+
+	ldr	r11, [sp,#4*(15)]
+	ldr	r12,[sp,#4*(12)]		@ modulo-scheduled load
+	ldr	r10, [sp,#4*(13)]
+	ldr	r14,[sp,#4*(14)]
+	str	r11, [sp,#4*(20+16+15)]
+	add	r11,sp,#4*(20)
+	vst1.32	{q0,q1},[r11]!		@ copy key
+	add	sp,sp,#4*(20)			@ switch frame
+	vst1.32	{q2,q3},[r11]
+	mov	r11,#10
+	b	Loop				@ go integer-only
+
+.align	4
+Ltail_neon:
+	cmp	r11,#64*3
+	bhs	L192_or_more_neon
+	cmp	r11,#64*2
+	bhs	L128_or_more_neon
+	cmp	r11,#64*1
+	bhs	L64_or_more_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q0,q1},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q2,q3},[r8]
+	b	Loop_tail_neon
+
+.align	4
+L64_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vst1.8	{q0,q1},[r14]!
+	vst1.8	{q2,q3},[r14]!
+
+	beq	Ldone_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q4,q5},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q6,q7},[r8]
+	sub	r11,r11,#64*1	@ len-=64*1
+	b	Loop_tail_neon
+
+.align	4
+L128_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	veor	q5,q5,q13
+	vst1.8	{q0,q1},[r14]!
+	veor	q6,q6,q14
+	vst1.8	{q2,q3},[r14]!
+	veor	q7,q7,q15
+	vst1.8	{q4,q5},[r14]!
+	vst1.8	{q6,q7},[r14]!
+
+	beq	Ldone_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q8,q9},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q10,q11},[r8]
+	sub	r11,r11,#64*2	@ len-=64*2
+	b	Loop_tail_neon
+
+.align	4
+L192_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	veor	q5,q5,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q6,q6,q14
+	vst1.8	{q0,q1},[r14]!
+	veor	q7,q7,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q8,q8,q12
+	vst1.8	{q2,q3},[r14]!
+	veor	q9,q9,q13
+	vst1.8	{q4,q5},[r14]!
+	veor	q10,q10,q14
+	vst1.8	{q6,q7},[r14]!
+	veor	q11,q11,q15
+	vst1.8	{q8,q9},[r14]!
+	vst1.8	{q10,q11},[r14]!
+
+	beq	Ldone_neon
+
+	ldmia	sp,{r8,r9,r10,r11}	@ load key material
+	add	r0,r0,r8	@ accumulate key material
+	add	r8,sp,#4*(4)
+	add	r1,r1,r9
+	add	r2,r2,r10
+	add	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,sp,#4*(8)
+	add	r5,r5,r9
+	add	r6,r6,r10
+	add	r7,r7,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	stmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7}
+	add	r0,sp,#4*(16+8)
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r8,sp,#4*(12)
+	add	r1,r1,r9
+	add	r2,r2,r10
+	add	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,sp,#4*(8)
+	add	r5,r5,r9
+	add	r4,r4,#3		@ counter+3
+	add	r6,r6,r10
+	add	r7,r7,r11
+	ldr	r11,[sp,#4*(32+2)]	@ re-load len
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	stmia	r8,{r0,r1,r2,r3,r4,r5,r6,r7}
+	add	r10,sp,#4*(0)
+	sub	r11,r11,#64*3	@ len-=64*3
+
+Loop_tail_neon:
+	ldrb	r8,[r10],#1	@ read buffer on stack
+	ldrb	r9,[r12],#1		@ read input
+	subs	r11,r11,#1
+	eor	r8,r8,r9
+	strb	r8,[r14],#1		@ store output
+	bne	Loop_tail_neon
+
+Ldone_neon:
+	add	sp,sp,#4*(32+4)
+	vldmia	sp,{d8,d9,d10,d11,d12,d13,d14,d15}
+	add	sp,sp,#4*(16+3)
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+
+.comm	_OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol	_OPENSSL_armcap_P
+.long	0
+#endif
diff --git a/ios-arm/crypto/fipsmodule/aes-armv4.S b/ios-arm/crypto/fipsmodule/aes-armv4.S
new file mode 100644
index 0000000..8d43e36
--- /dev/null
+++ b/ios-arm/crypto/fipsmodule/aes-armv4.S
@@ -0,0 +1,1218 @@
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License").  You may not use
+@ this file except in compliance with the License.  You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@ ====================================================================
+
+@ AES for ARMv4
+
+@ January 2007.
+@
+@ Code uses single 1K S-box and is >2 times faster than code generated
+@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
+@ allows to merge logical or arithmetic operation with shift or rotate
+@ in one instruction and emit combined result every cycle. The module
+@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit
+@ key [on single-issue Xscale PXA250 core].
+
+@ May 2007.
+@
+@ AES_set_[en|de]crypt_key is added.
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 12% improvement on
+@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~21.5 cycles per byte.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 AES
+@ instructions are in aesv8-armx.pl.)
+
+
+.text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#undef __thumb2__
+#endif
+
+
+.align	5
+AES_Te:
+.word	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
+.word	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
+.word	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
+.word	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
+.word	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
+.word	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
+.word	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
+.word	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
+.word	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
+.word	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
+.word	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
+.word	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
+.word	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
+.word	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
+.word	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
+.word	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
+.word	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
+.word	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
+.word	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
+.word	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
+.word	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
+.word	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
+.word	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
+.word	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
+.word	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
+.word	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
+.word	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
+.word	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
+.word	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
+.word	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
+.word	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
+.word	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
+.word	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
+.word	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
+.word	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
+.word	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
+.word	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
+.word	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
+.word	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
+.word	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
+.word	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
+.word	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
+.word	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
+.word	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
+.word	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
+.word	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
+.word	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
+.word	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
+.word	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
+.word	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
+.word	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
+.word	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
+.word	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
+.word	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
+.word	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
+.word	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
+.word	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
+.word	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
+.word	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
+.word	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
+.word	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
+.word	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
+.word	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
+.word	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
+@ Te4[256]
+.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+@ rcon[]
+.word	0x01000000, 0x02000000, 0x04000000, 0x08000000
+.word	0x10000000, 0x20000000, 0x40000000, 0x80000000
+.word	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
+
+
+@ void asm_AES_encrypt(const unsigned char *in, unsigned char *out,
+@ 		       const AES_KEY *key) {
+.globl	_asm_AES_encrypt
+.private_extern	_asm_AES_encrypt
+#ifdef __thumb2__
+.thumb_func	_asm_AES_encrypt
+#endif
+.align	5
+_asm_AES_encrypt:
+#ifndef	__thumb2__
+	sub	r3,pc,#8		@ _asm_AES_encrypt
+#else
+	adr	r3,.
+#endif
+	stmdb	sp!,{r1,r4-r12,lr}
+#ifdef	__APPLE__
+	adr	r10,AES_Te
+#else
+	sub	r10,r3,#_asm_AES_encrypt-AES_Te	@ Te
+#endif
+	mov	r12,r0		@ inp
+	mov	r11,r2
+#if __ARM_ARCH__<7
+	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
+	ldrb	r4,[r12,#2]	@ manner...
+	ldrb	r5,[r12,#1]
+	ldrb	r6,[r12,#0]
+	orr	r0,r0,r4,lsl#8
+	ldrb	r1,[r12,#7]
+	orr	r0,r0,r5,lsl#16
+	ldrb	r4,[r12,#6]
+	orr	r0,r0,r6,lsl#24
+	ldrb	r5,[r12,#5]
+	ldrb	r6,[r12,#4]
+	orr	r1,r1,r4,lsl#8
+	ldrb	r2,[r12,#11]
+	orr	r1,r1,r5,lsl#16
+	ldrb	r4,[r12,#10]
+	orr	r1,r1,r6,lsl#24
+	ldrb	r5,[r12,#9]
+	ldrb	r6,[r12,#8]
+	orr	r2,r2,r4,lsl#8
+	ldrb	r3,[r12,#15]
+	orr	r2,r2,r5,lsl#16
+	ldrb	r4,[r12,#14]
+	orr	r2,r2,r6,lsl#24
+	ldrb	r5,[r12,#13]
+	ldrb	r6,[r12,#12]
+	orr	r3,r3,r4,lsl#8
+	orr	r3,r3,r5,lsl#16
+	orr	r3,r3,r6,lsl#24
+#else
+	ldr	r0,[r12,#0]
+	ldr	r1,[r12,#4]
+	ldr	r2,[r12,#8]
+	ldr	r3,[r12,#12]
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+#endif
+	bl	_armv4_AES_encrypt
+
+	ldr	r12,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	str	r0,[r12,#0]
+	str	r1,[r12,#4]
+	str	r2,[r12,#8]
+	str	r3,[r12,#12]
+#else
+	mov	r4,r0,lsr#24		@ write output in endian-neutral
+	mov	r5,r0,lsr#16		@ manner...
+	mov	r6,r0,lsr#8
+	strb	r4,[r12,#0]
+	strb	r5,[r12,#1]
+	mov	r4,r1,lsr#24
+	strb	r6,[r12,#2]
+	mov	r5,r1,lsr#16
+	strb	r0,[r12,#3]
+	mov	r6,r1,lsr#8
+	strb	r4,[r12,#4]
+	strb	r5,[r12,#5]
+	mov	r4,r2,lsr#24
+	strb	r6,[r12,#6]
+	mov	r5,r2,lsr#16
+	strb	r1,[r12,#7]
+	mov	r6,r2,lsr#8
+	strb	r4,[r12,#8]
+	strb	r5,[r12,#9]
+	mov	r4,r3,lsr#24
+	strb	r6,[r12,#10]
+	mov	r5,r3,lsr#16
+	strb	r2,[r12,#11]
+	mov	r6,r3,lsr#8
+	strb	r4,[r12,#12]
+	strb	r5,[r12,#13]
+	strb	r6,[r12,#14]
+	strb	r3,[r12,#15]
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+
+
+#ifdef __thumb2__
+.thumb_func	_armv4_AES_encrypt
+#endif
+.align	2
+_armv4_AES_encrypt:
+	str	lr,[sp,#-4]!		@ push lr
+	ldmia	r11!,{r4,r5,r6,r7}
+	eor	r0,r0,r4
+	ldr	r12,[r11,#240-16]
+	eor	r1,r1,r5
+	eor	r2,r2,r6
+	eor	r3,r3,r7
+	sub	r12,r12,#1
+	mov	lr,#255
+
+	and	r7,lr,r0
+	and	r8,lr,r0,lsr#8
+	and	r9,lr,r0,lsr#16
+	mov	r0,r0,lsr#24
+Lenc_loop:
+	ldr	r4,[r10,r7,lsl#2]	@ Te3[s0>>0]
+	and	r7,lr,r1,lsr#16	@ i0
+	ldr	r5,[r10,r8,lsl#2]	@ Te2[s0>>8]
+	and	r8,lr,r1
+	ldr	r6,[r10,r9,lsl#2]	@ Te1[s0>>16]
+	and	r9,lr,r1,lsr#8
+	ldr	r0,[r10,r0,lsl#2]	@ Te0[s0>>24]
+	mov	r1,r1,lsr#24
+
+	ldr	r7,[r10,r7,lsl#2]	@ Te1[s1>>16]
+	ldr	r8,[r10,r8,lsl#2]	@ Te3[s1>>0]
+	ldr	r9,[r10,r9,lsl#2]	@ Te2[s1>>8]
+	eor	r0,r0,r7,ror#8
+	ldr	r1,[r10,r1,lsl#2]	@ Te0[s1>>24]
+	and	r7,lr,r2,lsr#8	@ i0
+	eor	r5,r5,r8,ror#8
+	and	r8,lr,r2,lsr#16	@ i1
+	eor	r6,r6,r9,ror#8
+	and	r9,lr,r2
+	ldr	r7,[r10,r7,lsl#2]	@ Te2[s2>>8]
+	eor	r1,r1,r4,ror#24
+	ldr	r8,[r10,r8,lsl#2]	@ Te1[s2>>16]
+	mov	r2,r2,lsr#24
+
+	ldr	r9,[r10,r9,lsl#2]	@ Te3[s2>>0]
+	eor	r0,r0,r7,ror#16
+	ldr	r2,[r10,r2,lsl#2]	@ Te0[s2>>24]
+	and	r7,lr,r3		@ i0
+	eor	r1,r1,r8,ror#8
+	and	r8,lr,r3,lsr#8	@ i1
+	eor	r6,r6,r9,ror#16
+	and	r9,lr,r3,lsr#16	@ i2
+	ldr	r7,[r10,r7,lsl#2]	@ Te3[s3>>0]
+	eor	r2,r2,r5,ror#16
+	ldr	r8,[r10,r8,lsl#2]	@ Te2[s3>>8]
+	mov	r3,r3,lsr#24
+
+	ldr	r9,[r10,r9,lsl#2]	@ Te1[s3>>16]
+	eor	r0,r0,r7,ror#24
+	ldr	r7,[r11],#16
+	eor	r1,r1,r8,ror#16
+	ldr	r3,[r10,r3,lsl#2]	@ Te0[s3>>24]
+	eor	r2,r2,r9,ror#8
+	ldr	r4,[r11,#-12]
+	eor	r3,r3,r6,ror#8
+
+	ldr	r5,[r11,#-8]
+	eor	r0,r0,r7
+	ldr	r6,[r11,#-4]
+	and	r7,lr,r0
+	eor	r1,r1,r4
+	and	r8,lr,r0,lsr#8
+	eor	r2,r2,r5
+	and	r9,lr,r0,lsr#16
+	eor	r3,r3,r6
+	mov	r0,r0,lsr#24
+
+	subs	r12,r12,#1
+	bne	Lenc_loop
+
+	add	r10,r10,#2
+
+	ldrb	r4,[r10,r7,lsl#2]	@ Te4[s0>>0]
+	and	r7,lr,r1,lsr#16	@ i0
+	ldrb	r5,[r10,r8,lsl#2]	@ Te4[s0>>8]
+	and	r8,lr,r1
+	ldrb	r6,[r10,r9,lsl#2]	@ Te4[s0>>16]
+	and	r9,lr,r1,lsr#8
+	ldrb	r0,[r10,r0,lsl#2]	@ Te4[s0>>24]
+	mov	r1,r1,lsr#24
+
+	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s1>>16]
+	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s1>>0]
+	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s1>>8]
+	eor	r0,r7,r0,lsl#8
+	ldrb	r1,[r10,r1,lsl#2]	@ Te4[s1>>24]
+	and	r7,lr,r2,lsr#8	@ i0
+	eor	r5,r8,r5,lsl#8
+	and	r8,lr,r2,lsr#16	@ i1
+	eor	r6,r9,r6,lsl#8
+	and	r9,lr,r2
+	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s2>>8]
+	eor	r1,r4,r1,lsl#24
+	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s2>>16]
+	mov	r2,r2,lsr#24
+
+	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s2>>0]
+	eor	r0,r7,r0,lsl#8
+	ldrb	r2,[r10,r2,lsl#2]	@ Te4[s2>>24]
+	and	r7,lr,r3		@ i0
+	eor	r1,r1,r8,lsl#16
+	and	r8,lr,r3,lsr#8	@ i1
+	eor	r6,r9,r6,lsl#8
+	and	r9,lr,r3,lsr#16	@ i2
+	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s3>>0]
+	eor	r2,r5,r2,lsl#24
+	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s3>>8]
+	mov	r3,r3,lsr#24
+
+	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s3>>16]
+	eor	r0,r7,r0,lsl#8
+	ldr	r7,[r11,#0]
+	ldrb	r3,[r10,r3,lsl#2]	@ Te4[s3>>24]
+	eor	r1,r1,r8,lsl#8
+	ldr	r4,[r11,#4]
+	eor	r2,r2,r9,lsl#16
+	ldr	r5,[r11,#8]
+	eor	r3,r6,r3,lsl#24
+	ldr	r6,[r11,#12]
+
+	eor	r0,r0,r7
+	eor	r1,r1,r4
+	eor	r2,r2,r5
+	eor	r3,r3,r6
+
+	sub	r10,r10,#2
+	ldr	pc,[sp],#4		@ pop and return
+
+
+.globl	_asm_AES_set_encrypt_key
+.private_extern	_asm_AES_set_encrypt_key
+#ifdef __thumb2__
+.thumb_func	_asm_AES_set_encrypt_key
+#endif
+.align	5
+_asm_AES_set_encrypt_key:
+_armv4_AES_set_encrypt_key:
+#ifndef	__thumb2__
+	sub	r3,pc,#8		@ _asm_AES_set_encrypt_key
+#else
+	adr	r3,.
+#endif
+	teq	r0,#0
+#ifdef	__thumb2__
+	itt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	moveq	r0,#-1
+	beq	Labrt
+	teq	r2,#0
+#ifdef	__thumb2__
+	itt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	moveq	r0,#-1
+	beq	Labrt
+
+	teq	r1,#128
+	beq	Lok
+	teq	r1,#192
+	beq	Lok
+	teq	r1,#256
+#ifdef	__thumb2__
+	itt	ne			@ Thumb2 thing, sanity check in ARM
+#endif
+	movne	r0,#-1
+	bne	Labrt
+
+Lok:	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	mov	r12,r0		@ inp
+	mov	lr,r1			@ bits
+	mov	r11,r2			@ key
+
+#ifdef	__APPLE__
+	adr	r10,AES_Te+1024				@ Te4
+#else
+	sub	r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024	@ Te4
+#endif
+
+#if __ARM_ARCH__<7
+	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
+	ldrb	r4,[r12,#2]	@ manner...
+	ldrb	r5,[r12,#1]
+	ldrb	r6,[r12,#0]
+	orr	r0,r0,r4,lsl#8
+	ldrb	r1,[r12,#7]
+	orr	r0,r0,r5,lsl#16
+	ldrb	r4,[r12,#6]
+	orr	r0,r0,r6,lsl#24
+	ldrb	r5,[r12,#5]
+	ldrb	r6,[r12,#4]
+	orr	r1,r1,r4,lsl#8
+	ldrb	r2,[r12,#11]
+	orr	r1,r1,r5,lsl#16
+	ldrb	r4,[r12,#10]
+	orr	r1,r1,r6,lsl#24
+	ldrb	r5,[r12,#9]
+	ldrb	r6,[r12,#8]
+	orr	r2,r2,r4,lsl#8
+	ldrb	r3,[r12,#15]
+	orr	r2,r2,r5,lsl#16
+	ldrb	r4,[r12,#14]
+	orr	r2,r2,r6,lsl#24
+	ldrb	r5,[r12,#13]
+	ldrb	r6,[r12,#12]
+	orr	r3,r3,r4,lsl#8
+	str	r0,[r11],#16
+	orr	r3,r3,r5,lsl#16
+	str	r1,[r11,#-12]
+	orr	r3,r3,r6,lsl#24
+	str	r2,[r11,#-8]
+	str	r3,[r11,#-4]
+#else
+	ldr	r0,[r12,#0]
+	ldr	r1,[r12,#4]
+	ldr	r2,[r12,#8]
+	ldr	r3,[r12,#12]
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	str	r0,[r11],#16
+	str	r1,[r11,#-12]
+	str	r2,[r11,#-8]
+	str	r3,[r11,#-4]
+#endif
+
+	teq	lr,#128
+	bne	Lnot128
+	mov	r12,#10
+	str	r12,[r11,#240-16]
+	add	r6,r10,#256			@ rcon
+	mov	lr,#255
+
+L128_loop:
+	and	r5,lr,r3,lsr#24
+	and	r7,lr,r3,lsr#16
+	ldrb	r5,[r10,r5]
+	and	r8,lr,r3,lsr#8
+	ldrb	r7,[r10,r7]
+	and	r9,lr,r3
+	ldrb	r8,[r10,r8]
+	orr	r5,r5,r7,lsl#24
+	ldrb	r9,[r10,r9]
+	orr	r5,r5,r8,lsl#16
+	ldr	r4,[r6],#4			@ rcon[i++]
+	orr	r5,r5,r9,lsl#8
+	eor	r5,r5,r4
+	eor	r0,r0,r5			@ rk[4]=rk[0]^...
+	eor	r1,r1,r0			@ rk[5]=rk[1]^rk[4]
+	str	r0,[r11],#16
+	eor	r2,r2,r1			@ rk[6]=rk[2]^rk[5]
+	str	r1,[r11,#-12]
+	eor	r3,r3,r2			@ rk[7]=rk[3]^rk[6]
+	str	r2,[r11,#-8]
+	subs	r12,r12,#1
+	str	r3,[r11,#-4]
+	bne	L128_loop
+	sub	r2,r11,#176
+	b	Ldone
+
+Lnot128:
+#if __ARM_ARCH__<7
+	ldrb	r8,[r12,#19]
+	ldrb	r4,[r12,#18]
+	ldrb	r5,[r12,#17]
+	ldrb	r6,[r12,#16]
+	orr	r8,r8,r4,lsl#8
+	ldrb	r9,[r12,#23]
+	orr	r8,r8,r5,lsl#16
+	ldrb	r4,[r12,#22]
+	orr	r8,r8,r6,lsl#24
+	ldrb	r5,[r12,#21]
+	ldrb	r6,[r12,#20]
+	orr	r9,r9,r4,lsl#8
+	orr	r9,r9,r5,lsl#16
+	str	r8,[r11],#8
+	orr	r9,r9,r6,lsl#24
+	str	r9,[r11,#-4]
+#else
+	ldr	r8,[r12,#16]
+	ldr	r9,[r12,#20]
+#ifdef __ARMEL__
+	rev	r8,r8
+	rev	r9,r9
+#endif
+	str	r8,[r11],#8
+	str	r9,[r11,#-4]
+#endif
+
+	teq	lr,#192
+	bne	Lnot192
+	mov	r12,#12
+	str	r12,[r11,#240-24]
+	add	r6,r10,#256			@ rcon
+	mov	lr,#255
+	mov	r12,#8
+
+L192_loop:
+	and	r5,lr,r9,lsr#24
+	and	r7,lr,r9,lsr#16
+	ldrb	r5,[r10,r5]
+	and	r8,lr,r9,lsr#8
+	ldrb	r7,[r10,r7]
+	and	r9,lr,r9
+	ldrb	r8,[r10,r8]
+	orr	r5,r5,r7,lsl#24
+	ldrb	r9,[r10,r9]
+	orr	r5,r5,r8,lsl#16
+	ldr	r4,[r6],#4			@ rcon[i++]
+	orr	r5,r5,r9,lsl#8
+	eor	r9,r5,r4
+	eor	r0,r0,r9			@ rk[6]=rk[0]^...
+	eor	r1,r1,r0			@ rk[7]=rk[1]^rk[6]
+	str	r0,[r11],#24
+	eor	r2,r2,r1			@ rk[8]=rk[2]^rk[7]
+	str	r1,[r11,#-20]
+	eor	r3,r3,r2			@ rk[9]=rk[3]^rk[8]
+	str	r2,[r11,#-16]
+	subs	r12,r12,#1
+	str	r3,[r11,#-12]
+#ifdef	__thumb2__
+	itt	eq				@ Thumb2 thing, sanity check in ARM
+#endif
+	subeq	r2,r11,#216
+	beq	Ldone
+
+	ldr	r7,[r11,#-32]
+	ldr	r8,[r11,#-28]
+	eor	r7,r7,r3			@ rk[10]=rk[4]^rk[9]
+	eor	r9,r8,r7			@ rk[11]=rk[5]^rk[10]
+	str	r7,[r11,#-8]
+	str	r9,[r11,#-4]
+	b	L192_loop
+
+Lnot192:
+#if __ARM_ARCH__<7
+	ldrb	r8,[r12,#27]
+	ldrb	r4,[r12,#26]
+	ldrb	r5,[r12,#25]
+	ldrb	r6,[r12,#24]
+	orr	r8,r8,r4,lsl#8
+	ldrb	r9,[r12,#31]
+	orr	r8,r8,r5,lsl#16
+	ldrb	r4,[r12,#30]
+	orr	r8,r8,r6,lsl#24
+	ldrb	r5,[r12,#29]
+	ldrb	r6,[r12,#28]
+	orr	r9,r9,r4,lsl#8
+	orr	r9,r9,r5,lsl#16
+	str	r8,[r11],#8
+	orr	r9,r9,r6,lsl#24
+	str	r9,[r11,#-4]
+#else
+	ldr	r8,[r12,#24]
+	ldr	r9,[r12,#28]
+#ifdef __ARMEL__
+	rev	r8,r8
+	rev	r9,r9
+#endif
+	str	r8,[r11],#8
+	str	r9,[r11,#-4]
+#endif
+
+	mov	r12,#14
+	str	r12,[r11,#240-32]
+	add	r6,r10,#256			@ rcon
+	mov	lr,#255
+	mov	r12,#7
+
+L256_loop:
+	and	r5,lr,r9,lsr#24
+	and	r7,lr,r9,lsr#16
+	ldrb	r5,[r10,r5]
+	and	r8,lr,r9,lsr#8
+	ldrb	r7,[r10,r7]
+	and	r9,lr,r9
+	ldrb	r8,[r10,r8]
+	orr	r5,r5,r7,lsl#24
+	ldrb	r9,[r10,r9]
+	orr	r5,r5,r8,lsl#16
+	ldr	r4,[r6],#4			@ rcon[i++]
+	orr	r5,r5,r9,lsl#8
+	eor	r9,r5,r4
+	eor	r0,r0,r9			@ rk[8]=rk[0]^...
+	eor	r1,r1,r0			@ rk[9]=rk[1]^rk[8]
+	str	r0,[r11],#32
+	eor	r2,r2,r1			@ rk[10]=rk[2]^rk[9]
+	str	r1,[r11,#-28]
+	eor	r3,r3,r2			@ rk[11]=rk[3]^rk[10]
+	str	r2,[r11,#-24]
+	subs	r12,r12,#1
+	str	r3,[r11,#-20]
+#ifdef	__thumb2__
+	itt	eq				@ Thumb2 thing, sanity check in ARM
+#endif
+	subeq	r2,r11,#256
+	beq	Ldone
+
+	and	r5,lr,r3
+	and	r7,lr,r3,lsr#8
+	ldrb	r5,[r10,r5]
+	and	r8,lr,r3,lsr#16
+	ldrb	r7,[r10,r7]
+	and	r9,lr,r3,lsr#24
+	ldrb	r8,[r10,r8]
+	orr	r5,r5,r7,lsl#8
+	ldrb	r9,[r10,r9]
+	orr	r5,r5,r8,lsl#16
+	ldr	r4,[r11,#-48]
+	orr	r5,r5,r9,lsl#24
+
+	ldr	r7,[r11,#-44]
+	ldr	r8,[r11,#-40]
+	eor	r4,r4,r5			@ rk[12]=rk[4]^...
+	ldr	r9,[r11,#-36]
+	eor	r7,r7,r4			@ rk[13]=rk[5]^rk[12]
+	str	r4,[r11,#-16]
+	eor	r8,r8,r7			@ rk[14]=rk[6]^rk[13]
+	str	r7,[r11,#-12]
+	eor	r9,r9,r8			@ rk[15]=rk[7]^rk[14]
+	str	r8,[r11,#-8]
+	str	r9,[r11,#-4]
+	b	L256_loop
+
+.align	2
+Ldone:	mov	r0,#0
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+Labrt:
+#if __ARM_ARCH__>=5
+	bx	lr				@ .word	0xe12fff1e
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+
+
+.globl	_asm_AES_set_decrypt_key
+.private_extern	_asm_AES_set_decrypt_key
+#ifdef __thumb2__
+.thumb_func	_asm_AES_set_decrypt_key
+#endif
+.align	5
+_asm_AES_set_decrypt_key:
+	str	lr,[sp,#-4]!            @ push lr
+	bl	_armv4_AES_set_encrypt_key
+	teq	r0,#0
+	ldr	lr,[sp],#4              @ pop lr
+	bne	Labrt
+
+	mov	r0,r2			@ _asm_AES_set_encrypt_key preserves r2,
+	mov	r1,r2			@ which is AES_KEY *key
+	b	_armv4_AES_set_enc2dec_key
+
+
+@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
+.globl	_AES_set_enc2dec_key
+.private_extern	_AES_set_enc2dec_key
+#ifdef __thumb2__
+.thumb_func	_AES_set_enc2dec_key
+#endif
+.align	5
+_AES_set_enc2dec_key:
+_armv4_AES_set_enc2dec_key:
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+
+	ldr	r12,[r0,#240]
+	mov	r7,r0			@ input
+	add	r8,r0,r12,lsl#4
+	mov	r11,r1			@ output
+	add	r10,r1,r12,lsl#4
+	str	r12,[r1,#240]
+
+Linv:	ldr	r0,[r7],#16
+	ldr	r1,[r7,#-12]
+	ldr	r2,[r7,#-8]
+	ldr	r3,[r7,#-4]
+	ldr	r4,[r8],#-16
+	ldr	r5,[r8,#16+4]
+	ldr	r6,[r8,#16+8]
+	ldr	r9,[r8,#16+12]
+	str	r0,[r10],#-16
+	str	r1,[r10,#16+4]
+	str	r2,[r10,#16+8]
+	str	r3,[r10,#16+12]
+	str	r4,[r11],#16
+	str	r5,[r11,#-12]
+	str	r6,[r11,#-8]
+	str	r9,[r11,#-4]
+	teq	r7,r8
+	bne	Linv
+
+	ldr	r0,[r7]
+	ldr	r1,[r7,#4]
+	ldr	r2,[r7,#8]
+	ldr	r3,[r7,#12]
+	str	r0,[r11]
+	str	r1,[r11,#4]
+	str	r2,[r11,#8]
+	str	r3,[r11,#12]
+	sub	r11,r11,r12,lsl#3
+	ldr	r0,[r11,#16]!		@ prefetch tp1
+	mov	r7,#0x80
+	mov	r8,#0x1b
+	orr	r7,r7,#0x8000
+	orr	r8,r8,#0x1b00
+	orr	r7,r7,r7,lsl#16
+	orr	r8,r8,r8,lsl#16
+	sub	r12,r12,#1
+	mvn	r9,r7
+	mov	r12,r12,lsl#2	@ (rounds-1)*4
+
+Lmix:	and	r4,r0,r7
+	and	r1,r0,r9
+	sub	r4,r4,r4,lsr#7
+	and	r4,r4,r8
+	eor	r1,r4,r1,lsl#1	@ tp2
+
+	and	r4,r1,r7
+	and	r2,r1,r9
+	sub	r4,r4,r4,lsr#7
+	and	r4,r4,r8
+	eor	r2,r4,r2,lsl#1	@ tp4
+
+	and	r4,r2,r7
+	and	r3,r2,r9
+	sub	r4,r4,r4,lsr#7
+	and	r4,r4,r8
+	eor	r3,r4,r3,lsl#1	@ tp8
+
+	eor	r4,r1,r2
+	eor	r5,r0,r3		@ tp9
+	eor	r4,r4,r3		@ tpe
+	eor	r4,r4,r1,ror#24
+	eor	r4,r4,r5,ror#24	@ ^= ROTATE(tpb=tp9^tp2,8)
+	eor	r4,r4,r2,ror#16
+	eor	r4,r4,r5,ror#16	@ ^= ROTATE(tpd=tp9^tp4,16)
+	eor	r4,r4,r5,ror#8	@ ^= ROTATE(tp9,24)
+
+	ldr	r0,[r11,#4]		@ prefetch tp1
+	str	r4,[r11],#4
+	subs	r12,r12,#1
+	bne	Lmix
+
+	mov	r0,#0
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+
+
+
+.align	5
+AES_Td:
+.word	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
+.word	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
+.word	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
+.word	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
+.word	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
+.word	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
+.word	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
+.word	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
+.word	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
+.word	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
+.word	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
+.word	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
+.word	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
+.word	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
+.word	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
+.word	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
+.word	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
+.word	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
+.word	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
+.word	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
+.word	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
+.word	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
+.word	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
+.word	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
+.word	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
+.word	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
+.word	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
+.word	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
+.word	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
+.word	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
+.word	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
+.word	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
+.word	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
+.word	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
+.word	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
+.word	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
+.word	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
+.word	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
+.word	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
+.word	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
+.word	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
+.word	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
+.word	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
+.word	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
+.word	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
+.word	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
+.word	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
+.word	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
+.word	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
+.word	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
+.word	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
+.word	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
+.word	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
+.word	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
+.word	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
+.word	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
+.word	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
+.word	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
+.word	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
+.word	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
+.word	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
+.word	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
+.word	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
+.word	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
+@ Td4[256]
+.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
+
+@ void asm_AES_decrypt(const unsigned char *in, unsigned char *out,
+@ 		       const AES_KEY *key) {
+.globl	_asm_AES_decrypt
+.private_extern	_asm_AES_decrypt
+#ifdef __thumb2__
+.thumb_func	_asm_AES_decrypt
+#endif
+.align	5
+_asm_AES_decrypt:
+#ifndef	__thumb2__
+	sub	r3,pc,#8		@ _asm_AES_decrypt
+#else
+	adr	r3,.
+#endif
+	stmdb	sp!,{r1,r4-r12,lr}
+#ifdef	__APPLE__
+	adr	r10,AES_Td
+#else
+	sub	r10,r3,#_asm_AES_decrypt-AES_Td	@ Td
+#endif
+	mov	r12,r0		@ inp
+	mov	r11,r2
+#if __ARM_ARCH__<7
+	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
+	ldrb	r4,[r12,#2]	@ manner...
+	ldrb	r5,[r12,#1]
+	ldrb	r6,[r12,#0]
+	orr	r0,r0,r4,lsl#8
+	ldrb	r1,[r12,#7]
+	orr	r0,r0,r5,lsl#16
+	ldrb	r4,[r12,#6]
+	orr	r0,r0,r6,lsl#24
+	ldrb	r5,[r12,#5]
+	ldrb	r6,[r12,#4]
+	orr	r1,r1,r4,lsl#8
+	ldrb	r2,[r12,#11]
+	orr	r1,r1,r5,lsl#16
+	ldrb	r4,[r12,#10]
+	orr	r1,r1,r6,lsl#24
+	ldrb	r5,[r12,#9]
+	ldrb	r6,[r12,#8]
+	orr	r2,r2,r4,lsl#8
+	ldrb	r3,[r12,#15]
+	orr	r2,r2,r5,lsl#16
+	ldrb	r4,[r12,#14]
+	orr	r2,r2,r6,lsl#24
+	ldrb	r5,[r12,#13]
+	ldrb	r6,[r12,#12]
+	orr	r3,r3,r4,lsl#8
+	orr	r3,r3,r5,lsl#16
+	orr	r3,r3,r6,lsl#24
+#else
+	ldr	r0,[r12,#0]
+	ldr	r1,[r12,#4]
+	ldr	r2,[r12,#8]
+	ldr	r3,[r12,#12]
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+#endif
+	bl	_armv4_AES_decrypt
+
+	ldr	r12,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	str	r0,[r12,#0]
+	str	r1,[r12,#4]
+	str	r2,[r12,#8]
+	str	r3,[r12,#12]
+#else
+	mov	r4,r0,lsr#24		@ write output in endian-neutral
+	mov	r5,r0,lsr#16		@ manner...
+	mov	r6,r0,lsr#8
+	strb	r4,[r12,#0]
+	strb	r5,[r12,#1]
+	mov	r4,r1,lsr#24
+	strb	r6,[r12,#2]
+	mov	r5,r1,lsr#16
+	strb	r0,[r12,#3]
+	mov	r6,r1,lsr#8
+	strb	r4,[r12,#4]
+	strb	r5,[r12,#5]
+	mov	r4,r2,lsr#24
+	strb	r6,[r12,#6]
+	mov	r5,r2,lsr#16
+	strb	r1,[r12,#7]
+	mov	r6,r2,lsr#8
+	strb	r4,[r12,#8]
+	strb	r5,[r12,#9]
+	mov	r4,r3,lsr#24
+	strb	r6,[r12,#10]
+	mov	r5,r3,lsr#16
+	strb	r2,[r12,#11]
+	mov	r6,r3,lsr#8
+	strb	r4,[r12,#12]
+	strb	r5,[r12,#13]
+	strb	r6,[r12,#14]
+	strb	r3,[r12,#15]
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+
+
+#ifdef __thumb2__
+.thumb_func	_armv4_AES_decrypt
+#endif
+.align	2
+_armv4_AES_decrypt:
+	str	lr,[sp,#-4]!		@ push lr
+	ldmia	r11!,{r4,r5,r6,r7}
+	eor	r0,r0,r4
+	ldr	r12,[r11,#240-16]
+	eor	r1,r1,r5
+	eor	r2,r2,r6
+	eor	r3,r3,r7
+	sub	r12,r12,#1
+	mov	lr,#255
+
+	and	r7,lr,r0,lsr#16
+	and	r8,lr,r0,lsr#8
+	and	r9,lr,r0
+	mov	r0,r0,lsr#24
+Ldec_loop:
+	ldr	r4,[r10,r7,lsl#2]	@ Td1[s0>>16]
+	and	r7,lr,r1		@ i0
+	ldr	r5,[r10,r8,lsl#2]	@ Td2[s0>>8]
+	and	r8,lr,r1,lsr#16
+	ldr	r6,[r10,r9,lsl#2]	@ Td3[s0>>0]
+	and	r9,lr,r1,lsr#8
+	ldr	r0,[r10,r0,lsl#2]	@ Td0[s0>>24]
+	mov	r1,r1,lsr#24
+
+	ldr	r7,[r10,r7,lsl#2]	@ Td3[s1>>0]
+	ldr	r8,[r10,r8,lsl#2]	@ Td1[s1>>16]
+	ldr	r9,[r10,r9,lsl#2]	@ Td2[s1>>8]
+	eor	r0,r0,r7,ror#24
+	ldr	r1,[r10,r1,lsl#2]	@ Td0[s1>>24]
+	and	r7,lr,r2,lsr#8	@ i0
+	eor	r5,r8,r5,ror#8
+	and	r8,lr,r2		@ i1
+	eor	r6,r9,r6,ror#8
+	and	r9,lr,r2,lsr#16
+	ldr	r7,[r10,r7,lsl#2]	@ Td2[s2>>8]
+	eor	r1,r1,r4,ror#8
+	ldr	r8,[r10,r8,lsl#2]	@ Td3[s2>>0]
+	mov	r2,r2,lsr#24
+
+	ldr	r9,[r10,r9,lsl#2]	@ Td1[s2>>16]
+	eor	r0,r0,r7,ror#16
+	ldr	r2,[r10,r2,lsl#2]	@ Td0[s2>>24]
+	and	r7,lr,r3,lsr#16	@ i0
+	eor	r1,r1,r8,ror#24
+	and	r8,lr,r3,lsr#8	@ i1
+	eor	r6,r9,r6,ror#8
+	and	r9,lr,r3		@ i2
+	ldr	r7,[r10,r7,lsl#2]	@ Td1[s3>>16]
+	eor	r2,r2,r5,ror#8
+	ldr	r8,[r10,r8,lsl#2]	@ Td2[s3>>8]
+	mov	r3,r3,lsr#24
+
+	ldr	r9,[r10,r9,lsl#2]	@ Td3[s3>>0]
+	eor	r0,r0,r7,ror#8
+	ldr	r7,[r11],#16
+	eor	r1,r1,r8,ror#16
+	ldr	r3,[r10,r3,lsl#2]	@ Td0[s3>>24]
+	eor	r2,r2,r9,ror#24
+
+	ldr	r4,[r11,#-12]
+	eor	r0,r0,r7
+	ldr	r5,[r11,#-8]
+	eor	r3,r3,r6,ror#8
+	ldr	r6,[r11,#-4]
+	and	r7,lr,r0,lsr#16
+	eor	r1,r1,r4
+	and	r8,lr,r0,lsr#8
+	eor	r2,r2,r5
+	and	r9,lr,r0
+	eor	r3,r3,r6
+	mov	r0,r0,lsr#24
+
+	subs	r12,r12,#1
+	bne	Ldec_loop
+
+	add	r10,r10,#1024
+
+	ldr	r5,[r10,#0]		@ prefetch Td4
+	ldr	r6,[r10,#32]
+	ldr	r4,[r10,#64]
+	ldr	r5,[r10,#96]
+	ldr	r6,[r10,#128]
+	ldr	r4,[r10,#160]
+	ldr	r5,[r10,#192]
+	ldr	r6,[r10,#224]
+
+	ldrb	r0,[r10,r0]		@ Td4[s0>>24]
+	ldrb	r4,[r10,r7]		@ Td4[s0>>16]
+	and	r7,lr,r1		@ i0
+	ldrb	r5,[r10,r8]		@ Td4[s0>>8]
+	and	r8,lr,r1,lsr#16
+	ldrb	r6,[r10,r9]		@ Td4[s0>>0]
+	and	r9,lr,r1,lsr#8
+
+	add	r1,r10,r1,lsr#24
+	ldrb	r7,[r10,r7]		@ Td4[s1>>0]
+	ldrb	r1,[r1]		@ Td4[s1>>24]
+	ldrb	r8,[r10,r8]		@ Td4[s1>>16]
+	eor	r0,r7,r0,lsl#24
+	ldrb	r9,[r10,r9]		@ Td4[s1>>8]
+	eor	r1,r4,r1,lsl#8
+	and	r7,lr,r2,lsr#8	@ i0
+	eor	r5,r5,r8,lsl#8
+	and	r8,lr,r2		@ i1
+	ldrb	r7,[r10,r7]		@ Td4[s2>>8]
+	eor	r6,r6,r9,lsl#8
+	ldrb	r8,[r10,r8]		@ Td4[s2>>0]
+	and	r9,lr,r2,lsr#16
+
+	add	r2,r10,r2,lsr#24
+	ldrb	r2,[r2]		@ Td4[s2>>24]
+	eor	r0,r0,r7,lsl#8
+	ldrb	r9,[r10,r9]		@ Td4[s2>>16]
+	eor	r1,r8,r1,lsl#16
+	and	r7,lr,r3,lsr#16	@ i0
+	eor	r2,r5,r2,lsl#16
+	and	r8,lr,r3,lsr#8	@ i1
+	ldrb	r7,[r10,r7]		@ Td4[s3>>16]
+	eor	r6,r6,r9,lsl#16
+	ldrb	r8,[r10,r8]		@ Td4[s3>>8]
+	and	r9,lr,r3		@ i2
+
+	add	r3,r10,r3,lsr#24
+	ldrb	r9,[r10,r9]		@ Td4[s3>>0]
+	ldrb	r3,[r3]		@ Td4[s3>>24]
+	eor	r0,r0,r7,lsl#16
+	ldr	r7,[r11,#0]
+	eor	r1,r1,r8,lsl#8
+	ldr	r4,[r11,#4]
+	eor	r2,r9,r2,lsl#8
+	ldr	r5,[r11,#8]
+	eor	r3,r6,r3,lsl#24
+	ldr	r6,[r11,#12]
+
+	eor	r0,r0,r7
+	eor	r1,r1,r4
+	eor	r2,r2,r5
+	eor	r3,r3,r6
+
+	sub	r10,r10,#1024
+	ldr	pc,[sp],#4		@ pop and return
+
+.byte	65,69,83,32,102,111,114,32,65,82,77,118,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
diff --git a/ios-arm/crypto/fipsmodule/aesv8-armx32.S b/ios-arm/crypto/fipsmodule/aesv8-armx32.S
new file mode 100644
index 0000000..d44c88c
--- /dev/null
+++ b/ios-arm/crypto/fipsmodule/aesv8-armx32.S
@@ -0,0 +1,773 @@
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+
+.code	32
+#undef	__thumb2__
+.align	5
+Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.globl	_aes_hw_set_encrypt_key
+.private_extern	_aes_hw_set_encrypt_key
+#ifdef __thumb2__
+.thumb_func	_aes_hw_set_encrypt_key
+#endif
+.align	5
+_aes_hw_set_encrypt_key:
+Lenc_key:
+	mov	r3,#-1
+	cmp	r0,#0
+	beq	Lenc_key_abort
+	cmp	r2,#0
+	beq	Lenc_key_abort
+	mov	r3,#-2
+	cmp	r1,#128
+	blt	Lenc_key_abort
+	cmp	r1,#256
+	bgt	Lenc_key_abort
+	tst	r1,#0x3f
+	bne	Lenc_key_abort
+
+	adr	r3,Lrcon
+	cmp	r1,#192
+
+	veor	q0,q0,q0
+	vld1.8	{q3},[r0]!
+	mov	r1,#8		@ reuse r1
+	vld1.32	{q1,q2},[r3]!
+
+	blt	Loop128
+	beq	L192
+	b	L256
+
+.align	4
+Loop128:
+	vtbl.8	d20,{q3},d4
+	vtbl.8	d21,{q3},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q3},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+	subs	r1,r1,#1
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	vshl.u8	q1,q1,#1
+	veor	q3,q3,q10
+	bne	Loop128
+
+	vld1.32	{q1},[r3]
+
+	vtbl.8	d20,{q3},d4
+	vtbl.8	d21,{q3},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q3},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	vshl.u8	q1,q1,#1
+	veor	q3,q3,q10
+
+	vtbl.8	d20,{q3},d4
+	vtbl.8	d21,{q3},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q3},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	veor	q3,q3,q10
+	vst1.32	{q3},[r2]
+	add	r2,r2,#0x50
+
+	mov	r12,#10
+	b	Ldone
+
+.align	4
+L192:
+	vld1.8	{d16},[r0]!
+	vmov.i8	q10,#8			@ borrow q10
+	vst1.32	{q3},[r2]!
+	vsub.i8	q2,q2,q10	@ adjust the mask
+
+Loop192:
+	vtbl.8	d20,{q8},d4
+	vtbl.8	d21,{q8},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{d16},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+	subs	r1,r1,#1
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+
+	vdup.32	q9,d7[1]
+	veor	q9,q9,q8
+	veor	q10,q10,q1
+	vext.8	q8,q0,q8,#12
+	vshl.u8	q1,q1,#1
+	veor	q8,q8,q9
+	veor	q3,q3,q10
+	veor	q8,q8,q10
+	vst1.32	{q3},[r2]!
+	bne	Loop192
+
+	mov	r12,#12
+	add	r2,r2,#0x20
+	b	Ldone
+
+.align	4
+L256:
+	vld1.8	{q8},[r0]
+	mov	r1,#7
+	mov	r12,#14
+	vst1.32	{q3},[r2]!
+
+Loop256:
+	vtbl.8	d20,{q8},d4
+	vtbl.8	d21,{q8},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q8},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+	subs	r1,r1,#1
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	vshl.u8	q1,q1,#1
+	veor	q3,q3,q10
+	vst1.32	{q3},[r2]!
+	beq	Ldone
+
+	vdup.32	q10,d7[1]
+	vext.8	q9,q0,q8,#12
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+
+	veor	q8,q8,q9
+	vext.8	q9,q0,q9,#12
+	veor	q8,q8,q9
+	vext.8	q9,q0,q9,#12
+	veor	q8,q8,q9
+
+	veor	q8,q8,q10
+	b	Loop256
+
+Ldone:
+	str	r12,[r2]
+	mov	r3,#0
+
+Lenc_key_abort:
+	mov	r0,r3			@ return value
+
+	bx	lr
+
+
+.globl	_aes_hw_set_decrypt_key
+.private_extern	_aes_hw_set_decrypt_key
+#ifdef __thumb2__
+.thumb_func	_aes_hw_set_decrypt_key
+#endif
+.align	5
+_aes_hw_set_decrypt_key:
+	stmdb	sp!,{r4,lr}
+	bl	Lenc_key
+
+	cmp	r0,#0
+	bne	Ldec_key_abort
+
+	sub	r2,r2,#240		@ restore original r2
+	mov	r4,#-16
+	add	r0,r2,r12,lsl#4	@ end of key schedule
+
+	vld1.32	{q0},[r2]
+	vld1.32	{q1},[r0]
+	vst1.32	{q0},[r0],r4
+	vst1.32	{q1},[r2]!
+
+Loop_imc:
+	vld1.32	{q0},[r2]
+	vld1.32	{q1},[r0]
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+	vst1.32	{q0},[r0],r4
+	vst1.32	{q1},[r2]!
+	cmp	r0,r2
+	bhi	Loop_imc
+
+	vld1.32	{q0},[r2]
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+	vst1.32	{q0},[r0]
+
+	eor	r0,r0,r0		@ return value
+Ldec_key_abort:
+	ldmia	sp!,{r4,pc}
+
+.globl	_aes_hw_encrypt
+.private_extern	_aes_hw_encrypt
+#ifdef __thumb2__
+.thumb_func	_aes_hw_encrypt
+#endif
+.align	5
+_aes_hw_encrypt:
+	ldr	r3,[r2,#240]
+	vld1.32	{q0},[r2]!
+	vld1.8	{q2},[r0]
+	sub	r3,r3,#2
+	vld1.32	{q1},[r2]!
+
+Loop_enc:
+.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
+.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
+	vld1.32	{q0},[r2]!
+	subs	r3,r3,#2
+.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
+.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
+	vld1.32	{q1},[r2]!
+	bgt	Loop_enc
+
+.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
+.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
+	vld1.32	{q0},[r2]
+.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
+	veor	q2,q2,q0
+
+	vst1.8	{q2},[r1]
+	bx	lr
+
+.globl	_aes_hw_decrypt
+.private_extern	_aes_hw_decrypt
+#ifdef __thumb2__
+.thumb_func	_aes_hw_decrypt
+#endif
+.align	5
+_aes_hw_decrypt:
+	ldr	r3,[r2,#240]
+	vld1.32	{q0},[r2]!
+	vld1.8	{q2},[r0]
+	sub	r3,r3,#2
+	vld1.32	{q1},[r2]!
+
+Loop_dec:
+.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
+.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
+	vld1.32	{q0},[r2]!
+	subs	r3,r3,#2
+.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
+.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
+	vld1.32	{q1},[r2]!
+	bgt	Loop_dec
+
+.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
+.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
+	vld1.32	{q0},[r2]
+.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
+	veor	q2,q2,q0
+
+	vst1.8	{q2},[r1]
+	bx	lr
+
+.globl	_aes_hw_cbc_encrypt
+.private_extern	_aes_hw_cbc_encrypt
+#ifdef __thumb2__
+.thumb_func	_aes_hw_cbc_encrypt
+#endif
+.align	5
+_aes_hw_cbc_encrypt:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,lr}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
+	ldmia	ip,{r4,r5}		@ load remaining args
+	subs	r2,r2,#16
+	mov	r8,#16
+	blo	Lcbc_abort
+	moveq	r8,#0
+
+	cmp	r5,#0			@ en- or decrypting?
+	ldr	r5,[r3,#240]
+	and	r2,r2,#-16
+	vld1.8	{q6},[r4]
+	vld1.8	{q0},[r0],r8
+
+	vld1.32	{q8,q9},[r3]		@ load key schedule...
+	sub	r5,r5,#6
+	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
+	sub	r5,r5,#2
+	vld1.32	{q10,q11},[r7]!
+	vld1.32	{q12,q13},[r7]!
+	vld1.32	{q14,q15},[r7]!
+	vld1.32	{q7},[r7]
+
+	add	r7,r3,#32
+	mov	r6,r5
+	beq	Lcbc_dec
+
+	cmp	r5,#2
+	veor	q0,q0,q6
+	veor	q5,q8,q7
+	beq	Lcbc_enc128
+
+	vld1.32	{q2,q3},[r7]
+	add	r7,r3,#16
+	add	r6,r3,#16*4
+	add	r12,r3,#16*5
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	add	r14,r3,#16*6
+	add	r3,r3,#16*7
+	b	Lenter_cbc_enc
+
+.align	4
+Loop_cbc_enc:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vst1.8	{q6},[r1]!
+Lenter_cbc_enc:
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q8},[r6]
+	cmp	r5,#4
+.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q9},[r12]
+	beq	Lcbc_enc192
+
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q8},[r14]
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q9},[r3]
+	nop
+
+Lcbc_enc192:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	subs	r2,r2,#16
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	moveq	r8,#0
+.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.8	{q8},[r0],r8
+.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	veor	q8,q8,q5
+.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q9},[r7]		@ re-pre-load rndkey[1]
+.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
+	veor	q6,q0,q7
+	bhs	Loop_cbc_enc
+
+	vst1.8	{q6},[r1]!
+	b	Lcbc_done
+
+.align	5
+Lcbc_enc128:
+	vld1.32	{q2,q3},[r7]
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	b	Lenter_cbc_enc128
+Loop_cbc_enc128:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vst1.8	{q6},[r1]!
+Lenter_cbc_enc128:
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	subs	r2,r2,#16
+.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	moveq	r8,#0
+.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.8	{q8},[r0],r8
+.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	veor	q8,q8,q5
+.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
+	veor	q6,q0,q7
+	bhs	Loop_cbc_enc128
+
+	vst1.8	{q6},[r1]!
+	b	Lcbc_done
+.align	5
+Lcbc_dec:
+	vld1.8	{q10},[r0]!
+	subs	r2,r2,#32		@ bias
+	add	r6,r5,#2
+	vorr	q3,q0,q0
+	vorr	q1,q0,q0
+	vorr	q11,q10,q10
+	blo	Lcbc_dec_tail
+
+	vorr	q1,q10,q10
+	vld1.8	{q10},[r0]!
+	vorr	q2,q0,q0
+	vorr	q3,q1,q1
+	vorr	q11,q10,q10
+
+Loop3x_cbc_dec:
+.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q9},[r7]!
+	bgt	Loop3x_cbc_dec
+
+.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q4,q6,q7
+	subs	r2,r2,#0x30
+	veor	q5,q2,q7
+	movlo	r6,r2			@ r6, r6, is zero at this point
+.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q9,q3,q7
+	add	r0,r0,r6		@ r0 is adjusted in such way that
+					@ at exit from the loop q1-q10
+					@ are loaded with last "words"
+	vorr	q6,q11,q11
+	mov	r7,r3
+.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.8	{q2},[r0]!
+.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.8	{q3},[r0]!
+.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.8	{q11},[r0]!
+.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
+.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
+.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
+	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
+	add	r6,r5,#2
+	veor	q4,q4,q0
+	veor	q5,q5,q1
+	veor	q10,q10,q9
+	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
+	vst1.8	{q4},[r1]!
+	vorr	q0,q2,q2
+	vst1.8	{q5},[r1]!
+	vorr	q1,q3,q3
+	vst1.8	{q10},[r1]!
+	vorr	q10,q11,q11
+	bhs	Loop3x_cbc_dec
+
+	cmn	r2,#0x30
+	beq	Lcbc_done
+	nop
+
+Lcbc_dec_tail:
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q9},[r7]!
+	bgt	Lcbc_dec_tail
+
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	cmn	r2,#0x20
+.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q5,q6,q7
+.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q9,q3,q7
+.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
+.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
+	beq	Lcbc_dec_one
+	veor	q5,q5,q1
+	veor	q9,q9,q10
+	vorr	q6,q11,q11
+	vst1.8	{q5},[r1]!
+	vst1.8	{q9},[r1]!
+	b	Lcbc_done
+
+Lcbc_dec_one:
+	veor	q5,q5,q10
+	vorr	q6,q11,q11
+	vst1.8	{q5},[r1]!
+
+Lcbc_done:
+	vst1.8	{q6},[r4]
+Lcbc_abort:
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,pc}
+
+.globl	_aes_hw_ctr32_encrypt_blocks
+.private_extern	_aes_hw_ctr32_encrypt_blocks
+#ifdef __thumb2__
+.thumb_func	_aes_hw_ctr32_encrypt_blocks
+#endif
+.align	5
+_aes_hw_ctr32_encrypt_blocks:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
+	ldr	r4, [ip]		@ load remaining arg
+	ldr	r5,[r3,#240]
+
+	ldr	r8, [r4, #12]
+	vld1.32	{q0},[r4]
+
+	vld1.32	{q8,q9},[r3]		@ load key schedule...
+	sub	r5,r5,#4
+	mov	r12,#16
+	cmp	r2,#2
+	add	r7,r3,r5,lsl#4	@ pointer to last 5 round keys
+	sub	r5,r5,#2
+	vld1.32	{q12,q13},[r7]!
+	vld1.32	{q14,q15},[r7]!
+	vld1.32	{q7},[r7]
+	add	r7,r3,#32
+	mov	r6,r5
+	movlo	r12,#0
+#ifndef __ARMEB__
+	rev	r8, r8
+#endif
+	vorr	q1,q0,q0
+	add	r10, r8, #1
+	vorr	q10,q0,q0
+	add	r8, r8, #2
+	vorr	q6,q0,q0
+	rev	r10, r10
+	vmov.32	d3[1],r10
+	bls	Lctr32_tail
+	rev	r12, r8
+	sub	r2,r2,#3		@ bias
+	vmov.32	d21[1],r12
+	b	Loop3x_ctr32
+
+.align	4
+Loop3x_ctr32:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
+.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
+.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
+	vld1.32	{q9},[r7]!
+	bgt	Loop3x_ctr32
+
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
+	vld1.8	{q2},[r0]!
+	vorr	q0,q6,q6
+.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
+.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
+	vld1.8	{q3},[r0]!
+	vorr	q1,q6,q6
+.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	vld1.8	{q11},[r0]!
+	mov	r7,r3
+.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
+.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
+	vorr	q10,q6,q6
+	add	r9,r8,#1
+.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	veor	q2,q2,q7
+	add	r10,r8,#2
+.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
+.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
+	veor	q3,q3,q7
+	add	r8,r8,#3
+.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	veor	q11,q11,q7
+	rev	r9,r9
+.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
+.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
+	vmov.32	d1[1], r9
+	rev	r10,r10
+.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	vmov.32	d3[1], r10
+	rev	r12,r8
+.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
+.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
+	vmov.32	d21[1], r12
+	subs	r2,r2,#3
+.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
+.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
+.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15
+
+	veor	q2,q2,q4
+	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
+	vst1.8	{q2},[r1]!
+	veor	q3,q3,q5
+	mov	r6,r5
+	vst1.8	{q3},[r1]!
+	veor	q11,q11,q9
+	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
+	vst1.8	{q11},[r1]!
+	bhs	Loop3x_ctr32
+
+	adds	r2,r2,#3
+	beq	Lctr32_done
+	cmp	r2,#1
+	mov	r12,#16
+	moveq	r12,#0
+
+Lctr32_tail:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.32	{q9},[r7]!
+	bgt	Lctr32_tail
+
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.8	{q2},[r0],r12
+.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.8	{q3},[r0]
+.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	veor	q2,q2,q7
+.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	veor	q3,q3,q7
+.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
+.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
+
+	cmp	r2,#1
+	veor	q2,q2,q0
+	veor	q3,q3,q1
+	vst1.8	{q2},[r1]!
+	beq	Lctr32_done
+	vst1.8	{q3},[r1]
+
+Lctr32_done:
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
+
+#endif
diff --git a/ios-arm/crypto/fipsmodule/armv4-mont.S b/ios-arm/crypto/fipsmodule/armv4-mont.S
new file mode 100644
index 0000000..fbb341f
--- /dev/null
+++ b/ios-arm/crypto/fipsmodule/armv4-mont.S
@@ -0,0 +1,966 @@
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.align	5
+LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-Lbn_mul_mont
+#endif
+
+.globl	_bn_mul_mont
+.private_extern	_bn_mul_mont
+#ifdef __thumb2__
+.thumb_func	_bn_mul_mont
+#endif
+
+.align	5
+_bn_mul_mont:
+Lbn_mul_mont:
+	ldr	ip,[sp,#4]		@ load num
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+#if __ARM_MAX_ARCH__>=7
+	tst	ip,#7
+	bne	Lialu
+	adr	r0,Lbn_mul_mont
+	ldr	r2,LOPENSSL_armcap
+	ldr	r0,[r0,r2]
+#ifdef	__APPLE__
+	ldr	r0,[r0]
+#endif
+	tst	r0,#ARMV7_NEON		@ NEON available?
+	ldmia	sp, {r0,r2}
+	beq	Lialu
+	add	sp,sp,#8
+	b	bn_mul8x_mont_neon
+.align	4
+Lialu:
+#endif
+	cmp	ip,#2
+	mov	r0,ip			@ load num
+#ifdef	__thumb2__
+	ittt	lt
+#endif
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	Labrt
+
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ save 10 registers
+
+	mov	r0,r0,lsl#2		@ rescale r0 for byte count
+	sub	sp,sp,r0		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	r0,r0,#4		@ "num=num-1"
+	add	r4,r2,r0		@ &bp[num-1]
+
+	add	r0,sp,r0		@ r0 to point at &tp[num-1]
+	ldr	r8,[r0,#14*4]		@ &n0
+	ldr	r2,[r2]		@ bp[0]
+	ldr	r5,[r1],#4		@ ap[0],ap++
+	ldr	r6,[r3],#4		@ np[0],np++
+	ldr	r8,[r8]		@ *n0
+	str	r4,[r0,#15*4]		@ save &bp[num]
+
+	umull	r10,r11,r5,r2	@ ap[0]*bp[0]
+	str	r8,[r0,#14*4]		@ save n0 value
+	mul	r8,r10,r8		@ "tp[0]"*n0
+	mov	r12,#0
+	umlal	r10,r12,r6,r8	@ np[0]*n0+"t[0]"
+	mov	r4,sp
+
+L1st:
+	ldr	r5,[r1],#4		@ ap[j],ap++
+	mov	r10,r11
+	ldr	r6,[r3],#4		@ np[j],np++
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[j]*bp[0]
+	mov	r14,#0
+	umlal	r12,r14,r6,r8	@ np[j]*n0
+	adds	r12,r12,r10
+	str	r12,[r4],#4		@ tp[j-1]=,tp++
+	adc	r12,r14,#0
+	cmp	r4,r0
+	bne	L1st
+
+	adds	r12,r12,r11
+	ldr	r4,[r0,#13*4]		@ restore bp
+	mov	r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
+	adc	r14,r14,#0
+	str	r12,[r0]		@ tp[num-1]=
+	mov	r7,sp
+	str	r14,[r0,#4]		@ tp[num]=
+
+Louter:
+	sub	r7,r0,r7		@ "original" r0-1 value
+	sub	r1,r1,r7		@ "rewind" ap to &ap[1]
+	ldr	r2,[r4,#4]!		@ *(++bp)
+	sub	r3,r3,r7		@ "rewind" np to &np[1]
+	ldr	r5,[r1,#-4]		@ ap[0]
+	ldr	r10,[sp]		@ tp[0]
+	ldr	r6,[r3,#-4]		@ np[0]
+	ldr	r7,[sp,#4]		@ tp[1]
+
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[0]*bp[i]+tp[0]
+	str	r4,[r0,#13*4]		@ save bp
+	mul	r8,r10,r8
+	mov	r12,#0
+	umlal	r10,r12,r6,r8	@ np[0]*n0+"tp[0]"
+	mov	r4,sp
+
+Linner:
+	ldr	r5,[r1],#4		@ ap[j],ap++
+	adds	r10,r11,r7		@ +=tp[j]
+	ldr	r6,[r3],#4		@ np[j],np++
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[j]*bp[i]
+	mov	r14,#0
+	umlal	r12,r14,r6,r8	@ np[j]*n0
+	adc	r11,r11,#0
+	ldr	r7,[r4,#8]		@ tp[j+1]
+	adds	r12,r12,r10
+	str	r12,[r4],#4		@ tp[j-1]=,tp++
+	adc	r12,r14,#0
+	cmp	r4,r0
+	bne	Linner
+
+	adds	r12,r12,r11
+	mov	r14,#0
+	ldr	r4,[r0,#13*4]		@ restore bp
+	adc	r14,r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
+	adds	r12,r12,r7
+	ldr	r7,[r0,#15*4]		@ restore &bp[num]
+	adc	r14,r14,#0
+	str	r12,[r0]		@ tp[num-1]=
+	str	r14,[r0,#4]		@ tp[num]=
+
+	cmp	r4,r7
+#ifdef	__thumb2__
+	itt	ne
+#endif
+	movne	r7,sp
+	bne	Louter
+
+	ldr	r2,[r0,#12*4]		@ pull rp
+	mov	r5,sp
+	add	r0,r0,#4		@ r0 to point at &tp[num]
+	sub	r5,r0,r5		@ "original" num value
+	mov	r4,sp			@ "rewind" r4
+	mov	r1,r4			@ "borrow" r1
+	sub	r3,r3,r5		@ "rewind" r3 to &np[0]
+
+	subs	r7,r7,r7		@ "clear" carry flag
+Lsub:	ldr	r7,[r4],#4
+	ldr	r6,[r3],#4
+	sbcs	r7,r7,r6		@ tp[j]-np[j]
+	str	r7,[r2],#4		@ rp[j]=
+	teq	r4,r0		@ preserve carry
+	bne	Lsub
+	sbcs	r14,r14,#0		@ upmost carry
+	mov	r4,sp			@ "rewind" r4
+	sub	r2,r2,r5		@ "rewind" r2
+
+	and	r1,r4,r14
+	bic	r3,r2,r14
+	orr	r1,r1,r3		@ ap=borrow?tp:rp
+
+Lcopy:	ldr	r7,[r1],#4		@ copy or in-place refresh
+	str	sp,[r4],#4		@ zap tp
+	str	r7,[r2],#4
+	cmp	r4,r0
+	bne	Lcopy
+
+	mov	sp,r0
+	add	sp,sp,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+Labrt:
+#if __ARM_ARCH__>=5
+	bx	lr				@ bx lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+#ifdef __thumb2__
+.thumb_func	bn_mul8x_mont_neon
+#endif
+.align	5
+bn_mul8x_mont_neon:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
+	ldmia	ip,{r4,r5}		@ load rest of parameter block
+	mov	ip,sp
+
+	cmp	r5,#8
+	bhi	LNEON_8n
+
+	@ special case for r5==8, everything is in register bank...
+
+	vld1.32	{d28[0]}, [r2,:32]!
+	veor	d8,d8,d8
+	sub	r7,sp,r5,lsl#4
+	vld1.32	{d0,d1,d2,d3},  [r1]!		@ can't specify :32 :-(
+	and	r7,r7,#-64
+	vld1.32	{d30[0]}, [r4,:32]
+	mov	sp,r7			@ alloca
+	vzip.16	d28,d8
+
+	vmull.u32	q6,d28,d0[0]
+	vmull.u32	q7,d28,d0[1]
+	vmull.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmull.u32	q9,d28,d1[1]
+
+	vadd.u64	d29,d29,d12
+	veor	d8,d8,d8
+	vmul.u32	d29,d29,d30
+
+	vmull.u32	q10,d28,d2[0]
+	vld1.32	{d4,d5,d6,d7}, [r3]!
+	vmull.u32	q11,d28,d2[1]
+	vmull.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmull.u32	q13,d28,d3[1]
+
+	vmlal.u32	q6,d29,d4[0]
+	sub	r9,r5,#1
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+
+	vmlal.u32	q10,d29,d6[0]
+	vmov	q5,q6
+	vmlal.u32	q11,d29,d6[1]
+	vmov	q6,q7
+	vmlal.u32	q12,d29,d7[0]
+	vmov	q7,q8
+	vmlal.u32	q13,d29,d7[1]
+	vmov	q8,q9
+	vmov	q9,q10
+	vshr.u64	d10,d10,#16
+	vmov	q10,q11
+	vmov	q11,q12
+	vadd.u64	d10,d10,d11
+	vmov	q12,q13
+	veor	q13,q13
+	vshr.u64	d10,d10,#16
+
+	b	LNEON_outer8
+
+.align	4
+LNEON_outer8:
+	vld1.32	{d28[0]}, [r2,:32]!
+	veor	d8,d8,d8
+	vzip.16	d28,d8
+	vadd.u64	d12,d12,d10
+
+	vmlal.u32	q6,d28,d0[0]
+	vmlal.u32	q7,d28,d0[1]
+	vmlal.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmlal.u32	q9,d28,d1[1]
+
+	vadd.u64	d29,d29,d12
+	veor	d8,d8,d8
+	subs	r9,r9,#1
+	vmul.u32	d29,d29,d30
+
+	vmlal.u32	q10,d28,d2[0]
+	vmlal.u32	q11,d28,d2[1]
+	vmlal.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q13,d28,d3[1]
+
+	vmlal.u32	q6,d29,d4[0]
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+
+	vmlal.u32	q10,d29,d6[0]
+	vmov	q5,q6
+	vmlal.u32	q11,d29,d6[1]
+	vmov	q6,q7
+	vmlal.u32	q12,d29,d7[0]
+	vmov	q7,q8
+	vmlal.u32	q13,d29,d7[1]
+	vmov	q8,q9
+	vmov	q9,q10
+	vshr.u64	d10,d10,#16
+	vmov	q10,q11
+	vmov	q11,q12
+	vadd.u64	d10,d10,d11
+	vmov	q12,q13
+	veor	q13,q13
+	vshr.u64	d10,d10,#16
+
+	bne	LNEON_outer8
+
+	vadd.u64	d12,d12,d10
+	mov	r7,sp
+	vshr.u64	d10,d12,#16
+	mov	r8,r5
+	vadd.u64	d13,d13,d10
+	add	r6,sp,#96
+	vshr.u64	d10,d13,#16
+	vzip.16	d12,d13
+
+	b	LNEON_tail_entry
+
+.align	4
+LNEON_8n:
+	veor	q6,q6,q6
+	sub	r7,sp,#128
+	veor	q7,q7,q7
+	sub	r7,r7,r5,lsl#4
+	veor	q8,q8,q8
+	and	r7,r7,#-64
+	veor	q9,q9,q9
+	mov	sp,r7			@ alloca
+	veor	q10,q10,q10
+	add	r7,r7,#256
+	veor	q11,q11,q11
+	sub	r8,r5,#8
+	veor	q12,q12,q12
+	veor	q13,q13,q13
+
+LNEON_8n_init:
+	vst1.64	{q6,q7},[r7,:256]!
+	subs	r8,r8,#8
+	vst1.64	{q8,q9},[r7,:256]!
+	vst1.64	{q10,q11},[r7,:256]!
+	vst1.64	{q12,q13},[r7,:256]!
+	bne	LNEON_8n_init
+
+	add	r6,sp,#256
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	add	r10,sp,#8
+	vld1.32	{d30[0]},[r4,:32]
+	mov	r9,r5
+	b	LNEON_8n_outer
+
+.align	4
+LNEON_8n_outer:
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	veor	d8,d8,d8
+	vzip.16	d28,d8
+	add	r7,sp,#128
+	vld1.32	{d4,d5,d6,d7},[r3]!
+
+	vmlal.u32	q6,d28,d0[0]
+	vmlal.u32	q7,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmlal.u32	q9,d28,d1[1]
+	vadd.u64	d29,d29,d12
+	vmlal.u32	q10,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q11,d28,d2[1]
+	vst1.32	{d28},[sp,:64]		@ put aside smashed b[8*i+0]
+	vmlal.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q13,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q6,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q7,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q8,d29,d5[0]
+	vshr.u64	d12,d12,#16
+	vmlal.u32	q9,d29,d5[1]
+	vmlal.u32	q10,d29,d6[0]
+	vadd.u64	d12,d12,d13
+	vmlal.u32	q11,d29,d6[1]
+	vshr.u64	d12,d12,#16
+	vmlal.u32	q12,d29,d7[0]
+	vmlal.u32	q13,d29,d7[1]
+	vadd.u64	d14,d14,d12
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+0]
+	vmlal.u32	q7,d28,d0[0]
+	vld1.64	{q6},[r6,:128]!
+	vmlal.u32	q8,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q9,d28,d1[0]
+	vshl.i64	d29,d15,#16
+	vmlal.u32	q10,d28,d1[1]
+	vadd.u64	d29,d29,d14
+	vmlal.u32	q11,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q12,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+1]
+	vmlal.u32	q13,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q6,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q7,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q8,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q9,d29,d5[0]
+	vshr.u64	d14,d14,#16
+	vmlal.u32	q10,d29,d5[1]
+	vmlal.u32	q11,d29,d6[0]
+	vadd.u64	d14,d14,d15
+	vmlal.u32	q12,d29,d6[1]
+	vshr.u64	d14,d14,#16
+	vmlal.u32	q13,d29,d7[0]
+	vmlal.u32	q6,d29,d7[1]
+	vadd.u64	d16,d16,d14
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+1]
+	vmlal.u32	q8,d28,d0[0]
+	vld1.64	{q7},[r6,:128]!
+	vmlal.u32	q9,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q10,d28,d1[0]
+	vshl.i64	d29,d17,#16
+	vmlal.u32	q11,d28,d1[1]
+	vadd.u64	d29,d29,d16
+	vmlal.u32	q12,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q13,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+2]
+	vmlal.u32	q6,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q7,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q8,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q9,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q10,d29,d5[0]
+	vshr.u64	d16,d16,#16
+	vmlal.u32	q11,d29,d5[1]
+	vmlal.u32	q12,d29,d6[0]
+	vadd.u64	d16,d16,d17
+	vmlal.u32	q13,d29,d6[1]
+	vshr.u64	d16,d16,#16
+	vmlal.u32	q6,d29,d7[0]
+	vmlal.u32	q7,d29,d7[1]
+	vadd.u64	d18,d18,d16
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+2]
+	vmlal.u32	q9,d28,d0[0]
+	vld1.64	{q8},[r6,:128]!
+	vmlal.u32	q10,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q11,d28,d1[0]
+	vshl.i64	d29,d19,#16
+	vmlal.u32	q12,d28,d1[1]
+	vadd.u64	d29,d29,d18
+	vmlal.u32	q13,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q6,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+3]
+	vmlal.u32	q7,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q8,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q9,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q10,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q11,d29,d5[0]
+	vshr.u64	d18,d18,#16
+	vmlal.u32	q12,d29,d5[1]
+	vmlal.u32	q13,d29,d6[0]
+	vadd.u64	d18,d18,d19
+	vmlal.u32	q6,d29,d6[1]
+	vshr.u64	d18,d18,#16
+	vmlal.u32	q7,d29,d7[0]
+	vmlal.u32	q8,d29,d7[1]
+	vadd.u64	d20,d20,d18
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+3]
+	vmlal.u32	q10,d28,d0[0]
+	vld1.64	{q9},[r6,:128]!
+	vmlal.u32	q11,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q12,d28,d1[0]
+	vshl.i64	d29,d21,#16
+	vmlal.u32	q13,d28,d1[1]
+	vadd.u64	d29,d29,d20
+	vmlal.u32	q6,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q7,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+4]
+	vmlal.u32	q8,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q9,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q10,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q11,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q12,d29,d5[0]
+	vshr.u64	d20,d20,#16
+	vmlal.u32	q13,d29,d5[1]
+	vmlal.u32	q6,d29,d6[0]
+	vadd.u64	d20,d20,d21
+	vmlal.u32	q7,d29,d6[1]
+	vshr.u64	d20,d20,#16
+	vmlal.u32	q8,d29,d7[0]
+	vmlal.u32	q9,d29,d7[1]
+	vadd.u64	d22,d22,d20
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+4]
+	vmlal.u32	q11,d28,d0[0]
+	vld1.64	{q10},[r6,:128]!
+	vmlal.u32	q12,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q13,d28,d1[0]
+	vshl.i64	d29,d23,#16
+	vmlal.u32	q6,d28,d1[1]
+	vadd.u64	d29,d29,d22
+	vmlal.u32	q7,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q8,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+5]
+	vmlal.u32	q9,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q10,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q11,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q12,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q13,d29,d5[0]
+	vshr.u64	d22,d22,#16
+	vmlal.u32	q6,d29,d5[1]
+	vmlal.u32	q7,d29,d6[0]
+	vadd.u64	d22,d22,d23
+	vmlal.u32	q8,d29,d6[1]
+	vshr.u64	d22,d22,#16
+	vmlal.u32	q9,d29,d7[0]
+	vmlal.u32	q10,d29,d7[1]
+	vadd.u64	d24,d24,d22
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+5]
+	vmlal.u32	q12,d28,d0[0]
+	vld1.64	{q11},[r6,:128]!
+	vmlal.u32	q13,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q6,d28,d1[0]
+	vshl.i64	d29,d25,#16
+	vmlal.u32	q7,d28,d1[1]
+	vadd.u64	d29,d29,d24
+	vmlal.u32	q8,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q9,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+6]
+	vmlal.u32	q10,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q11,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q12,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q13,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q6,d29,d5[0]
+	vshr.u64	d24,d24,#16
+	vmlal.u32	q7,d29,d5[1]
+	vmlal.u32	q8,d29,d6[0]
+	vadd.u64	d24,d24,d25
+	vmlal.u32	q9,d29,d6[1]
+	vshr.u64	d24,d24,#16
+	vmlal.u32	q10,d29,d7[0]
+	vmlal.u32	q11,d29,d7[1]
+	vadd.u64	d26,d26,d24
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+6]
+	vmlal.u32	q13,d28,d0[0]
+	vld1.64	{q12},[r6,:128]!
+	vmlal.u32	q6,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q7,d28,d1[0]
+	vshl.i64	d29,d27,#16
+	vmlal.u32	q8,d28,d1[1]
+	vadd.u64	d29,d29,d26
+	vmlal.u32	q9,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q10,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+7]
+	vmlal.u32	q11,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q12,d28,d3[1]
+	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	q13,d29,d4[0]
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	vmlal.u32	q6,d29,d4[1]
+	vmlal.u32	q7,d29,d5[0]
+	vshr.u64	d26,d26,#16
+	vmlal.u32	q8,d29,d5[1]
+	vmlal.u32	q9,d29,d6[0]
+	vadd.u64	d26,d26,d27
+	vmlal.u32	q10,d29,d6[1]
+	vshr.u64	d26,d26,#16
+	vmlal.u32	q11,d29,d7[0]
+	vmlal.u32	q12,d29,d7[1]
+	vadd.u64	d12,d12,d26
+	vst1.32	{d29},[r10,:64]	@ put aside smashed m[8*i+7]
+	add	r10,sp,#8		@ rewind
+	sub	r8,r5,#8
+	b	LNEON_8n_inner
+
+.align	4
+LNEON_8n_inner:
+	subs	r8,r8,#8
+	vmlal.u32	q6,d28,d0[0]
+	vld1.64	{q13},[r6,:128]
+	vmlal.u32	q7,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+0]
+	vmlal.u32	q8,d28,d1[0]
+	vld1.32	{d4,d5,d6,d7},[r3]!
+	vmlal.u32	q9,d28,d1[1]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q10,d28,d2[0]
+	vmlal.u32	q11,d28,d2[1]
+	vmlal.u32	q12,d28,d3[0]
+	vmlal.u32	q13,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+1]
+	vmlal.u32	q6,d29,d4[0]
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+	vmlal.u32	q10,d29,d6[0]
+	vmlal.u32	q11,d29,d6[1]
+	vmlal.u32	q12,d29,d7[0]
+	vmlal.u32	q13,d29,d7[1]
+	vst1.64	{q6},[r7,:128]!
+	vmlal.u32	q7,d28,d0[0]
+	vld1.64	{q6},[r6,:128]
+	vmlal.u32	q8,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+1]
+	vmlal.u32	q9,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q10,d28,d1[1]
+	vmlal.u32	q11,d28,d2[0]
+	vmlal.u32	q12,d28,d2[1]
+	vmlal.u32	q13,d28,d3[0]
+	vmlal.u32	q6,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+2]
+	vmlal.u32	q7,d29,d4[0]
+	vmlal.u32	q8,d29,d4[1]
+	vmlal.u32	q9,d29,d5[0]
+	vmlal.u32	q10,d29,d5[1]
+	vmlal.u32	q11,d29,d6[0]
+	vmlal.u32	q12,d29,d6[1]
+	vmlal.u32	q13,d29,d7[0]
+	vmlal.u32	q6,d29,d7[1]
+	vst1.64	{q7},[r7,:128]!
+	vmlal.u32	q8,d28,d0[0]
+	vld1.64	{q7},[r6,:128]
+	vmlal.u32	q9,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+2]
+	vmlal.u32	q10,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q11,d28,d1[1]
+	vmlal.u32	q12,d28,d2[0]
+	vmlal.u32	q13,d28,d2[1]
+	vmlal.u32	q6,d28,d3[0]
+	vmlal.u32	q7,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+3]
+	vmlal.u32	q8,d29,d4[0]
+	vmlal.u32	q9,d29,d4[1]
+	vmlal.u32	q10,d29,d5[0]
+	vmlal.u32	q11,d29,d5[1]
+	vmlal.u32	q12,d29,d6[0]
+	vmlal.u32	q13,d29,d6[1]
+	vmlal.u32	q6,d29,d7[0]
+	vmlal.u32	q7,d29,d7[1]
+	vst1.64	{q8},[r7,:128]!
+	vmlal.u32	q9,d28,d0[0]
+	vld1.64	{q8},[r6,:128]
+	vmlal.u32	q10,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+3]
+	vmlal.u32	q11,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q12,d28,d1[1]
+	vmlal.u32	q13,d28,d2[0]
+	vmlal.u32	q6,d28,d2[1]
+	vmlal.u32	q7,d28,d3[0]
+	vmlal.u32	q8,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+4]
+	vmlal.u32	q9,d29,d4[0]
+	vmlal.u32	q10,d29,d4[1]
+	vmlal.u32	q11,d29,d5[0]
+	vmlal.u32	q12,d29,d5[1]
+	vmlal.u32	q13,d29,d6[0]
+	vmlal.u32	q6,d29,d6[1]
+	vmlal.u32	q7,d29,d7[0]
+	vmlal.u32	q8,d29,d7[1]
+	vst1.64	{q9},[r7,:128]!
+	vmlal.u32	q10,d28,d0[0]
+	vld1.64	{q9},[r6,:128]
+	vmlal.u32	q11,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+4]
+	vmlal.u32	q12,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q13,d28,d1[1]
+	vmlal.u32	q6,d28,d2[0]
+	vmlal.u32	q7,d28,d2[1]
+	vmlal.u32	q8,d28,d3[0]
+	vmlal.u32	q9,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+5]
+	vmlal.u32	q10,d29,d4[0]
+	vmlal.u32	q11,d29,d4[1]
+	vmlal.u32	q12,d29,d5[0]
+	vmlal.u32	q13,d29,d5[1]
+	vmlal.u32	q6,d29,d6[0]
+	vmlal.u32	q7,d29,d6[1]
+	vmlal.u32	q8,d29,d7[0]
+	vmlal.u32	q9,d29,d7[1]
+	vst1.64	{q10},[r7,:128]!
+	vmlal.u32	q11,d28,d0[0]
+	vld1.64	{q10},[r6,:128]
+	vmlal.u32	q12,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+5]
+	vmlal.u32	q13,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q6,d28,d1[1]
+	vmlal.u32	q7,d28,d2[0]
+	vmlal.u32	q8,d28,d2[1]
+	vmlal.u32	q9,d28,d3[0]
+	vmlal.u32	q10,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+6]
+	vmlal.u32	q11,d29,d4[0]
+	vmlal.u32	q12,d29,d4[1]
+	vmlal.u32	q13,d29,d5[0]
+	vmlal.u32	q6,d29,d5[1]
+	vmlal.u32	q7,d29,d6[0]
+	vmlal.u32	q8,d29,d6[1]
+	vmlal.u32	q9,d29,d7[0]
+	vmlal.u32	q10,d29,d7[1]
+	vst1.64	{q11},[r7,:128]!
+	vmlal.u32	q12,d28,d0[0]
+	vld1.64	{q11},[r6,:128]
+	vmlal.u32	q13,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+6]
+	vmlal.u32	q6,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q7,d28,d1[1]
+	vmlal.u32	q8,d28,d2[0]
+	vmlal.u32	q9,d28,d2[1]
+	vmlal.u32	q10,d28,d3[0]
+	vmlal.u32	q11,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+7]
+	vmlal.u32	q12,d29,d4[0]
+	vmlal.u32	q13,d29,d4[1]
+	vmlal.u32	q6,d29,d5[0]
+	vmlal.u32	q7,d29,d5[1]
+	vmlal.u32	q8,d29,d6[0]
+	vmlal.u32	q9,d29,d6[1]
+	vmlal.u32	q10,d29,d7[0]
+	vmlal.u32	q11,d29,d7[1]
+	vst1.64	{q12},[r7,:128]!
+	vmlal.u32	q13,d28,d0[0]
+	vld1.64	{q12},[r6,:128]
+	vmlal.u32	q6,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+7]
+	vmlal.u32	q7,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q8,d28,d1[1]
+	vmlal.u32	q9,d28,d2[0]
+	vmlal.u32	q10,d28,d2[1]
+	vmlal.u32	q11,d28,d3[0]
+	vmlal.u32	q12,d28,d3[1]
+	it	eq
+	subeq	r1,r1,r5,lsl#2	@ rewind
+	vmlal.u32	q13,d29,d4[0]
+	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	q6,d29,d4[1]
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	vmlal.u32	q7,d29,d5[0]
+	add	r10,sp,#8		@ rewind
+	vmlal.u32	q8,d29,d5[1]
+	vmlal.u32	q9,d29,d6[0]
+	vmlal.u32	q10,d29,d6[1]
+	vmlal.u32	q11,d29,d7[0]
+	vst1.64	{q13},[r7,:128]!
+	vmlal.u32	q12,d29,d7[1]
+
+	bne	LNEON_8n_inner
+	add	r6,sp,#128
+	vst1.64	{q6,q7},[r7,:256]!
+	veor	q2,q2,q2		@ d4-d5
+	vst1.64	{q8,q9},[r7,:256]!
+	veor	q3,q3,q3		@ d6-d7
+	vst1.64	{q10,q11},[r7,:256]!
+	vst1.64	{q12},[r7,:128]
+
+	subs	r9,r9,#8
+	vld1.64	{q6,q7},[r6,:256]!
+	vld1.64	{q8,q9},[r6,:256]!
+	vld1.64	{q10,q11},[r6,:256]!
+	vld1.64	{q12,q13},[r6,:256]!
+
+	itt	ne
+	subne	r3,r3,r5,lsl#2	@ rewind
+	bne	LNEON_8n_outer
+
+	add	r7,sp,#128
+	vst1.64	{q2,q3}, [sp,:256]!	@ start wiping stack frame
+	vshr.u64	d10,d12,#16
+	vst1.64	{q2,q3},[sp,:256]!
+	vadd.u64	d13,d13,d10
+	vst1.64	{q2,q3}, [sp,:256]!
+	vshr.u64	d10,d13,#16
+	vst1.64	{q2,q3}, [sp,:256]!
+	vzip.16	d12,d13
+
+	mov	r8,r5
+	b	LNEON_tail_entry
+
+.align	4
+LNEON_tail:
+	vadd.u64	d12,d12,d10
+	vshr.u64	d10,d12,#16
+	vld1.64	{q8,q9}, [r6, :256]!
+	vadd.u64	d13,d13,d10
+	vld1.64	{q10,q11}, [r6, :256]!
+	vshr.u64	d10,d13,#16
+	vld1.64	{q12,q13}, [r6, :256]!
+	vzip.16	d12,d13
+
+LNEON_tail_entry:
+	vadd.u64	d14,d14,d10
+	vst1.32	{d12[0]}, [r7, :32]!
+	vshr.u64	d10,d14,#16
+	vadd.u64	d15,d15,d10
+	vshr.u64	d10,d15,#16
+	vzip.16	d14,d15
+	vadd.u64	d16,d16,d10
+	vst1.32	{d14[0]}, [r7, :32]!
+	vshr.u64	d10,d16,#16
+	vadd.u64	d17,d17,d10
+	vshr.u64	d10,d17,#16
+	vzip.16	d16,d17
+	vadd.u64	d18,d18,d10
+	vst1.32	{d16[0]}, [r7, :32]!
+	vshr.u64	d10,d18,#16
+	vadd.u64	d19,d19,d10
+	vshr.u64	d10,d19,#16
+	vzip.16	d18,d19
+	vadd.u64	d20,d20,d10
+	vst1.32	{d18[0]}, [r7, :32]!
+	vshr.u64	d10,d20,#16
+	vadd.u64	d21,d21,d10
+	vshr.u64	d10,d21,#16
+	vzip.16	d20,d21
+	vadd.u64	d22,d22,d10
+	vst1.32	{d20[0]}, [r7, :32]!
+	vshr.u64	d10,d22,#16
+	vadd.u64	d23,d23,d10
+	vshr.u64	d10,d23,#16
+	vzip.16	d22,d23
+	vadd.u64	d24,d24,d10
+	vst1.32	{d22[0]}, [r7, :32]!
+	vshr.u64	d10,d24,#16
+	vadd.u64	d25,d25,d10
+	vshr.u64	d10,d25,#16
+	vzip.16	d24,d25
+	vadd.u64	d26,d26,d10
+	vst1.32	{d24[0]}, [r7, :32]!
+	vshr.u64	d10,d26,#16
+	vadd.u64	d27,d27,d10
+	vshr.u64	d10,d27,#16
+	vzip.16	d26,d27
+	vld1.64	{q6,q7}, [r6, :256]!
+	subs	r8,r8,#8
+	vst1.32	{d26[0]},   [r7, :32]!
+	bne	LNEON_tail
+
+	vst1.32	{d10[0]}, [r7, :32]		@ top-most bit
+	sub	r3,r3,r5,lsl#2			@ rewind r3
+	subs	r1,sp,#0				@ clear carry flag
+	add	r2,sp,r5,lsl#2
+
+LNEON_sub:
+	ldmia	r1!, {r4,r5,r6,r7}
+	ldmia	r3!, {r8,r9,r10,r11}
+	sbcs	r8, r4,r8
+	sbcs	r9, r5,r9
+	sbcs	r10,r6,r10
+	sbcs	r11,r7,r11
+	teq	r1,r2				@ preserves carry
+	stmia	r0!, {r8,r9,r10,r11}
+	bne	LNEON_sub
+
+	ldr	r10, [r1]				@ load top-most bit
+	mov	r11,sp
+	veor	q0,q0,q0
+	sub	r11,r2,r11				@ this is num*4
+	veor	q1,q1,q1
+	mov	r1,sp
+	sub	r0,r0,r11				@ rewind r0
+	mov	r3,r2				@ second 3/4th of frame
+	sbcs	r10,r10,#0				@ result is carry flag
+
+LNEON_copy_n_zap:
+	ldmia	r1!, {r4,r5,r6,r7}
+	ldmia	r0,  {r8,r9,r10,r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	ldmia	r1, {r4,r5,r6,r7}
+	stmia	r0!, {r8,r9,r10,r11}
+	sub	r1,r1,#16
+	ldmia	r0, {r8,r9,r10,r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0,q1}, [r1,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	teq	r1,r2				@ preserves carry
+	stmia	r0!, {r8,r9,r10,r11}
+	bne	LNEON_copy_n_zap
+
+	mov	sp,ip
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+	bx	lr						@ bx lr
+
+#endif
+.byte	77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#if __ARM_MAX_ARCH__>=7
+.comm	_OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol	_OPENSSL_armcap_P
+.long	0
+.private_extern	_OPENSSL_armcap_P
+#endif
diff --git a/ios-arm/crypto/fipsmodule/bsaes-armv7.S b/ios-arm/crypto/fipsmodule/bsaes-armv7.S
new file mode 100644
index 0000000..5e2ebf0
--- /dev/null
+++ b/ios-arm/crypto/fipsmodule/bsaes-armv7.S
@@ -0,0 +1,2579 @@
+@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License").  You may not use
+@ this file except in compliance with the License.  You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+@ granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt	19.5 cycles per byte processed with 128-bit key
+@ decrypt	22.1 cycles per byte processed with 128-bit key
+@ key conv.	440  cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@						<appro@openssl.org>
+
+@ April-August 2013
+@
+@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
+@
+@					<ard.biesheuvel@linaro.org>
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+# define VFP_ABI_FRAME	0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME	0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+.text
+.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
+#if defined(__thumb2__) && !defined(__APPLE__)
+.thumb
+#else
+.code	32
+# undef __thumb2__
+#endif
+
+#ifdef __thumb2__
+.thumb_func	_bsaes_decrypt8
+#endif
+.align	4
+_bsaes_decrypt8:
+	adr	r6,.
+	vldmia	r4!, {q9}		@ round 0 key
+#ifdef	__APPLE__
+	adr	r6,LM0ISR
+#else
+	add	r6,r6,#LM0ISR-_bsaes_decrypt8
+#endif
+
+	vldmia	r6!, {q8}		@ LM0ISR
+	veor	q10, q0, q9	@ xor with round0 key
+	veor	q11, q1, q9
+	vtbl.8	d0, {q10}, d16
+	vtbl.8	d1, {q10}, d17
+	veor	q12, q2, q9
+	vtbl.8	d2, {q11}, d16
+	vtbl.8	d3, {q11}, d17
+	veor	q13, q3, q9
+	vtbl.8	d4, {q12}, d16
+	vtbl.8	d5, {q12}, d17
+	veor	q14, q4, q9
+	vtbl.8	d6, {q13}, d16
+	vtbl.8	d7, {q13}, d17
+	veor	q15, q5, q9
+	vtbl.8	d8, {q14}, d16
+	vtbl.8	d9, {q14}, d17
+	veor	q10, q6, q9
+	vtbl.8	d10, {q15}, d16
+	vtbl.8	d11, {q15}, d17
+	veor	q11, q7, q9
+	vtbl.8	d12, {q10}, d16
+	vtbl.8	d13, {q10}, d17
+	vtbl.8	d14, {q11}, d16
+	vtbl.8	d15, {q11}, d17
+	vmov.i8	q8,#0x55			@ compose LBS0
+	vmov.i8	q9,#0x33			@ compose LBS1
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q4, #1
+	veor	q10, q10, q7
+	veor	q11, q11, q5
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #1
+	veor	q5, q5, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q3
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose LBS2
+	vshr.u64	q10, q5, #2
+	vshr.u64	q11, q4, #2
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q5, q5, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q3
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q3, #4
+	vshr.u64	q11, q2, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #4
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q4
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	sub	r5,r5,#1
+	b	Ldec_sbox
+.align	4
+Ldec_loop:
+	vldmia	r4!, {q8,q9,q10,q11}
+	veor	q8, q8, q0
+	veor	q9, q9, q1
+	vtbl.8	d0, {q8}, d24
+	vtbl.8	d1, {q8}, d25
+	vldmia	r4!, {q8}
+	veor	q10, q10, q2
+	vtbl.8	d2, {q9}, d24
+	vtbl.8	d3, {q9}, d25
+	vldmia	r4!, {q9}
+	veor	q11, q11, q3
+	vtbl.8	d4, {q10}, d24
+	vtbl.8	d5, {q10}, d25
+	vldmia	r4!, {q10}
+	vtbl.8	d6, {q11}, d24
+	vtbl.8	d7, {q11}, d25
+	vldmia	r4!, {q11}
+	veor	q8, q8, q4
+	veor	q9, q9, q5
+	vtbl.8	d8, {q8}, d24
+	vtbl.8	d9, {q8}, d25
+	veor	q10, q10, q6
+	vtbl.8	d10, {q9}, d24
+	vtbl.8	d11, {q9}, d25
+	veor	q11, q11, q7
+	vtbl.8	d12, {q10}, d24
+	vtbl.8	d13, {q10}, d25
+	vtbl.8	d14, {q11}, d24
+	vtbl.8	d15, {q11}, d25
+Ldec_sbox:
+	veor	q1, q1, q4
+	veor	q3, q3, q4
+
+	veor	q4, q4, q7
+	veor	q1, q1, q6
+	veor	q2, q2, q7
+	veor	q6, q6, q4
+
+	veor	q0, q0, q1
+	veor	q2, q2, q5
+	veor	q7, q7, q6
+	veor	q3, q3, q0
+	veor	q5, q5, q0
+	veor	q1, q1, q3
+	veor	q11, q3, q0
+	veor	q10, q7, q4
+	veor	q9, q1, q6
+	veor	q13, q4, q0
+	vmov	q8, q10
+	veor	q12, q5, q2
+
+	vorr	q10, q10, q9
+	veor	q15, q11, q8
+	vand	q14, q11, q12
+	vorr	q11, q11, q12
+	veor	q12, q12, q9
+	vand	q8, q8, q9
+	veor	q9, q6, q2
+	vand	q15, q15, q12
+	vand	q13, q13, q9
+	veor	q9, q3, q7
+	veor	q12, q1, q5
+	veor	q11, q11, q13
+	veor	q10, q10, q13
+	vand	q13, q9, q12
+	vorr	q9, q9, q12
+	veor	q11, q11, q15
+	veor	q8, q8, q13
+	veor	q10, q10, q14
+	veor	q9, q9, q15
+	veor	q8, q8, q14
+	vand	q12, q4, q6
+	veor	q9, q9, q14
+	vand	q13, q0, q2
+	vand	q14, q7, q1
+	vorr	q15, q3, q5
+	veor	q11, q11, q12
+	veor	q9, q9, q14
+	veor	q8, q8, q15
+	veor	q10, q10, q13
+
+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
+
+	@ new smaller inversion
+
+	vand	q14, q11, q9
+	vmov	q12, q8
+
+	veor	q13, q10, q14
+	veor	q15, q8, q14
+	veor	q14, q8, q14	@ q14=q15
+
+	vbsl	q13, q9, q8
+	vbsl	q15, q11, q10
+	veor	q11, q11, q10
+
+	vbsl	q12, q13, q14
+	vbsl	q8, q14, q13
+
+	vand	q14, q12, q15
+	veor	q9, q9, q8
+
+	veor	q14, q14, q11
+	veor	q12, q5, q2
+	veor	q8, q1, q6
+	veor	q10, q15, q14
+	vand	q10, q10, q5
+	veor	q5, q5, q1
+	vand	q11, q1, q15
+	vand	q5, q5, q14
+	veor	q1, q11, q10
+	veor	q5, q5, q11
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q2
+	veor	q12, q12, q8
+	veor	q2, q2, q6
+	vand	q8, q8, q15
+	vand	q6, q6, q13
+	vand	q12, q12, q14
+	vand	q2, q2, q9
+	veor	q8, q8, q12
+	veor	q2, q2, q6
+	veor	q12, q12, q11
+	veor	q6, q6, q10
+	veor	q5, q5, q12
+	veor	q2, q2, q12
+	veor	q1, q1, q8
+	veor	q6, q6, q8
+
+	veor	q12, q3, q0
+	veor	q8, q7, q4
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q0
+	veor	q12, q12, q8
+	veor	q0, q0, q4
+	vand	q8, q8, q15
+	vand	q4, q4, q13
+	vand	q12, q12, q14
+	vand	q0, q0, q9
+	veor	q8, q8, q12
+	veor	q0, q0, q4
+	veor	q12, q12, q11
+	veor	q4, q4, q10
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q10, q15, q14
+	vand	q10, q10, q3
+	veor	q3, q3, q7
+	vand	q11, q7, q15
+	vand	q3, q3, q14
+	veor	q7, q11, q10
+	veor	q3, q3, q11
+	veor	q3, q3, q12
+	veor	q0, q0, q12
+	veor	q7, q7, q8
+	veor	q4, q4, q8
+	veor	q1, q1, q7
+	veor	q6, q6, q5
+
+	veor	q4, q4, q1
+	veor	q2, q2, q7
+	veor	q5, q5, q7
+	veor	q4, q4, q2
+	veor	q7, q7, q0
+	veor	q4, q4, q5
+	veor	q3, q3, q6
+	veor	q6, q6, q1
+	veor	q3, q3, q4
+
+	veor	q4, q4, q0
+	veor	q7, q7, q3
+	subs	r5,r5,#1
+	bcc	Ldec_done
+	@ multiplication by 0x05-0x00-0x04-0x00
+	vext.8	q8, q0, q0, #8
+	vext.8	q14, q3, q3, #8
+	vext.8	q15, q5, q5, #8
+	veor	q8, q8, q0
+	vext.8	q9, q1, q1, #8
+	veor	q14, q14, q3
+	vext.8	q10, q6, q6, #8
+	veor	q15, q15, q5
+	vext.8	q11, q4, q4, #8
+	veor	q9, q9, q1
+	vext.8	q12, q2, q2, #8
+	veor	q10, q10, q6
+	vext.8	q13, q7, q7, #8
+	veor	q11, q11, q4
+	veor	q12, q12, q2
+	veor	q13, q13, q7
+
+	veor	q0, q0, q14
+	veor	q1, q1, q14
+	veor	q6, q6, q8
+	veor	q2, q2, q10
+	veor	q4, q4, q9
+	veor	q1, q1, q15
+	veor	q6, q6, q15
+	veor	q2, q2, q14
+	veor	q7, q7, q11
+	veor	q4, q4, q14
+	veor	q3, q3, q12
+	veor	q2, q2, q15
+	veor	q7, q7, q15
+	veor	q5, q5, q13
+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
+	vext.8	q9, q1, q1, #12
+	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
+	vext.8	q10, q6, q6, #12
+	veor	q1, q1, q9
+	vext.8	q11, q4, q4, #12
+	veor	q6, q6, q10
+	vext.8	q12, q2, q2, #12
+	veor	q4, q4, q11
+	vext.8	q13, q7, q7, #12
+	veor	q2, q2, q12
+	vext.8	q14, q3, q3, #12
+	veor	q7, q7, q13
+	vext.8	q15, q5, q5, #12
+	veor	q3, q3, q14
+
+	veor	q9, q9, q0
+	veor	q5, q5, q15
+	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	q10, q10, q1
+	veor	q8, q8, q5
+	veor	q9, q9, q5
+	vext.8	q1, q1, q1, #8
+	veor	q13, q13, q2
+	veor	q0, q0, q8
+	veor	q14, q14, q7
+	veor	q1, q1, q9
+	vext.8	q8, q2, q2, #8
+	veor	q12, q12, q4
+	vext.8	q9, q7, q7, #8
+	veor	q15, q15, q3
+	vext.8	q2, q4, q4, #8
+	veor	q11, q11, q6
+	vext.8	q7, q5, q5, #8
+	veor	q12, q12, q5
+	vext.8	q4, q3, q3, #8
+	veor	q11, q11, q5
+	vext.8	q3, q6, q6, #8
+	veor	q5, q9, q13
+	veor	q11, q11, q2
+	veor	q7, q7, q15
+	veor	q6, q4, q14
+	veor	q4, q8, q12
+	veor	q2, q3, q10
+	vmov	q3, q11
+	 @ vmov	q5, q9
+	vldmia	r6, {q12}		@ LISR
+	ite	eq				@ Thumb2 thing, sanity check in ARM
+	addeq	r6,r6,#0x10
+	bne	Ldec_loop
+	vldmia	r6, {q12}		@ LISRM0
+	b	Ldec_loop
+.align	4
+Ldec_done:
+	vmov.i8	q8,#0x55			@ compose LBS0
+	vmov.i8	q9,#0x33			@ compose LBS1
+	vshr.u64	q10, q3, #1
+	vshr.u64	q11, q2, #1
+	veor	q10, q10, q5
+	veor	q11, q11, q7
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #1
+	veor	q7, q7, q11
+	vshl.u64	q11, q11, #1
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q4
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q4, q4, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose LBS2
+	vshr.u64	q10, q7, #2
+	vshr.u64	q11, q2, #2
+	veor	q10, q10, q5
+	veor	q11, q11, q3
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #2
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #2
+	veor	q7, q7, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q4
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q4, q4, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q4, #4
+	vshr.u64	q11, q6, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q3
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #4
+	veor	q4, q4, q10
+	veor	q6, q6, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q2
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vldmia	r4, {q8}			@ last round key
+	veor	q6, q6, q8
+	veor	q4, q4, q8
+	veor	q2, q2, q8
+	veor	q7, q7, q8
+	veor	q3, q3, q8
+	veor	q5, q5, q8
+	veor	q0, q0, q8
+	veor	q1, q1, q8
+	bx	lr
+
+
+
+.align	6
+_bsaes_const:
+LM0ISR:@ InvShiftRows constants
+.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+LISR:
+.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+LISRM0:
+.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+LM0SR:@ ShiftRows constants
+.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+LSR:
+.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+LSRM0:
+.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+LM0:
+.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+LREVM0SR:
+.quad	0x090d01050c000408, 0x03070b0f060a0e02
+.byte	66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	6
+
+
+#ifdef __thumb2__
+.thumb_func	_bsaes_encrypt8
+#endif
+.align	4
+_bsaes_encrypt8:
+	adr	r6,.
+	vldmia	r4!, {q9}		@ round 0 key
+#ifdef	__APPLE__
+	adr	r6,LM0SR
+#else
+	sub	r6,r6,#_bsaes_encrypt8-LM0SR
+#endif
+
+	vldmia	r6!, {q8}		@ LM0SR
+_bsaes_encrypt8_alt:
+	veor	q10, q0, q9	@ xor with round0 key
+	veor	q11, q1, q9
+	vtbl.8	d0, {q10}, d16
+	vtbl.8	d1, {q10}, d17
+	veor	q12, q2, q9
+	vtbl.8	d2, {q11}, d16
+	vtbl.8	d3, {q11}, d17
+	veor	q13, q3, q9
+	vtbl.8	d4, {q12}, d16
+	vtbl.8	d5, {q12}, d17
+	veor	q14, q4, q9
+	vtbl.8	d6, {q13}, d16
+	vtbl.8	d7, {q13}, d17
+	veor	q15, q5, q9
+	vtbl.8	d8, {q14}, d16
+	vtbl.8	d9, {q14}, d17
+	veor	q10, q6, q9
+	vtbl.8	d10, {q15}, d16
+	vtbl.8	d11, {q15}, d17
+	veor	q11, q7, q9
+	vtbl.8	d12, {q10}, d16
+	vtbl.8	d13, {q10}, d17
+	vtbl.8	d14, {q11}, d16
+	vtbl.8	d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+	vmov.i8	q8,#0x55			@ compose LBS0
+	vmov.i8	q9,#0x33			@ compose LBS1
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q4, #1
+	veor	q10, q10, q7
+	veor	q11, q11, q5
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #1
+	veor	q5, q5, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q3
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose LBS2
+	vshr.u64	q10, q5, #2
+	vshr.u64	q11, q4, #2
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q5, q5, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q3
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q3, #4
+	vshr.u64	q11, q2, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #4
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q4
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	sub	r5,r5,#1
+	b	Lenc_sbox
+.align	4
+Lenc_loop:
+	vldmia	r4!, {q8,q9,q10,q11}
+	veor	q8, q8, q0
+	veor	q9, q9, q1
+	vtbl.8	d0, {q8}, d24
+	vtbl.8	d1, {q8}, d25
+	vldmia	r4!, {q8}
+	veor	q10, q10, q2
+	vtbl.8	d2, {q9}, d24
+	vtbl.8	d3, {q9}, d25
+	vldmia	r4!, {q9}
+	veor	q11, q11, q3
+	vtbl.8	d4, {q10}, d24
+	vtbl.8	d5, {q10}, d25
+	vldmia	r4!, {q10}
+	vtbl.8	d6, {q11}, d24
+	vtbl.8	d7, {q11}, d25
+	vldmia	r4!, {q11}
+	veor	q8, q8, q4
+	veor	q9, q9, q5
+	vtbl.8	d8, {q8}, d24
+	vtbl.8	d9, {q8}, d25
+	veor	q10, q10, q6
+	vtbl.8	d10, {q9}, d24
+	vtbl.8	d11, {q9}, d25
+	veor	q11, q11, q7
+	vtbl.8	d12, {q10}, d24
+	vtbl.8	d13, {q10}, d25
+	vtbl.8	d14, {q11}, d24
+	vtbl.8	d15, {q11}, d25
+Lenc_sbox:
+	veor	q2, q2, q1
+	veor	q5, q5, q6
+	veor	q3, q3, q0
+	veor	q6, q6, q2
+	veor	q5, q5, q0
+
+	veor	q6, q6, q3
+	veor	q3, q3, q7
+	veor	q7, q7, q5
+	veor	q3, q3, q4
+	veor	q4, q4, q5
+
+	veor	q2, q2, q7
+	veor	q3, q3, q1
+	veor	q1, q1, q5
+	veor	q11, q7, q4
+	veor	q10, q1, q2
+	veor	q9, q5, q3
+	veor	q13, q2, q4
+	vmov	q8, q10
+	veor	q12, q6, q0
+
+	vorr	q10, q10, q9
+	veor	q15, q11, q8
+	vand	q14, q11, q12
+	vorr	q11, q11, q12
+	veor	q12, q12, q9
+	vand	q8, q8, q9
+	veor	q9, q3, q0
+	vand	q15, q15, q12
+	vand	q13, q13, q9
+	veor	q9, q7, q1
+	veor	q12, q5, q6
+	veor	q11, q11, q13
+	veor	q10, q10, q13
+	vand	q13, q9, q12
+	vorr	q9, q9, q12
+	veor	q11, q11, q15
+	veor	q8, q8, q13
+	veor	q10, q10, q14
+	veor	q9, q9, q15
+	veor	q8, q8, q14
+	vand	q12, q2, q3
+	veor	q9, q9, q14
+	vand	q13, q4, q0
+	vand	q14, q1, q5
+	vorr	q15, q7, q6
+	veor	q11, q11, q12
+	veor	q9, q9, q14
+	veor	q8, q8, q15
+	veor	q10, q10, q13
+
+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
+
+	@ new smaller inversion
+
+	vand	q14, q11, q9
+	vmov	q12, q8
+
+	veor	q13, q10, q14
+	veor	q15, q8, q14
+	veor	q14, q8, q14	@ q14=q15
+
+	vbsl	q13, q9, q8
+	vbsl	q15, q11, q10
+	veor	q11, q11, q10
+
+	vbsl	q12, q13, q14
+	vbsl	q8, q14, q13
+
+	vand	q14, q12, q15
+	veor	q9, q9, q8
+
+	veor	q14, q14, q11
+	veor	q12, q6, q0
+	veor	q8, q5, q3
+	veor	q10, q15, q14
+	vand	q10, q10, q6
+	veor	q6, q6, q5
+	vand	q11, q5, q15
+	vand	q6, q6, q14
+	veor	q5, q11, q10
+	veor	q6, q6, q11
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q0
+	veor	q12, q12, q8
+	veor	q0, q0, q3
+	vand	q8, q8, q15
+	vand	q3, q3, q13
+	vand	q12, q12, q14
+	vand	q0, q0, q9
+	veor	q8, q8, q12
+	veor	q0, q0, q3
+	veor	q12, q12, q11
+	veor	q3, q3, q10
+	veor	q6, q6, q12
+	veor	q0, q0, q12
+	veor	q5, q5, q8
+	veor	q3, q3, q8
+
+	veor	q12, q7, q4
+	veor	q8, q1, q2
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q4
+	veor	q12, q12, q8
+	veor	q4, q4, q2
+	vand	q8, q8, q15
+	vand	q2, q2, q13
+	vand	q12, q12, q14
+	vand	q4, q4, q9
+	veor	q8, q8, q12
+	veor	q4, q4, q2
+	veor	q12, q12, q11
+	veor	q2, q2, q10
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q10, q15, q14
+	vand	q10, q10, q7
+	veor	q7, q7, q1
+	vand	q11, q1, q15
+	vand	q7, q7, q14
+	veor	q1, q11, q10
+	veor	q7, q7, q11
+	veor	q7, q7, q12
+	veor	q4, q4, q12
+	veor	q1, q1, q8
+	veor	q2, q2, q8
+	veor	q7, q7, q0
+	veor	q1, q1, q6
+	veor	q6, q6, q0
+	veor	q4, q4, q7
+	veor	q0, q0, q1
+
+	veor	q1, q1, q5
+	veor	q5, q5, q2
+	veor	q2, q2, q3
+	veor	q3, q3, q5
+	veor	q4, q4, q5
+
+	veor	q6, q6, q3
+	subs	r5,r5,#1
+	bcc	Lenc_done
+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
+	vext.8	q9, q1, q1, #12
+	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
+	vext.8	q10, q4, q4, #12
+	veor	q1, q1, q9
+	vext.8	q11, q6, q6, #12
+	veor	q4, q4, q10
+	vext.8	q12, q3, q3, #12
+	veor	q6, q6, q11
+	vext.8	q13, q7, q7, #12
+	veor	q3, q3, q12
+	vext.8	q14, q2, q2, #12
+	veor	q7, q7, q13
+	vext.8	q15, q5, q5, #12
+	veor	q2, q2, q14
+
+	veor	q9, q9, q0
+	veor	q5, q5, q15
+	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	q10, q10, q1
+	veor	q8, q8, q5
+	veor	q9, q9, q5
+	vext.8	q1, q1, q1, #8
+	veor	q13, q13, q3
+	veor	q0, q0, q8
+	veor	q14, q14, q7
+	veor	q1, q1, q9
+	vext.8	q8, q3, q3, #8
+	veor	q12, q12, q6
+	vext.8	q9, q7, q7, #8
+	veor	q15, q15, q2
+	vext.8	q3, q6, q6, #8
+	veor	q11, q11, q4
+	vext.8	q7, q5, q5, #8
+	veor	q12, q12, q5
+	vext.8	q6, q2, q2, #8
+	veor	q11, q11, q5
+	vext.8	q2, q4, q4, #8
+	veor	q5, q9, q13
+	veor	q4, q8, q12
+	veor	q3, q3, q11
+	veor	q7, q7, q15
+	veor	q6, q6, q14
+	 @ vmov	q4, q8
+	veor	q2, q2, q10
+	 @ vmov	q5, q9
+	vldmia	r6, {q12}		@ LSR
+	ite	eq				@ Thumb2 thing, samity check in ARM
+	addeq	r6,r6,#0x10
+	bne	Lenc_loop
+	vldmia	r6, {q12}		@ LSRM0
+	b	Lenc_loop
+.align	4
+Lenc_done:
+	vmov.i8	q8,#0x55			@ compose LBS0
+	vmov.i8	q9,#0x33			@ compose LBS1
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q3, #1
+	veor	q10, q10, q5
+	veor	q11, q11, q7
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #1
+	veor	q7, q7, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q3, q3, q11
+	vshr.u64	q10, q4, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q6
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q6, q6, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q4, q4, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose LBS2
+	vshr.u64	q10, q7, #2
+	vshr.u64	q11, q3, #2
+	veor	q10, q10, q5
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q7, q7, q10
+	veor	q3, q3, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q6
+	veor	q11, q11, q4
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q6, q6, q10
+	vshl.u64	q10, q10, #2
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q6, #4
+	vshr.u64	q11, q4, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q2
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #4
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q3
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vldmia	r4, {q8}			@ last round key
+	veor	q4, q4, q8
+	veor	q6, q6, q8
+	veor	q3, q3, q8
+	veor	q7, q7, q8
+	veor	q2, q2, q8
+	veor	q5, q5, q8
+	veor	q0, q0, q8
+	veor	q1, q1, q8
+	bx	lr
+
+#ifdef __thumb2__
+.thumb_func	_bsaes_key_convert
+#endif
+.align	4
+_bsaes_key_convert:
+	adr	r6,.
+	vld1.8	{q7},  [r4]!		@ load round 0 key
+#ifdef	__APPLE__
+	adr	r6,LM0
+#else
+	sub	r6,r6,#_bsaes_key_convert-LM0
+#endif
+	vld1.8	{q15}, [r4]!		@ load round 1 key
+
+	vmov.i8	q8,  #0x01			@ bit masks
+	vmov.i8	q9,  #0x02
+	vmov.i8	q10, #0x04
+	vmov.i8	q11, #0x08
+	vmov.i8	q12, #0x10
+	vmov.i8	q13, #0x20
+	vldmia	r6, {q14}		@ LM0
+
+#ifdef __ARMEL__
+	vrev32.8	q7,  q7
+	vrev32.8	q15, q15
+#endif
+	sub	r5,r5,#1
+	vstmia	r12!, {q7}		@ save round 0 key
+	b	Lkey_loop
+
+.align	4
+Lkey_loop:
+	vtbl.8	d14,{q15},d28
+	vtbl.8	d15,{q15},d29
+	vmov.i8	q6,  #0x40
+	vmov.i8	q15, #0x80
+
+	vtst.8	q0, q7, q8
+	vtst.8	q1, q7, q9
+	vtst.8	q2, q7, q10
+	vtst.8	q3, q7, q11
+	vtst.8	q4, q7, q12
+	vtst.8	q5, q7, q13
+	vtst.8	q6, q7, q6
+	vtst.8	q7, q7, q15
+	vld1.8	{q15}, [r4]!		@ load next round key
+	vmvn	q0, q0		@ "pnot"
+	vmvn	q1, q1
+	vmvn	q5, q5
+	vmvn	q6, q6
+#ifdef __ARMEL__
+	vrev32.8	q15, q15
+#endif
+	subs	r5,r5,#1
+	vstmia	r12!,{q0,q1,q2,q3,q4,q5,q6,q7}		@ write bit-sliced round key
+	bne	Lkey_loop
+
+	vmov.i8	q7,#0x63			@ compose L63
+	@ don't save last round key
+	bx	lr
+
+
+
+
+.globl	_bsaes_cbc_encrypt
+.private_extern	_bsaes_cbc_encrypt
+#ifdef __thumb2__
+.thumb_func	_bsaes_cbc_encrypt
+#endif
+.align	5
+_bsaes_cbc_encrypt:
+#ifndef	__KERNEL__
+	cmp	r2, #128
+#ifndef	__thumb__
+	blo	_AES_cbc_encrypt
+#else
+	bhs	1f
+	b	_AES_cbc_encrypt
+1:
+#endif
+#endif
+
+	@ it is up to the caller to make sure we are called with enc == 0
+
+	mov	ip, sp
+	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+	VFP_ABI_PUSH
+	ldr	r8, [ip]			@ IV is 1st arg on the stack
+	mov	r2, r2, lsr#4		@ len in 16 byte blocks
+	sub	sp, #0x10			@ scratch space to carry over the IV
+	mov	r9, sp				@ save sp
+
+	ldr	r10, [r3, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
+	add	r12, #96			@ sifze of bit-slices key schedule
+
+	@ populate the key schedule
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	mov	sp, r12				@ sp is sp
+	bl	_bsaes_key_convert
+	vldmia	sp, {q6}
+	vstmia	r12,  {q15}		@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	sp, {q7}
+#else
+	ldr	r12, [r3, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [r3, #244]
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	add	r12, r3, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	add	r4, r3, #248
+	vldmia	r4, {q6}
+	vstmia	r12, {q15}			@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	r4, {q7}
+
+.align	2
+
+#endif
+
+	vld1.8	{q15}, [r8]		@ load IV
+	b	Lcbc_dec_loop
+
+.align	4
+Lcbc_dec_loop:
+	subs	r2, r2, #0x8
+	bmi	Lcbc_dec_loop_finish
+
+	vld1.8	{q0,q1}, [r0]!	@ load input
+	vld1.8	{q2,q3}, [r0]!
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	mov	r4, sp			@ pass the key
+#else
+	add	r4, r3, #248
+#endif
+	vld1.8	{q4,q5}, [r0]!
+	mov	r5, r10
+	vld1.8	{q6,q7}, [r0]
+	sub	r0, r0, #0x60
+	vstmia	r9, {q15}			@ put aside IV
+
+	bl	_bsaes_decrypt8
+
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q14,q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q3, q3, q13
+	vst1.8	{q6}, [r1]!
+	veor	q5, q5, q14
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	vst1.8	{q3}, [r1]!
+	vst1.8	{q5}, [r1]!
+
+	b	Lcbc_dec_loop
+
+Lcbc_dec_loop_finish:
+	adds	r2, r2, #8
+	beq	Lcbc_dec_done
+
+	vld1.8	{q0}, [r0]!		@ load input
+	cmp	r2, #2
+	blo	Lcbc_dec_one
+	vld1.8	{q1}, [r0]!
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	mov	r4, sp			@ pass the key
+#else
+	add	r4, r3, #248
+#endif
+	mov	r5, r10
+	vstmia	r9, {q15}			@ put aside IV
+	beq	Lcbc_dec_two
+	vld1.8	{q2}, [r0]!
+	cmp	r2, #4
+	blo	Lcbc_dec_three
+	vld1.8	{q3}, [r0]!
+	beq	Lcbc_dec_four
+	vld1.8	{q4}, [r0]!
+	cmp	r2, #6
+	blo	Lcbc_dec_five
+	vld1.8	{q5}, [r0]!
+	beq	Lcbc_dec_six
+	vld1.8	{q6}, [r0]!
+	sub	r0, r0, #0x70
+
+	bl	_bsaes_decrypt8
+
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q3, q3, q13
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	vst1.8	{q3}, [r1]!
+	b	Lcbc_dec_done
+.align	4
+Lcbc_dec_six:
+	sub	r0, r0, #0x60
+	bl	_bsaes_decrypt8
+	vldmia	r9,{q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	b	Lcbc_dec_done
+.align	4
+Lcbc_dec_five:
+	sub	r0, r0, #0x50
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q15}, [r0]!
+	veor	q4, q4, q10
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q2, q2, q11
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	b	Lcbc_dec_done
+.align	4
+Lcbc_dec_four:
+	sub	r0, r0, #0x40
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q15}, [r0]!
+	veor	q4, q4, q10
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	b	Lcbc_dec_done
+.align	4
+Lcbc_dec_three:
+	sub	r0, r0, #0x30
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q15}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	b	Lcbc_dec_done
+.align	4
+Lcbc_dec_two:
+	sub	r0, r0, #0x20
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8}, [r0]!		@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q15}, [r0]!		@ reload input
+	veor	q1, q1, q8
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	b	Lcbc_dec_done
+.align	4
+Lcbc_dec_one:
+	sub	r0, r0, #0x10
+	mov	r10, r1			@ save original out pointer
+	mov	r1, r9			@ use the iv scratch space as out buffer
+	mov	r2, r3
+	vmov	q4,q15		@ just in case ensure that IV
+	vmov	q5,q0			@ and input are preserved
+	bl	_AES_decrypt
+	vld1.8	{q0}, [r9]		@ load result
+	veor	q0, q0, q4	@ ^= IV
+	vmov	q15, q5		@ q5 holds input
+	vst1.8	{q0}, [r10]		@ write output
+
+Lcbc_dec_done:
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+Lcbc_dec_bzero:@ wipe key schedule [if any]
+	vstmia	sp!, {q0,q1}
+	cmp	sp, r9
+	bne	Lcbc_dec_bzero
+#endif
+
+	mov	sp, r9
+	add	sp, #0x10			@ add sp,r9,#0x10 is no good for thumb
+	vst1.8	{q15}, [r8]		@ return IV
+	VFP_ABI_POP
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
+
+
+.globl	_bsaes_ctr32_encrypt_blocks
+.private_extern	_bsaes_ctr32_encrypt_blocks
+#ifdef __thumb2__
+.thumb_func	_bsaes_ctr32_encrypt_blocks
+#endif
+.align	5
+_bsaes_ctr32_encrypt_blocks:
+	cmp	r2, #8			@ use plain AES for
+	blo	Lctr_enc_short			@ small sizes
+
+	mov	ip, sp
+	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+	VFP_ABI_PUSH
+	ldr	r8, [ip]			@ ctr is 1st arg on the stack
+	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
+	mov	r9, sp				@ save sp
+
+	ldr	r10, [r3, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
+	add	r12, #96			@ size of bit-sliced key schedule
+
+	@ populate the key schedule
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	mov	sp, r12				@ sp is sp
+	bl	_bsaes_key_convert
+	veor	q7,q7,q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+
+	vld1.8	{q0}, [r8]		@ load counter
+#ifdef	__APPLE__
+	mov	r8, #:lower16:(LREVM0SR-LM0)
+	add	r8, r6, r8
+#else
+	add	r8, r6, #LREVM0SR-LM0	@ borrow r8
+#endif
+	vldmia	sp, {q4}		@ load round0 key
+#else
+	ldr	r12, [r3, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [r3, #244]
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	add	r12, r3, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	q7,q7,q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+
+.align	2
+	add	r12, r3, #248
+	vld1.8	{q0}, [r8]		@ load counter
+	adrl	r8, LREVM0SR			@ borrow r8
+	vldmia	r12, {q4}			@ load round0 key
+	sub	sp, #0x10			@ place for adjusted round0 key
+#endif
+
+	vmov.i32	q8,#1		@ compose 1<<96
+	veor	q9,q9,q9
+	vrev32.8	q0,q0
+	vext.8	q8,q9,q8,#4
+	vrev32.8	q4,q4
+	vadd.u32	q9,q8,q8	@ compose 2<<96
+	vstmia	sp, {q4}		@ save adjusted round0 key
+	b	Lctr_enc_loop
+
+.align	4
+Lctr_enc_loop:
+	vadd.u32	q10, q8, q9	@ compose 3<<96
+	vadd.u32	q1, q0, q8	@ +1
+	vadd.u32	q2, q0, q9	@ +2
+	vadd.u32	q3, q0, q10	@ +3
+	vadd.u32	q4, q1, q10
+	vadd.u32	q5, q2, q10
+	vadd.u32	q6, q3, q10
+	vadd.u32	q7, q4, q10
+	vadd.u32	q10, q5, q10	@ next counter
+
+	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+	@ to flip byte order in 32-bit counter
+
+	vldmia	sp, {q9}		@ load round0 key
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x10		@ pass next round key
+#else
+	add	r4, r3, #264
+#endif
+	vldmia	r8, {q8}			@ LREVM0SR
+	mov	r5, r10			@ pass rounds
+	vstmia	r9, {q10}			@ save next counter
+#ifdef	__APPLE__
+	mov	r6, #:lower16:(LREVM0SR-LSR)
+	sub	r6, r8, r6
+#else
+	sub	r6, r8, #LREVM0SR-LSR	@ pass constants
+#endif
+
+	bl	_bsaes_encrypt8_alt
+
+	subs	r2, r2, #8
+	blo	Lctr_enc_loop_done
+
+	vld1.8	{q8,q9}, [r0]!	@ load input
+	vld1.8	{q10,q11}, [r0]!
+	veor	q0, q8
+	veor	q1, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q10
+	veor	q6, q11
+	vld1.8	{q14,q15}, [r0]!
+	veor	q3, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q7, q13
+	veor	q2, q14
+	vst1.8	{q4}, [r1]!
+	veor	q5, q15
+	vst1.8	{q6}, [r1]!
+	vmov.i32	q8, #1			@ compose 1<<96
+	vst1.8	{q3}, [r1]!
+	veor	q9, q9, q9
+	vst1.8	{q7}, [r1]!
+	vext.8	q8, q9, q8, #4
+	vst1.8	{q2}, [r1]!
+	vadd.u32	q9,q8,q8		@ compose 2<<96
+	vst1.8	{q5}, [r1]!
+	vldmia	r9, {q0}			@ load counter
+
+	bne	Lctr_enc_loop
+	b	Lctr_enc_done
+
+.align	4
+Lctr_enc_loop_done:
+	add	r2, r2, #8
+	vld1.8	{q8}, [r0]!	@ load input
+	veor	q0, q8
+	vst1.8	{q0}, [r1]!	@ write output
+	cmp	r2, #2
+	blo	Lctr_enc_done
+	vld1.8	{q9}, [r0]!
+	veor	q1, q9
+	vst1.8	{q1}, [r1]!
+	beq	Lctr_enc_done
+	vld1.8	{q10}, [r0]!
+	veor	q4, q10
+	vst1.8	{q4}, [r1]!
+	cmp	r2, #4
+	blo	Lctr_enc_done
+	vld1.8	{q11}, [r0]!
+	veor	q6, q11
+	vst1.8	{q6}, [r1]!
+	beq	Lctr_enc_done
+	vld1.8	{q12}, [r0]!
+	veor	q3, q12
+	vst1.8	{q3}, [r1]!
+	cmp	r2, #6
+	blo	Lctr_enc_done
+	vld1.8	{q13}, [r0]!
+	veor	q7, q13
+	vst1.8	{q7}, [r1]!
+	beq	Lctr_enc_done
+	vld1.8	{q14}, [r0]
+	veor	q2, q14
+	vst1.8	{q2}, [r1]!
+
+Lctr_enc_done:
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifndef	BSAES_ASM_EXTENDED_KEY
+Lctr_enc_bzero:@ wipe key schedule [if any]
+	vstmia	sp!, {q0,q1}
+	cmp	sp, r9
+	bne	Lctr_enc_bzero
+#else
+	vstmia	sp, {q0,q1}
+#endif
+
+	mov	sp, r9
+	add	sp, #0x10		@ add sp,r9,#0x10 is no good for thumb
+	VFP_ABI_POP
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
+
+.align	4
+Lctr_enc_short:
+	ldr	ip, [sp]		@ ctr pointer is passed on stack
+	stmdb	sp!, {r4,r5,r6,r7,r8, lr}
+
+	mov	r4, r0		@ copy arguments
+	mov	r5, r1
+	mov	r6, r2
+	mov	r7, r3
+	ldr	r8, [ip, #12]		@ load counter LSW
+	vld1.8	{q1}, [ip]		@ load whole counter value
+#ifdef __ARMEL__
+	rev	r8, r8
+#endif
+	sub	sp, sp, #0x10
+	vst1.8	{q1}, [sp]		@ copy counter value
+	sub	sp, sp, #0x10
+
+Lctr_enc_short_loop:
+	add	r0, sp, #0x10		@ input counter value
+	mov	r1, sp			@ output on the stack
+	mov	r2, r7			@ key
+
+	bl	_AES_encrypt
+
+	vld1.8	{q0}, [r4]!	@ load input
+	vld1.8	{q1}, [sp]		@ load encrypted counter
+	add	r8, r8, #1
+#ifdef __ARMEL__
+	rev	r0, r8
+	str	r0, [sp, #0x1c]		@ next counter value
+#else
+	str	r8, [sp, #0x1c]		@ next counter value
+#endif
+	veor	q0,q0,q1
+	vst1.8	{q0}, [r5]!	@ store output
+	subs	r6, r6, #1
+	bne	Lctr_enc_short_loop
+
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+	vstmia	sp!, {q0,q1}
+
+	ldmia	sp!, {r4,r5,r6,r7,r8, pc}
+
+.globl	_bsaes_xts_encrypt
+.private_extern	_bsaes_xts_encrypt
+#ifdef __thumb2__
+.thumb_func	_bsaes_xts_encrypt
+#endif
+.align	4
+_bsaes_xts_encrypt:
+	mov	ip, sp
+	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}		@ 0x20
+	VFP_ABI_PUSH
+	mov	r6, sp				@ future r3
+
+	mov	r7, r0
+	mov	r8, r1
+	mov	r9, r2
+	mov	r10, r3
+
+	sub	r0, sp, #0x10			@ 0x10
+	bic	r0, #0xf			@ align at 16 bytes
+	mov	sp, r0
+
+#ifdef	XTS_CHAIN_TWEAK
+	ldr	r0, [ip]			@ pointer to input tweak
+#else
+	@ generate initial tweak
+	ldr	r0, [ip, #4]			@ iv[]
+	mov	r1, sp
+	ldr	r2, [ip, #0]			@ key2
+	bl	_AES_encrypt
+	mov	r0,sp				@ pointer to initial tweak
+#endif
+
+	ldr	r1, [r10, #240]		@ get # of rounds
+	mov	r3, r6
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r1, lsl#7		@ 128 bytes per inner round key
+	@ add	r12, #96			@ size of bit-sliced key schedule
+	sub	r12, #48			@ place for tweak[9]
+
+	@ populate the key schedule
+	mov	r4, r10			@ pass key
+	mov	r5, r1			@ pass # of rounds
+	mov	sp, r12
+	add	r12, #0x90			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	q7, q7, q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+#else
+	ldr	r12, [r10, #244]
+	eors	r12, #1
+	beq	0f
+
+	str	r12, [r10, #244]
+	mov	r4, r10			@ pass key
+	mov	r5, r1			@ pass # of rounds
+	add	r12, r10, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	q7, q7, q15	@ fix up last round key
+	vstmia	r12, {q7}
+
+.align	2
+	sub	sp, #0x90			@ place for tweak[9]
+#endif
+
+	vld1.8	{q8}, [r0]			@ initial tweak
+	adr	r2, Lxts_magic
+
+	subs	r9, #0x80
+	blo	Lxts_enc_short
+	b	Lxts_enc_loop
+
+.align	4
+Lxts_enc_loop:
+	vldmia	r2, {q5}	@ load XTS magic
+	vshr.s64	q6, q8, #63
+	mov	r0, sp
+	vand	q6, q6, q5
+	vadd.u64	q9, q8, q8
+	vst1.64	{q8}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q9, #63
+	veor	q9, q9, q6
+	vand	q7, q7, q5
+	vadd.u64	q10, q9, q9
+	vst1.64	{q9}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q10, #63
+	veor	q10, q10, q7
+	vand	q6, q6, q5
+	vld1.8	{q0}, [r7]!
+	vadd.u64	q11, q10, q10
+	vst1.64	{q10}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q11, #63
+	veor	q11, q11, q6
+	vand	q7, q7, q5
+	vld1.8	{q1}, [r7]!
+	veor	q0, q0, q8
+	vadd.u64	q12, q11, q11
+	vst1.64	{q11}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q12, #63
+	veor	q12, q12, q7
+	vand	q6, q6, q5
+	vld1.8	{q2}, [r7]!
+	veor	q1, q1, q9
+	vadd.u64	q13, q12, q12
+	vst1.64	{q12}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q13, #63
+	veor	q13, q13, q6
+	vand	q7, q7, q5
+	vld1.8	{q3}, [r7]!
+	veor	q2, q2, q10
+	vadd.u64	q14, q13, q13
+	vst1.64	{q13}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q14, #63
+	veor	q14, q14, q7
+	vand	q6, q6, q5
+	vld1.8	{q4}, [r7]!
+	veor	q3, q3, q11
+	vadd.u64	q15, q14, q14
+	vst1.64	{q14}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q15, #63
+	veor	q15, q15, q6
+	vand	q7, q7, q5
+	vld1.8	{q5}, [r7]!
+	veor	q4, q4, q12
+	vadd.u64	q8, q15, q15
+	vst1.64	{q15}, [r0,:128]!
+	vswp	d15,d14
+	veor	q8, q8, q7
+	vst1.64	{q8}, [r0,:128]		@ next round tweak
+
+	vld1.8	{q6,q7}, [r7]!
+	veor	q5, q5, q13
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q6, q6, q14
+	mov	r5, r1			@ pass rounds
+	veor	q7, q7, q15
+	mov	r0, sp
+
+	bl	_bsaes_encrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	vld1.64	{q10,q11}, [r0,:128]!
+	veor	q0, q0, q8
+	vld1.64	{q12,q13}, [r0,:128]!
+	veor	q1, q1, q9
+	veor	q8, q4, q10
+	vst1.8	{q0,q1}, [r8]!
+	veor	q9, q6, q11
+	vld1.64	{q14,q15}, [r0,:128]!
+	veor	q10, q3, q12
+	vst1.8	{q8,q9}, [r8]!
+	veor	q11, q7, q13
+	veor	q12, q2, q14
+	vst1.8	{q10,q11}, [r8]!
+	veor	q13, q5, q15
+	vst1.8	{q12,q13}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+
+	subs	r9, #0x80
+	bpl	Lxts_enc_loop
+
+Lxts_enc_short:
+	adds	r9, #0x70
+	bmi	Lxts_enc_done
+
+	vldmia	r2, {q5}	@ load XTS magic
+	vshr.s64	q7, q8, #63
+	mov	r0, sp
+	vand	q7, q7, q5
+	vadd.u64	q9, q8, q8
+	vst1.64	{q8}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q9, #63
+	veor	q9, q9, q7
+	vand	q6, q6, q5
+	vadd.u64	q10, q9, q9
+	vst1.64	{q9}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q10, #63
+	veor	q10, q10, q6
+	vand	q7, q7, q5
+	vld1.8	{q0}, [r7]!
+	subs	r9, #0x10
+	bmi	Lxts_enc_1
+	vadd.u64	q11, q10, q10
+	vst1.64	{q10}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q11, #63
+	veor	q11, q11, q7
+	vand	q6, q6, q5
+	vld1.8	{q1}, [r7]!
+	subs	r9, #0x10
+	bmi	Lxts_enc_2
+	veor	q0, q0, q8
+	vadd.u64	q12, q11, q11
+	vst1.64	{q11}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q12, #63
+	veor	q12, q12, q6
+	vand	q7, q7, q5
+	vld1.8	{q2}, [r7]!
+	subs	r9, #0x10
+	bmi	Lxts_enc_3
+	veor	q1, q1, q9
+	vadd.u64	q13, q12, q12
+	vst1.64	{q12}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q13, #63
+	veor	q13, q13, q7
+	vand	q6, q6, q5
+	vld1.8	{q3}, [r7]!
+	subs	r9, #0x10
+	bmi	Lxts_enc_4
+	veor	q2, q2, q10
+	vadd.u64	q14, q13, q13
+	vst1.64	{q13}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q14, #63
+	veor	q14, q14, q6
+	vand	q7, q7, q5
+	vld1.8	{q4}, [r7]!
+	subs	r9, #0x10
+	bmi	Lxts_enc_5
+	veor	q3, q3, q11
+	vadd.u64	q15, q14, q14
+	vst1.64	{q14}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q15, #63
+	veor	q15, q15, q7
+	vand	q6, q6, q5
+	vld1.8	{q5}, [r7]!
+	subs	r9, #0x10
+	bmi	Lxts_enc_6
+	veor	q4, q4, q12
+	sub	r9, #0x10
+	vst1.64	{q15}, [r0,:128]		@ next round tweak
+
+	vld1.8	{q6}, [r7]!
+	veor	q5, q5, q13
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q6, q6, q14
+	mov	r5, r1			@ pass rounds
+	mov	r0, sp
+
+	bl	_bsaes_encrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	vld1.64	{q10,q11}, [r0,:128]!
+	veor	q0, q0, q8
+	vld1.64	{q12,q13}, [r0,:128]!
+	veor	q1, q1, q9
+	veor	q8, q4, q10
+	vst1.8	{q0,q1}, [r8]!
+	veor	q9, q6, q11
+	vld1.64	{q14}, [r0,:128]!
+	veor	q10, q3, q12
+	vst1.8	{q8,q9}, [r8]!
+	veor	q11, q7, q13
+	veor	q12, q2, q14
+	vst1.8	{q10,q11}, [r8]!
+	vst1.8	{q12}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+	b	Lxts_enc_done
+.align	4
+Lxts_enc_6:
+	veor	q4, q4, q12
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q5, q5, q13
+	mov	r5, r1			@ pass rounds
+	mov	r0, sp
+
+	bl	_bsaes_encrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	vld1.64	{q10,q11}, [r0,:128]!
+	veor	q0, q0, q8
+	vld1.64	{q12,q13}, [r0,:128]!
+	veor	q1, q1, q9
+	veor	q8, q4, q10
+	vst1.8	{q0,q1}, [r8]!
+	veor	q9, q6, q11
+	veor	q10, q3, q12
+	vst1.8	{q8,q9}, [r8]!
+	veor	q11, q7, q13
+	vst1.8	{q10,q11}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+	b	Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align	5
+Lxts_magic:
+.quad	1, 0x87
+
+.align	5
+Lxts_enc_5:
+	veor	q3, q3, q11
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q4, q4, q12
+	mov	r5, r1			@ pass rounds
+	mov	r0, sp
+
+	bl	_bsaes_encrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	vld1.64	{q10,q11}, [r0,:128]!
+	veor	q0, q0, q8
+	vld1.64	{q12}, [r0,:128]!
+	veor	q1, q1, q9
+	veor	q8, q4, q10
+	vst1.8	{q0,q1}, [r8]!
+	veor	q9, q6, q11
+	veor	q10, q3, q12
+	vst1.8	{q8,q9}, [r8]!
+	vst1.8	{q10}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+	b	Lxts_enc_done
+.align	4
+Lxts_enc_4:
+	veor	q2, q2, q10
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q3, q3, q11
+	mov	r5, r1			@ pass rounds
+	mov	r0, sp
+
+	bl	_bsaes_encrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	vld1.64	{q10,q11}, [r0,:128]!
+	veor	q0, q0, q8
+	veor	q1, q1, q9
+	veor	q8, q4, q10
+	vst1.8	{q0,q1}, [r8]!
+	veor	q9, q6, q11
+	vst1.8	{q8,q9}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+	b	Lxts_enc_done
+.align	4
+Lxts_enc_3:
+	veor	q1, q1, q9
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q2, q2, q10
+	mov	r5, r1			@ pass rounds
+	mov	r0, sp
+
+	bl	_bsaes_encrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	vld1.64	{q10}, [r0,:128]!
+	veor	q0, q0, q8
+	veor	q1, q1, q9
+	veor	q8, q4, q10
+	vst1.8	{q0,q1}, [r8]!
+	vst1.8	{q8}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+	b	Lxts_enc_done
+.align	4
+Lxts_enc_2:
+	veor	q0, q0, q8
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q1, q1, q9
+	mov	r5, r1			@ pass rounds
+	mov	r0, sp
+
+	bl	_bsaes_encrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	veor	q0, q0, q8
+	veor	q1, q1, q9
+	vst1.8	{q0,q1}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+	b	Lxts_enc_done
+.align	4
+Lxts_enc_1:
+	mov	r0, sp
+	veor	q0, q0, q8
+	mov	r1, sp
+	vst1.8	{q0}, [sp,:128]
+	mov	r2, r10
+	mov	r4, r3				@ preserve fp
+
+	bl	_AES_encrypt
+
+	vld1.8	{q0}, [sp,:128]
+	veor	q0, q0, q8
+	vst1.8	{q0}, [r8]!
+	mov	r3, r4
+
+	vmov	q8, q9		@ next round tweak
+
+Lxts_enc_done:
+#ifndef	XTS_CHAIN_TWEAK
+	adds	r9, #0x10
+	beq	Lxts_enc_ret
+	sub	r6, r8, #0x10
+
+Lxts_enc_steal:
+	ldrb	r0, [r7], #1
+	ldrb	r1, [r8, #-0x10]
+	strb	r0, [r8, #-0x10]
+	strb	r1, [r8], #1
+
+	subs	r9, #1
+	bhi	Lxts_enc_steal
+
+	vld1.8	{q0}, [r6]
+	mov	r0, sp
+	veor	q0, q0, q8
+	mov	r1, sp
+	vst1.8	{q0}, [sp,:128]
+	mov	r2, r10
+	mov	r4, r3			@ preserve fp
+
+	bl	_AES_encrypt
+
+	vld1.8	{q0}, [sp,:128]
+	veor	q0, q0, q8
+	vst1.8	{q0}, [r6]
+	mov	r3, r4
+#endif
+
+Lxts_enc_ret:
+	bic	r0, r3, #0xf
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifdef	XTS_CHAIN_TWEAK
+	ldr	r1, [r3, #0x20+VFP_ABI_FRAME]	@ chain tweak
+#endif
+Lxts_enc_bzero:@ wipe key schedule [if any]
+	vstmia	sp!, {q0,q1}
+	cmp	sp, r0
+	bne	Lxts_enc_bzero
+
+	mov	sp, r3
+#ifdef	XTS_CHAIN_TWEAK
+	vst1.8	{q8}, [r1]
+#endif
+	VFP_ABI_POP
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
+
+
+
+.globl	_bsaes_xts_decrypt
+.private_extern	_bsaes_xts_decrypt
+#ifdef __thumb2__
+.thumb_func	_bsaes_xts_decrypt
+#endif
+.align	4
+_bsaes_xts_decrypt:
+	mov	ip, sp
+	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}		@ 0x20
+	VFP_ABI_PUSH
+	mov	r6, sp				@ future r3
+
+	mov	r7, r0
+	mov	r8, r1
+	mov	r9, r2
+	mov	r10, r3
+
+	sub	r0, sp, #0x10			@ 0x10
+	bic	r0, #0xf			@ align at 16 bytes
+	mov	sp, r0
+
+#ifdef	XTS_CHAIN_TWEAK
+	ldr	r0, [ip]			@ pointer to input tweak
+#else
+	@ generate initial tweak
+	ldr	r0, [ip, #4]			@ iv[]
+	mov	r1, sp
+	ldr	r2, [ip, #0]			@ key2
+	bl	_AES_encrypt
+	mov	r0, sp				@ pointer to initial tweak
+#endif
+
+	ldr	r1, [r10, #240]		@ get # of rounds
+	mov	r3, r6
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r1, lsl#7		@ 128 bytes per inner round key
+	@ add	r12, #96			@ size of bit-sliced key schedule
+	sub	r12, #48			@ place for tweak[9]
+
+	@ populate the key schedule
+	mov	r4, r10			@ pass key
+	mov	r5, r1			@ pass # of rounds
+	mov	sp, r12
+	add	r12, #0x90			@ pass key schedule
+	bl	_bsaes_key_convert
+	add	r4, sp, #0x90
+	vldmia	r4, {q6}
+	vstmia	r12,  {q15}		@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	r4, {q7}
+#else
+	ldr	r12, [r10, #244]
+	eors	r12, #1
+	beq	0f
+
+	str	r12, [r10, #244]
+	mov	r4, r10			@ pass key
+	mov	r5, r1			@ pass # of rounds
+	add	r12, r10, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	add	r4, r10, #248
+	vldmia	r4, {q6}
+	vstmia	r12,  {q15}		@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	r4, {q7}
+
+.align	2
+	sub	sp, #0x90			@ place for tweak[9]
+#endif
+	vld1.8	{q8}, [r0]			@ initial tweak
+	adr	r2, Lxts_magic
+
+#ifndef	XTS_CHAIN_TWEAK
+	tst	r9, #0xf			@ if not multiple of 16
+	it	ne				@ Thumb2 thing, sanity check in ARM
+	subne	r9, #0x10			@ subtract another 16 bytes
+#endif
+	subs	r9, #0x80
+
+	blo	Lxts_dec_short
+	b	Lxts_dec_loop
+
+.align	4
+Lxts_dec_loop:
+	vldmia	r2, {q5}	@ load XTS magic
+	vshr.s64	q6, q8, #63
+	mov	r0, sp
+	vand	q6, q6, q5
+	vadd.u64	q9, q8, q8
+	vst1.64	{q8}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q9, #63
+	veor	q9, q9, q6
+	vand	q7, q7, q5
+	vadd.u64	q10, q9, q9
+	vst1.64	{q9}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q10, #63
+	veor	q10, q10, q7
+	vand	q6, q6, q5
+	vld1.8	{q0}, [r7]!
+	vadd.u64	q11, q10, q10
+	vst1.64	{q10}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q11, #63
+	veor	q11, q11, q6
+	vand	q7, q7, q5
+	vld1.8	{q1}, [r7]!
+	veor	q0, q0, q8
+	vadd.u64	q12, q11, q11
+	vst1.64	{q11}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q12, #63
+	veor	q12, q12, q7
+	vand	q6, q6, q5
+	vld1.8	{q2}, [r7]!
+	veor	q1, q1, q9
+	vadd.u64	q13, q12, q12
+	vst1.64	{q12}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q13, #63
+	veor	q13, q13, q6
+	vand	q7, q7, q5
+	vld1.8	{q3}, [r7]!
+	veor	q2, q2, q10
+	vadd.u64	q14, q13, q13
+	vst1.64	{q13}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q14, #63
+	veor	q14, q14, q7
+	vand	q6, q6, q5
+	vld1.8	{q4}, [r7]!
+	veor	q3, q3, q11
+	vadd.u64	q15, q14, q14
+	vst1.64	{q14}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q15, #63
+	veor	q15, q15, q6
+	vand	q7, q7, q5
+	vld1.8	{q5}, [r7]!
+	veor	q4, q4, q12
+	vadd.u64	q8, q15, q15
+	vst1.64	{q15}, [r0,:128]!
+	vswp	d15,d14
+	veor	q8, q8, q7
+	vst1.64	{q8}, [r0,:128]		@ next round tweak
+
+	vld1.8	{q6,q7}, [r7]!
+	veor	q5, q5, q13
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q6, q6, q14
+	mov	r5, r1			@ pass rounds
+	veor	q7, q7, q15
+	mov	r0, sp
+
+	bl	_bsaes_decrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	vld1.64	{q10,q11}, [r0,:128]!
+	veor	q0, q0, q8
+	vld1.64	{q12,q13}, [r0,:128]!
+	veor	q1, q1, q9
+	veor	q8, q6, q10
+	vst1.8	{q0,q1}, [r8]!
+	veor	q9, q4, q11
+	vld1.64	{q14,q15}, [r0,:128]!
+	veor	q10, q2, q12
+	vst1.8	{q8,q9}, [r8]!
+	veor	q11, q7, q13
+	veor	q12, q3, q14
+	vst1.8	{q10,q11}, [r8]!
+	veor	q13, q5, q15
+	vst1.8	{q12,q13}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+
+	subs	r9, #0x80
+	bpl	Lxts_dec_loop
+
+Lxts_dec_short:
+	adds	r9, #0x70
+	bmi	Lxts_dec_done
+
+	vldmia	r2, {q5}	@ load XTS magic
+	vshr.s64	q7, q8, #63
+	mov	r0, sp
+	vand	q7, q7, q5
+	vadd.u64	q9, q8, q8
+	vst1.64	{q8}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q9, #63
+	veor	q9, q9, q7
+	vand	q6, q6, q5
+	vadd.u64	q10, q9, q9
+	vst1.64	{q9}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q10, #63
+	veor	q10, q10, q6
+	vand	q7, q7, q5
+	vld1.8	{q0}, [r7]!
+	subs	r9, #0x10
+	bmi	Lxts_dec_1
+	vadd.u64	q11, q10, q10
+	vst1.64	{q10}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q11, #63
+	veor	q11, q11, q7
+	vand	q6, q6, q5
+	vld1.8	{q1}, [r7]!
+	subs	r9, #0x10
+	bmi	Lxts_dec_2
+	veor	q0, q0, q8
+	vadd.u64	q12, q11, q11
+	vst1.64	{q11}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q12, #63
+	veor	q12, q12, q6
+	vand	q7, q7, q5
+	vld1.8	{q2}, [r7]!
+	subs	r9, #0x10
+	bmi	Lxts_dec_3
+	veor	q1, q1, q9
+	vadd.u64	q13, q12, q12
+	vst1.64	{q12}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q13, #63
+	veor	q13, q13, q7
+	vand	q6, q6, q5
+	vld1.8	{q3}, [r7]!
+	subs	r9, #0x10
+	bmi	Lxts_dec_4
+	veor	q2, q2, q10
+	vadd.u64	q14, q13, q13
+	vst1.64	{q13}, [r0,:128]!
+	vswp	d13,d12
+	vshr.s64	q7, q14, #63
+	veor	q14, q14, q6
+	vand	q7, q7, q5
+	vld1.8	{q4}, [r7]!
+	subs	r9, #0x10
+	bmi	Lxts_dec_5
+	veor	q3, q3, q11
+	vadd.u64	q15, q14, q14
+	vst1.64	{q14}, [r0,:128]!
+	vswp	d15,d14
+	vshr.s64	q6, q15, #63
+	veor	q15, q15, q7
+	vand	q6, q6, q5
+	vld1.8	{q5}, [r7]!
+	subs	r9, #0x10
+	bmi	Lxts_dec_6
+	veor	q4, q4, q12
+	sub	r9, #0x10
+	vst1.64	{q15}, [r0,:128]		@ next round tweak
+
+	vld1.8	{q6}, [r7]!
+	veor	q5, q5, q13
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q6, q6, q14
+	mov	r5, r1			@ pass rounds
+	mov	r0, sp
+
+	bl	_bsaes_decrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	vld1.64	{q10,q11}, [r0,:128]!
+	veor	q0, q0, q8
+	vld1.64	{q12,q13}, [r0,:128]!
+	veor	q1, q1, q9
+	veor	q8, q6, q10
+	vst1.8	{q0,q1}, [r8]!
+	veor	q9, q4, q11
+	vld1.64	{q14}, [r0,:128]!
+	veor	q10, q2, q12
+	vst1.8	{q8,q9}, [r8]!
+	veor	q11, q7, q13
+	veor	q12, q3, q14
+	vst1.8	{q10,q11}, [r8]!
+	vst1.8	{q12}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+	b	Lxts_dec_done
+.align	4
+Lxts_dec_6:
+	vst1.64	{q14}, [r0,:128]		@ next round tweak
+
+	veor	q4, q4, q12
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q5, q5, q13
+	mov	r5, r1			@ pass rounds
+	mov	r0, sp
+
+	bl	_bsaes_decrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	vld1.64	{q10,q11}, [r0,:128]!
+	veor	q0, q0, q8
+	vld1.64	{q12,q13}, [r0,:128]!
+	veor	q1, q1, q9
+	veor	q8, q6, q10
+	vst1.8	{q0,q1}, [r8]!
+	veor	q9, q4, q11
+	veor	q10, q2, q12
+	vst1.8	{q8,q9}, [r8]!
+	veor	q11, q7, q13
+	vst1.8	{q10,q11}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+	b	Lxts_dec_done
+.align	4
+Lxts_dec_5:
+	veor	q3, q3, q11
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q4, q4, q12
+	mov	r5, r1			@ pass rounds
+	mov	r0, sp
+
+	bl	_bsaes_decrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	vld1.64	{q10,q11}, [r0,:128]!
+	veor	q0, q0, q8
+	vld1.64	{q12}, [r0,:128]!
+	veor	q1, q1, q9
+	veor	q8, q6, q10
+	vst1.8	{q0,q1}, [r8]!
+	veor	q9, q4, q11
+	veor	q10, q2, q12
+	vst1.8	{q8,q9}, [r8]!
+	vst1.8	{q10}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+	b	Lxts_dec_done
+.align	4
+Lxts_dec_4:
+	veor	q2, q2, q10
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q3, q3, q11
+	mov	r5, r1			@ pass rounds
+	mov	r0, sp
+
+	bl	_bsaes_decrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	vld1.64	{q10,q11}, [r0,:128]!
+	veor	q0, q0, q8
+	veor	q1, q1, q9
+	veor	q8, q6, q10
+	vst1.8	{q0,q1}, [r8]!
+	veor	q9, q4, q11
+	vst1.8	{q8,q9}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+	b	Lxts_dec_done
+.align	4
+Lxts_dec_3:
+	veor	q1, q1, q9
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q2, q2, q10
+	mov	r5, r1			@ pass rounds
+	mov	r0, sp
+
+	bl	_bsaes_decrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	vld1.64	{q10}, [r0,:128]!
+	veor	q0, q0, q8
+	veor	q1, q1, q9
+	veor	q8, q6, q10
+	vst1.8	{q0,q1}, [r8]!
+	vst1.8	{q8}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+	b	Lxts_dec_done
+.align	4
+Lxts_dec_2:
+	veor	q0, q0, q8
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x90			@ pass key schedule
+#else
+	add	r4, r10, #248			@ pass key schedule
+#endif
+	veor	q1, q1, q9
+	mov	r5, r1			@ pass rounds
+	mov	r0, sp
+
+	bl	_bsaes_decrypt8
+
+	vld1.64	{q8,q9}, [r0,:128]!
+	veor	q0, q0, q8
+	veor	q1, q1, q9
+	vst1.8	{q0,q1}, [r8]!
+
+	vld1.64	{q8}, [r0,:128]		@ next round tweak
+	b	Lxts_dec_done
+.align	4
+Lxts_dec_1:
+	mov	r0, sp
+	veor	q0, q0, q8
+	mov	r1, sp
+	vst1.8	{q0}, [sp,:128]
+	mov	r5, r2			@ preserve magic
+	mov	r2, r10
+	mov	r4, r3				@ preserve fp
+
+	bl	_AES_decrypt
+
+	vld1.8	{q0}, [sp,:128]
+	veor	q0, q0, q8
+	vst1.8	{q0}, [r8]!
+	mov	r3, r4
+	mov	r2, r5
+
+	vmov	q8, q9		@ next round tweak
+
+Lxts_dec_done:
+#ifndef	XTS_CHAIN_TWEAK
+	adds	r9, #0x10
+	beq	Lxts_dec_ret
+
+	@ calculate one round of extra tweak for the stolen ciphertext
+	vldmia	r2, {q5}
+	vshr.s64	q6, q8, #63
+	vand	q6, q6, q5
+	vadd.u64	q9, q8, q8
+	vswp	d13,d12
+	veor	q9, q9, q6
+
+	@ perform the final decryption with the last tweak value
+	vld1.8	{q0}, [r7]!
+	mov	r0, sp
+	veor	q0, q0, q9
+	mov	r1, sp
+	vst1.8	{q0}, [sp,:128]
+	mov	r2, r10
+	mov	r4, r3			@ preserve fp
+
+	bl	_AES_decrypt
+
+	vld1.8	{q0}, [sp,:128]
+	veor	q0, q0, q9
+	vst1.8	{q0}, [r8]
+
+	mov	r6, r8
+Lxts_dec_steal:
+	ldrb	r1, [r8]
+	ldrb	r0, [r7], #1
+	strb	r1, [r8, #0x10]
+	strb	r0, [r8], #1
+
+	subs	r9, #1
+	bhi	Lxts_dec_steal
+
+	vld1.8	{q0}, [r6]
+	mov	r0, sp
+	veor	q0, q8
+	mov	r1, sp
+	vst1.8	{q0}, [sp,:128]
+	mov	r2, r10
+
+	bl	_AES_decrypt
+
+	vld1.8	{q0}, [sp,:128]
+	veor	q0, q0, q8
+	vst1.8	{q0}, [r6]
+	mov	r3, r4
+#endif
+
+Lxts_dec_ret:
+	bic	r0, r3, #0xf
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifdef	XTS_CHAIN_TWEAK
+	ldr	r1, [r3, #0x20+VFP_ABI_FRAME]	@ chain tweak
+#endif
+Lxts_dec_bzero:@ wipe key schedule [if any]
+	vstmia	sp!, {q0,q1}
+	cmp	sp, r0
+	bne	Lxts_dec_bzero
+
+	mov	sp, r3
+#ifdef	XTS_CHAIN_TWEAK
+	vst1.8	{q8}, [r1]
+#endif
+	VFP_ABI_POP
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
+
+
+#endif
diff --git a/ios-arm/crypto/fipsmodule/ghash-armv4.S b/ios-arm/crypto/fipsmodule/ghash-armv4.S
new file mode 100644
index 0000000..ace157f
--- /dev/null
+++ b/ios-arm/crypto/fipsmodule/ghash-armv4.S
@@ -0,0 +1,588 @@
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
+@ instructions are in aesv8-armx.pl.)
+
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+#ifdef  __clang__
+#define ldrplb  ldrbpl
+#define ldrneb  ldrbne
+#endif
+
+
+.align	5
+rem_4bit:
+.short	0x0000,0x1C20,0x3840,0x2460
+.short	0x7080,0x6CA0,0x48C0,0x54E0
+.short	0xE100,0xFD20,0xD940,0xC560
+.short	0x9180,0x8DA0,0xA9C0,0xB5E0
+
+
+#ifdef __thumb2__
+.thumb_func	rem_4bit_get
+#endif
+rem_4bit_get:
+#if defined(__thumb2__)
+	adr	r2,rem_4bit
+#else
+	sub	r2,pc,#8+32	@ &rem_4bit
+#endif
+	b	Lrem_4bit_got
+	nop
+	nop
+
+
+.globl	_gcm_ghash_4bit
+.private_extern	_gcm_ghash_4bit
+#ifdef __thumb2__
+.thumb_func	_gcm_ghash_4bit
+#endif
+.align	4
+_gcm_ghash_4bit:
+#if defined(__thumb2__)
+	adr	r12,rem_4bit
+#else
+	sub	r12,pc,#8+48		@ &rem_4bit
+#endif
+	add	r3,r2,r3		@ r3 to point at the end
+	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}		@ save r3/end too
+
+	ldmia	r12,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy rem_4bit ...
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ ... to stack
+
+	ldrb	r12,[r2,#15]
+	ldrb	r14,[r0,#15]
+Louter:
+	eor	r12,r12,r14
+	and	r14,r12,#0xf0
+	and	r12,r12,#0x0f
+	mov	r3,#14
+
+	add	r7,r1,r12,lsl#4
+	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
+	add	r11,r1,r14
+	ldrb	r12,[r2,#14]
+
+	and	r14,r4,#0xf		@ rem
+	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
+	add	r14,r14,r14
+	eor	r4,r8,r4,lsr#4
+	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
+	eor	r4,r4,r5,lsl#28
+	ldrb	r14,[r0,#14]
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+	eor	r12,r12,r14
+	and	r14,r12,#0xf0
+	and	r12,r12,#0x0f
+	eor	r7,r7,r8,lsl#16
+
+Linner:
+	add	r11,r1,r12,lsl#4
+	and	r12,r4,#0xf		@ rem
+	subs	r3,r3,#1
+	add	r12,r12,r12
+	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
+	eor	r4,r8,r4,lsr#4
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
+	eor	r6,r10,r6,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
+	ldrplb	r12,[r2,r3]
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+
+	add	r11,r1,r14
+	and	r14,r4,#0xf		@ rem
+	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
+	add	r14,r14,r14
+	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
+	eor	r4,r8,r4,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
+	ldrplb	r8,[r0,r3]
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	ldrh	r9,[sp,r14]
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+#ifdef	__thumb2__
+	it	pl
+#endif
+	eorpl	r12,r12,r8
+	eor	r7,r11,r7,lsr#4
+#ifdef	__thumb2__
+	itt	pl
+#endif
+	andpl	r14,r12,#0xf0
+	andpl	r12,r12,#0x0f
+	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
+	bpl	Linner
+
+	ldr	r3,[sp,#32]		@ re-load r3/end
+	add	r2,r2,#16
+	mov	r14,r4
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r4,r4
+	str	r4,[r0,#12]
+#elif defined(__ARMEB__)
+	str	r4,[r0,#12]
+#else
+	mov	r9,r4,lsr#8
+	strb	r4,[r0,#12+3]
+	mov	r10,r4,lsr#16
+	strb	r9,[r0,#12+2]
+	mov	r11,r4,lsr#24
+	strb	r10,[r0,#12+1]
+	strb	r11,[r0,#12]
+#endif
+	cmp	r2,r3
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r5,r5
+	str	r5,[r0,#8]
+#elif defined(__ARMEB__)
+	str	r5,[r0,#8]
+#else
+	mov	r9,r5,lsr#8
+	strb	r5,[r0,#8+3]
+	mov	r10,r5,lsr#16
+	strb	r9,[r0,#8+2]
+	mov	r11,r5,lsr#24
+	strb	r10,[r0,#8+1]
+	strb	r11,[r0,#8]
+#endif
+
+#ifdef __thumb2__
+	it	ne
+#endif
+	ldrneb	r12,[r2,#15]
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r6,r6
+	str	r6,[r0,#4]
+#elif defined(__ARMEB__)
+	str	r6,[r0,#4]
+#else
+	mov	r9,r6,lsr#8
+	strb	r6,[r0,#4+3]
+	mov	r10,r6,lsr#16
+	strb	r9,[r0,#4+2]
+	mov	r11,r6,lsr#24
+	strb	r10,[r0,#4+1]
+	strb	r11,[r0,#4]
+#endif
+
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r7,r7
+	str	r7,[r0,#0]
+#elif defined(__ARMEB__)
+	str	r7,[r0,#0]
+#else
+	mov	r9,r7,lsr#8
+	strb	r7,[r0,#0+3]
+	mov	r10,r7,lsr#16
+	strb	r9,[r0,#0+2]
+	mov	r11,r7,lsr#24
+	strb	r10,[r0,#0+1]
+	strb	r11,[r0,#0]
+#endif
+
+	bne	Louter
+
+	add	sp,sp,#36
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+
+
+.globl	_gcm_gmult_4bit
+.private_extern	_gcm_gmult_4bit
+#ifdef __thumb2__
+.thumb_func	_gcm_gmult_4bit
+#endif
+_gcm_gmult_4bit:
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
+	ldrb	r12,[r0,#15]
+	b	rem_4bit_get
+Lrem_4bit_got:
+	and	r14,r12,#0xf0
+	and	r12,r12,#0x0f
+	mov	r3,#14
+
+	add	r7,r1,r12,lsl#4
+	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
+	ldrb	r12,[r0,#14]
+
+	add	r11,r1,r14
+	and	r14,r4,#0xf		@ rem
+	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
+	add	r14,r14,r14
+	eor	r4,r8,r4,lsr#4
+	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+	and	r14,r12,#0xf0
+	eor	r7,r7,r8,lsl#16
+	and	r12,r12,#0x0f
+
+Loop:
+	add	r11,r1,r12,lsl#4
+	and	r12,r4,#0xf		@ rem
+	subs	r3,r3,#1
+	add	r12,r12,r12
+	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
+	eor	r4,r8,r4,lsr#4
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	eor	r5,r5,r6,lsl#28
+	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
+	eor	r6,r10,r6,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
+	ldrplb	r12,[r0,r3]
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+
+	add	r11,r1,r14
+	and	r14,r4,#0xf		@ rem
+	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
+	add	r14,r14,r14
+	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
+	eor	r4,r8,r4,lsr#4
+	eor	r4,r4,r5,lsl#28
+	eor	r5,r9,r5,lsr#4
+	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
+	eor	r5,r5,r6,lsl#28
+	eor	r6,r10,r6,lsr#4
+	eor	r6,r6,r7,lsl#28
+	eor	r7,r11,r7,lsr#4
+#ifdef	__thumb2__
+	itt	pl
+#endif
+	andpl	r14,r12,#0xf0
+	andpl	r12,r12,#0x0f
+	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
+	bpl	Loop
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r4,r4
+	str	r4,[r0,#12]
+#elif defined(__ARMEB__)
+	str	r4,[r0,#12]
+#else
+	mov	r9,r4,lsr#8
+	strb	r4,[r0,#12+3]
+	mov	r10,r4,lsr#16
+	strb	r9,[r0,#12+2]
+	mov	r11,r4,lsr#24
+	strb	r10,[r0,#12+1]
+	strb	r11,[r0,#12]
+#endif
+
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r5,r5
+	str	r5,[r0,#8]
+#elif defined(__ARMEB__)
+	str	r5,[r0,#8]
+#else
+	mov	r9,r5,lsr#8
+	strb	r5,[r0,#8+3]
+	mov	r10,r5,lsr#16
+	strb	r9,[r0,#8+2]
+	mov	r11,r5,lsr#24
+	strb	r10,[r0,#8+1]
+	strb	r11,[r0,#8]
+#endif
+
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r6,r6
+	str	r6,[r0,#4]
+#elif defined(__ARMEB__)
+	str	r6,[r0,#4]
+#else
+	mov	r9,r6,lsr#8
+	strb	r6,[r0,#4+3]
+	mov	r10,r6,lsr#16
+	strb	r9,[r0,#4+2]
+	mov	r11,r6,lsr#24
+	strb	r10,[r0,#4+1]
+	strb	r11,[r0,#4]
+#endif
+
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	r7,r7
+	str	r7,[r0,#0]
+#elif defined(__ARMEB__)
+	str	r7,[r0,#0]
+#else
+	mov	r9,r7,lsr#8
+	strb	r7,[r0,#0+3]
+	mov	r10,r7,lsr#16
+	strb	r9,[r0,#0+2]
+	mov	r11,r7,lsr#24
+	strb	r10,[r0,#0+1]
+	strb	r11,[r0,#0]
+#endif
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+.globl	_gcm_init_neon
+.private_extern	_gcm_init_neon
+#ifdef __thumb2__
+.thumb_func	_gcm_init_neon
+#endif
+.align	4
+_gcm_init_neon:
+	vld1.64	d7,[r1]!		@ load H
+	vmov.i8	q8,#0xe1
+	vld1.64	d6,[r1]
+	vshl.i64	d17,#57
+	vshr.u64	d16,#63		@ t0=0xc2....01
+	vdup.8	q9,d7[7]
+	vshr.u64	d26,d6,#63
+	vshr.s8	q9,#7			@ broadcast carry bit
+	vshl.i64	q3,q3,#1
+	vand	q8,q8,q9
+	vorr	d7,d26		@ H<<<=1
+	veor	q3,q3,q8		@ twisted H
+	vstmia	r0,{q3}
+
+	bx	lr					@ bx lr
+
+
+.globl	_gcm_gmult_neon
+.private_extern	_gcm_gmult_neon
+#ifdef __thumb2__
+.thumb_func	_gcm_gmult_neon
+#endif
+.align	4
+_gcm_gmult_neon:
+	vld1.64	d7,[r0]!		@ load Xi
+	vld1.64	d6,[r0]!
+	vmov.i64	d29,#0x0000ffffffffffff
+	vldmia	r1,{d26,d27}	@ load twisted H
+	vmov.i64	d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	q3,q3
+#endif
+	vmov.i64	d31,#0x000000000000ffff
+	veor	d28,d26,d27		@ Karatsuba pre-processing
+	mov	r3,#16
+	b	Lgmult_neon
+
+
+.globl	_gcm_ghash_neon
+.private_extern	_gcm_ghash_neon
+#ifdef __thumb2__
+.thumb_func	_gcm_ghash_neon
+#endif
+.align	4
+_gcm_ghash_neon:
+	vld1.64	d1,[r0]!		@ load Xi
+	vld1.64	d0,[r0]!
+	vmov.i64	d29,#0x0000ffffffffffff
+	vldmia	r1,{d26,d27}	@ load twisted H
+	vmov.i64	d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	q0,q0
+#endif
+	vmov.i64	d31,#0x000000000000ffff
+	veor	d28,d26,d27		@ Karatsuba pre-processing
+
+Loop_neon:
+	vld1.64	d7,[r2]!		@ load inp
+	vld1.64	d6,[r2]!
+#ifdef __ARMEL__
+	vrev64.8	q3,q3
+#endif
+	veor	q3,q0			@ inp^=Xi
+Lgmult_neon:
+	vext.8	d16, d26, d26, #1	@ A1
+	vmull.p8	q8, d16, d6		@ F = A1*B
+	vext.8	d0, d6, d6, #1	@ B1
+	vmull.p8	q0, d26, d0		@ E = A*B1
+	vext.8	d18, d26, d26, #2	@ A2
+	vmull.p8	q9, d18, d6		@ H = A2*B
+	vext.8	d22, d6, d6, #2	@ B2
+	vmull.p8	q11, d26, d22		@ G = A*B2
+	vext.8	d20, d26, d26, #3	@ A3
+	veor	q8, q8, q0		@ L = E + F
+	vmull.p8	q10, d20, d6		@ J = A3*B
+	vext.8	d0, d6, d6, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q0, d26, d0		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d6, d6, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d26, d22		@ K = A*B4
+	veor	q10, q10, q0		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q0, d26, d6		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q0, q0, q8
+	veor	q0, q0, q10
+	veor	d6,d6,d7	@ Karatsuba pre-processing
+	vext.8	d16, d28, d28, #1	@ A1
+	vmull.p8	q8, d16, d6		@ F = A1*B
+	vext.8	d2, d6, d6, #1	@ B1
+	vmull.p8	q1, d28, d2		@ E = A*B1
+	vext.8	d18, d28, d28, #2	@ A2
+	vmull.p8	q9, d18, d6		@ H = A2*B
+	vext.8	d22, d6, d6, #2	@ B2
+	vmull.p8	q11, d28, d22		@ G = A*B2
+	vext.8	d20, d28, d28, #3	@ A3
+	veor	q8, q8, q1		@ L = E + F
+	vmull.p8	q10, d20, d6		@ J = A3*B
+	vext.8	d2, d6, d6, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q1, d28, d2		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d6, d6, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d28, d22		@ K = A*B4
+	veor	q10, q10, q1		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q1, d28, d6		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q1, q1, q8
+	veor	q1, q1, q10
+	vext.8	d16, d27, d27, #1	@ A1
+	vmull.p8	q8, d16, d7		@ F = A1*B
+	vext.8	d4, d7, d7, #1	@ B1
+	vmull.p8	q2, d27, d4		@ E = A*B1
+	vext.8	d18, d27, d27, #2	@ A2
+	vmull.p8	q9, d18, d7		@ H = A2*B
+	vext.8	d22, d7, d7, #2	@ B2
+	vmull.p8	q11, d27, d22		@ G = A*B2
+	vext.8	d20, d27, d27, #3	@ A3
+	veor	q8, q8, q2		@ L = E + F
+	vmull.p8	q10, d20, d7		@ J = A3*B
+	vext.8	d4, d7, d7, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q2, d27, d4		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d7, d7, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d27, d22		@ K = A*B4
+	veor	q10, q10, q2		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q2, d27, d7		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q2, q2, q8
+	veor	q2, q2, q10
+	veor	q1,q1,q0		@ Karatsuba post-processing
+	veor	q1,q1,q2
+	veor	d1,d1,d2
+	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
+
+	@ equivalent of reduction_avx from ghash-x86_64.pl
+	vshl.i64	q9,q0,#57		@ 1st phase
+	vshl.i64	q10,q0,#62
+	veor	q10,q10,q9		@
+	vshl.i64	q9,q0,#63
+	veor	q10, q10, q9		@
+	veor	d1,d1,d20	@
+	veor	d4,d4,d21
+
+	vshr.u64	q10,q0,#1		@ 2nd phase
+	veor	q2,q2,q0
+	veor	q0,q0,q10		@
+	vshr.u64	q10,q10,#6
+	vshr.u64	q0,q0,#1		@
+	veor	q0,q0,q2		@
+	veor	q0,q0,q10		@
+
+	subs	r3,#16
+	bne	Loop_neon
+
+#ifdef __ARMEL__
+	vrev64.8	q0,q0
+#endif
+	sub	r0,#16
+	vst1.64	d1,[r0]!		@ write out Xi
+	vst1.64	d0,[r0]
+
+	bx	lr					@ bx lr
+
+#endif
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
diff --git a/ios-arm/crypto/fipsmodule/ghashv8-armx32.S b/ios-arm/crypto/fipsmodule/ghashv8-armx32.S
new file mode 100644
index 0000000..af26892
--- /dev/null
+++ b/ios-arm/crypto/fipsmodule/ghashv8-armx32.S
@@ -0,0 +1,241 @@
+#include <openssl/arm_arch.h>
+
+.text
+
+.code	32
+#undef	__thumb2__
+.globl	_gcm_init_v8
+.private_extern	_gcm_init_v8
+#ifdef __thumb2__
+.thumb_func	_gcm_init_v8
+#endif
+.align	4
+_gcm_init_v8:
+	vld1.64	{q9},[r1]		@ load input H
+	vmov.i8	q11,#0xe1
+	vshl.i64	q11,q11,#57		@ 0xc2.0
+	vext.8	q3,q9,q9,#8
+	vshr.u64	q10,q11,#63
+	vdup.32	q9,d18[1]
+	vext.8	q8,q10,q11,#8		@ t0=0xc2....01
+	vshr.u64	q10,q3,#63
+	vshr.s32	q9,q9,#31		@ broadcast carry bit
+	vand	q10,q10,q8
+	vshl.i64	q3,q