aboutsummaryrefslogtreecommitdiff
path: root/arch/arm/crypto/blake2b-neon-core.S
diff options
context:
space:
mode:
authorLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
committerLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
commit5b7c4cabbb65f5c469464da6c5f614cbd7f730f2 (patch)
treecc5c2d0a898769fd59549594fedb3ee6f84e59a0 /arch/arm/crypto/blake2b-neon-core.S
downloadlinux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.tar.gz
linux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.zip
Merge tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextgrafted
Pull networking updates from Jakub Kicinski: "Core: - Add dedicated kmem_cache for typical/small skb->head, avoid having to access struct page at kfree time, and improve memory use. - Introduce sysctl to set default RPS configuration for new netdevs. - Define Netlink protocol specification format which can be used to describe messages used by each family and auto-generate parsers. Add tools for generating kernel data structures and uAPI headers. - Expose all net/core sysctls inside netns. - Remove 4s sleep in netpoll if carrier is instantly detected on boot. - Add configurable limit of MDB entries per port, and port-vlan. - Continue populating drop reasons throughout the stack. - Retire a handful of legacy Qdiscs and classifiers. Protocols: - Support IPv4 big TCP (TSO frames larger than 64kB). - Add IP_LOCAL_PORT_RANGE socket option, to control local port range on socket by socket basis. - Track and report in procfs number of MPTCP sockets used. - Support mixing IPv4 and IPv6 flows in the in-kernel MPTCP path manager. - IPv6: don't check net.ipv6.route.max_size and rely on garbage collection to free memory (similarly to IPv4). - Support Penultimate Segment Pop (PSP) flavor in SRv6 (RFC8986). - ICMP: add per-rate limit counters. - Add support for user scanning requests in ieee802154. - Remove static WEP support. - Support minimal Wi-Fi 7 Extremely High Throughput (EHT) rate reporting. - WiFi 7 EHT channel puncturing support (client & AP). BPF: - Add a rbtree data structure following the "next-gen data structure" precedent set by recently added linked list, that is, by using kfunc + kptr instead of adding a new BPF map type. - Expose XDP hints via kfuncs with initial support for RX hash and timestamp metadata. - Add BPF_F_NO_TUNNEL_KEY extension to bpf_skb_set_tunnel_key to better support decap on GRE tunnel devices not operating in collect metadata. - Improve x86 JIT's codegen for PROBE_MEM runtime error checks. - Remove the need for trace_printk_lock for bpf_trace_printk and bpf_trace_vprintk helpers. - Extend libbpf's bpf_tracing.h support for tracing arguments of kprobes/uprobes and syscall as a special case. - Significantly reduce the search time for module symbols by livepatch and BPF. - Enable cpumasks to be used as kptrs, which is useful for tracing programs tracking which tasks end up running on which CPUs in different time intervals. - Add support for BPF trampoline on s390x and riscv64. - Add capability to export the XDP features supported by the NIC. - Add __bpf_kfunc tag for marking kernel functions as kfuncs. - Add cgroup.memory=nobpf kernel parameter option to disable BPF memory accounting for container environments. Netfilter: - Remove the CLUSTERIP target. It has been marked as obsolete for years, and we still have WARN splats wrt races of the out-of-band /proc interface installed by this target. - Add 'destroy' commands to nf_tables. They are identical to the existing 'delete' commands, but do not return an error if the referenced object (set, chain, rule...) did not exist. Driver API: - Improve cpumask_local_spread() locality to help NICs set the right IRQ affinity on AMD platforms. - Separate C22 and C45 MDIO bus transactions more clearly. - Introduce new DCB table to control DSCP rewrite on egress. - Support configuration of Physical Layer Collision Avoidance (PLCA) Reconciliation Sublayer (RS) (802.3cg-2019). Modern version of shared medium Ethernet. - Support for MAC Merge layer (IEEE 802.3-2018 clause 99). Allowing preemption of low priority frames by high priority frames. - Add support for controlling MACSec offload using netlink SET. - Rework devlink instance refcounts to allow registration and de-registration under the instance lock. Split the code into multiple files, drop some of the unnecessarily granular locks and factor out common parts of netlink operation handling. - Add TX frame aggregation parameters (for USB drivers). - Add a new attr TCA_EXT_WARN_MSG to report TC (offload) warning messages with notifications for debug. - Allow offloading of UDP NEW connections via act_ct. - Add support for per action HW stats in TC. - Support hardware miss to TC action (continue processing in SW from a specific point in the action chain). - Warn if old Wireless Extension user space interface is used with modern cfg80211/mac80211 drivers. Do not support Wireless Extensions for Wi-Fi 7 devices at all. Everyone should switch to using nl80211 interface instead. - Improve the CAN bit timing configuration. Use extack to return error messages directly to user space, update the SJW handling, including the definition of a new default value that will benefit CAN-FD controllers, by increasing their oscillator tolerance. New hardware / drivers: - Ethernet: - nVidia BlueField-3 support (control traffic driver) - Ethernet support for imx93 SoCs - Motorcomm yt8531 gigabit Ethernet PHY - onsemi NCN26000 10BASE-T1S PHY (with support for PLCA) - Microchip LAN8841 PHY (incl. cable diagnostics and PTP) - Amlogic gxl MDIO mux - WiFi: - RealTek RTL8188EU (rtl8xxxu) - Qualcomm Wi-Fi 7 devices (ath12k) - CAN: - Renesas R-Car V4H Drivers: - Bluetooth: - Set Per Platform Antenna Gain (PPAG) for Intel controllers. - Ethernet NICs: - Intel (1G, igc): - support TSN / Qbv / packet scheduling features of i226 model - Intel (100G, ice): - use GNSS subsystem instead of TTY - multi-buffer XDP support - extend support for GPIO pins to E823 devices - nVidia/Mellanox: - update the shared buffer configuration on PFC commands - implement PTP adjphase function for HW offset control - TC support for Geneve and GRE with VF tunnel offload - more efficient crypto key management method - multi-port eswitch support - Netronome/Corigine: - add DCB IEEE support - support IPsec offloading for NFP3800 - Freescale/NXP (enetc): - support XDP_REDIRECT for XDP non-linear buffers - improve reconfig, avoid link flap and waiting for idle - support MAC Merge layer - Other NICs: - sfc/ef100: add basic devlink support for ef100 - ionic: rx_push mode operation (writing descriptors via MMIO) - bnxt: use the auxiliary bus abstraction for RDMA - r8169: disable ASPM and reset bus in case of tx timeout - cpsw: support QSGMII mode for J721e CPSW9G - cpts: support pulse-per-second output - ngbe: add an mdio bus driver - usbnet: optimize usbnet_bh() by avoiding unnecessary queuing - r8152: handle devices with FW with NCM support - amd-xgbe: support 10Mbps, 2.5GbE speeds and rx-adaptation - virtio-net: support multi buffer XDP - virtio/vsock: replace virtio_vsock_pkt with sk_buff - tsnep: XDP support - Ethernet high-speed switches: - nVidia/Mellanox (mlxsw): - add support for latency TLV (in FW control messages) - Microchip (sparx5): - separate explicit and implicit traffic forwarding rules, make the implicit rules always active - add support for egress DSCP rewrite - IS0 VCAP support (Ingress Classification) - IS2 VCAP filters (protos, L3 addrs, L4 ports, flags, ToS etc.) - ES2 VCAP support (Egress Access Control) - support for Per-Stream Filtering and Policing (802.1Q, 8.6.5.1) - Ethernet embedded switches: - Marvell (mv88e6xxx): - add MAB (port auth) offload support - enable PTP receive for mv88e6390 - NXP (ocelot): - support MAC Merge layer - support for the the vsc7512 internal copper phys - Microchip: - lan9303: convert to PHYLINK - lan966x: support TC flower filter statistics - lan937x: PTP support for KSZ9563/KSZ8563 and LAN937x - lan937x: support Credit Based Shaper configuration - ksz9477: support Energy Efficient Ethernet - other: - qca8k: convert to regmap read/write API, use bulk operations - rswitch: Improve TX timestamp accuracy - Intel WiFi (iwlwifi): - EHT (Wi-Fi 7) rate reporting - STEP equalizer support: transfer some STEP (connection to radio on platforms with integrated wifi) related parameters from the BIOS to the firmware. - Qualcomm 802.11ax WiFi (ath11k): - IPQ5018 support - Fine Timing Measurement (FTM) responder role support - channel 177 support - MediaTek WiFi (mt76): - per-PHY LED support - mt7996: EHT (Wi-Fi 7) support - Wireless Ethernet Dispatch (WED) reset support - switch to using page pool allocator - RealTek WiFi (rtw89): - support new version of Bluetooth co-existance - Mobile: - rmnet: support TX aggregation" * tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1872 commits) page_pool: add a comment explaining the fragment counter usage net: ethtool: fix __ethtool_dev_mm_supported() implementation ethtool: pse-pd: Fix double word in comments xsk: add linux/vmalloc.h to xsk.c sefltests: netdevsim: wait for devlink instance after netns removal selftest: fib_tests: Always cleanup before exit net/mlx5e: Align IPsec ASO result memory to be as required by hardware net/mlx5e: TC, Set CT miss to the specific ct action instance net/mlx5e: Rename CHAIN_TO_REG to MAPPED_OBJ_TO_REG net/mlx5: Refactor tc miss handling to a single function net/mlx5: Kconfig: Make tc offload depend on tc skb extension net/sched: flower: Support hardware miss to tc action net/sched: flower: Move filter handle initialization earlier net/sched: cls_api: Support hardware miss to tc action net/sched: Rename user cookie and act cookie sfc: fix builds without CONFIG_RTC_LIB sfc: clean up some inconsistent indentings net/mlx4_en: Introduce flexible array to silence overflow warning net: lan966x: Fix possible deadlock inside PTP net/ulp: Remove redundant ->clone() test in inet_clone_ulp(). ...
Diffstat (limited to 'arch/arm/crypto/blake2b-neon-core.S')
-rw-r--r--arch/arm/crypto/blake2b-neon-core.S347
1 files changed, 347 insertions, 0 deletions
diff --git a/arch/arm/crypto/blake2b-neon-core.S b/arch/arm/crypto/blake2b-neon-core.S
new file mode 100644
index 000000000..0406a1863
--- /dev/null
+++ b/arch/arm/crypto/blake2b-neon-core.S
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2b digest algorithm, NEON accelerated
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+ .text
+ .fpu neon
+
+ // The arguments to blake2b_compress_neon()
+ STATE .req r0
+ BLOCK .req r1
+ NBLOCKS .req r2
+ INC .req r3
+
+ // Pointers to the rotation tables
+ ROR24_TABLE .req r4
+ ROR16_TABLE .req r5
+
+ // The original stack pointer
+ ORIG_SP .req r6
+
+ // NEON registers which contain the message words of the current block.
+ // M_0-M_3 are occasionally used for other purposes too.
+ M_0 .req d16
+ M_1 .req d17
+ M_2 .req d18
+ M_3 .req d19
+ M_4 .req d20
+ M_5 .req d21
+ M_6 .req d22
+ M_7 .req d23
+ M_8 .req d24
+ M_9 .req d25
+ M_10 .req d26
+ M_11 .req d27
+ M_12 .req d28
+ M_13 .req d29
+ M_14 .req d30
+ M_15 .req d31
+
+ .align 4
+ // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
+ // instruction. This is the most efficient way to implement these
+ // rotation amounts with NEON. (On Cortex-A53 it's the same speed as
+ // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
+.Lror24_table:
+ .byte 3, 4, 5, 6, 7, 0, 1, 2
+.Lror16_table:
+ .byte 2, 3, 4, 5, 6, 7, 0, 1
+ // The BLAKE2b initialization vector
+.Lblake2b_IV:
+ .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
+ .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
+ .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
+ .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+
+// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
+// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
+// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
+// (M_0-M_3), so that they can be reloaded if they are used as temporary
+// registers. The macro arguments s0-s15 give the order in which the message
+// words are used in this round. 'final' is 1 if this is the final round.
+.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \
+ s8, s9, s10, s11, s12, s13, s14, s15, final=0
+
+ // Mix the columns:
+ // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
+ // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
+
+ // a += b + m[blake2b_sigma[r][2*i + 0]];
+ vadd.u64 q0, q0, q2
+ vadd.u64 q1, q1, q3
+ vadd.u64 d0, d0, M_\s0
+ vadd.u64 d1, d1, M_\s2
+ vadd.u64 d2, d2, M_\s4
+ vadd.u64 d3, d3, M_\s6
+
+ // d = ror64(d ^ a, 32);
+ veor q6, q6, q0
+ veor q7, q7, q1
+ vrev64.32 q6, q6
+ vrev64.32 q7, q7
+
+ // c += d;
+ vadd.u64 q4, q4, q6
+ vadd.u64 q5, q5, q7
+
+ // b = ror64(b ^ c, 24);
+ vld1.8 {M_0}, [ROR24_TABLE, :64]
+ veor q2, q2, q4
+ veor q3, q3, q5
+ vtbl.8 d4, {d4}, M_0
+ vtbl.8 d5, {d5}, M_0
+ vtbl.8 d6, {d6}, M_0
+ vtbl.8 d7, {d7}, M_0
+
+ // a += b + m[blake2b_sigma[r][2*i + 1]];
+ //
+ // M_0 got clobbered above, so we have to reload it if any of the four
+ // message words this step needs happens to be M_0. Otherwise we don't
+ // need to reload it here, as it will just get clobbered again below.
+.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
+ vld1.8 {M_0}, [sp, :64]
+.endif
+ vadd.u64 q0, q0, q2
+ vadd.u64 q1, q1, q3
+ vadd.u64 d0, d0, M_\s1
+ vadd.u64 d1, d1, M_\s3
+ vadd.u64 d2, d2, M_\s5
+ vadd.u64 d3, d3, M_\s7
+
+ // d = ror64(d ^ a, 16);
+ vld1.8 {M_0}, [ROR16_TABLE, :64]
+ veor q6, q6, q0
+ veor q7, q7, q1
+ vtbl.8 d12, {d12}, M_0
+ vtbl.8 d13, {d13}, M_0
+ vtbl.8 d14, {d14}, M_0
+ vtbl.8 d15, {d15}, M_0
+
+ // c += d;
+ vadd.u64 q4, q4, q6
+ vadd.u64 q5, q5, q7
+
+ // b = ror64(b ^ c, 63);
+ //
+ // This rotation amount isn't a multiple of 8, so it has to be
+ // implemented using a pair of shifts, which requires temporary
+ // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
+ veor q8, q2, q4
+ veor q9, q3, q5
+ vshr.u64 q2, q8, #63
+ vshr.u64 q3, q9, #63
+ vsli.u64 q2, q8, #1
+ vsli.u64 q3, q9, #1
+ vld1.8 {q8-q9}, [sp, :256]
+
+ // Mix the diagonals:
+ // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
+ // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
+ //
+ // There are two possible ways to do this: use 'vext' instructions to
+ // shift the rows of the matrix so that the diagonals become columns,
+ // and undo it afterwards; or just use 64-bit operations on 'd'
+ // registers instead of 128-bit operations on 'q' registers. We use the
+ // latter approach, as it performs much better on Cortex-A7.
+
+ // a += b + m[blake2b_sigma[r][2*i + 0]];
+ vadd.u64 d0, d0, d5
+ vadd.u64 d1, d1, d6
+ vadd.u64 d2, d2, d7
+ vadd.u64 d3, d3, d4
+ vadd.u64 d0, d0, M_\s8
+ vadd.u64 d1, d1, M_\s10
+ vadd.u64 d2, d2, M_\s12
+ vadd.u64 d3, d3, M_\s14
+
+ // d = ror64(d ^ a, 32);
+ veor d15, d15, d0
+ veor d12, d12, d1
+ veor d13, d13, d2
+ veor d14, d14, d3
+ vrev64.32 d15, d15
+ vrev64.32 d12, d12
+ vrev64.32 d13, d13
+ vrev64.32 d14, d14
+
+ // c += d;
+ vadd.u64 d10, d10, d15
+ vadd.u64 d11, d11, d12
+ vadd.u64 d8, d8, d13
+ vadd.u64 d9, d9, d14
+
+ // b = ror64(b ^ c, 24);
+ vld1.8 {M_0}, [ROR24_TABLE, :64]
+ veor d5, d5, d10
+ veor d6, d6, d11
+ veor d7, d7, d8
+ veor d4, d4, d9
+ vtbl.8 d5, {d5}, M_0
+ vtbl.8 d6, {d6}, M_0
+ vtbl.8 d7, {d7}, M_0
+ vtbl.8 d4, {d4}, M_0
+
+ // a += b + m[blake2b_sigma[r][2*i + 1]];
+.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
+ vld1.8 {M_0}, [sp, :64]
+.endif
+ vadd.u64 d0, d0, d5
+ vadd.u64 d1, d1, d6
+ vadd.u64 d2, d2, d7
+ vadd.u64 d3, d3, d4
+ vadd.u64 d0, d0, M_\s9
+ vadd.u64 d1, d1, M_\s11
+ vadd.u64 d2, d2, M_\s13
+ vadd.u64 d3, d3, M_\s15
+
+ // d = ror64(d ^ a, 16);
+ vld1.8 {M_0}, [ROR16_TABLE, :64]
+ veor d15, d15, d0
+ veor d12, d12, d1
+ veor d13, d13, d2
+ veor d14, d14, d3
+ vtbl.8 d12, {d12}, M_0
+ vtbl.8 d13, {d13}, M_0
+ vtbl.8 d14, {d14}, M_0
+ vtbl.8 d15, {d15}, M_0
+
+ // c += d;
+ vadd.u64 d10, d10, d15
+ vadd.u64 d11, d11, d12
+ vadd.u64 d8, d8, d13
+ vadd.u64 d9, d9, d14
+
+ // b = ror64(b ^ c, 63);
+ veor d16, d4, d9
+ veor d17, d5, d10
+ veor d18, d6, d11
+ veor d19, d7, d8
+ vshr.u64 q2, q8, #63
+ vshr.u64 q3, q9, #63
+ vsli.u64 q2, q8, #1
+ vsli.u64 q3, q9, #1
+ // Reloading q8-q9 can be skipped on the final round.
+.if ! \final
+ vld1.8 {q8-q9}, [sp, :256]
+.endif
+.endm
+
+//
+// void blake2b_compress_neon(struct blake2b_state *state,
+// const u8 *block, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2b_state are used:
+// u64 h[8]; (inout)
+// u64 t[2]; (inout)
+// u64 f[2]; (in)
+//
+ .align 5
+ENTRY(blake2b_compress_neon)
+ push {r4-r10}
+
+ // Allocate a 32-byte stack buffer that is 32-byte aligned.
+ mov ORIG_SP, sp
+ sub ip, sp, #32
+ bic ip, ip, #31
+ mov sp, ip
+
+ adr ROR24_TABLE, .Lror24_table
+ adr ROR16_TABLE, .Lror16_table
+
+ mov ip, STATE
+ vld1.64 {q0-q1}, [ip]! // Load h[0..3]
+ vld1.64 {q2-q3}, [ip]! // Load h[4..7]
+.Lnext_block:
+ adr r10, .Lblake2b_IV
+ vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]
+ vld1.64 {q4-q5}, [r10]! // Load IV[0..3]
+ vmov r7, r8, d28 // Copy t[0] to (r7, r8)
+ vld1.64 {q6-q7}, [r10] // Load IV[4..7]
+ adds r7, r7, INC // Increment counter
+ bcs .Lslow_inc_ctr
+ vmov.i32 d28[0], r7
+ vst1.64 {d28}, [ip] // Update t[0]
+.Linc_ctr_done:
+
+ // Load the next message block and finish initializing the state matrix
+ // 'v'. Fortunately, there are exactly enough NEON registers to fit the
+ // entire state matrix in q0-q7 and the entire message block in q8-15.
+ //
+ // However, _blake2b_round also needs some extra registers for rotates,
+ // so we have to spill some registers. It's better to spill the message
+ // registers than the state registers, as the message doesn't change.
+ // Therefore we store a copy of the first 32 bytes of the message block
+ // (q8-q9) in an aligned buffer on the stack so that they can be
+ // reloaded when needed. (We could just reload directly from the
+ // message buffer, but it's faster to use aligned loads.)
+ vld1.8 {q8-q9}, [BLOCK]!
+ veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
+ vld1.8 {q10-q11}, [BLOCK]!
+ veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
+ vld1.8 {q12-q13}, [BLOCK]!
+ vst1.8 {q8-q9}, [sp, :256]
+ mov ip, STATE
+ vld1.8 {q14-q15}, [BLOCK]!
+
+ // Execute the rounds. Each round is provided the order in which it
+ // needs to use the message words.
+ _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+ _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+ _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+ _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+ _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+ _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+ _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+ _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+ _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+ _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
+ final=1
+
+ // Fold the final state matrix into the hash chaining value:
+ //
+ // for (i = 0; i < 8; i++)
+ // h[i] ^= v[i] ^ v[i + 8];
+ //
+ vld1.64 {q8-q9}, [ip]! // Load old h[0..3]
+ veor q0, q0, q4 // v[0..1] ^= v[8..9]
+ veor q1, q1, q5 // v[2..3] ^= v[10..11]
+ vld1.64 {q10-q11}, [ip] // Load old h[4..7]
+ veor q2, q2, q6 // v[4..5] ^= v[12..13]
+ veor q3, q3, q7 // v[6..7] ^= v[14..15]
+ veor q0, q0, q8 // v[0..1] ^= h[0..1]
+ veor q1, q1, q9 // v[2..3] ^= h[2..3]
+ mov ip, STATE
+ subs NBLOCKS, NBLOCKS, #1 // nblocks--
+ vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
+ veor q2, q2, q10 // v[4..5] ^= h[4..5]
+ veor q3, q3, q11 // v[6..7] ^= h[6..7]
+ vst1.64 {q2-q3}, [ip]! // Store new h[4..7]
+
+ // Advance to the next block, if there is one.
+ bne .Lnext_block // nblocks != 0?
+
+ mov sp, ORIG_SP
+ pop {r4-r10}
+ mov pc, lr
+
+.Lslow_inc_ctr:
+ // Handle the case where the counter overflowed its low 32 bits, by
+ // carrying the overflow bit into the full 128-bit counter.
+ vmov r9, r10, d29
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adc r10, r10, #0
+ vmov d28, r7, r8
+ vmov d29, r9, r10
+ vst1.64 {q14}, [ip] // Update t[0] and t[1]
+ b .Linc_ctr_done
+ENDPROC(blake2b_compress_neon)