From 5b7c4cabbb65f5c469464da6c5f614cbd7f730f2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 Feb 2023 18:24:12 -0800 Subject: Merge tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next Pull networking updates from Jakub Kicinski: "Core: - Add dedicated kmem_cache for typical/small skb->head, avoid having to access struct page at kfree time, and improve memory use. - Introduce sysctl to set default RPS configuration for new netdevs. - Define Netlink protocol specification format which can be used to describe messages used by each family and auto-generate parsers. Add tools for generating kernel data structures and uAPI headers. - Expose all net/core sysctls inside netns. - Remove 4s sleep in netpoll if carrier is instantly detected on boot. - Add configurable limit of MDB entries per port, and port-vlan. - Continue populating drop reasons throughout the stack. - Retire a handful of legacy Qdiscs and classifiers. Protocols: - Support IPv4 big TCP (TSO frames larger than 64kB). - Add IP_LOCAL_PORT_RANGE socket option, to control local port range on socket by socket basis. - Track and report in procfs number of MPTCP sockets used. - Support mixing IPv4 and IPv6 flows in the in-kernel MPTCP path manager. - IPv6: don't check net.ipv6.route.max_size and rely on garbage collection to free memory (similarly to IPv4). - Support Penultimate Segment Pop (PSP) flavor in SRv6 (RFC8986). - ICMP: add per-rate limit counters. - Add support for user scanning requests in ieee802154. - Remove static WEP support. - Support minimal Wi-Fi 7 Extremely High Throughput (EHT) rate reporting. - WiFi 7 EHT channel puncturing support (client & AP). BPF: - Add a rbtree data structure following the "next-gen data structure" precedent set by recently added linked list, that is, by using kfunc + kptr instead of adding a new BPF map type. - Expose XDP hints via kfuncs with initial support for RX hash and timestamp metadata. - Add BPF_F_NO_TUNNEL_KEY extension to bpf_skb_set_tunnel_key to better support decap on GRE tunnel devices not operating in collect metadata. - Improve x86 JIT's codegen for PROBE_MEM runtime error checks. - Remove the need for trace_printk_lock for bpf_trace_printk and bpf_trace_vprintk helpers. - Extend libbpf's bpf_tracing.h support for tracing arguments of kprobes/uprobes and syscall as a special case. - Significantly reduce the search time for module symbols by livepatch and BPF. - Enable cpumasks to be used as kptrs, which is useful for tracing programs tracking which tasks end up running on which CPUs in different time intervals. - Add support for BPF trampoline on s390x and riscv64. - Add capability to export the XDP features supported by the NIC. - Add __bpf_kfunc tag for marking kernel functions as kfuncs. - Add cgroup.memory=nobpf kernel parameter option to disable BPF memory accounting for container environments. Netfilter: - Remove the CLUSTERIP target. It has been marked as obsolete for years, and we still have WARN splats wrt races of the out-of-band /proc interface installed by this target. - Add 'destroy' commands to nf_tables. They are identical to the existing 'delete' commands, but do not return an error if the referenced object (set, chain, rule...) did not exist. Driver API: - Improve cpumask_local_spread() locality to help NICs set the right IRQ affinity on AMD platforms. - Separate C22 and C45 MDIO bus transactions more clearly. - Introduce new DCB table to control DSCP rewrite on egress. - Support configuration of Physical Layer Collision Avoidance (PLCA) Reconciliation Sublayer (RS) (802.3cg-2019). Modern version of shared medium Ethernet. - Support for MAC Merge layer (IEEE 802.3-2018 clause 99). Allowing preemption of low priority frames by high priority frames. - Add support for controlling MACSec offload using netlink SET. - Rework devlink instance refcounts to allow registration and de-registration under the instance lock. Split the code into multiple files, drop some of the unnecessarily granular locks and factor out common parts of netlink operation handling. - Add TX frame aggregation parameters (for USB drivers). - Add a new attr TCA_EXT_WARN_MSG to report TC (offload) warning messages with notifications for debug. - Allow offloading of UDP NEW connections via act_ct. - Add support for per action HW stats in TC. - Support hardware miss to TC action (continue processing in SW from a specific point in the action chain). - Warn if old Wireless Extension user space interface is used with modern cfg80211/mac80211 drivers. Do not support Wireless Extensions for Wi-Fi 7 devices at all. Everyone should switch to using nl80211 interface instead. - Improve the CAN bit timing configuration. Use extack to return error messages directly to user space, update the SJW handling, including the definition of a new default value that will benefit CAN-FD controllers, by increasing their oscillator tolerance. New hardware / drivers: - Ethernet: - nVidia BlueField-3 support (control traffic driver) - Ethernet support for imx93 SoCs - Motorcomm yt8531 gigabit Ethernet PHY - onsemi NCN26000 10BASE-T1S PHY (with support for PLCA) - Microchip LAN8841 PHY (incl. cable diagnostics and PTP) - Amlogic gxl MDIO mux - WiFi: - RealTek RTL8188EU (rtl8xxxu) - Qualcomm Wi-Fi 7 devices (ath12k) - CAN: - Renesas R-Car V4H Drivers: - Bluetooth: - Set Per Platform Antenna Gain (PPAG) for Intel controllers. - Ethernet NICs: - Intel (1G, igc): - support TSN / Qbv / packet scheduling features of i226 model - Intel (100G, ice): - use GNSS subsystem instead of TTY - multi-buffer XDP support - extend support for GPIO pins to E823 devices - nVidia/Mellanox: - update the shared buffer configuration on PFC commands - implement PTP adjphase function for HW offset control - TC support for Geneve and GRE with VF tunnel offload - more efficient crypto key management method - multi-port eswitch support - Netronome/Corigine: - add DCB IEEE support - support IPsec offloading for NFP3800 - Freescale/NXP (enetc): - support XDP_REDIRECT for XDP non-linear buffers - improve reconfig, avoid link flap and waiting for idle - support MAC Merge layer - Other NICs: - sfc/ef100: add basic devlink support for ef100 - ionic: rx_push mode operation (writing descriptors via MMIO) - bnxt: use the auxiliary bus abstraction for RDMA - r8169: disable ASPM and reset bus in case of tx timeout - cpsw: support QSGMII mode for J721e CPSW9G - cpts: support pulse-per-second output - ngbe: add an mdio bus driver - usbnet: optimize usbnet_bh() by avoiding unnecessary queuing - r8152: handle devices with FW with NCM support - amd-xgbe: support 10Mbps, 2.5GbE speeds and rx-adaptation - virtio-net: support multi buffer XDP - virtio/vsock: replace virtio_vsock_pkt with sk_buff - tsnep: XDP support - Ethernet high-speed switches: - nVidia/Mellanox (mlxsw): - add support for latency TLV (in FW control messages) - Microchip (sparx5): - separate explicit and implicit traffic forwarding rules, make the implicit rules always active - add support for egress DSCP rewrite - IS0 VCAP support (Ingress Classification) - IS2 VCAP filters (protos, L3 addrs, L4 ports, flags, ToS etc.) - ES2 VCAP support (Egress Access Control) - support for Per-Stream Filtering and Policing (802.1Q, 8.6.5.1) - Ethernet embedded switches: - Marvell (mv88e6xxx): - add MAB (port auth) offload support - enable PTP receive for mv88e6390 - NXP (ocelot): - support MAC Merge layer - support for the the vsc7512 internal copper phys - Microchip: - lan9303: convert to PHYLINK - lan966x: support TC flower filter statistics - lan937x: PTP support for KSZ9563/KSZ8563 and LAN937x - lan937x: support Credit Based Shaper configuration - ksz9477: support Energy Efficient Ethernet - other: - qca8k: convert to regmap read/write API, use bulk operations - rswitch: Improve TX timestamp accuracy - Intel WiFi (iwlwifi): - EHT (Wi-Fi 7) rate reporting - STEP equalizer support: transfer some STEP (connection to radio on platforms with integrated wifi) related parameters from the BIOS to the firmware. - Qualcomm 802.11ax WiFi (ath11k): - IPQ5018 support - Fine Timing Measurement (FTM) responder role support - channel 177 support - MediaTek WiFi (mt76): - per-PHY LED support - mt7996: EHT (Wi-Fi 7) support - Wireless Ethernet Dispatch (WED) reset support - switch to using page pool allocator - RealTek WiFi (rtw89): - support new version of Bluetooth co-existance - Mobile: - rmnet: support TX aggregation" * tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1872 commits) page_pool: add a comment explaining the fragment counter usage net: ethtool: fix __ethtool_dev_mm_supported() implementation ethtool: pse-pd: Fix double word in comments xsk: add linux/vmalloc.h to xsk.c sefltests: netdevsim: wait for devlink instance after netns removal selftest: fib_tests: Always cleanup before exit net/mlx5e: Align IPsec ASO result memory to be as required by hardware net/mlx5e: TC, Set CT miss to the specific ct action instance net/mlx5e: Rename CHAIN_TO_REG to MAPPED_OBJ_TO_REG net/mlx5: Refactor tc miss handling to a single function net/mlx5: Kconfig: Make tc offload depend on tc skb extension net/sched: flower: Support hardware miss to tc action net/sched: flower: Move filter handle initialization earlier net/sched: cls_api: Support hardware miss to tc action net/sched: Rename user cookie and act cookie sfc: fix builds without CONFIG_RTC_LIB sfc: clean up some inconsistent indentings net/mlx4_en: Introduce flexible array to silence overflow warning net: lan966x: Fix possible deadlock inside PTP net/ulp: Remove redundant ->clone() test in inet_clone_ulp(). ... --- arch/arm/kernel/entry-common.S | 460 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 460 insertions(+) create mode 100644 arch/arm/kernel/entry-common.S (limited to 'arch/arm/kernel/entry-common.S') diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S new file mode 100644 index 000000000..03d4c5578 --- /dev/null +++ b/arch/arm/kernel/entry-common.S @@ -0,0 +1,460 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * linux/arch/arm/kernel/entry-common.S + * + * Copyright (C) 2000 Russell King + */ + +#include +#include +#include +#include +#include +#ifdef CONFIG_AEABI +#include +#endif + + .equ NR_syscalls, __NR_syscalls + +#include "entry-header.S" + +saved_psr .req r8 +#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING_USER) +saved_pc .req r9 +#define TRACE(x...) x +#else +saved_pc .req lr +#define TRACE(x...) +#endif + + .section .entry.text,"ax",%progbits + .align 5 +#if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING_USER) || \ + IS_ENABLED(CONFIG_DEBUG_RSEQ)) +/* + * This is the fast syscall return path. We do as little as possible here, + * such as avoiding writing r0 to the stack. We only use this path if we + * have tracing, context tracking and rseq debug disabled - the overheads + * from those features make this path too inefficient. + */ +ret_fast_syscall: +__ret_fast_syscall: + UNWIND(.fnstart ) + UNWIND(.cantunwind ) + disable_irq_notrace @ disable interrupts + ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing + movs r1, r1, lsl #16 + bne fast_work_pending + + restore_user_regs fast = 1, offset = S_OFF + UNWIND(.fnend ) +ENDPROC(ret_fast_syscall) + + /* Ok, we need to do extra processing, enter the slow path. */ +fast_work_pending: + str r0, [sp, #S_R0+S_OFF]! @ returned r0 + /* fall through to work_pending */ +#else +/* + * The "replacement" ret_fast_syscall for when tracing, context tracking, + * or rseq debug is enabled. As we will need to call out to some C functions, + * we save r0 first to avoid needing to save registers around each C function + * call. + */ +ret_fast_syscall: +__ret_fast_syscall: + UNWIND(.fnstart ) + UNWIND(.cantunwind ) + str r0, [sp, #S_R0 + S_OFF]! @ save returned r0 +#if IS_ENABLED(CONFIG_DEBUG_RSEQ) + /* do_rseq_syscall needs interrupts enabled. */ + mov r0, sp @ 'regs' + bl do_rseq_syscall +#endif + disable_irq_notrace @ disable interrupts + ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing + movs r1, r1, lsl #16 + beq no_work_pending + UNWIND(.fnend ) +ENDPROC(ret_fast_syscall) + + /* Slower path - fall through to work_pending */ +#endif + + tst r1, #_TIF_SYSCALL_WORK + bne __sys_trace_return_nosave +slow_work_pending: + mov r0, sp @ 'regs' + mov r2, why @ 'syscall' + bl do_work_pending + cmp r0, #0 + beq no_work_pending + movlt scno, #(__NR_restart_syscall - __NR_SYSCALL_BASE) + ldmia sp, {r0 - r6} @ have to reload r0 - r6 + b local_restart @ ... and off we go +ENDPROC(ret_fast_syscall) + +/* + * "slow" syscall return path. "why" tells us if this was a real syscall. + * IRQs may be enabled here, so always disable them. Note that we use the + * "notrace" version to avoid calling into the tracing code unnecessarily. + * do_work_pending() will update this state if necessary. + */ +ENTRY(ret_to_user) +ret_slow_syscall: +#if IS_ENABLED(CONFIG_DEBUG_RSEQ) + /* do_rseq_syscall needs interrupts enabled. */ + enable_irq_notrace @ enable interrupts + mov r0, sp @ 'regs' + bl do_rseq_syscall +#endif + disable_irq_notrace @ disable interrupts +ENTRY(ret_to_user_from_irq) + ldr r1, [tsk, #TI_FLAGS] + movs r1, r1, lsl #16 + bne slow_work_pending +no_work_pending: + asm_trace_hardirqs_on save = 0 + + ct_user_enter save = 0 + + restore_user_regs fast = 0, offset = 0 +ENDPROC(ret_to_user_from_irq) +ENDPROC(ret_to_user) + +/* + * This is how we return from a fork. + */ +ENTRY(ret_from_fork) + bl schedule_tail + cmp r5, #0 + movne r0, r4 + badrne lr, 1f + retne r5 +1: get_thread_info tsk + b ret_slow_syscall +ENDPROC(ret_from_fork) + +/*============================================================================= + * SWI handler + *----------------------------------------------------------------------------- + */ + + .align 5 +#ifdef CONFIG_HARDEN_BRANCH_HISTORY +ENTRY(vector_bhb_loop8_swi) + sub sp, sp, #PT_REGS_SIZE + stmia sp, {r0 - r12} + mov r8, #8 +1: b 2f +2: subs r8, r8, #1 + bne 1b + dsb nsh + isb + b 3f +ENDPROC(vector_bhb_loop8_swi) + + .align 5 +ENTRY(vector_bhb_bpiall_swi) + sub sp, sp, #PT_REGS_SIZE + stmia sp, {r0 - r12} + mcr p15, 0, r8, c7, c5, 6 @ BPIALL + isb + b 3f +ENDPROC(vector_bhb_bpiall_swi) +#endif + .align 5 +ENTRY(vector_swi) +#ifdef CONFIG_CPU_V7M + v7m_exception_entry +#else + sub sp, sp, #PT_REGS_SIZE + stmia sp, {r0 - r12} @ Calling r0 - r12 +3: + ARM( add r8, sp, #S_PC ) + ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr + THUMB( mov r8, sp ) + THUMB( store_user_sp_lr r8, r10, S_SP ) @ calling sp, lr + mrs saved_psr, spsr @ called from non-FIQ mode, so ok. + TRACE( mov saved_pc, lr ) + str saved_pc, [sp, #S_PC] @ Save calling PC + str saved_psr, [sp, #S_PSR] @ Save CPSR + str r0, [sp, #S_OLD_R0] @ Save OLD_R0 +#endif + reload_current r10, ip + zero_fp + alignment_trap r10, ip, cr_alignment + asm_trace_hardirqs_on save=0 + enable_irq_notrace + ct_user_exit save=0 + + /* + * Get the system call number. + */ + +#if defined(CONFIG_OABI_COMPAT) + + /* + * If we have CONFIG_OABI_COMPAT then we need to look at the swi + * value to determine if it is an EABI or an old ABI call. + */ +#ifdef CONFIG_ARM_THUMB + tst saved_psr, #PSR_T_BIT + movne r10, #0 @ no thumb OABI emulation + USER( ldreq r10, [saved_pc, #-4] ) @ get SWI instruction +#else + USER( ldr r10, [saved_pc, #-4] ) @ get SWI instruction +#endif + ARM_BE8(rev r10, r10) @ little endian instruction + +#elif defined(CONFIG_AEABI) + + /* + * Pure EABI user space always put syscall number into scno (r7). + */ +#elif defined(CONFIG_ARM_THUMB) + /* Legacy ABI only, possibly thumb mode. */ + tst saved_psr, #PSR_T_BIT @ this is SPSR from save_user_regs + addne scno, r7, #__NR_SYSCALL_BASE @ put OS number in + USER( ldreq scno, [saved_pc, #-4] ) + +#else + /* Legacy ABI only. */ + USER( ldr scno, [saved_pc, #-4] ) @ get SWI instruction +#endif + + /* saved_psr and saved_pc are now dead */ + + uaccess_disable tbl + get_thread_info tsk + + adr tbl, sys_call_table @ load syscall table pointer + +#if defined(CONFIG_OABI_COMPAT) + /* + * If the swi argument is zero, this is an EABI call and we do nothing. + * + * If this is an old ABI call, get the syscall number into scno and + * get the old ABI syscall table address. + */ + bics r10, r10, #0xff000000 + strne r10, [tsk, #TI_ABI_SYSCALL] + streq scno, [tsk, #TI_ABI_SYSCALL] + eorne scno, r10, #__NR_OABI_SYSCALL_BASE + ldrne tbl, =sys_oabi_call_table +#elif !defined(CONFIG_AEABI) + bic scno, scno, #0xff000000 @ mask off SWI op-code + str scno, [tsk, #TI_ABI_SYSCALL] + eor scno, scno, #__NR_SYSCALL_BASE @ check OS number +#else + str scno, [tsk, #TI_ABI_SYSCALL] +#endif + /* + * Reload the registers that may have been corrupted on entry to + * the syscall assembly (by tracing or context tracking.) + */ + TRACE( ldmia sp, {r0 - r3} ) + +local_restart: + ldr r10, [tsk, #TI_FLAGS] @ check for syscall tracing + stmdb sp!, {r4, r5} @ push fifth and sixth args + + tst r10, #_TIF_SYSCALL_WORK @ are we tracing syscalls? + bne __sys_trace + + invoke_syscall tbl, scno, r10, __ret_fast_syscall + + add r1, sp, #S_OFF +2: cmp scno, #(__ARM_NR_BASE - __NR_SYSCALL_BASE) + eor r0, scno, #__NR_SYSCALL_BASE @ put OS number back + bcs arm_syscall + mov why, #0 @ no longer a real syscall + b sys_ni_syscall @ not private func + +#if defined(CONFIG_OABI_COMPAT) || !defined(CONFIG_AEABI) + /* + * We failed to handle a fault trying to access the page + * containing the swi instruction, but we're not really in a + * position to return -EFAULT. Instead, return back to the + * instruction and re-enter the user fault handling path trying + * to page it in. This will likely result in sending SEGV to the + * current task. + */ +9001: + sub lr, saved_pc, #4 + str lr, [sp, #S_PC] + get_thread_info tsk + b ret_fast_syscall +#endif +ENDPROC(vector_swi) + .ltorg + + /* + * This is the really slow path. We're going to be doing + * context switches, and waiting for our parent to respond. + */ +__sys_trace: + add r0, sp, #S_OFF + bl syscall_trace_enter + mov scno, r0 + invoke_syscall tbl, scno, r10, __sys_trace_return, reload=1 + cmp scno, #-1 @ skip the syscall? + bne 2b + add sp, sp, #S_OFF @ restore stack + +__sys_trace_return_nosave: + enable_irq_notrace + mov r0, sp + bl syscall_trace_exit + b ret_slow_syscall + +__sys_trace_return: + str r0, [sp, #S_R0 + S_OFF]! @ save returned r0 + mov r0, sp + bl syscall_trace_exit + b ret_slow_syscall + + .macro syscall_table_start, sym + .equ __sys_nr, 0 + .type \sym, #object +ENTRY(\sym) + .endm + + .macro syscall, nr, func + .ifgt __sys_nr - \nr + .error "Duplicated/unorded system call entry" + .endif + .rept \nr - __sys_nr + .long sys_ni_syscall + .endr + .long \func + .equ __sys_nr, \nr + 1 + .endm + + .macro syscall_table_end, sym + .ifgt __sys_nr - __NR_syscalls + .error "System call table too big" + .endif + .rept __NR_syscalls - __sys_nr + .long sys_ni_syscall + .endr + .size \sym, . - \sym + .endm + +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) +#define __SYSCALL(nr, func) syscall nr, func + +/* + * This is the syscall table declaration for native ABI syscalls. + * With EABI a couple syscalls are obsolete and defined as sys_ni_syscall. + */ + syscall_table_start sys_call_table +#ifdef CONFIG_AEABI +#include +#else +#include +#endif + syscall_table_end sys_call_table + +/*============================================================================ + * Special system call wrappers + */ +@ r0 = syscall number +@ r8 = syscall table +sys_syscall: + bic scno, r0, #__NR_OABI_SYSCALL_BASE + cmp scno, #__NR_syscall - __NR_SYSCALL_BASE + cmpne scno, #NR_syscalls @ check range +#ifdef CONFIG_CPU_SPECTRE + movhs scno, #0 + csdb +#endif + stmialo sp, {r5, r6} @ shuffle args + movlo r0, r1 + movlo r1, r2 + movlo r2, r3 + movlo r3, r4 + ldrlo pc, [tbl, scno, lsl #2] + b sys_ni_syscall +ENDPROC(sys_syscall) + +sys_sigreturn_wrapper: + add r0, sp, #S_OFF + mov why, #0 @ prevent syscall restart handling + b sys_sigreturn +ENDPROC(sys_sigreturn_wrapper) + +sys_rt_sigreturn_wrapper: + add r0, sp, #S_OFF + mov why, #0 @ prevent syscall restart handling + b sys_rt_sigreturn +ENDPROC(sys_rt_sigreturn_wrapper) + +sys_statfs64_wrapper: + teq r1, #88 + moveq r1, #84 + b sys_statfs64 +ENDPROC(sys_statfs64_wrapper) + +sys_fstatfs64_wrapper: + teq r1, #88 + moveq r1, #84 + b sys_fstatfs64 +ENDPROC(sys_fstatfs64_wrapper) + +/* + * Note: off_4k (r5) is always units of 4K. If we can't do the requested + * offset, we return EINVAL. + */ +sys_mmap2: + str r5, [sp, #4] + b sys_mmap_pgoff +ENDPROC(sys_mmap2) + +#ifdef CONFIG_OABI_COMPAT + +/* + * These are syscalls with argument register differences + */ + +sys_oabi_pread64: + stmia sp, {r3, r4} + b sys_pread64 +ENDPROC(sys_oabi_pread64) + +sys_oabi_pwrite64: + stmia sp, {r3, r4} + b sys_pwrite64 +ENDPROC(sys_oabi_pwrite64) + +sys_oabi_truncate64: + mov r3, r2 + mov r2, r1 + b sys_truncate64 +ENDPROC(sys_oabi_truncate64) + +sys_oabi_ftruncate64: + mov r3, r2 + mov r2, r1 + b sys_ftruncate64 +ENDPROC(sys_oabi_ftruncate64) + +sys_oabi_readahead: + str r3, [sp] + mov r3, r2 + mov r2, r1 + b sys_readahead +ENDPROC(sys_oabi_readahead) + +/* + * Let's declare a second syscall table for old ABI binaries + * using the compatibility syscall entries. + */ + syscall_table_start sys_oabi_call_table +#undef __SYSCALL_WITH_COMPAT +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) +#include + syscall_table_end sys_oabi_call_table + +#endif + -- cgit v1.2.3