aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel/idt.c
diff options
context:
space:
mode:
authorLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
committerLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
commit5b7c4cabbb65f5c469464da6c5f614cbd7f730f2 (patch)
treecc5c2d0a898769fd59549594fedb3ee6f84e59a0 /arch/x86/kernel/idt.c
downloadlinux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.tar.gz
linux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.zip
Merge tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextgrafted
Pull networking updates from Jakub Kicinski: "Core: - Add dedicated kmem_cache for typical/small skb->head, avoid having to access struct page at kfree time, and improve memory use. - Introduce sysctl to set default RPS configuration for new netdevs. - Define Netlink protocol specification format which can be used to describe messages used by each family and auto-generate parsers. Add tools for generating kernel data structures and uAPI headers. - Expose all net/core sysctls inside netns. - Remove 4s sleep in netpoll if carrier is instantly detected on boot. - Add configurable limit of MDB entries per port, and port-vlan. - Continue populating drop reasons throughout the stack. - Retire a handful of legacy Qdiscs and classifiers. Protocols: - Support IPv4 big TCP (TSO frames larger than 64kB). - Add IP_LOCAL_PORT_RANGE socket option, to control local port range on socket by socket basis. - Track and report in procfs number of MPTCP sockets used. - Support mixing IPv4 and IPv6 flows in the in-kernel MPTCP path manager. - IPv6: don't check net.ipv6.route.max_size and rely on garbage collection to free memory (similarly to IPv4). - Support Penultimate Segment Pop (PSP) flavor in SRv6 (RFC8986). - ICMP: add per-rate limit counters. - Add support for user scanning requests in ieee802154. - Remove static WEP support. - Support minimal Wi-Fi 7 Extremely High Throughput (EHT) rate reporting. - WiFi 7 EHT channel puncturing support (client & AP). BPF: - Add a rbtree data structure following the "next-gen data structure" precedent set by recently added linked list, that is, by using kfunc + kptr instead of adding a new BPF map type. - Expose XDP hints via kfuncs with initial support for RX hash and timestamp metadata. - Add BPF_F_NO_TUNNEL_KEY extension to bpf_skb_set_tunnel_key to better support decap on GRE tunnel devices not operating in collect metadata. - Improve x86 JIT's codegen for PROBE_MEM runtime error checks. - Remove the need for trace_printk_lock for bpf_trace_printk and bpf_trace_vprintk helpers. - Extend libbpf's bpf_tracing.h support for tracing arguments of kprobes/uprobes and syscall as a special case. - Significantly reduce the search time for module symbols by livepatch and BPF. - Enable cpumasks to be used as kptrs, which is useful for tracing programs tracking which tasks end up running on which CPUs in different time intervals. - Add support for BPF trampoline on s390x and riscv64. - Add capability to export the XDP features supported by the NIC. - Add __bpf_kfunc tag for marking kernel functions as kfuncs. - Add cgroup.memory=nobpf kernel parameter option to disable BPF memory accounting for container environments. Netfilter: - Remove the CLUSTERIP target. It has been marked as obsolete for years, and we still have WARN splats wrt races of the out-of-band /proc interface installed by this target. - Add 'destroy' commands to nf_tables. They are identical to the existing 'delete' commands, but do not return an error if the referenced object (set, chain, rule...) did not exist. Driver API: - Improve cpumask_local_spread() locality to help NICs set the right IRQ affinity on AMD platforms. - Separate C22 and C45 MDIO bus transactions more clearly. - Introduce new DCB table to control DSCP rewrite on egress. - Support configuration of Physical Layer Collision Avoidance (PLCA) Reconciliation Sublayer (RS) (802.3cg-2019). Modern version of shared medium Ethernet. - Support for MAC Merge layer (IEEE 802.3-2018 clause 99). Allowing preemption of low priority frames by high priority frames. - Add support for controlling MACSec offload using netlink SET. - Rework devlink instance refcounts to allow registration and de-registration under the instance lock. Split the code into multiple files, drop some of the unnecessarily granular locks and factor out common parts of netlink operation handling. - Add TX frame aggregation parameters (for USB drivers). - Add a new attr TCA_EXT_WARN_MSG to report TC (offload) warning messages with notifications for debug. - Allow offloading of UDP NEW connections via act_ct. - Add support for per action HW stats in TC. - Support hardware miss to TC action (continue processing in SW from a specific point in the action chain). - Warn if old Wireless Extension user space interface is used with modern cfg80211/mac80211 drivers. Do not support Wireless Extensions for Wi-Fi 7 devices at all. Everyone should switch to using nl80211 interface instead. - Improve the CAN bit timing configuration. Use extack to return error messages directly to user space, update the SJW handling, including the definition of a new default value that will benefit CAN-FD controllers, by increasing their oscillator tolerance. New hardware / drivers: - Ethernet: - nVidia BlueField-3 support (control traffic driver) - Ethernet support for imx93 SoCs - Motorcomm yt8531 gigabit Ethernet PHY - onsemi NCN26000 10BASE-T1S PHY (with support for PLCA) - Microchip LAN8841 PHY (incl. cable diagnostics and PTP) - Amlogic gxl MDIO mux - WiFi: - RealTek RTL8188EU (rtl8xxxu) - Qualcomm Wi-Fi 7 devices (ath12k) - CAN: - Renesas R-Car V4H Drivers: - Bluetooth: - Set Per Platform Antenna Gain (PPAG) for Intel controllers. - Ethernet NICs: - Intel (1G, igc): - support TSN / Qbv / packet scheduling features of i226 model - Intel (100G, ice): - use GNSS subsystem instead of TTY - multi-buffer XDP support - extend support for GPIO pins to E823 devices - nVidia/Mellanox: - update the shared buffer configuration on PFC commands - implement PTP adjphase function for HW offset control - TC support for Geneve and GRE with VF tunnel offload - more efficient crypto key management method - multi-port eswitch support - Netronome/Corigine: - add DCB IEEE support - support IPsec offloading for NFP3800 - Freescale/NXP (enetc): - support XDP_REDIRECT for XDP non-linear buffers - improve reconfig, avoid link flap and waiting for idle - support MAC Merge layer - Other NICs: - sfc/ef100: add basic devlink support for ef100 - ionic: rx_push mode operation (writing descriptors via MMIO) - bnxt: use the auxiliary bus abstraction for RDMA - r8169: disable ASPM and reset bus in case of tx timeout - cpsw: support QSGMII mode for J721e CPSW9G - cpts: support pulse-per-second output - ngbe: add an mdio bus driver - usbnet: optimize usbnet_bh() by avoiding unnecessary queuing - r8152: handle devices with FW with NCM support - amd-xgbe: support 10Mbps, 2.5GbE speeds and rx-adaptation - virtio-net: support multi buffer XDP - virtio/vsock: replace virtio_vsock_pkt with sk_buff - tsnep: XDP support - Ethernet high-speed switches: - nVidia/Mellanox (mlxsw): - add support for latency TLV (in FW control messages) - Microchip (sparx5): - separate explicit and implicit traffic forwarding rules, make the implicit rules always active - add support for egress DSCP rewrite - IS0 VCAP support (Ingress Classification) - IS2 VCAP filters (protos, L3 addrs, L4 ports, flags, ToS etc.) - ES2 VCAP support (Egress Access Control) - support for Per-Stream Filtering and Policing (802.1Q, 8.6.5.1) - Ethernet embedded switches: - Marvell (mv88e6xxx): - add MAB (port auth) offload support - enable PTP receive for mv88e6390 - NXP (ocelot): - support MAC Merge layer - support for the the vsc7512 internal copper phys - Microchip: - lan9303: convert to PHYLINK - lan966x: support TC flower filter statistics - lan937x: PTP support for KSZ9563/KSZ8563 and LAN937x - lan937x: support Credit Based Shaper configuration - ksz9477: support Energy Efficient Ethernet - other: - qca8k: convert to regmap read/write API, use bulk operations - rswitch: Improve TX timestamp accuracy - Intel WiFi (iwlwifi): - EHT (Wi-Fi 7) rate reporting - STEP equalizer support: transfer some STEP (connection to radio on platforms with integrated wifi) related parameters from the BIOS to the firmware. - Qualcomm 802.11ax WiFi (ath11k): - IPQ5018 support - Fine Timing Measurement (FTM) responder role support - channel 177 support - MediaTek WiFi (mt76): - per-PHY LED support - mt7996: EHT (Wi-Fi 7) support - Wireless Ethernet Dispatch (WED) reset support - switch to using page pool allocator - RealTek WiFi (rtw89): - support new version of Bluetooth co-existance - Mobile: - rmnet: support TX aggregation" * tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1872 commits) page_pool: add a comment explaining the fragment counter usage net: ethtool: fix __ethtool_dev_mm_supported() implementation ethtool: pse-pd: Fix double word in comments xsk: add linux/vmalloc.h to xsk.c sefltests: netdevsim: wait for devlink instance after netns removal selftest: fib_tests: Always cleanup before exit net/mlx5e: Align IPsec ASO result memory to be as required by hardware net/mlx5e: TC, Set CT miss to the specific ct action instance net/mlx5e: Rename CHAIN_TO_REG to MAPPED_OBJ_TO_REG net/mlx5: Refactor tc miss handling to a single function net/mlx5: Kconfig: Make tc offload depend on tc skb extension net/sched: flower: Support hardware miss to tc action net/sched: flower: Move filter handle initialization earlier net/sched: cls_api: Support hardware miss to tc action net/sched: Rename user cookie and act cookie sfc: fix builds without CONFIG_RTC_LIB sfc: clean up some inconsistent indentings net/mlx4_en: Introduce flexible array to silence overflow warning net: lan966x: Fix possible deadlock inside PTP net/ulp: Remove redundant ->clone() test in inet_clone_ulp(). ...
Diffstat (limited to '')
-rw-r--r--arch/x86/kernel/idt.c344
1 files changed, 344 insertions, 0 deletions
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
new file mode 100644
index 000000000..a58c6bc1c
--- /dev/null
+++ b/arch/x86/kernel/idt.c
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Interrupt descriptor table related code
+ */
+#include <linux/interrupt.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/set_memory.h>
+#include <asm/traps.h>
+#include <asm/proto.h>
+#include <asm/desc.h>
+#include <asm/hw_irq.h>
+#include <asm/idtentry.h>
+
+#define DPL0 0x0
+#define DPL3 0x3
+
+#define DEFAULT_STACK 0
+
+#define G(_vector, _addr, _ist, _type, _dpl, _segment) \
+ { \
+ .vector = _vector, \
+ .bits.ist = _ist, \
+ .bits.type = _type, \
+ .bits.dpl = _dpl, \
+ .bits.p = 1, \
+ .addr = _addr, \
+ .segment = _segment, \
+ }
+
+/* Interrupt gate */
+#define INTG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate */
+#define SYSG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+#ifdef CONFIG_X86_64
+/*
+ * Interrupt gate with interrupt stack. The _ist index is the index in
+ * the tss.ist[] array, but for the descriptor it needs to start at 1.
+ */
+#define ISTG(_vector, _addr, _ist) \
+ G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+#else
+#define ISTG(_vector, _addr, _ist) INTG(_vector, _addr)
+#endif
+
+/* Task gate */
+#define TSKG(_vector, _gdt) \
+ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
+
+#define IDT_TABLE_SIZE (IDT_ENTRIES * sizeof(gate_desc))
+
+static bool idt_setup_done __initdata;
+
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initconst struct idt_data early_idts[] = {
+ INTG(X86_TRAP_DB, asm_exc_debug),
+ SYSG(X86_TRAP_BP, asm_exc_int3),
+
+#ifdef CONFIG_X86_32
+ /*
+ * Not possible on 64-bit. See idt_setup_early_pf() for details.
+ */
+ INTG(X86_TRAP_PF, asm_exc_page_fault),
+#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+ INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
+#endif
+};
+
+/*
+ * The default IDT entries which are set up in trap_init() before
+ * cpu_init() is invoked. Interrupt stacks cannot be used at that point and
+ * the traps which use them are reinitialized with IST after cpu_init() has
+ * set up TSS.
+ */
+static const __initconst struct idt_data def_idts[] = {
+ INTG(X86_TRAP_DE, asm_exc_divide_error),
+ ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
+ INTG(X86_TRAP_BR, asm_exc_bounds),
+ INTG(X86_TRAP_UD, asm_exc_invalid_op),
+ INTG(X86_TRAP_NM, asm_exc_device_not_available),
+ INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun),
+ INTG(X86_TRAP_TS, asm_exc_invalid_tss),
+ INTG(X86_TRAP_NP, asm_exc_segment_not_present),
+ INTG(X86_TRAP_SS, asm_exc_stack_segment),
+ INTG(X86_TRAP_GP, asm_exc_general_protection),
+ INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug),
+ INTG(X86_TRAP_MF, asm_exc_coprocessor_error),
+ INTG(X86_TRAP_AC, asm_exc_alignment_check),
+ INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error),
+
+#ifdef CONFIG_X86_32
+ TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
+#else
+ ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
+#endif
+ ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
+
+#ifdef CONFIG_X86_MCE
+ ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
+#endif
+
+#ifdef CONFIG_X86_KERNEL_IBT
+ INTG(X86_TRAP_CP, asm_exc_control_protection),
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
+#endif
+
+ SYSG(X86_TRAP_OF, asm_exc_overflow),
+#if defined(CONFIG_IA32_EMULATION)
+ SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
+#elif defined(CONFIG_X86_32)
+ SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
+#endif
+};
+
+/*
+ * The APIC and SMP idt entries
+ */
+static const __initconst struct idt_data apic_idts[] = {
+#ifdef CONFIG_SMP
+ INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi),
+ INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function),
+ INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single),
+ INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup),
+ INTG(REBOOT_VECTOR, asm_sysvec_reboot),
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ INTG(THERMAL_APIC_VECTOR, asm_sysvec_thermal),
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ INTG(THRESHOLD_APIC_VECTOR, asm_sysvec_threshold),
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ INTG(DEFERRED_ERROR_VECTOR, asm_sysvec_deferred_error),
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt),
+ INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi),
+# ifdef CONFIG_HAVE_KVM
+ INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi),
+ INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi),
+ INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi),
+# endif
+# ifdef CONFIG_IRQ_WORK
+ INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work),
+# endif
+ INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt),
+ INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt),
+#endif
+};
+
+/* Must be page-aligned because the real IDT is used in the cpu entry area */
+static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+static struct desc_ptr idt_descr __ro_after_init = {
+ .size = IDT_TABLE_SIZE - 1,
+ .address = (unsigned long) idt_table,
+};
+
+void load_current_idt(void)
+{
+ lockdep_assert_irqs_disabled();
+ load_idt(&idt_descr);
+}
+
+#ifdef CONFIG_X86_F00F_BUG
+bool idt_is_f00f_address(unsigned long address)
+{
+ return ((address - idt_descr.address) >> 3) == 6;
+}
+#endif
+
+static __init void
+idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
+{
+ gate_desc desc;
+
+ for (; size > 0; t++, size--) {
+ idt_init_desc(&desc, t);
+ write_idt_entry(idt, t->vector, &desc);
+ if (sys)
+ set_bit(t->vector, system_vectors);
+ }
+}
+
+static __init void set_intr_gate(unsigned int n, const void *addr)
+{
+ struct idt_data data;
+
+ init_idt_data(&data, n, addr);
+
+ idt_setup_from_table(idt_table, &data, 1, false);
+}
+
+/**
+ * idt_setup_early_traps - Initialize the idt table with early traps
+ *
+ * On X8664 these traps do not use interrupt stacks as they can't work
+ * before cpu_init() is invoked and sets up TSS. The IST variants are
+ * installed after that.
+ */
+void __init idt_setup_early_traps(void)
+{
+ idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts),
+ true);
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_setup_traps - Initialize the idt table with default traps
+ */
+void __init idt_setup_traps(void)
+{
+ idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initconst struct idt_data early_pf_idts[] = {
+ INTG(X86_TRAP_PF, asm_exc_page_fault),
+};
+
+/**
+ * idt_setup_early_pf - Initialize the idt table with early pagefault handler
+ *
+ * On X8664 this does not use interrupt stacks as they can't work before
+ * cpu_init() is invoked and sets up TSS. The IST variant is installed
+ * after that.
+ *
+ * Note, that X86_64 cannot install the real #PF handler in
+ * idt_setup_early_traps() because the memory initialization needs the #PF
+ * handler from the early_idt_handler_array to initialize the early page
+ * tables.
+ */
+void __init idt_setup_early_pf(void)
+{
+ idt_setup_from_table(idt_table, early_pf_idts,
+ ARRAY_SIZE(early_pf_idts), true);
+}
+#endif
+
+static void __init idt_map_in_cea(void)
+{
+ /*
+ * Set the IDT descriptor to a fixed read-only location in the cpu
+ * entry area, so that the "sidt" instruction will not leak the
+ * location of the kernel, and to defend the IDT against arbitrary
+ * memory write vulnerabilities.
+ */
+ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
+ PAGE_KERNEL_RO);
+ idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
+}
+
+/**
+ * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates
+ */
+void __init idt_setup_apic_and_irq_gates(void)
+{
+ int i = FIRST_EXTERNAL_VECTOR;
+ void *entry;
+
+ idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true);
+
+ for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) {
+ entry = irq_entries_start + IDT_ALIGN * (i - FIRST_EXTERNAL_VECTOR);
+ set_intr_gate(i, entry);
+ }
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
+ /*
+ * Don't set the non assigned system vectors in the
+ * system_vectors bitmap. Otherwise they show up in
+ * /proc/interrupts.
+ */
+ entry = spurious_entries_start + IDT_ALIGN * (i - FIRST_SYSTEM_VECTOR);
+ set_intr_gate(i, entry);
+ }
+#endif
+ /* Map IDT into CPU entry area and reload it. */
+ idt_map_in_cea();
+ load_idt(&idt_descr);
+
+ /* Make the IDT table read only */
+ set_memory_ro((unsigned long)&idt_table, 1);
+
+ idt_setup_done = true;
+}
+
+/**
+ * idt_setup_early_handler - Initializes the idt table with early handlers
+ */
+void __init idt_setup_early_handler(void)
+{
+ int i;
+
+ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
+ set_intr_gate(i, early_idt_handler_array[i]);
+#ifdef CONFIG_X86_32
+ for ( ; i < NR_VECTORS; i++)
+ set_intr_gate(i, early_ignore_irq);
+#endif
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_invalidate - Invalidate interrupt descriptor table
+ */
+void idt_invalidate(void)
+{
+ static const struct desc_ptr idt = { .address = 0, .size = 0 };
+
+ load_idt(&idt);
+}
+
+void __init alloc_intr_gate(unsigned int n, const void *addr)
+{
+ if (WARN_ON(n < FIRST_SYSTEM_VECTOR))
+ return;
+
+ if (WARN_ON(idt_setup_done))
+ return;
+
+ if (!WARN_ON(test_and_set_bit(n, system_vectors)))
+ set_intr_gate(n, addr);
+}