aboutsummaryrefslogtreecommitdiff
path: root/tools/testing/selftests/kvm/lib/memstress.c
diff options
context:
space:
mode:
authorLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
committerLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
commit5b7c4cabbb65f5c469464da6c5f614cbd7f730f2 (patch)
treecc5c2d0a898769fd59549594fedb3ee6f84e59a0 /tools/testing/selftests/kvm/lib/memstress.c
downloadlinux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.tar.gz
linux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.zip
Merge tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextgrafted
Pull networking updates from Jakub Kicinski: "Core: - Add dedicated kmem_cache for typical/small skb->head, avoid having to access struct page at kfree time, and improve memory use. - Introduce sysctl to set default RPS configuration for new netdevs. - Define Netlink protocol specification format which can be used to describe messages used by each family and auto-generate parsers. Add tools for generating kernel data structures and uAPI headers. - Expose all net/core sysctls inside netns. - Remove 4s sleep in netpoll if carrier is instantly detected on boot. - Add configurable limit of MDB entries per port, and port-vlan. - Continue populating drop reasons throughout the stack. - Retire a handful of legacy Qdiscs and classifiers. Protocols: - Support IPv4 big TCP (TSO frames larger than 64kB). - Add IP_LOCAL_PORT_RANGE socket option, to control local port range on socket by socket basis. - Track and report in procfs number of MPTCP sockets used. - Support mixing IPv4 and IPv6 flows in the in-kernel MPTCP path manager. - IPv6: don't check net.ipv6.route.max_size and rely on garbage collection to free memory (similarly to IPv4). - Support Penultimate Segment Pop (PSP) flavor in SRv6 (RFC8986). - ICMP: add per-rate limit counters. - Add support for user scanning requests in ieee802154. - Remove static WEP support. - Support minimal Wi-Fi 7 Extremely High Throughput (EHT) rate reporting. - WiFi 7 EHT channel puncturing support (client & AP). BPF: - Add a rbtree data structure following the "next-gen data structure" precedent set by recently added linked list, that is, by using kfunc + kptr instead of adding a new BPF map type. - Expose XDP hints via kfuncs with initial support for RX hash and timestamp metadata. - Add BPF_F_NO_TUNNEL_KEY extension to bpf_skb_set_tunnel_key to better support decap on GRE tunnel devices not operating in collect metadata. - Improve x86 JIT's codegen for PROBE_MEM runtime error checks. - Remove the need for trace_printk_lock for bpf_trace_printk and bpf_trace_vprintk helpers. - Extend libbpf's bpf_tracing.h support for tracing arguments of kprobes/uprobes and syscall as a special case. - Significantly reduce the search time for module symbols by livepatch and BPF. - Enable cpumasks to be used as kptrs, which is useful for tracing programs tracking which tasks end up running on which CPUs in different time intervals. - Add support for BPF trampoline on s390x and riscv64. - Add capability to export the XDP features supported by the NIC. - Add __bpf_kfunc tag for marking kernel functions as kfuncs. - Add cgroup.memory=nobpf kernel parameter option to disable BPF memory accounting for container environments. Netfilter: - Remove the CLUSTERIP target. It has been marked as obsolete for years, and we still have WARN splats wrt races of the out-of-band /proc interface installed by this target. - Add 'destroy' commands to nf_tables. They are identical to the existing 'delete' commands, but do not return an error if the referenced object (set, chain, rule...) did not exist. Driver API: - Improve cpumask_local_spread() locality to help NICs set the right IRQ affinity on AMD platforms. - Separate C22 and C45 MDIO bus transactions more clearly. - Introduce new DCB table to control DSCP rewrite on egress. - Support configuration of Physical Layer Collision Avoidance (PLCA) Reconciliation Sublayer (RS) (802.3cg-2019). Modern version of shared medium Ethernet. - Support for MAC Merge layer (IEEE 802.3-2018 clause 99). Allowing preemption of low priority frames by high priority frames. - Add support for controlling MACSec offload using netlink SET. - Rework devlink instance refcounts to allow registration and de-registration under the instance lock. Split the code into multiple files, drop some of the unnecessarily granular locks and factor out common parts of netlink operation handling. - Add TX frame aggregation parameters (for USB drivers). - Add a new attr TCA_EXT_WARN_MSG to report TC (offload) warning messages with notifications for debug. - Allow offloading of UDP NEW connections via act_ct. - Add support for per action HW stats in TC. - Support hardware miss to TC action (continue processing in SW from a specific point in the action chain). - Warn if old Wireless Extension user space interface is used with modern cfg80211/mac80211 drivers. Do not support Wireless Extensions for Wi-Fi 7 devices at all. Everyone should switch to using nl80211 interface instead. - Improve the CAN bit timing configuration. Use extack to return error messages directly to user space, update the SJW handling, including the definition of a new default value that will benefit CAN-FD controllers, by increasing their oscillator tolerance. New hardware / drivers: - Ethernet: - nVidia BlueField-3 support (control traffic driver) - Ethernet support for imx93 SoCs - Motorcomm yt8531 gigabit Ethernet PHY - onsemi NCN26000 10BASE-T1S PHY (with support for PLCA) - Microchip LAN8841 PHY (incl. cable diagnostics and PTP) - Amlogic gxl MDIO mux - WiFi: - RealTek RTL8188EU (rtl8xxxu) - Qualcomm Wi-Fi 7 devices (ath12k) - CAN: - Renesas R-Car V4H Drivers: - Bluetooth: - Set Per Platform Antenna Gain (PPAG) for Intel controllers. - Ethernet NICs: - Intel (1G, igc): - support TSN / Qbv / packet scheduling features of i226 model - Intel (100G, ice): - use GNSS subsystem instead of TTY - multi-buffer XDP support - extend support for GPIO pins to E823 devices - nVidia/Mellanox: - update the shared buffer configuration on PFC commands - implement PTP adjphase function for HW offset control - TC support for Geneve and GRE with VF tunnel offload - more efficient crypto key management method - multi-port eswitch support - Netronome/Corigine: - add DCB IEEE support - support IPsec offloading for NFP3800 - Freescale/NXP (enetc): - support XDP_REDIRECT for XDP non-linear buffers - improve reconfig, avoid link flap and waiting for idle - support MAC Merge layer - Other NICs: - sfc/ef100: add basic devlink support for ef100 - ionic: rx_push mode operation (writing descriptors via MMIO) - bnxt: use the auxiliary bus abstraction for RDMA - r8169: disable ASPM and reset bus in case of tx timeout - cpsw: support QSGMII mode for J721e CPSW9G - cpts: support pulse-per-second output - ngbe: add an mdio bus driver - usbnet: optimize usbnet_bh() by avoiding unnecessary queuing - r8152: handle devices with FW with NCM support - amd-xgbe: support 10Mbps, 2.5GbE speeds and rx-adaptation - virtio-net: support multi buffer XDP - virtio/vsock: replace virtio_vsock_pkt with sk_buff - tsnep: XDP support - Ethernet high-speed switches: - nVidia/Mellanox (mlxsw): - add support for latency TLV (in FW control messages) - Microchip (sparx5): - separate explicit and implicit traffic forwarding rules, make the implicit rules always active - add support for egress DSCP rewrite - IS0 VCAP support (Ingress Classification) - IS2 VCAP filters (protos, L3 addrs, L4 ports, flags, ToS etc.) - ES2 VCAP support (Egress Access Control) - support for Per-Stream Filtering and Policing (802.1Q, 8.6.5.1) - Ethernet embedded switches: - Marvell (mv88e6xxx): - add MAB (port auth) offload support - enable PTP receive for mv88e6390 - NXP (ocelot): - support MAC Merge layer - support for the the vsc7512 internal copper phys - Microchip: - lan9303: convert to PHYLINK - lan966x: support TC flower filter statistics - lan937x: PTP support for KSZ9563/KSZ8563 and LAN937x - lan937x: support Credit Based Shaper configuration - ksz9477: support Energy Efficient Ethernet - other: - qca8k: convert to regmap read/write API, use bulk operations - rswitch: Improve TX timestamp accuracy - Intel WiFi (iwlwifi): - EHT (Wi-Fi 7) rate reporting - STEP equalizer support: transfer some STEP (connection to radio on platforms with integrated wifi) related parameters from the BIOS to the firmware. - Qualcomm 802.11ax WiFi (ath11k): - IPQ5018 support - Fine Timing Measurement (FTM) responder role support - channel 177 support - MediaTek WiFi (mt76): - per-PHY LED support - mt7996: EHT (Wi-Fi 7) support - Wireless Ethernet Dispatch (WED) reset support - switch to using page pool allocator - RealTek WiFi (rtw89): - support new version of Bluetooth co-existance - Mobile: - rmnet: support TX aggregation" * tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1872 commits) page_pool: add a comment explaining the fragment counter usage net: ethtool: fix __ethtool_dev_mm_supported() implementation ethtool: pse-pd: Fix double word in comments xsk: add linux/vmalloc.h to xsk.c sefltests: netdevsim: wait for devlink instance after netns removal selftest: fib_tests: Always cleanup before exit net/mlx5e: Align IPsec ASO result memory to be as required by hardware net/mlx5e: TC, Set CT miss to the specific ct action instance net/mlx5e: Rename CHAIN_TO_REG to MAPPED_OBJ_TO_REG net/mlx5: Refactor tc miss handling to a single function net/mlx5: Kconfig: Make tc offload depend on tc skb extension net/sched: flower: Support hardware miss to tc action net/sched: flower: Move filter handle initialization earlier net/sched: cls_api: Support hardware miss to tc action net/sched: Rename user cookie and act cookie sfc: fix builds without CONFIG_RTC_LIB sfc: clean up some inconsistent indentings net/mlx4_en: Introduce flexible array to silence overflow warning net: lan966x: Fix possible deadlock inside PTP net/ulp: Remove redundant ->clone() test in inet_clone_ulp(). ...
Diffstat (limited to 'tools/testing/selftests/kvm/lib/memstress.c')
-rw-r--r--tools/testing/selftests/kvm/lib/memstress.c322
1 files changed, 322 insertions, 0 deletions
diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c
new file mode 100644
index 000000000..5f1d3173c
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/memstress.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020, Google LLC.
+ */
+#define _GNU_SOURCE
+
+#include <inttypes.h>
+
+#include "kvm_util.h"
+#include "memstress.h"
+#include "processor.h"
+
+struct memstress_args memstress_args;
+
+/*
+ * Guest virtual memory offset of the testing memory slot.
+ * Must not conflict with identity mapped test code.
+ */
+static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
+
+struct vcpu_thread {
+ /* The index of the vCPU. */
+ int vcpu_idx;
+
+ /* The pthread backing the vCPU. */
+ pthread_t thread;
+
+ /* Set to true once the vCPU thread is up and running. */
+ bool running;
+};
+
+/* The vCPU threads involved in this test. */
+static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS];
+
+/* The function run by each vCPU thread, as provided by the test. */
+static void (*vcpu_thread_fn)(struct memstress_vcpu_args *);
+
+/* Set to true once all vCPU threads are up and running. */
+static bool all_vcpu_threads_running;
+
+static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+
+/*
+ * Continuously write to the first 8 bytes of each page in the
+ * specified region.
+ */
+void memstress_guest_code(uint32_t vcpu_idx)
+{
+ struct memstress_args *args = &memstress_args;
+ struct memstress_vcpu_args *vcpu_args = &args->vcpu_args[vcpu_idx];
+ struct guest_random_state rand_state;
+ uint64_t gva;
+ uint64_t pages;
+ uint64_t addr;
+ uint64_t page;
+ int i;
+
+ rand_state = new_guest_random_state(args->random_seed + vcpu_idx);
+
+ gva = vcpu_args->gva;
+ pages = vcpu_args->pages;
+
+ /* Make sure vCPU args data structure is not corrupt. */
+ GUEST_ASSERT(vcpu_args->vcpu_idx == vcpu_idx);
+
+ while (true) {
+ for (i = 0; i < pages; i++) {
+ if (args->random_access)
+ page = guest_random_u32(&rand_state) % pages;
+ else
+ page = i;
+
+ addr = gva + (page * args->guest_page_size);
+
+ if (guest_random_u32(&rand_state) % 100 < args->write_percent)
+ *(uint64_t *)addr = 0x0123456789ABCDEF;
+ else
+ READ_ONCE(*(uint64_t *)addr);
+ }
+
+ GUEST_SYNC(1);
+ }
+}
+
+void memstress_setup_vcpus(struct kvm_vm *vm, int nr_vcpus,
+ struct kvm_vcpu *vcpus[],
+ uint64_t vcpu_memory_bytes,
+ bool partition_vcpu_memory_access)
+{
+ struct memstress_args *args = &memstress_args;
+ struct memstress_vcpu_args *vcpu_args;
+ int i;
+
+ for (i = 0; i < nr_vcpus; i++) {
+ vcpu_args = &args->vcpu_args[i];
+
+ vcpu_args->vcpu = vcpus[i];
+ vcpu_args->vcpu_idx = i;
+
+ if (partition_vcpu_memory_access) {
+ vcpu_args->gva = guest_test_virt_mem +
+ (i * vcpu_memory_bytes);
+ vcpu_args->pages = vcpu_memory_bytes /
+ args->guest_page_size;
+ vcpu_args->gpa = args->gpa + (i * vcpu_memory_bytes);
+ } else {
+ vcpu_args->gva = guest_test_virt_mem;
+ vcpu_args->pages = (nr_vcpus * vcpu_memory_bytes) /
+ args->guest_page_size;
+ vcpu_args->gpa = args->gpa;
+ }
+
+ vcpu_args_set(vcpus[i], 1, i);
+
+ pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n",
+ i, vcpu_args->gpa, vcpu_args->gpa +
+ (vcpu_args->pages * args->guest_page_size));
+ }
+}
+
+struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus,
+ uint64_t vcpu_memory_bytes, int slots,
+ enum vm_mem_backing_src_type backing_src,
+ bool partition_vcpu_memory_access)
+{
+ struct memstress_args *args = &memstress_args;
+ struct kvm_vm *vm;
+ uint64_t guest_num_pages, slot0_pages = 0;
+ uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src);
+ uint64_t region_end_gfn;
+ int i;
+
+ pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+
+ /* By default vCPUs will write to memory. */
+ args->write_percent = 100;
+
+ /*
+ * Snapshot the non-huge page size. This is used by the guest code to
+ * access/dirty pages at the logging granularity.
+ */
+ args->guest_page_size = vm_guest_mode_params[mode].page_size;
+
+ guest_num_pages = vm_adjust_num_guest_pages(mode,
+ (nr_vcpus * vcpu_memory_bytes) / args->guest_page_size);
+
+ TEST_ASSERT(vcpu_memory_bytes % getpagesize() == 0,
+ "Guest memory size is not host page size aligned.");
+ TEST_ASSERT(vcpu_memory_bytes % args->guest_page_size == 0,
+ "Guest memory size is not guest page size aligned.");
+ TEST_ASSERT(guest_num_pages % slots == 0,
+ "Guest memory cannot be evenly divided into %d slots.",
+ slots);
+
+ /*
+ * If using nested, allocate extra pages for the nested page tables and
+ * in-memory data structures.
+ */
+ if (args->nested)
+ slot0_pages += memstress_nested_pages(nr_vcpus);
+
+ /*
+ * Pass guest_num_pages to populate the page tables for test memory.
+ * The memory is also added to memslot 0, but that's a benign side
+ * effect as KVM allows aliasing HVAs in meslots.
+ */
+ vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages,
+ memstress_guest_code, vcpus);
+
+ args->vm = vm;
+
+ /* Put the test region at the top guest physical memory. */
+ region_end_gfn = vm->max_gfn + 1;
+
+#ifdef __x86_64__
+ /*
+ * When running vCPUs in L2, restrict the test region to 48 bits to
+ * avoid needing 5-level page tables to identity map L2.
+ */
+ if (args->nested)
+ region_end_gfn = min(region_end_gfn, (1UL << 48) / args->guest_page_size);
+#endif
+ /*
+ * If there should be more memory in the guest test region than there
+ * can be pages in the guest, it will definitely cause problems.
+ */
+ TEST_ASSERT(guest_num_pages < region_end_gfn,
+ "Requested more guest memory than address space allows.\n"
+ " guest pages: %" PRIx64 " max gfn: %" PRIx64
+ " nr_vcpus: %d wss: %" PRIx64 "]\n",
+ guest_num_pages, region_end_gfn - 1, nr_vcpus, vcpu_memory_bytes);
+
+ args->gpa = (region_end_gfn - guest_num_pages - 1) * args->guest_page_size;
+ args->gpa = align_down(args->gpa, backing_src_pagesz);
+#ifdef __s390x__
+ /* Align to 1M (segment size) */
+ args->gpa = align_down(args->gpa, 1 << 20);
+#endif
+ args->size = guest_num_pages * args->guest_page_size;
+ pr_info("guest physical test memory: [0x%lx, 0x%lx)\n",
+ args->gpa, args->gpa + args->size);
+
+ /* Add extra memory slots for testing */
+ for (i = 0; i < slots; i++) {
+ uint64_t region_pages = guest_num_pages / slots;
+ vm_paddr_t region_start = args->gpa + region_pages * args->guest_page_size * i;
+
+ vm_userspace_mem_region_add(vm, backing_src, region_start,
+ MEMSTRESS_MEM_SLOT_INDEX + i,
+ region_pages, 0);
+ }
+
+ /* Do mapping for the demand paging memory slot */
+ virt_map(vm, guest_test_virt_mem, args->gpa, guest_num_pages);
+
+ memstress_setup_vcpus(vm, nr_vcpus, vcpus, vcpu_memory_bytes,
+ partition_vcpu_memory_access);
+
+ if (args->nested) {
+ pr_info("Configuring vCPUs to run in L2 (nested).\n");
+ memstress_setup_nested(vm, nr_vcpus, vcpus);
+ }
+
+ /* Export the shared variables to the guest. */
+ sync_global_to_guest(vm, memstress_args);
+
+ return vm;
+}
+
+void memstress_destroy_vm(struct kvm_vm *vm)
+{
+ kvm_vm_free(vm);
+}
+
+void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent)
+{
+ memstress_args.write_percent = write_percent;
+ sync_global_to_guest(vm, memstress_args.write_percent);
+}
+
+void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed)
+{
+ memstress_args.random_seed = random_seed;
+ sync_global_to_guest(vm, memstress_args.random_seed);
+}
+
+void memstress_set_random_access(struct kvm_vm *vm, bool random_access)
+{
+ memstress_args.random_access = random_access;
+ sync_global_to_guest(vm, memstress_args.random_access);
+}
+
+uint64_t __weak memstress_nested_pages(int nr_vcpus)
+{
+ return 0;
+}
+
+void __weak memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu **vcpus)
+{
+ pr_info("%s() not support on this architecture, skipping.\n", __func__);
+ exit(KSFT_SKIP);
+}
+
+static void *vcpu_thread_main(void *data)
+{
+ struct vcpu_thread *vcpu = data;
+ int vcpu_idx = vcpu->vcpu_idx;
+
+ if (memstress_args.pin_vcpus)
+ kvm_pin_this_task_to_pcpu(memstress_args.vcpu_to_pcpu[vcpu_idx]);
+
+ WRITE_ONCE(vcpu->running, true);
+
+ /*
+ * Wait for all vCPU threads to be up and running before calling the test-
+ * provided vCPU thread function. This prevents thread creation (which
+ * requires taking the mmap_sem in write mode) from interfering with the
+ * guest faulting in its memory.
+ */
+ while (!READ_ONCE(all_vcpu_threads_running))
+ ;
+
+ vcpu_thread_fn(&memstress_args.vcpu_args[vcpu_idx]);
+
+ return NULL;
+}
+
+void memstress_start_vcpu_threads(int nr_vcpus,
+ void (*vcpu_fn)(struct memstress_vcpu_args *))
+{
+ int i;
+
+ vcpu_thread_fn = vcpu_fn;
+ WRITE_ONCE(all_vcpu_threads_running, false);
+ WRITE_ONCE(memstress_args.stop_vcpus, false);
+
+ for (i = 0; i < nr_vcpus; i++) {
+ struct vcpu_thread *vcpu = &vcpu_threads[i];
+
+ vcpu->vcpu_idx = i;
+ WRITE_ONCE(vcpu->running, false);
+
+ pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu);
+ }
+
+ for (i = 0; i < nr_vcpus; i++) {
+ while (!READ_ONCE(vcpu_threads[i].running))
+ ;
+ }
+
+ WRITE_ONCE(all_vcpu_threads_running, true);
+}
+
+void memstress_join_vcpu_threads(int nr_vcpus)
+{
+ int i;
+
+ WRITE_ONCE(memstress_args.stop_vcpus, true);
+
+ for (i = 0; i < nr_vcpus; i++)
+ pthread_join(vcpu_threads[i].thread, NULL);
+}