aboutsummaryrefslogtreecommitdiff
path: root/kernel/trace/tracing_map.h
diff options
context:
space:
mode:
authorLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
committerLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
commit5b7c4cabbb65f5c469464da6c5f614cbd7f730f2 (patch)
treecc5c2d0a898769fd59549594fedb3ee6f84e59a0 /kernel/trace/tracing_map.h
downloadlinux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.tar.gz
linux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.zip
Merge tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextgrafted
Pull networking updates from Jakub Kicinski: "Core: - Add dedicated kmem_cache for typical/small skb->head, avoid having to access struct page at kfree time, and improve memory use. - Introduce sysctl to set default RPS configuration for new netdevs. - Define Netlink protocol specification format which can be used to describe messages used by each family and auto-generate parsers. Add tools for generating kernel data structures and uAPI headers. - Expose all net/core sysctls inside netns. - Remove 4s sleep in netpoll if carrier is instantly detected on boot. - Add configurable limit of MDB entries per port, and port-vlan. - Continue populating drop reasons throughout the stack. - Retire a handful of legacy Qdiscs and classifiers. Protocols: - Support IPv4 big TCP (TSO frames larger than 64kB). - Add IP_LOCAL_PORT_RANGE socket option, to control local port range on socket by socket basis. - Track and report in procfs number of MPTCP sockets used. - Support mixing IPv4 and IPv6 flows in the in-kernel MPTCP path manager. - IPv6: don't check net.ipv6.route.max_size and rely on garbage collection to free memory (similarly to IPv4). - Support Penultimate Segment Pop (PSP) flavor in SRv6 (RFC8986). - ICMP: add per-rate limit counters. - Add support for user scanning requests in ieee802154. - Remove static WEP support. - Support minimal Wi-Fi 7 Extremely High Throughput (EHT) rate reporting. - WiFi 7 EHT channel puncturing support (client & AP). BPF: - Add a rbtree data structure following the "next-gen data structure" precedent set by recently added linked list, that is, by using kfunc + kptr instead of adding a new BPF map type. - Expose XDP hints via kfuncs with initial support for RX hash and timestamp metadata. - Add BPF_F_NO_TUNNEL_KEY extension to bpf_skb_set_tunnel_key to better support decap on GRE tunnel devices not operating in collect metadata. - Improve x86 JIT's codegen for PROBE_MEM runtime error checks. - Remove the need for trace_printk_lock for bpf_trace_printk and bpf_trace_vprintk helpers. - Extend libbpf's bpf_tracing.h support for tracing arguments of kprobes/uprobes and syscall as a special case. - Significantly reduce the search time for module symbols by livepatch and BPF. - Enable cpumasks to be used as kptrs, which is useful for tracing programs tracking which tasks end up running on which CPUs in different time intervals. - Add support for BPF trampoline on s390x and riscv64. - Add capability to export the XDP features supported by the NIC. - Add __bpf_kfunc tag for marking kernel functions as kfuncs. - Add cgroup.memory=nobpf kernel parameter option to disable BPF memory accounting for container environments. Netfilter: - Remove the CLUSTERIP target. It has been marked as obsolete for years, and we still have WARN splats wrt races of the out-of-band /proc interface installed by this target. - Add 'destroy' commands to nf_tables. They are identical to the existing 'delete' commands, but do not return an error if the referenced object (set, chain, rule...) did not exist. Driver API: - Improve cpumask_local_spread() locality to help NICs set the right IRQ affinity on AMD platforms. - Separate C22 and C45 MDIO bus transactions more clearly. - Introduce new DCB table to control DSCP rewrite on egress. - Support configuration of Physical Layer Collision Avoidance (PLCA) Reconciliation Sublayer (RS) (802.3cg-2019). Modern version of shared medium Ethernet. - Support for MAC Merge layer (IEEE 802.3-2018 clause 99). Allowing preemption of low priority frames by high priority frames. - Add support for controlling MACSec offload using netlink SET. - Rework devlink instance refcounts to allow registration and de-registration under the instance lock. Split the code into multiple files, drop some of the unnecessarily granular locks and factor out common parts of netlink operation handling. - Add TX frame aggregation parameters (for USB drivers). - Add a new attr TCA_EXT_WARN_MSG to report TC (offload) warning messages with notifications for debug. - Allow offloading of UDP NEW connections via act_ct. - Add support for per action HW stats in TC. - Support hardware miss to TC action (continue processing in SW from a specific point in the action chain). - Warn if old Wireless Extension user space interface is used with modern cfg80211/mac80211 drivers. Do not support Wireless Extensions for Wi-Fi 7 devices at all. Everyone should switch to using nl80211 interface instead. - Improve the CAN bit timing configuration. Use extack to return error messages directly to user space, update the SJW handling, including the definition of a new default value that will benefit CAN-FD controllers, by increasing their oscillator tolerance. New hardware / drivers: - Ethernet: - nVidia BlueField-3 support (control traffic driver) - Ethernet support for imx93 SoCs - Motorcomm yt8531 gigabit Ethernet PHY - onsemi NCN26000 10BASE-T1S PHY (with support for PLCA) - Microchip LAN8841 PHY (incl. cable diagnostics and PTP) - Amlogic gxl MDIO mux - WiFi: - RealTek RTL8188EU (rtl8xxxu) - Qualcomm Wi-Fi 7 devices (ath12k) - CAN: - Renesas R-Car V4H Drivers: - Bluetooth: - Set Per Platform Antenna Gain (PPAG) for Intel controllers. - Ethernet NICs: - Intel (1G, igc): - support TSN / Qbv / packet scheduling features of i226 model - Intel (100G, ice): - use GNSS subsystem instead of TTY - multi-buffer XDP support - extend support for GPIO pins to E823 devices - nVidia/Mellanox: - update the shared buffer configuration on PFC commands - implement PTP adjphase function for HW offset control - TC support for Geneve and GRE with VF tunnel offload - more efficient crypto key management method - multi-port eswitch support - Netronome/Corigine: - add DCB IEEE support - support IPsec offloading for NFP3800 - Freescale/NXP (enetc): - support XDP_REDIRECT for XDP non-linear buffers - improve reconfig, avoid link flap and waiting for idle - support MAC Merge layer - Other NICs: - sfc/ef100: add basic devlink support for ef100 - ionic: rx_push mode operation (writing descriptors via MMIO) - bnxt: use the auxiliary bus abstraction for RDMA - r8169: disable ASPM and reset bus in case of tx timeout - cpsw: support QSGMII mode for J721e CPSW9G - cpts: support pulse-per-second output - ngbe: add an mdio bus driver - usbnet: optimize usbnet_bh() by avoiding unnecessary queuing - r8152: handle devices with FW with NCM support - amd-xgbe: support 10Mbps, 2.5GbE speeds and rx-adaptation - virtio-net: support multi buffer XDP - virtio/vsock: replace virtio_vsock_pkt with sk_buff - tsnep: XDP support - Ethernet high-speed switches: - nVidia/Mellanox (mlxsw): - add support for latency TLV (in FW control messages) - Microchip (sparx5): - separate explicit and implicit traffic forwarding rules, make the implicit rules always active - add support for egress DSCP rewrite - IS0 VCAP support (Ingress Classification) - IS2 VCAP filters (protos, L3 addrs, L4 ports, flags, ToS etc.) - ES2 VCAP support (Egress Access Control) - support for Per-Stream Filtering and Policing (802.1Q, 8.6.5.1) - Ethernet embedded switches: - Marvell (mv88e6xxx): - add MAB (port auth) offload support - enable PTP receive for mv88e6390 - NXP (ocelot): - support MAC Merge layer - support for the the vsc7512 internal copper phys - Microchip: - lan9303: convert to PHYLINK - lan966x: support TC flower filter statistics - lan937x: PTP support for KSZ9563/KSZ8563 and LAN937x - lan937x: support Credit Based Shaper configuration - ksz9477: support Energy Efficient Ethernet - other: - qca8k: convert to regmap read/write API, use bulk operations - rswitch: Improve TX timestamp accuracy - Intel WiFi (iwlwifi): - EHT (Wi-Fi 7) rate reporting - STEP equalizer support: transfer some STEP (connection to radio on platforms with integrated wifi) related parameters from the BIOS to the firmware. - Qualcomm 802.11ax WiFi (ath11k): - IPQ5018 support - Fine Timing Measurement (FTM) responder role support - channel 177 support - MediaTek WiFi (mt76): - per-PHY LED support - mt7996: EHT (Wi-Fi 7) support - Wireless Ethernet Dispatch (WED) reset support - switch to using page pool allocator - RealTek WiFi (rtw89): - support new version of Bluetooth co-existance - Mobile: - rmnet: support TX aggregation" * tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1872 commits) page_pool: add a comment explaining the fragment counter usage net: ethtool: fix __ethtool_dev_mm_supported() implementation ethtool: pse-pd: Fix double word in comments xsk: add linux/vmalloc.h to xsk.c sefltests: netdevsim: wait for devlink instance after netns removal selftest: fib_tests: Always cleanup before exit net/mlx5e: Align IPsec ASO result memory to be as required by hardware net/mlx5e: TC, Set CT miss to the specific ct action instance net/mlx5e: Rename CHAIN_TO_REG to MAPPED_OBJ_TO_REG net/mlx5: Refactor tc miss handling to a single function net/mlx5: Kconfig: Make tc offload depend on tc skb extension net/sched: flower: Support hardware miss to tc action net/sched: flower: Move filter handle initialization earlier net/sched: cls_api: Support hardware miss to tc action net/sched: Rename user cookie and act cookie sfc: fix builds without CONFIG_RTC_LIB sfc: clean up some inconsistent indentings net/mlx4_en: Introduce flexible array to silence overflow warning net: lan966x: Fix possible deadlock inside PTP net/ulp: Remove redundant ->clone() test in inet_clone_ulp(). ...
Diffstat (limited to 'kernel/trace/tracing_map.h')
-rw-r--r--kernel/trace/tracing_map.h288
1 files changed, 288 insertions, 0 deletions
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
new file mode 100644
index 000000000..2c765ee2a
--- /dev/null
+++ b/kernel/trace/tracing_map.h
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __TRACING_MAP_H
+#define __TRACING_MAP_H
+
+#define TRACING_MAP_BITS_DEFAULT 11
+#define TRACING_MAP_BITS_MAX 17
+#define TRACING_MAP_BITS_MIN 7
+
+#define TRACING_MAP_KEYS_MAX 3
+#define TRACING_MAP_VALS_MAX 3
+#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
+ TRACING_MAP_VALS_MAX)
+#define TRACING_MAP_VARS_MAX 16
+#define TRACING_MAP_SORT_KEYS_MAX 2
+
+typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
+
+/*
+ * This is an overview of the tracing_map data structures and how they
+ * relate to the tracing_map API. The details of the algorithms
+ * aren't discussed here - this is just a general overview of the data
+ * structures and how they interact with the API.
+ *
+ * The central data structure of the tracing_map is an initially
+ * zeroed array of struct tracing_map_entry (stored in the map field
+ * of struct tracing_map). tracing_map_entry is a very simple data
+ * structure containing only two fields: a 32-bit unsigned 'key'
+ * variable and a pointer named 'val'. This array of struct
+ * tracing_map_entry is essentially a hash table which will be
+ * modified by a single function, tracing_map_insert(), but which can
+ * be traversed and read by a user at any time (though the user does
+ * this indirectly via an array of tracing_map_sort_entry - see the
+ * explanation of that data structure in the discussion of the
+ * sorting-related data structures below).
+ *
+ * The central function of the tracing_map API is
+ * tracing_map_insert(). tracing_map_insert() hashes the
+ * arbitrarily-sized key passed into it into a 32-bit unsigned key.
+ * It then uses this key, truncated to the array size, as an index
+ * into the array of tracing_map_entries. If the value of the 'key'
+ * field of the tracing_map_entry found at that location is 0, then
+ * that entry is considered to be free and can be claimed, by
+ * replacing the 0 in the 'key' field of the tracing_map_entry with
+ * the new 32-bit hashed key. Once claimed, that tracing_map_entry's
+ * 'val' field is then used to store a unique element which will be
+ * forever associated with that 32-bit hashed key in the
+ * tracing_map_entry.
+ *
+ * That unique element now in the tracing_map_entry's 'val' field is
+ * an instance of tracing_map_elt, where 'elt' in the latter part of
+ * that variable name is short for 'element'. The purpose of a
+ * tracing_map_elt is to hold values specific to the particular
+ * 32-bit hashed key it's associated with. Things such as the unique
+ * set of aggregated sums associated with the 32-bit hashed key, along
+ * with a copy of the full key associated with the entry, and which
+ * was used to produce the 32-bit hashed key.
+ *
+ * When tracing_map_create() is called to create the tracing map, the
+ * user specifies (indirectly via the map_bits param, the details are
+ * unimportant for this discussion) the maximum number of elements
+ * that the map can hold (stored in the max_elts field of struct
+ * tracing_map). This is the maximum possible number of
+ * tracing_map_entries in the tracing_map_entry array which can be
+ * 'claimed' as described in the above discussion, and therefore is
+ * also the maximum number of tracing_map_elts that can be associated
+ * with the tracing_map_entry array in the tracing_map. Because of
+ * the way the insertion algorithm works, the size of the allocated
+ * tracing_map_entry array is always twice the maximum number of
+ * elements (2 * max_elts). This value is stored in the map_size
+ * field of struct tracing_map.
+ *
+ * Because tracing_map_insert() needs to work from any context,
+ * including from within the memory allocation functions themselves,
+ * both the tracing_map_entry array and a pool of max_elts
+ * tracing_map_elts are pre-allocated before any call is made to
+ * tracing_map_insert().
+ *
+ * The tracing_map_entry array is allocated as a single block by
+ * tracing_map_create().
+ *
+ * Because the tracing_map_elts are much larger objects and can't
+ * generally be allocated together as a single large array without
+ * failure, they're allocated individually, by tracing_map_init().
+ *
+ * The pool of tracing_map_elts are allocated by tracing_map_init()
+ * rather than by tracing_map_create() because at the time
+ * tracing_map_create() is called, there isn't enough information to
+ * create the tracing_map_elts. Specifically,the user first needs to
+ * tell the tracing_map implementation how many fields the
+ * tracing_map_elts contain, and which types of fields they are (key
+ * or sum). The user does this via the tracing_map_add_sum_field()
+ * and tracing_map_add_key_field() functions, following which the user
+ * calls tracing_map_init() to finish up the tracing map setup. The
+ * array holding the pointers which make up the pre-allocated pool of
+ * tracing_map_elts is allocated as a single block and is stored in
+ * the elts field of struct tracing_map.
+ *
+ * There is also a set of structures used for sorting that might
+ * benefit from some minimal explanation.
+ *
+ * struct tracing_map_sort_key is used to drive the sort at any given
+ * time. By 'any given time' we mean that a different
+ * tracing_map_sort_key will be used at different times depending on
+ * whether the sort currently being performed is a primary or a
+ * secondary sort.
+ *
+ * The sort key is very simple, consisting of the field index of the
+ * tracing_map_elt field to sort on (which the user saved when adding
+ * the field), and whether the sort should be done in an ascending or
+ * descending order.
+ *
+ * For the convenience of the sorting code, a tracing_map_sort_entry
+ * is created for each tracing_map_elt, again individually allocated
+ * to avoid failures that might be expected if allocated as a single
+ * large array of struct tracing_map_sort_entry.
+ * tracing_map_sort_entry instances are the objects expected by the
+ * various internal sorting functions, and are also what the user
+ * ultimately receives after calling tracing_map_sort_entries().
+ * Because it doesn't make sense for users to access an unordered and
+ * sparsely populated tracing_map directly, the
+ * tracing_map_sort_entries() function is provided so that users can
+ * retrieve a sorted list of all existing elements. In addition to
+ * the associated tracing_map_elt 'elt' field contained within the
+ * tracing_map_sort_entry, which is the object of interest to the
+ * user, tracing_map_sort_entry objects contain a number of additional
+ * fields which are used for caching and internal purposes and can
+ * safely be ignored.
+*/
+
+struct tracing_map_field {
+ tracing_map_cmp_fn_t cmp_fn;
+ union {
+ atomic64_t sum;
+ unsigned int offset;
+ };
+};
+
+struct tracing_map_elt {
+ struct tracing_map *map;
+ struct tracing_map_field *fields;
+ atomic64_t *vars;
+ bool *var_set;
+ void *key;
+ void *private_data;
+};
+
+struct tracing_map_entry {
+ u32 key;
+ struct tracing_map_elt *val;
+};
+
+struct tracing_map_sort_key {
+ unsigned int field_idx;
+ bool descending;
+};
+
+struct tracing_map_sort_entry {
+ void *key;
+ struct tracing_map_elt *elt;
+ bool elt_copied;
+ bool dup;
+};
+
+struct tracing_map_array {
+ unsigned int entries_per_page;
+ unsigned int entry_size_shift;
+ unsigned int entry_shift;
+ unsigned int entry_mask;
+ unsigned int n_pages;
+ void **pages;
+};
+
+#define TRACING_MAP_ARRAY_ELT(array, idx) \
+ (array->pages[idx >> array->entry_shift] + \
+ ((idx & array->entry_mask) << array->entry_size_shift))
+
+#define TRACING_MAP_ENTRY(array, idx) \
+ ((struct tracing_map_entry *)TRACING_MAP_ARRAY_ELT(array, idx))
+
+#define TRACING_MAP_ELT(array, idx) \
+ ((struct tracing_map_elt **)TRACING_MAP_ARRAY_ELT(array, idx))
+
+struct tracing_map {
+ unsigned int key_size;
+ unsigned int map_bits;
+ unsigned int map_size;
+ unsigned int max_elts;
+ atomic_t next_elt;
+ struct tracing_map_array *elts;
+ struct tracing_map_array *map;
+ const struct tracing_map_ops *ops;
+ void *private_data;
+ struct tracing_map_field fields[TRACING_MAP_FIELDS_MAX];
+ unsigned int n_fields;
+ int key_idx[TRACING_MAP_KEYS_MAX];
+ unsigned int n_keys;
+ struct tracing_map_sort_key sort_key;
+ unsigned int n_vars;
+ atomic64_t hits;
+ atomic64_t drops;
+};
+
+/**
+ * struct tracing_map_ops - callbacks for tracing_map
+ *
+ * The methods in this structure define callback functions for various
+ * operations on a tracing_map or objects related to a tracing_map.
+ *
+ * For a detailed description of tracing_map_elt objects please see
+ * the overview of tracing_map data structures at the beginning of
+ * this file.
+ *
+ * All the methods below are optional.
+ *
+ * @elt_alloc: When a tracing_map_elt is allocated, this function, if
+ * defined, will be called and gives clients the opportunity to
+ * allocate additional data and attach it to the element
+ * (tracing_map_elt->private_data is meant for that purpose).
+ * Element allocation occurs before tracing begins, when the
+ * tracing_map_init() call is made by client code.
+ *
+ * @elt_free: When a tracing_map_elt is freed, this function is called
+ * and allows client-allocated per-element data to be freed.
+ *
+ * @elt_clear: This callback allows per-element client-defined data to
+ * be cleared, if applicable.
+ *
+ * @elt_init: This callback allows per-element client-defined data to
+ * be initialized when used i.e. when the element is actually
+ * claimed by tracing_map_insert() in the context of the map
+ * insertion.
+ */
+struct tracing_map_ops {
+ int (*elt_alloc)(struct tracing_map_elt *elt);
+ void (*elt_free)(struct tracing_map_elt *elt);
+ void (*elt_clear)(struct tracing_map_elt *elt);
+ void (*elt_init)(struct tracing_map_elt *elt);
+};
+
+extern struct tracing_map *
+tracing_map_create(unsigned int map_bits,
+ unsigned int key_size,
+ const struct tracing_map_ops *ops,
+ void *private_data);
+extern int tracing_map_init(struct tracing_map *map);
+
+extern int tracing_map_add_sum_field(struct tracing_map *map);
+extern int tracing_map_add_var(struct tracing_map *map);
+extern int tracing_map_add_key_field(struct tracing_map *map,
+ unsigned int offset,
+ tracing_map_cmp_fn_t cmp_fn);
+
+extern void tracing_map_destroy(struct tracing_map *map);
+extern void tracing_map_clear(struct tracing_map *map);
+
+extern struct tracing_map_elt *
+tracing_map_insert(struct tracing_map *map, void *key);
+extern struct tracing_map_elt *
+tracing_map_lookup(struct tracing_map *map, void *key);
+
+extern tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size,
+ int field_is_signed);
+extern int tracing_map_cmp_string(void *val_a, void *val_b);
+extern int tracing_map_cmp_none(void *val_a, void *val_b);
+
+extern void tracing_map_update_sum(struct tracing_map_elt *elt,
+ unsigned int i, u64 n);
+extern void tracing_map_set_var(struct tracing_map_elt *elt,
+ unsigned int i, u64 n);
+extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i);
+extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
+extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
+extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
+
+extern void tracing_map_set_field_descr(struct tracing_map *map,
+ unsigned int i,
+ unsigned int key_offset,
+ tracing_map_cmp_fn_t cmp_fn);
+extern int
+tracing_map_sort_entries(struct tracing_map *map,
+ struct tracing_map_sort_key *sort_keys,
+ unsigned int n_sort_keys,
+ struct tracing_map_sort_entry ***sort_entries);
+
+extern void
+tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries,
+ unsigned int n_entries);
+#endif /* __TRACING_MAP_H */