aboutsummaryrefslogtreecommitdiff
path: root/lib/btree.c
diff options
context:
space:
mode:
authorLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
committerLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
commit5b7c4cabbb65f5c469464da6c5f614cbd7f730f2 (patch)
treecc5c2d0a898769fd59549594fedb3ee6f84e59a0 /lib/btree.c
downloadlinux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.tar.gz
linux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.zip
Merge tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextgrafted
Pull networking updates from Jakub Kicinski: "Core: - Add dedicated kmem_cache for typical/small skb->head, avoid having to access struct page at kfree time, and improve memory use. - Introduce sysctl to set default RPS configuration for new netdevs. - Define Netlink protocol specification format which can be used to describe messages used by each family and auto-generate parsers. Add tools for generating kernel data structures and uAPI headers. - Expose all net/core sysctls inside netns. - Remove 4s sleep in netpoll if carrier is instantly detected on boot. - Add configurable limit of MDB entries per port, and port-vlan. - Continue populating drop reasons throughout the stack. - Retire a handful of legacy Qdiscs and classifiers. Protocols: - Support IPv4 big TCP (TSO frames larger than 64kB). - Add IP_LOCAL_PORT_RANGE socket option, to control local port range on socket by socket basis. - Track and report in procfs number of MPTCP sockets used. - Support mixing IPv4 and IPv6 flows in the in-kernel MPTCP path manager. - IPv6: don't check net.ipv6.route.max_size and rely on garbage collection to free memory (similarly to IPv4). - Support Penultimate Segment Pop (PSP) flavor in SRv6 (RFC8986). - ICMP: add per-rate limit counters. - Add support for user scanning requests in ieee802154. - Remove static WEP support. - Support minimal Wi-Fi 7 Extremely High Throughput (EHT) rate reporting. - WiFi 7 EHT channel puncturing support (client & AP). BPF: - Add a rbtree data structure following the "next-gen data structure" precedent set by recently added linked list, that is, by using kfunc + kptr instead of adding a new BPF map type. - Expose XDP hints via kfuncs with initial support for RX hash and timestamp metadata. - Add BPF_F_NO_TUNNEL_KEY extension to bpf_skb_set_tunnel_key to better support decap on GRE tunnel devices not operating in collect metadata. - Improve x86 JIT's codegen for PROBE_MEM runtime error checks. - Remove the need for trace_printk_lock for bpf_trace_printk and bpf_trace_vprintk helpers. - Extend libbpf's bpf_tracing.h support for tracing arguments of kprobes/uprobes and syscall as a special case. - Significantly reduce the search time for module symbols by livepatch and BPF. - Enable cpumasks to be used as kptrs, which is useful for tracing programs tracking which tasks end up running on which CPUs in different time intervals. - Add support for BPF trampoline on s390x and riscv64. - Add capability to export the XDP features supported by the NIC. - Add __bpf_kfunc tag for marking kernel functions as kfuncs. - Add cgroup.memory=nobpf kernel parameter option to disable BPF memory accounting for container environments. Netfilter: - Remove the CLUSTERIP target. It has been marked as obsolete for years, and we still have WARN splats wrt races of the out-of-band /proc interface installed by this target. - Add 'destroy' commands to nf_tables. They are identical to the existing 'delete' commands, but do not return an error if the referenced object (set, chain, rule...) did not exist. Driver API: - Improve cpumask_local_spread() locality to help NICs set the right IRQ affinity on AMD platforms. - Separate C22 and C45 MDIO bus transactions more clearly. - Introduce new DCB table to control DSCP rewrite on egress. - Support configuration of Physical Layer Collision Avoidance (PLCA) Reconciliation Sublayer (RS) (802.3cg-2019). Modern version of shared medium Ethernet. - Support for MAC Merge layer (IEEE 802.3-2018 clause 99). Allowing preemption of low priority frames by high priority frames. - Add support for controlling MACSec offload using netlink SET. - Rework devlink instance refcounts to allow registration and de-registration under the instance lock. Split the code into multiple files, drop some of the unnecessarily granular locks and factor out common parts of netlink operation handling. - Add TX frame aggregation parameters (for USB drivers). - Add a new attr TCA_EXT_WARN_MSG to report TC (offload) warning messages with notifications for debug. - Allow offloading of UDP NEW connections via act_ct. - Add support for per action HW stats in TC. - Support hardware miss to TC action (continue processing in SW from a specific point in the action chain). - Warn if old Wireless Extension user space interface is used with modern cfg80211/mac80211 drivers. Do not support Wireless Extensions for Wi-Fi 7 devices at all. Everyone should switch to using nl80211 interface instead. - Improve the CAN bit timing configuration. Use extack to return error messages directly to user space, update the SJW handling, including the definition of a new default value that will benefit CAN-FD controllers, by increasing their oscillator tolerance. New hardware / drivers: - Ethernet: - nVidia BlueField-3 support (control traffic driver) - Ethernet support for imx93 SoCs - Motorcomm yt8531 gigabit Ethernet PHY - onsemi NCN26000 10BASE-T1S PHY (with support for PLCA) - Microchip LAN8841 PHY (incl. cable diagnostics and PTP) - Amlogic gxl MDIO mux - WiFi: - RealTek RTL8188EU (rtl8xxxu) - Qualcomm Wi-Fi 7 devices (ath12k) - CAN: - Renesas R-Car V4H Drivers: - Bluetooth: - Set Per Platform Antenna Gain (PPAG) for Intel controllers. - Ethernet NICs: - Intel (1G, igc): - support TSN / Qbv / packet scheduling features of i226 model - Intel (100G, ice): - use GNSS subsystem instead of TTY - multi-buffer XDP support - extend support for GPIO pins to E823 devices - nVidia/Mellanox: - update the shared buffer configuration on PFC commands - implement PTP adjphase function for HW offset control - TC support for Geneve and GRE with VF tunnel offload - more efficient crypto key management method - multi-port eswitch support - Netronome/Corigine: - add DCB IEEE support - support IPsec offloading for NFP3800 - Freescale/NXP (enetc): - support XDP_REDIRECT for XDP non-linear buffers - improve reconfig, avoid link flap and waiting for idle - support MAC Merge layer - Other NICs: - sfc/ef100: add basic devlink support for ef100 - ionic: rx_push mode operation (writing descriptors via MMIO) - bnxt: use the auxiliary bus abstraction for RDMA - r8169: disable ASPM and reset bus in case of tx timeout - cpsw: support QSGMII mode for J721e CPSW9G - cpts: support pulse-per-second output - ngbe: add an mdio bus driver - usbnet: optimize usbnet_bh() by avoiding unnecessary queuing - r8152: handle devices with FW with NCM support - amd-xgbe: support 10Mbps, 2.5GbE speeds and rx-adaptation - virtio-net: support multi buffer XDP - virtio/vsock: replace virtio_vsock_pkt with sk_buff - tsnep: XDP support - Ethernet high-speed switches: - nVidia/Mellanox (mlxsw): - add support for latency TLV (in FW control messages) - Microchip (sparx5): - separate explicit and implicit traffic forwarding rules, make the implicit rules always active - add support for egress DSCP rewrite - IS0 VCAP support (Ingress Classification) - IS2 VCAP filters (protos, L3 addrs, L4 ports, flags, ToS etc.) - ES2 VCAP support (Egress Access Control) - support for Per-Stream Filtering and Policing (802.1Q, 8.6.5.1) - Ethernet embedded switches: - Marvell (mv88e6xxx): - add MAB (port auth) offload support - enable PTP receive for mv88e6390 - NXP (ocelot): - support MAC Merge layer - support for the the vsc7512 internal copper phys - Microchip: - lan9303: convert to PHYLINK - lan966x: support TC flower filter statistics - lan937x: PTP support for KSZ9563/KSZ8563 and LAN937x - lan937x: support Credit Based Shaper configuration - ksz9477: support Energy Efficient Ethernet - other: - qca8k: convert to regmap read/write API, use bulk operations - rswitch: Improve TX timestamp accuracy - Intel WiFi (iwlwifi): - EHT (Wi-Fi 7) rate reporting - STEP equalizer support: transfer some STEP (connection to radio on platforms with integrated wifi) related parameters from the BIOS to the firmware. - Qualcomm 802.11ax WiFi (ath11k): - IPQ5018 support - Fine Timing Measurement (FTM) responder role support - channel 177 support - MediaTek WiFi (mt76): - per-PHY LED support - mt7996: EHT (Wi-Fi 7) support - Wireless Ethernet Dispatch (WED) reset support - switch to using page pool allocator - RealTek WiFi (rtw89): - support new version of Bluetooth co-existance - Mobile: - rmnet: support TX aggregation" * tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1872 commits) page_pool: add a comment explaining the fragment counter usage net: ethtool: fix __ethtool_dev_mm_supported() implementation ethtool: pse-pd: Fix double word in comments xsk: add linux/vmalloc.h to xsk.c sefltests: netdevsim: wait for devlink instance after netns removal selftest: fib_tests: Always cleanup before exit net/mlx5e: Align IPsec ASO result memory to be as required by hardware net/mlx5e: TC, Set CT miss to the specific ct action instance net/mlx5e: Rename CHAIN_TO_REG to MAPPED_OBJ_TO_REG net/mlx5: Refactor tc miss handling to a single function net/mlx5: Kconfig: Make tc offload depend on tc skb extension net/sched: flower: Support hardware miss to tc action net/sched: flower: Move filter handle initialization earlier net/sched: cls_api: Support hardware miss to tc action net/sched: Rename user cookie and act cookie sfc: fix builds without CONFIG_RTC_LIB sfc: clean up some inconsistent indentings net/mlx4_en: Introduce flexible array to silence overflow warning net: lan966x: Fix possible deadlock inside PTP net/ulp: Remove redundant ->clone() test in inet_clone_ulp(). ...
Diffstat (limited to '')
-rw-r--r--lib/btree.c797
1 files changed, 797 insertions, 0 deletions
diff --git a/lib/btree.c b/lib/btree.c
new file mode 100644
index 000000000..a82100c73
--- /dev/null
+++ b/lib/btree.c
@@ -0,0 +1,797 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * lib/btree.c - Simple In-memory B+Tree
+ *
+ * Copyright (c) 2007-2008 Joern Engel <joern@purestorage.com>
+ * Bits and pieces stolen from Peter Zijlstra's code, which is
+ * Copyright 2007, Red Hat Inc. Peter Zijlstra
+ *
+ * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch
+ *
+ * A relatively simple B+Tree implementation. I have written it as a learning
+ * exercise to understand how B+Trees work. Turned out to be useful as well.
+ *
+ * B+Trees can be used similar to Linux radix trees (which don't have anything
+ * in common with textbook radix trees, beware). Prerequisite for them working
+ * well is that access to a random tree node is much faster than a large number
+ * of operations within each node.
+ *
+ * Disks have fulfilled the prerequisite for a long time. More recently DRAM
+ * has gained similar properties, as memory access times, when measured in cpu
+ * cycles, have increased. Cacheline sizes have increased as well, which also
+ * helps B+Trees.
+ *
+ * Compared to radix trees, B+Trees are more efficient when dealing with a
+ * sparsely populated address space. Between 25% and 50% of the memory is
+ * occupied with valid pointers. When densely populated, radix trees contain
+ * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2%
+ * pointers.
+ *
+ * This particular implementation stores pointers identified by a long value.
+ * Storing NULL pointers is illegal, lookup will return NULL when no entry
+ * was found.
+ *
+ * A tricks was used that is not commonly found in textbooks. The lowest
+ * values are to the right, not to the left. All used slots within a node
+ * are on the left, all unused slots contain NUL values. Most operations
+ * simply loop once over all slots and terminate on the first NUL.
+ */
+
+#include <linux/btree.h>
+#include <linux/cache.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define NODESIZE MAX(L1_CACHE_BYTES, 128)
+
+struct btree_geo {
+ int keylen;
+ int no_pairs;
+ int no_longs;
+};
+
+struct btree_geo btree_geo32 = {
+ .keylen = 1,
+ .no_pairs = NODESIZE / sizeof(long) / 2,
+ .no_longs = NODESIZE / sizeof(long) / 2,
+};
+EXPORT_SYMBOL_GPL(btree_geo32);
+
+#define LONG_PER_U64 (64 / BITS_PER_LONG)
+struct btree_geo btree_geo64 = {
+ .keylen = LONG_PER_U64,
+ .no_pairs = NODESIZE / sizeof(long) / (1 + LONG_PER_U64),
+ .no_longs = LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + LONG_PER_U64)),
+};
+EXPORT_SYMBOL_GPL(btree_geo64);
+
+struct btree_geo btree_geo128 = {
+ .keylen = 2 * LONG_PER_U64,
+ .no_pairs = NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64),
+ .no_longs = 2 * LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64)),
+};
+EXPORT_SYMBOL_GPL(btree_geo128);
+
+#define MAX_KEYLEN (2 * LONG_PER_U64)
+
+static struct kmem_cache *btree_cachep;
+
+void *btree_alloc(gfp_t gfp_mask, void *pool_data)
+{
+ return kmem_cache_alloc(btree_cachep, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(btree_alloc);
+
+void btree_free(void *element, void *pool_data)
+{
+ kmem_cache_free(btree_cachep, element);
+}
+EXPORT_SYMBOL_GPL(btree_free);
+
+static unsigned long *btree_node_alloc(struct btree_head *head, gfp_t gfp)
+{
+ unsigned long *node;
+
+ node = mempool_alloc(head->mempool, gfp);
+ if (likely(node))
+ memset(node, 0, NODESIZE);
+ return node;
+}
+
+static int longcmp(const unsigned long *l1, const unsigned long *l2, size_t n)
+{
+ size_t i;
+
+ for (i = 0; i < n; i++) {
+ if (l1[i] < l2[i])
+ return -1;
+ if (l1[i] > l2[i])
+ return 1;
+ }
+ return 0;
+}
+
+static unsigned long *longcpy(unsigned long *dest, const unsigned long *src,
+ size_t n)
+{
+ size_t i;
+
+ for (i = 0; i < n; i++)
+ dest[i] = src[i];
+ return dest;
+}
+
+static unsigned long *longset(unsigned long *s, unsigned long c, size_t n)
+{
+ size_t i;
+
+ for (i = 0; i < n; i++)
+ s[i] = c;
+ return s;
+}
+
+static void dec_key(struct btree_geo *geo, unsigned long *key)
+{
+ unsigned long val;
+ int i;
+
+ for (i = geo->keylen - 1; i >= 0; i--) {
+ val = key[i];
+ key[i] = val - 1;
+ if (val)
+ break;
+ }
+}
+
+static unsigned long *bkey(struct btree_geo *geo, unsigned long *node, int n)
+{
+ return &node[n * geo->keylen];
+}
+
+static void *bval(struct btree_geo *geo, unsigned long *node, int n)
+{
+ return (void *)node[geo->no_longs + n];
+}
+
+static void setkey(struct btree_geo *geo, unsigned long *node, int n,
+ unsigned long *key)
+{
+ longcpy(bkey(geo, node, n), key, geo->keylen);
+}
+
+static void setval(struct btree_geo *geo, unsigned long *node, int n,
+ void *val)
+{
+ node[geo->no_longs + n] = (unsigned long) val;
+}
+
+static void clearpair(struct btree_geo *geo, unsigned long *node, int n)
+{
+ longset(bkey(geo, node, n), 0, geo->keylen);
+ node[geo->no_longs + n] = 0;
+}
+
+static inline void __btree_init(struct btree_head *head)
+{
+ head->node = NULL;
+ head->height = 0;
+}
+
+void btree_init_mempool(struct btree_head *head, mempool_t *mempool)
+{
+ __btree_init(head);
+ head->mempool = mempool;
+}
+EXPORT_SYMBOL_GPL(btree_init_mempool);
+
+int btree_init(struct btree_head *head)
+{
+ __btree_init(head);
+ head->mempool = mempool_create(0, btree_alloc, btree_free, NULL);
+ if (!head->mempool)
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(btree_init);
+
+void btree_destroy(struct btree_head *head)
+{
+ mempool_free(head->node, head->mempool);
+ mempool_destroy(head->mempool);
+ head->mempool = NULL;
+}
+EXPORT_SYMBOL_GPL(btree_destroy);
+
+void *btree_last(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key)
+{
+ int height = head->height;
+ unsigned long *node = head->node;
+
+ if (height == 0)
+ return NULL;
+
+ for ( ; height > 1; height--)
+ node = bval(geo, node, 0);
+
+ longcpy(key, bkey(geo, node, 0), geo->keylen);
+ return bval(geo, node, 0);
+}
+EXPORT_SYMBOL_GPL(btree_last);
+
+static int keycmp(struct btree_geo *geo, unsigned long *node, int pos,
+ unsigned long *key)
+{
+ return longcmp(bkey(geo, node, pos), key, geo->keylen);
+}
+
+static int keyzero(struct btree_geo *geo, unsigned long *key)
+{
+ int i;
+
+ for (i = 0; i < geo->keylen; i++)
+ if (key[i])
+ return 0;
+
+ return 1;
+}
+
+static void *btree_lookup_node(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key)
+{
+ int i, height = head->height;
+ unsigned long *node = head->node;
+
+ if (height == 0)
+ return NULL;
+
+ for ( ; height > 1; height--) {
+ for (i = 0; i < geo->no_pairs; i++)
+ if (keycmp(geo, node, i, key) <= 0)
+ break;
+ if (i == geo->no_pairs)
+ return NULL;
+ node = bval(geo, node, i);
+ if (!node)
+ return NULL;
+ }
+ return node;
+}
+
+void *btree_lookup(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key)
+{
+ int i;
+ unsigned long *node;
+
+ node = btree_lookup_node(head, geo, key);
+ if (!node)
+ return NULL;
+
+ for (i = 0; i < geo->no_pairs; i++)
+ if (keycmp(geo, node, i, key) == 0)
+ return bval(geo, node, i);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(btree_lookup);
+
+int btree_update(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key, void *val)
+{
+ int i;
+ unsigned long *node;
+
+ node = btree_lookup_node(head, geo, key);
+ if (!node)
+ return -ENOENT;
+
+ for (i = 0; i < geo->no_pairs; i++)
+ if (keycmp(geo, node, i, key) == 0) {
+ setval(geo, node, i, val);
+ return 0;
+ }
+ return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(btree_update);
+
+/*
+ * Usually this function is quite similar to normal lookup. But the key of
+ * a parent node may be smaller than the smallest key of all its siblings.
+ * In such a case we cannot just return NULL, as we have only proven that no
+ * key smaller than __key, but larger than this parent key exists.
+ * So we set __key to the parent key and retry. We have to use the smallest
+ * such parent key, which is the last parent key we encountered.
+ */
+void *btree_get_prev(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *__key)
+{
+ int i, height;
+ unsigned long *node, *oldnode;
+ unsigned long *retry_key = NULL, key[MAX_KEYLEN];
+
+ if (keyzero(geo, __key))
+ return NULL;
+
+ if (head->height == 0)
+ return NULL;
+ longcpy(key, __key, geo->keylen);
+retry:
+ dec_key(geo, key);
+
+ node = head->node;
+ for (height = head->height ; height > 1; height--) {
+ for (i = 0; i < geo->no_pairs; i++)
+ if (keycmp(geo, node, i, key) <= 0)
+ break;
+ if (i == geo->no_pairs)
+ goto miss;
+ oldnode = node;
+ node = bval(geo, node, i);
+ if (!node)
+ goto miss;
+ retry_key = bkey(geo, oldnode, i);
+ }
+
+ if (!node)
+ goto miss;
+
+ for (i = 0; i < geo->no_pairs; i++) {
+ if (keycmp(geo, node, i, key) <= 0) {
+ if (bval(geo, node, i)) {
+ longcpy(__key, bkey(geo, node, i), geo->keylen);
+ return bval(geo, node, i);
+ } else
+ goto miss;
+ }
+ }
+miss:
+ if (retry_key) {
+ longcpy(key, retry_key, geo->keylen);
+ retry_key = NULL;
+ goto retry;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(btree_get_prev);
+
+static int getpos(struct btree_geo *geo, unsigned long *node,
+ unsigned long *key)
+{
+ int i;
+
+ for (i = 0; i < geo->no_pairs; i++) {
+ if (keycmp(geo, node, i, key) <= 0)
+ break;
+ }
+ return i;
+}
+
+static int getfill(struct btree_geo *geo, unsigned long *node, int start)
+{
+ int i;
+
+ for (i = start; i < geo->no_pairs; i++)
+ if (!bval(geo, node, i))
+ break;
+ return i;
+}
+
+/*
+ * locate the correct leaf node in the btree
+ */
+static unsigned long *find_level(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key, int level)
+{
+ unsigned long *node = head->node;
+ int i, height;
+
+ for (height = head->height; height > level; height--) {
+ for (i = 0; i < geo->no_pairs; i++)
+ if (keycmp(geo, node, i, key) <= 0)
+ break;
+
+ if ((i == geo->no_pairs) || !bval(geo, node, i)) {
+ /* right-most key is too large, update it */
+ /* FIXME: If the right-most key on higher levels is
+ * always zero, this wouldn't be necessary. */
+ i--;
+ setkey(geo, node, i, key);
+ }
+ BUG_ON(i < 0);
+ node = bval(geo, node, i);
+ }
+ BUG_ON(!node);
+ return node;
+}
+
+static int btree_grow(struct btree_head *head, struct btree_geo *geo,
+ gfp_t gfp)
+{
+ unsigned long *node;
+ int fill;
+
+ node = btree_node_alloc(head, gfp);
+ if (!node)
+ return -ENOMEM;
+ if (head->node) {
+ fill = getfill(geo, head->node, 0);
+ setkey(geo, node, 0, bkey(geo, head->node, fill - 1));
+ setval(geo, node, 0, head->node);
+ }
+ head->node = node;
+ head->height++;
+ return 0;
+}
+
+static void btree_shrink(struct btree_head *head, struct btree_geo *geo)
+{
+ unsigned long *node;
+ int fill;
+
+ if (head->height <= 1)
+ return;
+
+ node = head->node;
+ fill = getfill(geo, node, 0);
+ BUG_ON(fill > 1);
+ head->node = bval(geo, node, 0);
+ head->height--;
+ mempool_free(node, head->mempool);
+}
+
+static int btree_insert_level(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key, void *val, int level,
+ gfp_t gfp)
+{
+ unsigned long *node;
+ int i, pos, fill, err;
+
+ BUG_ON(!val);
+ if (head->height < level) {
+ err = btree_grow(head, geo, gfp);
+ if (err)
+ return err;
+ }
+
+retry:
+ node = find_level(head, geo, key, level);
+ pos = getpos(geo, node, key);
+ fill = getfill(geo, node, pos);
+ /* two identical keys are not allowed */
+ BUG_ON(pos < fill && keycmp(geo, node, pos, key) == 0);
+
+ if (fill == geo->no_pairs) {
+ /* need to split node */
+ unsigned long *new;
+
+ new = btree_node_alloc(head, gfp);
+ if (!new)
+ return -ENOMEM;
+ err = btree_insert_level(head, geo,
+ bkey(geo, node, fill / 2 - 1),
+ new, level + 1, gfp);
+ if (err) {
+ mempool_free(new, head->mempool);
+ return err;
+ }
+ for (i = 0; i < fill / 2; i++) {
+ setkey(geo, new, i, bkey(geo, node, i));
+ setval(geo, new, i, bval(geo, node, i));
+ setkey(geo, node, i, bkey(geo, node, i + fill / 2));
+ setval(geo, node, i, bval(geo, node, i + fill / 2));
+ clearpair(geo, node, i + fill / 2);
+ }
+ if (fill & 1) {
+ setkey(geo, node, i, bkey(geo, node, fill - 1));
+ setval(geo, node, i, bval(geo, node, fill - 1));
+ clearpair(geo, node, fill - 1);
+ }
+ goto retry;
+ }
+ BUG_ON(fill >= geo->no_pairs);
+
+ /* shift and insert */
+ for (i = fill; i > pos; i--) {
+ setkey(geo, node, i, bkey(geo, node, i - 1));
+ setval(geo, node, i, bval(geo, node, i - 1));
+ }
+ setkey(geo, node, pos, key);
+ setval(geo, node, pos, val);
+
+ return 0;
+}
+
+int btree_insert(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key, void *val, gfp_t gfp)
+{
+ BUG_ON(!val);
+ return btree_insert_level(head, geo, key, val, 1, gfp);
+}
+EXPORT_SYMBOL_GPL(btree_insert);
+
+static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key, int level);
+static void merge(struct btree_head *head, struct btree_geo *geo, int level,
+ unsigned long *left, int lfill,
+ unsigned long *right, int rfill,
+ unsigned long *parent, int lpos)
+{
+ int i;
+
+ for (i = 0; i < rfill; i++) {
+ /* Move all keys to the left */
+ setkey(geo, left, lfill + i, bkey(geo, right, i));
+ setval(geo, left, lfill + i, bval(geo, right, i));
+ }
+ /* Exchange left and right child in parent */
+ setval(geo, parent, lpos, right);
+ setval(geo, parent, lpos + 1, left);
+ /* Remove left (formerly right) child from parent */
+ btree_remove_level(head, geo, bkey(geo, parent, lpos), level + 1);
+ mempool_free(right, head->mempool);
+}
+
+static void rebalance(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key, int level, unsigned long *child, int fill)
+{
+ unsigned long *parent, *left = NULL, *right = NULL;
+ int i, no_left, no_right;
+
+ if (fill == 0) {
+ /* Because we don't steal entries from a neighbour, this case
+ * can happen. Parent node contains a single child, this
+ * node, so merging with a sibling never happens.
+ */
+ btree_remove_level(head, geo, key, level + 1);
+ mempool_free(child, head->mempool);
+ return;
+ }
+
+ parent = find_level(head, geo, key, level + 1);
+ i = getpos(geo, parent, key);
+ BUG_ON(bval(geo, parent, i) != child);
+
+ if (i > 0) {
+ left = bval(geo, parent, i - 1);
+ no_left = getfill(geo, left, 0);
+ if (fill + no_left <= geo->no_pairs) {
+ merge(head, geo, level,
+ left, no_left,
+ child, fill,
+ parent, i - 1);
+ return;
+ }
+ }
+ if (i + 1 < getfill(geo, parent, i)) {
+ right = bval(geo, parent, i + 1);
+ no_right = getfill(geo, right, 0);
+ if (fill + no_right <= geo->no_pairs) {
+ merge(head, geo, level,
+ child, fill,
+ right, no_right,
+ parent, i);
+ return;
+ }
+ }
+ /*
+ * We could also try to steal one entry from the left or right
+ * neighbor. By not doing so we changed the invariant from
+ * "all nodes are at least half full" to "no two neighboring
+ * nodes can be merged". Which means that the average fill of
+ * all nodes is still half or better.
+ */
+}
+
+static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key, int level)
+{
+ unsigned long *node;
+ int i, pos, fill;
+ void *ret;
+
+ if (level > head->height) {
+ /* we recursed all the way up */
+ head->height = 0;
+ head->node = NULL;
+ return NULL;
+ }
+
+ node = find_level(head, geo, key, level);
+ pos = getpos(geo, node, key);
+ fill = getfill(geo, node, pos);
+ if ((level == 1) && (keycmp(geo, node, pos, key) != 0))
+ return NULL;
+ ret = bval(geo, node, pos);
+
+ /* remove and shift */
+ for (i = pos; i < fill - 1; i++) {
+ setkey(geo, node, i, bkey(geo, node, i + 1));
+ setval(geo, node, i, bval(geo, node, i + 1));
+ }
+ clearpair(geo, node, fill - 1);
+
+ if (fill - 1 < geo->no_pairs / 2) {
+ if (level < head->height)
+ rebalance(head, geo, key, level, node, fill - 1);
+ else if (fill - 1 == 1)
+ btree_shrink(head, geo);
+ }
+
+ return ret;
+}
+
+void *btree_remove(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *key)
+{
+ if (head->height == 0)
+ return NULL;
+
+ return btree_remove_level(head, geo, key, 1);
+}
+EXPORT_SYMBOL_GPL(btree_remove);
+
+int btree_merge(struct btree_head *target, struct btree_head *victim,
+ struct btree_geo *geo, gfp_t gfp)
+{
+ unsigned long key[MAX_KEYLEN];
+ unsigned long dup[MAX_KEYLEN];
+ void *val;
+ int err;
+
+ BUG_ON(target == victim);
+
+ if (!(target->node)) {
+ /* target is empty, just copy fields over */
+ target->node = victim->node;
+ target->height = victim->height;
+ __btree_init(victim);
+ return 0;
+ }
+
+ /* TODO: This needs some optimizations. Currently we do three tree
+ * walks to remove a single object from the victim.
+ */
+ for (;;) {
+ if (!btree_last(victim, geo, key))
+ break;
+ val = btree_lookup(victim, geo, key);
+ err = btree_insert(target, geo, key, val, gfp);
+ if (err)
+ return err;
+ /* We must make a copy of the key, as the original will get
+ * mangled inside btree_remove. */
+ longcpy(dup, key, geo->keylen);
+ btree_remove(victim, geo, dup);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(btree_merge);
+
+static size_t __btree_for_each(struct btree_head *head, struct btree_geo *geo,
+ unsigned long *node, unsigned long opaque,
+ void (*func)(void *elem, unsigned long opaque,
+ unsigned long *key, size_t index,
+ void *func2),
+ void *func2, int reap, int height, size_t count)
+{
+ int i;
+ unsigned long *child;
+
+ for (i = 0; i < geo->no_pairs; i++) {
+ child = bval(geo, node, i);
+ if (!child)
+ break;
+ if (height > 1)
+ count = __btree_for_each(head, geo, child, opaque,
+ func, func2, reap, height - 1, count);
+ else
+ func(child, opaque, bkey(geo, node, i), count++,
+ func2);
+ }
+ if (reap)
+ mempool_free(node, head->mempool);
+ return count;
+}
+
+static void empty(void *elem, unsigned long opaque, unsigned long *key,
+ size_t index, void *func2)
+{
+}
+
+void visitorl(void *elem, unsigned long opaque, unsigned long *key,
+ size_t index, void *__func)
+{
+ visitorl_t func = __func;
+
+ func(elem, opaque, *key, index);
+}
+EXPORT_SYMBOL_GPL(visitorl);
+
+void visitor32(void *elem, unsigned long opaque, unsigned long *__key,
+ size_t index, void *__func)
+{
+ visitor32_t func = __func;
+ u32 *key = (void *)__key;
+
+ func(elem, opaque, *key, index);
+}
+EXPORT_SYMBOL_GPL(visitor32);
+
+void visitor64(void *elem, unsigned long opaque, unsigned long *__key,
+ size_t index, void *__func)
+{
+ visitor64_t func = __func;
+ u64 *key = (void *)__key;
+
+ func(elem, opaque, *key, index);
+}
+EXPORT_SYMBOL_GPL(visitor64);
+
+void visitor128(void *elem, unsigned long opaque, unsigned long *__key,
+ size_t index, void *__func)
+{
+ visitor128_t func = __func;
+ u64 *key = (void *)__key;
+
+ func(elem, opaque, key[0], key[1], index);
+}
+EXPORT_SYMBOL_GPL(visitor128);
+
+size_t btree_visitor(struct btree_head *head, struct btree_geo *geo,
+ unsigned long opaque,
+ void (*func)(void *elem, unsigned long opaque,
+ unsigned long *key,
+ size_t index, void *func2),
+ void *func2)
+{
+ size_t count = 0;
+
+ if (!func2)
+ func = empty;
+ if (head->node)
+ count = __btree_for_each(head, geo, head->node, opaque, func,
+ func2, 0, head->height, 0);
+ return count;
+}
+EXPORT_SYMBOL_GPL(btree_visitor);
+
+size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo,
+ unsigned long opaque,
+ void (*func)(void *elem, unsigned long opaque,
+ unsigned long *key,
+ size_t index, void *func2),
+ void *func2)
+{
+ size_t count = 0;
+
+ if (!func2)
+ func = empty;
+ if (head->node)
+ count = __btree_for_each(head, geo, head->node, opaque, func,
+ func2, 1, head->height, 0);
+ __btree_init(head);
+ return count;
+}
+EXPORT_SYMBOL_GPL(btree_grim_visitor);
+
+static int __init btree_module_init(void)
+{
+ btree_cachep = kmem_cache_create("btree_node", NODESIZE, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ return 0;
+}
+
+static void __exit btree_module_exit(void)
+{
+ kmem_cache_destroy(btree_cachep);
+}
+
+/* If core code starts using btree, initialization should happen even earlier */
+module_init(btree_module_init);
+module_exit(btree_module_exit);
+
+MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
+MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
+MODULE_LICENSE("GPL");