aboutsummaryrefslogtreecommitdiff
path: root/samples/bpf/xdp_router_ipv4_user.c
diff options
context:
space:
mode:
authorLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
committerLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
commit5b7c4cabbb65f5c469464da6c5f614cbd7f730f2 (patch)
treecc5c2d0a898769fd59549594fedb3ee6f84e59a0 /samples/bpf/xdp_router_ipv4_user.c
downloadlinux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.tar.gz
linux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.zip
Merge tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextgrafted
Pull networking updates from Jakub Kicinski: "Core: - Add dedicated kmem_cache for typical/small skb->head, avoid having to access struct page at kfree time, and improve memory use. - Introduce sysctl to set default RPS configuration for new netdevs. - Define Netlink protocol specification format which can be used to describe messages used by each family and auto-generate parsers. Add tools for generating kernel data structures and uAPI headers. - Expose all net/core sysctls inside netns. - Remove 4s sleep in netpoll if carrier is instantly detected on boot. - Add configurable limit of MDB entries per port, and port-vlan. - Continue populating drop reasons throughout the stack. - Retire a handful of legacy Qdiscs and classifiers. Protocols: - Support IPv4 big TCP (TSO frames larger than 64kB). - Add IP_LOCAL_PORT_RANGE socket option, to control local port range on socket by socket basis. - Track and report in procfs number of MPTCP sockets used. - Support mixing IPv4 and IPv6 flows in the in-kernel MPTCP path manager. - IPv6: don't check net.ipv6.route.max_size and rely on garbage collection to free memory (similarly to IPv4). - Support Penultimate Segment Pop (PSP) flavor in SRv6 (RFC8986). - ICMP: add per-rate limit counters. - Add support for user scanning requests in ieee802154. - Remove static WEP support. - Support minimal Wi-Fi 7 Extremely High Throughput (EHT) rate reporting. - WiFi 7 EHT channel puncturing support (client & AP). BPF: - Add a rbtree data structure following the "next-gen data structure" precedent set by recently added linked list, that is, by using kfunc + kptr instead of adding a new BPF map type. - Expose XDP hints via kfuncs with initial support for RX hash and timestamp metadata. - Add BPF_F_NO_TUNNEL_KEY extension to bpf_skb_set_tunnel_key to better support decap on GRE tunnel devices not operating in collect metadata. - Improve x86 JIT's codegen for PROBE_MEM runtime error checks. - Remove the need for trace_printk_lock for bpf_trace_printk and bpf_trace_vprintk helpers. - Extend libbpf's bpf_tracing.h support for tracing arguments of kprobes/uprobes and syscall as a special case. - Significantly reduce the search time for module symbols by livepatch and BPF. - Enable cpumasks to be used as kptrs, which is useful for tracing programs tracking which tasks end up running on which CPUs in different time intervals. - Add support for BPF trampoline on s390x and riscv64. - Add capability to export the XDP features supported by the NIC. - Add __bpf_kfunc tag for marking kernel functions as kfuncs. - Add cgroup.memory=nobpf kernel parameter option to disable BPF memory accounting for container environments. Netfilter: - Remove the CLUSTERIP target. It has been marked as obsolete for years, and we still have WARN splats wrt races of the out-of-band /proc interface installed by this target. - Add 'destroy' commands to nf_tables. They are identical to the existing 'delete' commands, but do not return an error if the referenced object (set, chain, rule...) did not exist. Driver API: - Improve cpumask_local_spread() locality to help NICs set the right IRQ affinity on AMD platforms. - Separate C22 and C45 MDIO bus transactions more clearly. - Introduce new DCB table to control DSCP rewrite on egress. - Support configuration of Physical Layer Collision Avoidance (PLCA) Reconciliation Sublayer (RS) (802.3cg-2019). Modern version of shared medium Ethernet. - Support for MAC Merge layer (IEEE 802.3-2018 clause 99). Allowing preemption of low priority frames by high priority frames. - Add support for controlling MACSec offload using netlink SET. - Rework devlink instance refcounts to allow registration and de-registration under the instance lock. Split the code into multiple files, drop some of the unnecessarily granular locks and factor out common parts of netlink operation handling. - Add TX frame aggregation parameters (for USB drivers). - Add a new attr TCA_EXT_WARN_MSG to report TC (offload) warning messages with notifications for debug. - Allow offloading of UDP NEW connections via act_ct. - Add support for per action HW stats in TC. - Support hardware miss to TC action (continue processing in SW from a specific point in the action chain). - Warn if old Wireless Extension user space interface is used with modern cfg80211/mac80211 drivers. Do not support Wireless Extensions for Wi-Fi 7 devices at all. Everyone should switch to using nl80211 interface instead. - Improve the CAN bit timing configuration. Use extack to return error messages directly to user space, update the SJW handling, including the definition of a new default value that will benefit CAN-FD controllers, by increasing their oscillator tolerance. New hardware / drivers: - Ethernet: - nVidia BlueField-3 support (control traffic driver) - Ethernet support for imx93 SoCs - Motorcomm yt8531 gigabit Ethernet PHY - onsemi NCN26000 10BASE-T1S PHY (with support for PLCA) - Microchip LAN8841 PHY (incl. cable diagnostics and PTP) - Amlogic gxl MDIO mux - WiFi: - RealTek RTL8188EU (rtl8xxxu) - Qualcomm Wi-Fi 7 devices (ath12k) - CAN: - Renesas R-Car V4H Drivers: - Bluetooth: - Set Per Platform Antenna Gain (PPAG) for Intel controllers. - Ethernet NICs: - Intel (1G, igc): - support TSN / Qbv / packet scheduling features of i226 model - Intel (100G, ice): - use GNSS subsystem instead of TTY - multi-buffer XDP support - extend support for GPIO pins to E823 devices - nVidia/Mellanox: - update the shared buffer configuration on PFC commands - implement PTP adjphase function for HW offset control - TC support for Geneve and GRE with VF tunnel offload - more efficient crypto key management method - multi-port eswitch support - Netronome/Corigine: - add DCB IEEE support - support IPsec offloading for NFP3800 - Freescale/NXP (enetc): - support XDP_REDIRECT for XDP non-linear buffers - improve reconfig, avoid link flap and waiting for idle - support MAC Merge layer - Other NICs: - sfc/ef100: add basic devlink support for ef100 - ionic: rx_push mode operation (writing descriptors via MMIO) - bnxt: use the auxiliary bus abstraction for RDMA - r8169: disable ASPM and reset bus in case of tx timeout - cpsw: support QSGMII mode for J721e CPSW9G - cpts: support pulse-per-second output - ngbe: add an mdio bus driver - usbnet: optimize usbnet_bh() by avoiding unnecessary queuing - r8152: handle devices with FW with NCM support - amd-xgbe: support 10Mbps, 2.5GbE speeds and rx-adaptation - virtio-net: support multi buffer XDP - virtio/vsock: replace virtio_vsock_pkt with sk_buff - tsnep: XDP support - Ethernet high-speed switches: - nVidia/Mellanox (mlxsw): - add support for latency TLV (in FW control messages) - Microchip (sparx5): - separate explicit and implicit traffic forwarding rules, make the implicit rules always active - add support for egress DSCP rewrite - IS0 VCAP support (Ingress Classification) - IS2 VCAP filters (protos, L3 addrs, L4 ports, flags, ToS etc.) - ES2 VCAP support (Egress Access Control) - support for Per-Stream Filtering and Policing (802.1Q, 8.6.5.1) - Ethernet embedded switches: - Marvell (mv88e6xxx): - add MAB (port auth) offload support - enable PTP receive for mv88e6390 - NXP (ocelot): - support MAC Merge layer - support for the the vsc7512 internal copper phys - Microchip: - lan9303: convert to PHYLINK - lan966x: support TC flower filter statistics - lan937x: PTP support for KSZ9563/KSZ8563 and LAN937x - lan937x: support Credit Based Shaper configuration - ksz9477: support Energy Efficient Ethernet - other: - qca8k: convert to regmap read/write API, use bulk operations - rswitch: Improve TX timestamp accuracy - Intel WiFi (iwlwifi): - EHT (Wi-Fi 7) rate reporting - STEP equalizer support: transfer some STEP (connection to radio on platforms with integrated wifi) related parameters from the BIOS to the firmware. - Qualcomm 802.11ax WiFi (ath11k): - IPQ5018 support - Fine Timing Measurement (FTM) responder role support - channel 177 support - MediaTek WiFi (mt76): - per-PHY LED support - mt7996: EHT (Wi-Fi 7) support - Wireless Ethernet Dispatch (WED) reset support - switch to using page pool allocator - RealTek WiFi (rtw89): - support new version of Bluetooth co-existance - Mobile: - rmnet: support TX aggregation" * tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1872 commits) page_pool: add a comment explaining the fragment counter usage net: ethtool: fix __ethtool_dev_mm_supported() implementation ethtool: pse-pd: Fix double word in comments xsk: add linux/vmalloc.h to xsk.c sefltests: netdevsim: wait for devlink instance after netns removal selftest: fib_tests: Always cleanup before exit net/mlx5e: Align IPsec ASO result memory to be as required by hardware net/mlx5e: TC, Set CT miss to the specific ct action instance net/mlx5e: Rename CHAIN_TO_REG to MAPPED_OBJ_TO_REG net/mlx5: Refactor tc miss handling to a single function net/mlx5: Kconfig: Make tc offload depend on tc skb extension net/sched: flower: Support hardware miss to tc action net/sched: flower: Move filter handle initialization earlier net/sched: cls_api: Support hardware miss to tc action net/sched: Rename user cookie and act cookie sfc: fix builds without CONFIG_RTC_LIB sfc: clean up some inconsistent indentings net/mlx4_en: Introduce flexible array to silence overflow warning net: lan966x: Fix possible deadlock inside PTP net/ulp: Remove redundant ->clone() test in inet_clone_ulp(). ...
Diffstat (limited to 'samples/bpf/xdp_router_ipv4_user.c')
-rw-r--r--samples/bpf/xdp_router_ipv4_user.c699
1 files changed, 699 insertions, 0 deletions
diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c
new file mode 100644
index 000000000..9d41db09c
--- /dev/null
+++ b/samples/bpf/xdp_router_ipv4_user.c
@@ -0,0 +1,699 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2017 Cavium, Inc.
+ */
+#include <linux/bpf.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <net/if.h>
+#include <netdb.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include "bpf_util.h"
+#include <bpf/libbpf.h>
+#include <libgen.h>
+#include <getopt.h>
+#include <pthread.h>
+#include "xdp_sample_user.h"
+#include "xdp_router_ipv4.skel.h"
+
+static const char *__doc__ =
+"XDP IPv4 router implementation\n"
+"Usage: xdp_router_ipv4 <IFNAME-0> ... <IFNAME-N>\n";
+
+static char buf[8192];
+static int lpm_map_fd;
+static int arp_table_map_fd;
+static int exact_match_map_fd;
+static int tx_port_map_fd;
+
+static bool routes_thread_exit;
+static int interval = 5;
+
+static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT |
+ SAMPLE_DEVMAP_XMIT_CNT_MULTI | SAMPLE_EXCEPTION_CNT;
+
+DEFINE_SAMPLE_INIT(xdp_router_ipv4);
+
+static const struct option long_options[] = {
+ { "help", no_argument, NULL, 'h' },
+ { "skb-mode", no_argument, NULL, 'S' },
+ { "force", no_argument, NULL, 'F' },
+ { "interval", required_argument, NULL, 'i' },
+ { "verbose", no_argument, NULL, 'v' },
+ { "stats", no_argument, NULL, 's' },
+ {}
+};
+
+static int get_route_table(int rtm_family);
+
+static int recv_msg(struct sockaddr_nl sock_addr, int sock)
+{
+ struct nlmsghdr *nh;
+ int len, nll = 0;
+ char *buf_ptr;
+
+ buf_ptr = buf;
+ while (1) {
+ len = recv(sock, buf_ptr, sizeof(buf) - nll, 0);
+ if (len < 0)
+ return len;
+
+ nh = (struct nlmsghdr *)buf_ptr;
+
+ if (nh->nlmsg_type == NLMSG_DONE)
+ break;
+ buf_ptr += len;
+ nll += len;
+ if ((sock_addr.nl_groups & RTMGRP_NEIGH) == RTMGRP_NEIGH)
+ break;
+
+ if ((sock_addr.nl_groups & RTMGRP_IPV4_ROUTE) == RTMGRP_IPV4_ROUTE)
+ break;
+ }
+ return nll;
+}
+
+/* Function to parse the route entry returned by netlink
+ * Updates the route entry related map entries
+ */
+static void read_route(struct nlmsghdr *nh, int nll)
+{
+ char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24];
+ struct bpf_lpm_trie_key *prefix_key;
+ struct rtattr *rt_attr;
+ struct rtmsg *rt_msg;
+ int rtm_family;
+ int rtl;
+ int i;
+ struct route_table {
+ int dst_len, iface, metric;
+ __be32 dst, gw;
+ __be64 mac;
+ } route;
+ struct arp_table {
+ __be64 mac;
+ __be32 dst;
+ };
+
+ struct direct_map {
+ struct arp_table arp;
+ int ifindex;
+ __be64 mac;
+ } direct_entry;
+
+ memset(&route, 0, sizeof(route));
+ for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) {
+ rt_msg = (struct rtmsg *)NLMSG_DATA(nh);
+ rtm_family = rt_msg->rtm_family;
+ if (rtm_family == AF_INET)
+ if (rt_msg->rtm_table != RT_TABLE_MAIN)
+ continue;
+ rt_attr = (struct rtattr *)RTM_RTA(rt_msg);
+ rtl = RTM_PAYLOAD(nh);
+
+ for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) {
+ switch (rt_attr->rta_type) {
+ case NDA_DST:
+ sprintf(dsts, "%u",
+ (*((__be32 *)RTA_DATA(rt_attr))));
+ break;
+ case RTA_GATEWAY:
+ sprintf(gws, "%u",
+ *((__be32 *)RTA_DATA(rt_attr)));
+ break;
+ case RTA_OIF:
+ sprintf(ifs, "%u",
+ *((int *)RTA_DATA(rt_attr)));
+ break;
+ case RTA_METRICS:
+ sprintf(metrics, "%u",
+ *((int *)RTA_DATA(rt_attr)));
+ default:
+ break;
+ }
+ }
+ sprintf(dsts_len, "%d", rt_msg->rtm_dst_len);
+ route.dst = atoi(dsts);
+ route.dst_len = atoi(dsts_len);
+ route.gw = atoi(gws);
+ route.iface = atoi(ifs);
+ route.metric = atoi(metrics);
+ assert(get_mac_addr(route.iface, &route.mac) == 0);
+ assert(bpf_map_update_elem(tx_port_map_fd,
+ &route.iface, &route.iface, 0) == 0);
+ if (rtm_family == AF_INET) {
+ struct trie_value {
+ __u8 prefix[4];
+ __be64 value;
+ int ifindex;
+ int metric;
+ __be32 gw;
+ } *prefix_value;
+
+ prefix_key = alloca(sizeof(*prefix_key) + 4);
+ prefix_value = alloca(sizeof(*prefix_value));
+
+ prefix_key->prefixlen = 32;
+ prefix_key->prefixlen = route.dst_len;
+ direct_entry.mac = route.mac & 0xffffffffffff;
+ direct_entry.ifindex = route.iface;
+ direct_entry.arp.mac = 0;
+ direct_entry.arp.dst = 0;
+ if (route.dst_len == 32) {
+ if (nh->nlmsg_type == RTM_DELROUTE) {
+ assert(bpf_map_delete_elem(exact_match_map_fd,
+ &route.dst) == 0);
+ } else {
+ if (bpf_map_lookup_elem(arp_table_map_fd,
+ &route.dst,
+ &direct_entry.arp.mac) == 0)
+ direct_entry.arp.dst = route.dst;
+ assert(bpf_map_update_elem(exact_match_map_fd,
+ &route.dst,
+ &direct_entry, 0) == 0);
+ }
+ }
+ for (i = 0; i < 4; i++)
+ prefix_key->data[i] = (route.dst >> i * 8) & 0xff;
+
+ if (bpf_map_lookup_elem(lpm_map_fd, prefix_key,
+ prefix_value) < 0) {
+ for (i = 0; i < 4; i++)
+ prefix_value->prefix[i] = prefix_key->data[i];
+ prefix_value->value = route.mac & 0xffffffffffff;
+ prefix_value->ifindex = route.iface;
+ prefix_value->gw = route.gw;
+ prefix_value->metric = route.metric;
+
+ assert(bpf_map_update_elem(lpm_map_fd,
+ prefix_key,
+ prefix_value, 0
+ ) == 0);
+ } else {
+ if (nh->nlmsg_type == RTM_DELROUTE) {
+ assert(bpf_map_delete_elem(lpm_map_fd,
+ prefix_key
+ ) == 0);
+ /* Rereading the route table to check if
+ * there is an entry with the same
+ * prefix but a different metric as the
+ * deleted entry.
+ */
+ get_route_table(AF_INET);
+ } else if (prefix_key->data[0] ==
+ prefix_value->prefix[0] &&
+ prefix_key->data[1] ==
+ prefix_value->prefix[1] &&
+ prefix_key->data[2] ==
+ prefix_value->prefix[2] &&
+ prefix_key->data[3] ==
+ prefix_value->prefix[3] &&
+ route.metric >= prefix_value->metric) {
+ continue;
+ } else {
+ for (i = 0; i < 4; i++)
+ prefix_value->prefix[i] =
+ prefix_key->data[i];
+ prefix_value->value =
+ route.mac & 0xffffffffffff;
+ prefix_value->ifindex = route.iface;
+ prefix_value->gw = route.gw;
+ prefix_value->metric = route.metric;
+ assert(bpf_map_update_elem(lpm_map_fd,
+ prefix_key,
+ prefix_value,
+ 0) == 0);
+ }
+ }
+ }
+ memset(&route, 0, sizeof(route));
+ memset(dsts, 0, sizeof(dsts));
+ memset(dsts_len, 0, sizeof(dsts_len));
+ memset(gws, 0, sizeof(gws));
+ memset(ifs, 0, sizeof(ifs));
+ memset(&route, 0, sizeof(route));
+ }
+}
+
+/* Function to read the existing route table when the process is launched*/
+static int get_route_table(int rtm_family)
+{
+ struct sockaddr_nl sa;
+ struct nlmsghdr *nh;
+ int sock, seq = 0;
+ struct msghdr msg;
+ struct iovec iov;
+ int ret = 0;
+ int nll;
+
+ struct {
+ struct nlmsghdr nl;
+ struct rtmsg rt;
+ char buf[8192];
+ } req;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ fprintf(stderr, "open netlink socket: %s\n", strerror(errno));
+ return -errno;
+ }
+ memset(&sa, 0, sizeof(sa));
+ sa.nl_family = AF_NETLINK;
+ if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+ fprintf(stderr, "bind netlink socket: %s\n", strerror(errno));
+ ret = -errno;
+ goto cleanup;
+ }
+ memset(&req, 0, sizeof(req));
+ req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+ req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.nl.nlmsg_type = RTM_GETROUTE;
+
+ req.rt.rtm_family = rtm_family;
+ req.rt.rtm_table = RT_TABLE_MAIN;
+ req.nl.nlmsg_pid = 0;
+ req.nl.nlmsg_seq = ++seq;
+ memset(&msg, 0, sizeof(msg));
+ iov.iov_base = (void *)&req.nl;
+ iov.iov_len = req.nl.nlmsg_len;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ ret = sendmsg(sock, &msg, 0);
+ if (ret < 0) {
+ fprintf(stderr, "send to netlink: %s\n", strerror(errno));
+ ret = -errno;
+ goto cleanup;
+ }
+ memset(buf, 0, sizeof(buf));
+ nll = recv_msg(sa, sock);
+ if (nll < 0) {
+ fprintf(stderr, "recv from netlink: %s\n", strerror(nll));
+ ret = nll;
+ goto cleanup;
+ }
+ nh = (struct nlmsghdr *)buf;
+ read_route(nh, nll);
+cleanup:
+ close(sock);
+ return ret;
+}
+
+/* Function to parse the arp entry returned by netlink
+ * Updates the arp entry related map entries
+ */
+static void read_arp(struct nlmsghdr *nh, int nll)
+{
+ struct rtattr *rt_attr;
+ char dsts[24], mac[24];
+ struct ndmsg *rt_msg;
+ int rtl, ndm_family;
+
+ struct arp_table {
+ __be64 mac;
+ __be32 dst;
+ } arp_entry;
+ struct direct_map {
+ struct arp_table arp;
+ int ifindex;
+ __be64 mac;
+ } direct_entry;
+
+ for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) {
+ rt_msg = (struct ndmsg *)NLMSG_DATA(nh);
+ rt_attr = (struct rtattr *)RTM_RTA(rt_msg);
+ ndm_family = rt_msg->ndm_family;
+ rtl = RTM_PAYLOAD(nh);
+ for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) {
+ switch (rt_attr->rta_type) {
+ case NDA_DST:
+ sprintf(dsts, "%u",
+ *((__be32 *)RTA_DATA(rt_attr)));
+ break;
+ case NDA_LLADDR:
+ sprintf(mac, "%lld",
+ *((__be64 *)RTA_DATA(rt_attr)));
+ break;
+ default:
+ break;
+ }
+ }
+ arp_entry.dst = atoi(dsts);
+ arp_entry.mac = atol(mac);
+
+ if (ndm_family == AF_INET) {
+ if (bpf_map_lookup_elem(exact_match_map_fd,
+ &arp_entry.dst,
+ &direct_entry) == 0) {
+ if (nh->nlmsg_type == RTM_DELNEIGH) {
+ direct_entry.arp.dst = 0;
+ direct_entry.arp.mac = 0;
+ } else if (nh->nlmsg_type == RTM_NEWNEIGH) {
+ direct_entry.arp.dst = arp_entry.dst;
+ direct_entry.arp.mac = arp_entry.mac;
+ }
+ assert(bpf_map_update_elem(exact_match_map_fd,
+ &arp_entry.dst,
+ &direct_entry, 0
+ ) == 0);
+ memset(&direct_entry, 0, sizeof(direct_entry));
+ }
+ if (nh->nlmsg_type == RTM_DELNEIGH) {
+ assert(bpf_map_delete_elem(arp_table_map_fd,
+ &arp_entry.dst) == 0);
+ } else if (nh->nlmsg_type == RTM_NEWNEIGH) {
+ assert(bpf_map_update_elem(arp_table_map_fd,
+ &arp_entry.dst,
+ &arp_entry.mac, 0
+ ) == 0);
+ }
+ }
+ memset(&arp_entry, 0, sizeof(arp_entry));
+ memset(dsts, 0, sizeof(dsts));
+ }
+}
+
+/* Function to read the existing arp table when the process is launched*/
+static int get_arp_table(int rtm_family)
+{
+ struct sockaddr_nl sa;
+ struct nlmsghdr *nh;
+ int sock, seq = 0;
+ struct msghdr msg;
+ struct iovec iov;
+ int ret = 0;
+ int nll;
+ struct {
+ struct nlmsghdr nl;
+ struct ndmsg rt;
+ char buf[8192];
+ } req;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ fprintf(stderr, "open netlink socket: %s\n", strerror(errno));
+ return -errno;
+ }
+ memset(&sa, 0, sizeof(sa));
+ sa.nl_family = AF_NETLINK;
+ if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+ fprintf(stderr, "bind netlink socket: %s\n", strerror(errno));
+ ret = -errno;
+ goto cleanup;
+ }
+ memset(&req, 0, sizeof(req));
+ req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+ req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.nl.nlmsg_type = RTM_GETNEIGH;
+ req.rt.ndm_state = NUD_REACHABLE;
+ req.rt.ndm_family = rtm_family;
+ req.nl.nlmsg_pid = 0;
+ req.nl.nlmsg_seq = ++seq;
+ memset(&msg, 0, sizeof(msg));
+ iov.iov_base = (void *)&req.nl;
+ iov.iov_len = req.nl.nlmsg_len;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ ret = sendmsg(sock, &msg, 0);
+ if (ret < 0) {
+ fprintf(stderr, "send to netlink: %s\n", strerror(errno));
+ ret = -errno;
+ goto cleanup;
+ }
+ memset(buf, 0, sizeof(buf));
+ nll = recv_msg(sa, sock);
+ if (nll < 0) {
+ fprintf(stderr, "recv from netlink: %s\n", strerror(nll));
+ ret = nll;
+ goto cleanup;
+ }
+ nh = (struct nlmsghdr *)buf;
+ read_arp(nh, nll);
+cleanup:
+ close(sock);
+ return ret;
+}
+
+/* Function to keep track and update changes in route and arp table
+ * Give regular statistics of packets forwarded
+ */
+static void *monitor_routes_thread(void *arg)
+{
+ struct pollfd fds_route, fds_arp;
+ struct sockaddr_nl la, lr;
+ int sock, sock_arp, nll;
+ struct nlmsghdr *nh;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ fprintf(stderr, "open netlink socket: %s\n", strerror(errno));
+ return NULL;
+ }
+
+ fcntl(sock, F_SETFL, O_NONBLOCK);
+ memset(&lr, 0, sizeof(lr));
+ lr.nl_family = AF_NETLINK;
+ lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY;
+ if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) {
+ fprintf(stderr, "bind netlink socket: %s\n", strerror(errno));
+ close(sock);
+ return NULL;
+ }
+
+ fds_route.fd = sock;
+ fds_route.events = POLL_IN;
+
+ sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock_arp < 0) {
+ fprintf(stderr, "open netlink socket: %s\n", strerror(errno));
+ close(sock);
+ return NULL;
+ }
+
+ fcntl(sock_arp, F_SETFL, O_NONBLOCK);
+ memset(&la, 0, sizeof(la));
+ la.nl_family = AF_NETLINK;
+ la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY;
+ if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) {
+ fprintf(stderr, "bind netlink socket: %s\n", strerror(errno));
+ goto cleanup;
+ }
+
+ fds_arp.fd = sock_arp;
+ fds_arp.events = POLL_IN;
+
+ /* dump route and arp tables */
+ if (get_arp_table(AF_INET) < 0) {
+ fprintf(stderr, "Failed reading arp table\n");
+ goto cleanup;
+ }
+
+ if (get_route_table(AF_INET) < 0) {
+ fprintf(stderr, "Failed reading route table\n");
+ goto cleanup;
+ }
+
+ while (!routes_thread_exit) {
+ memset(buf, 0, sizeof(buf));
+ if (poll(&fds_route, 1, 3) == POLL_IN) {
+ nll = recv_msg(lr, sock);
+ if (nll < 0) {
+ fprintf(stderr, "recv from netlink: %s\n",
+ strerror(nll));
+ goto cleanup;
+ }
+
+ nh = (struct nlmsghdr *)buf;
+ read_route(nh, nll);
+ }
+
+ memset(buf, 0, sizeof(buf));
+ if (poll(&fds_arp, 1, 3) == POLL_IN) {
+ nll = recv_msg(la, sock_arp);
+ if (nll < 0) {
+ fprintf(stderr, "recv from netlink: %s\n",
+ strerror(nll));
+ goto cleanup;
+ }
+
+ nh = (struct nlmsghdr *)buf;
+ read_arp(nh, nll);
+ }
+
+ sleep(interval);
+ }
+
+cleanup:
+ close(sock_arp);
+ close(sock);
+ return NULL;
+}
+
+static void usage(char *argv[], const struct option *long_options,
+ const char *doc, int mask, bool error,
+ struct bpf_object *obj)
+{
+ sample_usage(argv, long_options, doc, mask, error);
+}
+
+int main(int argc, char **argv)
+{
+ bool error = true, generic = false, force = false;
+ int opt, ret = EXIT_FAIL_BPF;
+ struct xdp_router_ipv4 *skel;
+ int i, total_ifindex = argc - 1;
+ char **ifname_list = argv + 1;
+ pthread_t routes_thread;
+ int longindex = 0;
+
+ if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL) < 0) {
+ fprintf(stderr, "Failed to set libbpf strict mode: %s\n",
+ strerror(errno));
+ goto end;
+ }
+
+ skel = xdp_router_ipv4__open();
+ if (!skel) {
+ fprintf(stderr, "Failed to xdp_router_ipv4__open: %s\n",
+ strerror(errno));
+ goto end;
+ }
+
+ ret = sample_init_pre_load(skel);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to sample_init_pre_load: %s\n",
+ strerror(-ret));
+ ret = EXIT_FAIL_BPF;
+ goto end_destroy;
+ }
+
+ ret = xdp_router_ipv4__load(skel);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to xdp_router_ipv4__load: %s\n",
+ strerror(errno));
+ goto end_destroy;
+ }
+
+ ret = sample_init(skel, mask);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret));
+ ret = EXIT_FAIL;
+ goto end_destroy;
+ }
+
+ while ((opt = getopt_long(argc, argv, "si:SFvh",
+ long_options, &longindex)) != -1) {
+ switch (opt) {
+ case 's':
+ mask |= SAMPLE_REDIRECT_MAP_CNT;
+ total_ifindex--;
+ ifname_list++;
+ break;
+ case 'i':
+ interval = strtoul(optarg, NULL, 0);
+ total_ifindex -= 2;
+ ifname_list += 2;
+ break;
+ case 'S':
+ generic = true;
+ total_ifindex--;
+ ifname_list++;
+ break;
+ case 'F':
+ force = true;
+ total_ifindex--;
+ ifname_list++;
+ break;
+ case 'v':
+ sample_switch_mode();
+ total_ifindex--;
+ ifname_list++;
+ break;
+ case 'h':
+ error = false;
+ default:
+ usage(argv, long_options, __doc__, mask, error, skel->obj);
+ goto end_destroy;
+ }
+ }
+
+ ret = EXIT_FAIL_OPTION;
+ if (optind == argc) {
+ usage(argv, long_options, __doc__, mask, true, skel->obj);
+ goto end_destroy;
+ }
+
+ lpm_map_fd = bpf_map__fd(skel->maps.lpm_map);
+ if (lpm_map_fd < 0) {
+ fprintf(stderr, "Failed loading lpm_map %s\n",
+ strerror(-lpm_map_fd));
+ goto end_destroy;
+ }
+ arp_table_map_fd = bpf_map__fd(skel->maps.arp_table);
+ if (arp_table_map_fd < 0) {
+ fprintf(stderr, "Failed loading arp_table_map_fd %s\n",
+ strerror(-arp_table_map_fd));
+ goto end_destroy;
+ }
+ exact_match_map_fd = bpf_map__fd(skel->maps.exact_match);
+ if (exact_match_map_fd < 0) {
+ fprintf(stderr, "Failed loading exact_match_map_fd %s\n",
+ strerror(-exact_match_map_fd));
+ goto end_destroy;
+ }
+ tx_port_map_fd = bpf_map__fd(skel->maps.tx_port);
+ if (tx_port_map_fd < 0) {
+ fprintf(stderr, "Failed loading tx_port_map_fd %s\n",
+ strerror(-tx_port_map_fd));
+ goto end_destroy;
+ }
+
+ ret = EXIT_FAIL_XDP;
+ for (i = 0; i < total_ifindex; i++) {
+ int index = if_nametoindex(ifname_list[i]);
+
+ if (!index) {
+ fprintf(stderr, "Interface %s not found %s\n",
+ ifname_list[i], strerror(-tx_port_map_fd));
+ goto end_destroy;
+ }
+ if (sample_install_xdp(skel->progs.xdp_router_ipv4_prog,
+ index, generic, force) < 0)
+ goto end_destroy;
+ }
+
+ ret = pthread_create(&routes_thread, NULL, monitor_routes_thread, NULL);
+ if (ret) {
+ fprintf(stderr, "Failed creating routes_thread: %s\n", strerror(-ret));
+ ret = EXIT_FAIL;
+ goto end_destroy;
+ }
+
+ ret = sample_run(interval, NULL, NULL);
+ routes_thread_exit = true;
+
+ if (ret < 0) {
+ fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret));
+ ret = EXIT_FAIL;
+ goto end_thread_wait;
+ }
+ ret = EXIT_OK;
+
+end_thread_wait:
+ pthread_join(routes_thread, NULL);
+end_destroy:
+ xdp_router_ipv4__destroy(skel);
+end:
+ sample_exit(ret);
+}