aboutsummaryrefslogtreecommitdiff
path: root/drivers/vfio/pci/mlx5/main.c
diff options
context:
space:
mode:
authorLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
committerLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
commit5b7c4cabbb65f5c469464da6c5f614cbd7f730f2 (patch)
treecc5c2d0a898769fd59549594fedb3ee6f84e59a0 /drivers/vfio/pci/mlx5/main.c
downloadlinux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.tar.gz
linux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.zip
Merge tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextgrafted
Pull networking updates from Jakub Kicinski: "Core: - Add dedicated kmem_cache for typical/small skb->head, avoid having to access struct page at kfree time, and improve memory use. - Introduce sysctl to set default RPS configuration for new netdevs. - Define Netlink protocol specification format which can be used to describe messages used by each family and auto-generate parsers. Add tools for generating kernel data structures and uAPI headers. - Expose all net/core sysctls inside netns. - Remove 4s sleep in netpoll if carrier is instantly detected on boot. - Add configurable limit of MDB entries per port, and port-vlan. - Continue populating drop reasons throughout the stack. - Retire a handful of legacy Qdiscs and classifiers. Protocols: - Support IPv4 big TCP (TSO frames larger than 64kB). - Add IP_LOCAL_PORT_RANGE socket option, to control local port range on socket by socket basis. - Track and report in procfs number of MPTCP sockets used. - Support mixing IPv4 and IPv6 flows in the in-kernel MPTCP path manager. - IPv6: don't check net.ipv6.route.max_size and rely on garbage collection to free memory (similarly to IPv4). - Support Penultimate Segment Pop (PSP) flavor in SRv6 (RFC8986). - ICMP: add per-rate limit counters. - Add support for user scanning requests in ieee802154. - Remove static WEP support. - Support minimal Wi-Fi 7 Extremely High Throughput (EHT) rate reporting. - WiFi 7 EHT channel puncturing support (client & AP). BPF: - Add a rbtree data structure following the "next-gen data structure" precedent set by recently added linked list, that is, by using kfunc + kptr instead of adding a new BPF map type. - Expose XDP hints via kfuncs with initial support for RX hash and timestamp metadata. - Add BPF_F_NO_TUNNEL_KEY extension to bpf_skb_set_tunnel_key to better support decap on GRE tunnel devices not operating in collect metadata. - Improve x86 JIT's codegen for PROBE_MEM runtime error checks. - Remove the need for trace_printk_lock for bpf_trace_printk and bpf_trace_vprintk helpers. - Extend libbpf's bpf_tracing.h support for tracing arguments of kprobes/uprobes and syscall as a special case. - Significantly reduce the search time for module symbols by livepatch and BPF. - Enable cpumasks to be used as kptrs, which is useful for tracing programs tracking which tasks end up running on which CPUs in different time intervals. - Add support for BPF trampoline on s390x and riscv64. - Add capability to export the XDP features supported by the NIC. - Add __bpf_kfunc tag for marking kernel functions as kfuncs. - Add cgroup.memory=nobpf kernel parameter option to disable BPF memory accounting for container environments. Netfilter: - Remove the CLUSTERIP target. It has been marked as obsolete for years, and we still have WARN splats wrt races of the out-of-band /proc interface installed by this target. - Add 'destroy' commands to nf_tables. They are identical to the existing 'delete' commands, but do not return an error if the referenced object (set, chain, rule...) did not exist. Driver API: - Improve cpumask_local_spread() locality to help NICs set the right IRQ affinity on AMD platforms. - Separate C22 and C45 MDIO bus transactions more clearly. - Introduce new DCB table to control DSCP rewrite on egress. - Support configuration of Physical Layer Collision Avoidance (PLCA) Reconciliation Sublayer (RS) (802.3cg-2019). Modern version of shared medium Ethernet. - Support for MAC Merge layer (IEEE 802.3-2018 clause 99). Allowing preemption of low priority frames by high priority frames. - Add support for controlling MACSec offload using netlink SET. - Rework devlink instance refcounts to allow registration and de-registration under the instance lock. Split the code into multiple files, drop some of the unnecessarily granular locks and factor out common parts of netlink operation handling. - Add TX frame aggregation parameters (for USB drivers). - Add a new attr TCA_EXT_WARN_MSG to report TC (offload) warning messages with notifications for debug. - Allow offloading of UDP NEW connections via act_ct. - Add support for per action HW stats in TC. - Support hardware miss to TC action (continue processing in SW from a specific point in the action chain). - Warn if old Wireless Extension user space interface is used with modern cfg80211/mac80211 drivers. Do not support Wireless Extensions for Wi-Fi 7 devices at all. Everyone should switch to using nl80211 interface instead. - Improve the CAN bit timing configuration. Use extack to return error messages directly to user space, update the SJW handling, including the definition of a new default value that will benefit CAN-FD controllers, by increasing their oscillator tolerance. New hardware / drivers: - Ethernet: - nVidia BlueField-3 support (control traffic driver) - Ethernet support for imx93 SoCs - Motorcomm yt8531 gigabit Ethernet PHY - onsemi NCN26000 10BASE-T1S PHY (with support for PLCA) - Microchip LAN8841 PHY (incl. cable diagnostics and PTP) - Amlogic gxl MDIO mux - WiFi: - RealTek RTL8188EU (rtl8xxxu) - Qualcomm Wi-Fi 7 devices (ath12k) - CAN: - Renesas R-Car V4H Drivers: - Bluetooth: - Set Per Platform Antenna Gain (PPAG) for Intel controllers. - Ethernet NICs: - Intel (1G, igc): - support TSN / Qbv / packet scheduling features of i226 model - Intel (100G, ice): - use GNSS subsystem instead of TTY - multi-buffer XDP support - extend support for GPIO pins to E823 devices - nVidia/Mellanox: - update the shared buffer configuration on PFC commands - implement PTP adjphase function for HW offset control - TC support for Geneve and GRE with VF tunnel offload - more efficient crypto key management method - multi-port eswitch support - Netronome/Corigine: - add DCB IEEE support - support IPsec offloading for NFP3800 - Freescale/NXP (enetc): - support XDP_REDIRECT for XDP non-linear buffers - improve reconfig, avoid link flap and waiting for idle - support MAC Merge layer - Other NICs: - sfc/ef100: add basic devlink support for ef100 - ionic: rx_push mode operation (writing descriptors via MMIO) - bnxt: use the auxiliary bus abstraction for RDMA - r8169: disable ASPM and reset bus in case of tx timeout - cpsw: support QSGMII mode for J721e CPSW9G - cpts: support pulse-per-second output - ngbe: add an mdio bus driver - usbnet: optimize usbnet_bh() by avoiding unnecessary queuing - r8152: handle devices with FW with NCM support - amd-xgbe: support 10Mbps, 2.5GbE speeds and rx-adaptation - virtio-net: support multi buffer XDP - virtio/vsock: replace virtio_vsock_pkt with sk_buff - tsnep: XDP support - Ethernet high-speed switches: - nVidia/Mellanox (mlxsw): - add support for latency TLV (in FW control messages) - Microchip (sparx5): - separate explicit and implicit traffic forwarding rules, make the implicit rules always active - add support for egress DSCP rewrite - IS0 VCAP support (Ingress Classification) - IS2 VCAP filters (protos, L3 addrs, L4 ports, flags, ToS etc.) - ES2 VCAP support (Egress Access Control) - support for Per-Stream Filtering and Policing (802.1Q, 8.6.5.1) - Ethernet embedded switches: - Marvell (mv88e6xxx): - add MAB (port auth) offload support - enable PTP receive for mv88e6390 - NXP (ocelot): - support MAC Merge layer - support for the the vsc7512 internal copper phys - Microchip: - lan9303: convert to PHYLINK - lan966x: support TC flower filter statistics - lan937x: PTP support for KSZ9563/KSZ8563 and LAN937x - lan937x: support Credit Based Shaper configuration - ksz9477: support Energy Efficient Ethernet - other: - qca8k: convert to regmap read/write API, use bulk operations - rswitch: Improve TX timestamp accuracy - Intel WiFi (iwlwifi): - EHT (Wi-Fi 7) rate reporting - STEP equalizer support: transfer some STEP (connection to radio on platforms with integrated wifi) related parameters from the BIOS to the firmware. - Qualcomm 802.11ax WiFi (ath11k): - IPQ5018 support - Fine Timing Measurement (FTM) responder role support - channel 177 support - MediaTek WiFi (mt76): - per-PHY LED support - mt7996: EHT (Wi-Fi 7) support - Wireless Ethernet Dispatch (WED) reset support - switch to using page pool allocator - RealTek WiFi (rtw89): - support new version of Bluetooth co-existance - Mobile: - rmnet: support TX aggregation" * tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1872 commits) page_pool: add a comment explaining the fragment counter usage net: ethtool: fix __ethtool_dev_mm_supported() implementation ethtool: pse-pd: Fix double word in comments xsk: add linux/vmalloc.h to xsk.c sefltests: netdevsim: wait for devlink instance after netns removal selftest: fib_tests: Always cleanup before exit net/mlx5e: Align IPsec ASO result memory to be as required by hardware net/mlx5e: TC, Set CT miss to the specific ct action instance net/mlx5e: Rename CHAIN_TO_REG to MAPPED_OBJ_TO_REG net/mlx5: Refactor tc miss handling to a single function net/mlx5: Kconfig: Make tc offload depend on tc skb extension net/sched: flower: Support hardware miss to tc action net/sched: flower: Move filter handle initialization earlier net/sched: cls_api: Support hardware miss to tc action net/sched: Rename user cookie and act cookie sfc: fix builds without CONFIG_RTC_LIB sfc: clean up some inconsistent indentings net/mlx4_en: Introduce flexible array to silence overflow warning net: lan966x: Fix possible deadlock inside PTP net/ulp: Remove redundant ->clone() test in inet_clone_ulp(). ...
Diffstat (limited to '')
-rw-r--r--drivers/vfio/pci/mlx5/main.c1211
1 files changed, 1211 insertions, 0 deletions
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
new file mode 100644
index 000000000..9feb89c6d
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -0,0 +1,1211 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/sched/mm.h>
+#include <linux/anon_inodes.h>
+
+#include "cmd.h"
+
+/* Arbitrary to prevent userspace from consuming endless memory */
+#define MAX_MIGRATION_SIZE (512*1024*1024)
+
+static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
+{
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+ return container_of(core_device, struct mlx5vf_pci_core_device,
+ core_device);
+}
+
+struct page *
+mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
+ unsigned long offset)
+{
+ unsigned long cur_offset = 0;
+ struct scatterlist *sg;
+ unsigned int i;
+
+ /* All accesses are sequential */
+ if (offset < buf->last_offset || !buf->last_offset_sg) {
+ buf->last_offset = 0;
+ buf->last_offset_sg = buf->table.sgt.sgl;
+ buf->sg_last_entry = 0;
+ }
+
+ cur_offset = buf->last_offset;
+
+ for_each_sg(buf->last_offset_sg, sg,
+ buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
+ if (offset < sg->length + cur_offset) {
+ buf->last_offset_sg = sg;
+ buf->sg_last_entry += i;
+ buf->last_offset = cur_offset;
+ return nth_page(sg_page(sg),
+ (offset - cur_offset) / PAGE_SIZE);
+ }
+ cur_offset += sg->length;
+ }
+ return NULL;
+}
+
+int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
+ unsigned int npages)
+{
+ unsigned int to_alloc = npages;
+ struct page **page_list;
+ unsigned long filled;
+ unsigned int to_fill;
+ int ret;
+
+ to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
+ page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL);
+ if (!page_list)
+ return -ENOMEM;
+
+ do {
+ filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list);
+ if (!filled) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ to_alloc -= filled;
+ ret = sg_alloc_append_table_from_pages(
+ &buf->table, page_list, filled, 0,
+ filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
+ GFP_KERNEL);
+
+ if (ret)
+ goto err;
+ buf->allocated_length += filled * PAGE_SIZE;
+ /* clean input for another bulk allocation */
+ memset(page_list, 0, filled * sizeof(*page_list));
+ to_fill = min_t(unsigned int, to_alloc,
+ PAGE_SIZE / sizeof(*page_list));
+ } while (to_alloc > 0);
+
+ kvfree(page_list);
+ return 0;
+
+err:
+ kvfree(page_list);
+ return ret;
+}
+
+static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
+{
+ mutex_lock(&migf->lock);
+ migf->state = MLX5_MIGF_STATE_ERROR;
+ migf->filp->f_pos = 0;
+ mutex_unlock(&migf->lock);
+}
+
+static int mlx5vf_release_file(struct inode *inode, struct file *filp)
+{
+ struct mlx5_vf_migration_file *migf = filp->private_data;
+
+ mlx5vf_disable_fd(migf);
+ mutex_destroy(&migf->lock);
+ kfree(migf);
+ return 0;
+}
+
+static struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
+ bool *end_of_data)
+{
+ struct mlx5_vhca_data_buffer *buf;
+ bool found = false;
+
+ *end_of_data = false;
+ spin_lock_irq(&migf->list_lock);
+ if (list_empty(&migf->buf_list)) {
+ *end_of_data = true;
+ goto end;
+ }
+
+ buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
+ buf_elm);
+ if (pos >= buf->start_pos &&
+ pos < buf->start_pos + buf->length) {
+ found = true;
+ goto end;
+ }
+
+ /*
+ * As we use a stream based FD we may expect having the data always
+ * on first chunk
+ */
+ migf->state = MLX5_MIGF_STATE_ERROR;
+
+end:
+ spin_unlock_irq(&migf->list_lock);
+ return found ? buf : NULL;
+}
+
+static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
+ char __user **buf, size_t *len, loff_t *pos)
+{
+ unsigned long offset;
+ ssize_t done = 0;
+ size_t copy_len;
+
+ copy_len = min_t(size_t,
+ vhca_buf->start_pos + vhca_buf->length - *pos, *len);
+ while (copy_len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *from_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+ page_offset = offset % PAGE_SIZE;
+ offset -= page_offset;
+ page = mlx5vf_get_migration_page(vhca_buf, offset);
+ if (!page)
+ return -EINVAL;
+ page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
+ from_buff = kmap_local_page(page);
+ ret = copy_to_user(*buf, from_buff + page_offset, page_len);
+ kunmap_local(from_buff);
+ if (ret)
+ return -EFAULT;
+ *pos += page_len;
+ *len -= page_len;
+ *buf += page_len;
+ done += page_len;
+ copy_len -= page_len;
+ }
+
+ if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
+ spin_lock_irq(&vhca_buf->migf->list_lock);
+ list_del_init(&vhca_buf->buf_elm);
+ list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
+ spin_unlock_irq(&vhca_buf->migf->list_lock);
+ }
+
+ return done;
+}
+
+static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
+ loff_t *pos)
+{
+ struct mlx5_vf_migration_file *migf = filp->private_data;
+ struct mlx5_vhca_data_buffer *vhca_buf;
+ bool first_loop_call = true;
+ bool end_of_data;
+ ssize_t done = 0;
+
+ if (pos)
+ return -ESPIPE;
+ pos = &filp->f_pos;
+
+ if (!(filp->f_flags & O_NONBLOCK)) {
+ if (wait_event_interruptible(migf->poll_wait,
+ !list_empty(&migf->buf_list) ||
+ migf->state == MLX5_MIGF_STATE_ERROR ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+ migf->state == MLX5_MIGF_STATE_COMPLETE))
+ return -ERESTARTSYS;
+ }
+
+ mutex_lock(&migf->lock);
+ if (migf->state == MLX5_MIGF_STATE_ERROR) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ while (len) {
+ ssize_t count;
+
+ vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
+ &end_of_data);
+ if (first_loop_call) {
+ first_loop_call = false;
+ /* Temporary end of file as part of PRE_COPY */
+ if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
+ done = -ENOMSG;
+ goto out_unlock;
+ }
+
+ if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
+ if (filp->f_flags & O_NONBLOCK) {
+ done = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+ }
+
+ if (end_of_data)
+ goto out_unlock;
+
+ if (!vhca_buf) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
+ if (count < 0) {
+ done = count;
+ goto out_unlock;
+ }
+ done += count;
+ }
+
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return done;
+}
+
+static __poll_t mlx5vf_save_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ struct mlx5_vf_migration_file *migf = filp->private_data;
+ __poll_t pollflags = 0;
+
+ poll_wait(filp, &migf->poll_wait, wait);
+
+ mutex_lock(&migf->lock);
+ if (migf->state == MLX5_MIGF_STATE_ERROR)
+ pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+ else if (!list_empty(&migf->buf_list) ||
+ migf->state == MLX5_MIGF_STATE_COMPLETE)
+ pollflags = EPOLLIN | EPOLLRDNORM;
+ mutex_unlock(&migf->lock);
+
+ return pollflags;
+}
+
+/*
+ * FD is exposed and user can use it after receiving an error.
+ * Mark migf in error, and wake the user.
+ */
+static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
+{
+ migf->state = MLX5_MIGF_STATE_ERROR;
+ wake_up_interruptible(&migf->poll_wait);
+}
+
+static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mlx5_vf_migration_file *migf = filp->private_data;
+ struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
+ struct mlx5_vhca_data_buffer *buf;
+ struct vfio_precopy_info info = {};
+ loff_t *pos = &filp->f_pos;
+ unsigned long minsz;
+ size_t inc_length = 0;
+ bool end_of_data;
+ int ret;
+
+ if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ mutex_lock(&mvdev->state_mutex);
+ if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+ mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+ ret = -EINVAL;
+ goto err_state_unlock;
+ }
+
+ /*
+ * We can't issue a SAVE command when the device is suspended, so as
+ * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
+ * bytes that can't be read.
+ */
+ if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
+ /*
+ * Once the query returns it's guaranteed that there is no
+ * active SAVE command.
+ * As so, the other code below is safe with the proper locks.
+ */
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
+ MLX5VF_QUERY_INC);
+ if (ret)
+ goto err_state_unlock;
+ }
+
+ mutex_lock(&migf->lock);
+ if (migf->state == MLX5_MIGF_STATE_ERROR) {
+ ret = -ENODEV;
+ goto err_migf_unlock;
+ }
+
+ buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
+ if (buf) {
+ if (buf->start_pos == 0) {
+ info.initial_bytes = buf->header_image_size - *pos;
+ } else if (buf->start_pos ==
+ sizeof(struct mlx5_vf_migration_header)) {
+ /* First data buffer following the header */
+ info.initial_bytes = buf->start_pos +
+ buf->length - *pos;
+ } else {
+ info.dirty_bytes = buf->start_pos + buf->length - *pos;
+ }
+ } else {
+ if (!end_of_data) {
+ ret = -EINVAL;
+ goto err_migf_unlock;
+ }
+
+ info.dirty_bytes = inc_length;
+ }
+
+ if (!end_of_data || !inc_length) {
+ mutex_unlock(&migf->lock);
+ goto done;
+ }
+
+ mutex_unlock(&migf->lock);
+ /*
+ * We finished transferring the current state and the device has a
+ * dirty state, save a new state to be ready for.
+ */
+ buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ mlx5vf_mark_err(migf);
+ goto err_state_unlock;
+ }
+
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
+ if (ret) {
+ mlx5vf_mark_err(migf);
+ mlx5vf_put_data_buffer(buf);
+ goto err_state_unlock;
+ }
+
+done:
+ mlx5vf_state_mutex_unlock(mvdev);
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+ return 0;
+
+err_migf_unlock:
+ mutex_unlock(&migf->lock);
+err_state_unlock:
+ mlx5vf_state_mutex_unlock(mvdev);
+ return ret;
+}
+
+static const struct file_operations mlx5vf_save_fops = {
+ .owner = THIS_MODULE,
+ .read = mlx5vf_save_read,
+ .poll = mlx5vf_save_poll,
+ .unlocked_ioctl = mlx5vf_precopy_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .release = mlx5vf_release_file,
+ .llseek = no_llseek,
+};
+
+static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
+{
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
+ struct mlx5_vhca_data_buffer *buf;
+ size_t length;
+ int ret;
+
+ if (migf->state == MLX5_MIGF_STATE_ERROR)
+ return -ENODEV;
+
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
+ MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
+ if (ret)
+ goto err;
+
+ buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto err;
+ }
+
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
+ if (ret)
+ goto err_save;
+
+ return 0;
+
+err_save:
+ mlx5vf_put_data_buffer(buf);
+err:
+ mlx5vf_mark_err(migf);
+ return ret;
+}
+
+static struct mlx5_vf_migration_file *
+mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
+{
+ struct mlx5_vf_migration_file *migf;
+ struct mlx5_vhca_data_buffer *buf;
+ size_t length;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
+ O_RDONLY);
+ if (IS_ERR(migf->filp)) {
+ ret = PTR_ERR(migf->filp);
+ goto end;
+ }
+
+ migf->mvdev = mvdev;
+ ret = mlx5vf_cmd_alloc_pd(migf);
+ if (ret)
+ goto out_free;
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ init_waitqueue_head(&migf->poll_wait);
+ init_completion(&migf->save_comp);
+ /*
+ * save_comp is being used as a binary semaphore built from
+ * a completion. A normal mutex cannot be used because the lock is
+ * passed between kernel threads and lockdep can't model this.
+ */
+ complete(&migf->save_comp);
+ mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
+ INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
+ if (ret)
+ goto out_pd;
+
+ buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_pd;
+ }
+
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
+ if (ret)
+ goto out_save;
+ return migf;
+out_save:
+ mlx5vf_free_data_buffer(buf);
+out_pd:
+ mlx5vf_cmd_dealloc_pd(migf);
+out_free:
+ fput(migf->filp);
+end:
+ kfree(migf);
+ return ERR_PTR(ret);
+}
+
+static int
+mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ unsigned long offset;
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *to_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+ page_offset = offset % PAGE_SIZE;
+
+ page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
+ if (!page)
+ return -EINVAL;
+ page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + page_offset, *buf, page_len);
+ kunmap_local(to_buff);
+ if (ret)
+ return -EFAULT;
+
+ *pos += page_len;
+ *done += page_len;
+ *buf += page_len;
+ *len -= page_len;
+ vhca_buf->length += page_len;
+ return 0;
+}
+
+static int
+mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
+ loff_t requested_length,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ int ret;
+
+ if (requested_length > MAX_MIGRATION_SIZE)
+ return -ENOMEM;
+
+ if (vhca_buf->allocated_length < requested_length) {
+ ret = mlx5vf_add_migration_pages(
+ vhca_buf,
+ DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
+ PAGE_SIZE));
+ if (ret)
+ return ret;
+ }
+
+ while (*len) {
+ ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
+ done);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static ssize_t
+mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *vhca_buf,
+ size_t image_size, const char __user **buf,
+ size_t *len, loff_t *pos, ssize_t *done,
+ bool *has_work)
+{
+ size_t copy_len, to_copy;
+ int ret;
+
+ to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
+ copy_len = to_copy;
+ while (to_copy) {
+ ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
+ done);
+ if (ret)
+ return ret;
+ }
+
+ *len -= copy_len;
+ if (vhca_buf->length == image_size) {
+ migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
+ migf->max_pos += image_size;
+ *has_work = true;
+ }
+
+ return 0;
+}
+
+static int
+mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *vhca_buf,
+ const char __user **buf,
+ size_t *len, loff_t *pos,
+ ssize_t *done, bool *has_work)
+{
+ struct page *page;
+ size_t copy_len;
+ u8 *to_buff;
+ int ret;
+
+ copy_len = min_t(size_t, *len,
+ sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
+ page = mlx5vf_get_migration_page(vhca_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto end;
+ }
+
+ *buf += copy_len;
+ *pos += copy_len;
+ *done += copy_len;
+ *len -= copy_len;
+ vhca_buf->length += copy_len;
+ if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
+ u64 flags;
+
+ vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff);
+ if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ flags = le64_to_cpup((__le64 *)(to_buff +
+ offsetof(struct mlx5_vf_migration_header, flags)));
+ if (flags) {
+ ret = -EOPNOTSUPP;
+ goto end;
+ }
+
+ migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
+ migf->max_pos += vhca_buf->length;
+ *has_work = true;
+ }
+end:
+ kunmap_local(to_buff);
+ return ret;
+}
+
+static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct mlx5_vf_migration_file *migf = filp->private_data;
+ struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
+ struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
+ loff_t requested_length;
+ bool has_work = false;
+ ssize_t done = 0;
+ int ret = 0;
+
+ if (pos)
+ return -ESPIPE;
+ pos = &filp->f_pos;
+
+ if (*pos < 0 ||
+ check_add_overflow((loff_t)len, *pos, &requested_length))
+ return -EINVAL;
+
+ mutex_lock(&migf->mvdev->state_mutex);
+ mutex_lock(&migf->lock);
+ if (migf->state == MLX5_MIGF_STATE_ERROR) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ while (len || has_work) {
+ has_work = false;
+ switch (migf->load_state) {
+ case MLX5_VF_LOAD_STATE_READ_HEADER:
+ ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
+ &buf, &len, pos,
+ &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_PREP_IMAGE:
+ {
+ u64 size = vhca_buf_header->header_image_size;
+
+ if (vhca_buf->allocated_length < size) {
+ mlx5vf_free_data_buffer(vhca_buf);
+
+ migf->buf = mlx5vf_alloc_data_buffer(migf,
+ size, DMA_TO_DEVICE);
+ if (IS_ERR(migf->buf)) {
+ ret = PTR_ERR(migf->buf);
+ migf->buf = NULL;
+ goto out_unlock;
+ }
+
+ vhca_buf = migf->buf;
+ }
+
+ vhca_buf->start_pos = migf->max_pos;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
+ break;
+ }
+ case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
+ ret = mlx5vf_resume_read_image_no_header(vhca_buf,
+ requested_length,
+ &buf, &len, pos, &done);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_READ_IMAGE:
+ ret = mlx5vf_resume_read_image(migf, vhca_buf,
+ vhca_buf_header->header_image_size,
+ &buf, &len, pos, &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
+ ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
+ if (ret)
+ goto out_unlock;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+
+ /* prep header buf for next image */
+ vhca_buf_header->length = 0;
+ vhca_buf_header->header_image_size = 0;
+ /* prep data buf for next image */
+ vhca_buf->length = 0;
+
+ break;
+ default:
+ break;
+ }
+ }
+
+out_unlock:
+ if (ret)
+ migf->state = MLX5_MIGF_STATE_ERROR;
+ mutex_unlock(&migf->lock);
+ mlx5vf_state_mutex_unlock(migf->mvdev);
+ return ret ? ret : done;
+}
+
+static const struct file_operations mlx5vf_resume_fops = {
+ .owner = THIS_MODULE,
+ .write = mlx5vf_resume_write,
+ .release = mlx5vf_release_file,
+ .llseek = no_llseek,
+};
+
+static struct mlx5_vf_migration_file *
+mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
+{
+ struct mlx5_vf_migration_file *migf;
+ struct mlx5_vhca_data_buffer *buf;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
+ O_WRONLY);
+ if (IS_ERR(migf->filp)) {
+ ret = PTR_ERR(migf->filp);
+ goto end;
+ }
+
+ migf->mvdev = mvdev;
+ ret = mlx5vf_cmd_alloc_pd(migf);
+ if (ret)
+ goto out_free;
+
+ buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_pd;
+ }
+
+ migf->buf = buf;
+ if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
+ buf = mlx5vf_alloc_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_buf;
+ }
+
+ migf->buf_header = buf;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+ } else {
+ /* Initial state will be to read the image */
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
+ return migf;
+out_buf:
+ mlx5vf_free_data_buffer(migf->buf);
+out_pd:
+ mlx5vf_cmd_dealloc_pd(migf);
+out_free:
+ fput(migf->filp);
+end:
+ kfree(migf);
+ return ERR_PTR(ret);
+}
+
+void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
+{
+ if (mvdev->resuming_migf) {
+ mlx5vf_disable_fd(mvdev->resuming_migf);
+ mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
+ fput(mvdev->resuming_migf->filp);
+ mvdev->resuming_migf = NULL;
+ }
+ if (mvdev->saving_migf) {
+ mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
+ cancel_work_sync(&mvdev->saving_migf->async_data.work);
+ mlx5vf_disable_fd(mvdev->saving_migf);
+ mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
+ fput(mvdev->saving_migf->filp);
+ mvdev->saving_migf = NULL;
+ }
+}
+
+static struct file *
+mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
+ u32 new)
+{
+ u32 cur = mvdev->mig_state;
+ int ret;
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
+ ret = mlx5vf_cmd_suspend_vhca(mvdev,
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ ret = mlx5vf_cmd_resume_vhca(mvdev,
+ MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ ret = mlx5vf_cmd_suspend_vhca(mvdev,
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
+ ret = mlx5vf_cmd_resume_vhca(mvdev,
+ MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ struct mlx5_vf_migration_file *migf;
+
+ migf = mlx5vf_pci_save_device_data(mvdev, false);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ mvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+ new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
+ mlx5vf_disable_fds(mvdev);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+ struct mlx5_vf_migration_file *migf;
+
+ migf = mlx5vf_pci_resume_device_data(mvdev);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ mvdev->resuming_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+ if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
+ ret = mlx5vf_cmd_load_vhca_state(mvdev,
+ mvdev->resuming_migf,
+ mvdev->resuming_migf->buf);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ mlx5vf_disable_fds(mvdev);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+ new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ struct mlx5_vf_migration_file *migf;
+
+ migf = mlx5vf_pci_save_device_data(mvdev, true);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ mvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ ret = mlx5vf_cmd_suspend_vhca(mvdev,
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
+ if (ret)
+ return ERR_PTR(ret);
+ ret = mlx5vf_pci_save_device_inc_data(mvdev);
+ return ret ? ERR_PTR(ret) : NULL;
+ }
+
+ /*
+ * vfio_mig_get_next_state() does not use arcs other than the above
+ */
+ WARN_ON(true);
+ return ERR_PTR(-EINVAL);
+}
+
+/*
+ * This function is called in all state_mutex unlock cases to
+ * handle a 'deferred_reset' if exists.
+ */
+void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
+{
+again:
+ spin_lock(&mvdev->reset_lock);
+ if (mvdev->deferred_reset) {
+ mvdev->deferred_reset = false;
+ spin_unlock(&mvdev->reset_lock);
+ mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ mlx5vf_disable_fds(mvdev);
+ goto again;
+ }
+ mutex_unlock(&mvdev->state_mutex);
+ spin_unlock(&mvdev->reset_lock);
+}
+
+static struct file *
+mlx5vf_pci_set_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+ enum vfio_device_mig_state next_state;
+ struct file *res = NULL;
+ int ret;
+
+ mutex_lock(&mvdev->state_mutex);
+ while (new_state != mvdev->mig_state) {
+ ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
+ new_state, &next_state);
+ if (ret) {
+ res = ERR_PTR(ret);
+ break;
+ }
+ res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
+ if (IS_ERR(res))
+ break;
+ mvdev->mig_state = next_state;
+ if (WARN_ON(res && new_state != mvdev->mig_state)) {
+ fput(res);
+ res = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ mlx5vf_state_mutex_unlock(mvdev);
+ return res;
+}
+
+static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+ size_t state_size;
+ int ret;
+
+ mutex_lock(&mvdev->state_mutex);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
+ &state_size, 0);
+ if (!ret)
+ *stop_copy_length = state_size;
+ mlx5vf_state_mutex_unlock(mvdev);
+ return ret;
+}
+
+static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *curr_state)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+
+ mutex_lock(&mvdev->state_mutex);
+ *curr_state = mvdev->mig_state;
+ mlx5vf_state_mutex_unlock(mvdev);
+ return 0;
+}
+
+static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
+
+ if (!mvdev->migrate_cap)
+ return;
+
+ /*
+ * As the higher VFIO layers are holding locks across reset and using
+ * those same locks with the mm_lock we need to prevent ABBA deadlock
+ * with the state_mutex and mm_lock.
+ * In case the state_mutex was taken already we defer the cleanup work
+ * to the unlock flow of the other running context.
+ */
+ spin_lock(&mvdev->reset_lock);
+ mvdev->deferred_reset = true;
+ if (!mutex_trylock(&mvdev->state_mutex)) {
+ spin_unlock(&mvdev->reset_lock);
+ return;
+ }
+ spin_unlock(&mvdev->reset_lock);
+ mlx5vf_state_mutex_unlock(mvdev);
+}
+
+static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+ struct vfio_pci_core_device *vdev = &mvdev->core_device;
+ int ret;
+
+ ret = vfio_pci_core_enable(vdev);
+ if (ret)
+ return ret;
+
+ if (mvdev->migrate_cap)
+ mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ vfio_pci_core_finish_enable(vdev);
+ return 0;
+}
+
+static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+
+ mlx5vf_cmd_close_migratable(mvdev);
+ vfio_pci_core_close_device(core_vdev);
+}
+
+static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
+ .migration_set_state = mlx5vf_pci_set_device_state,
+ .migration_get_state = mlx5vf_pci_get_device_state,
+ .migration_get_data_size = mlx5vf_pci_get_data_size,
+};
+
+static const struct vfio_log_ops mlx5vf_pci_log_ops = {
+ .log_start = mlx5vf_start_page_tracker,
+ .log_stop = mlx5vf_stop_page_tracker,
+ .log_read_and_clear = mlx5vf_tracker_read_and_clear,
+};
+
+static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
+ struct mlx5vf_pci_core_device, core_device.vdev);
+ int ret;
+
+ ret = vfio_pci_core_init_dev(core_vdev);
+ if (ret)
+ return ret;
+
+ mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
+ &mlx5vf_pci_log_ops);
+
+ return 0;
+}
+
+static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
+ struct mlx5vf_pci_core_device, core_device.vdev);
+
+ mlx5vf_cmd_remove_migratable(mvdev);
+ vfio_pci_core_release_dev(core_vdev);
+}
+
+static const struct vfio_device_ops mlx5vf_pci_ops = {
+ .name = "mlx5-vfio-pci",
+ .init = mlx5vf_pci_init_dev,
+ .release = mlx5vf_pci_release_dev,
+ .open_device = mlx5vf_pci_open_device,
+ .close_device = mlx5vf_pci_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+};
+
+static int mlx5vf_pci_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ struct mlx5vf_pci_core_device *mvdev;
+ int ret;
+
+ mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
+ &pdev->dev, &mlx5vf_pci_ops);
+ if (IS_ERR(mvdev))
+ return PTR_ERR(mvdev);
+
+ dev_set_drvdata(&pdev->dev, &mvdev->core_device);
+ ret = vfio_pci_core_register_device(&mvdev->core_device);
+ if (ret)
+ goto out_put_vdev;
+ return 0;
+
+out_put_vdev:
+ vfio_put_device(&mvdev->core_device.vdev);
+ return ret;
+}
+
+static void mlx5vf_pci_remove(struct pci_dev *pdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
+
+ vfio_pci_core_unregister_device(&mvdev->core_device);
+ vfio_put_device(&mvdev->core_device.vdev);
+}
+
+static const struct pci_device_id mlx5vf_pci_table[] = {
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
+ {}
+};
+
+MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
+
+static const struct pci_error_handlers mlx5vf_err_handlers = {
+ .reset_done = mlx5vf_pci_aer_reset_done,
+ .error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static struct pci_driver mlx5vf_pci_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = mlx5vf_pci_table,
+ .probe = mlx5vf_pci_probe,
+ .remove = mlx5vf_pci_remove,
+ .err_handler = &mlx5vf_err_handlers,
+ .driver_managed_dma = true,
+};
+
+module_pci_driver(mlx5vf_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
+MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
+MODULE_DESCRIPTION(
+ "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");