aboutsummaryrefslogtreecommitdiff
path: root/mm/page_isolation.c
diff options
context:
space:
mode:
authorLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
committerLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
commit5b7c4cabbb65f5c469464da6c5f614cbd7f730f2 (patch)
treecc5c2d0a898769fd59549594fedb3ee6f84e59a0 /mm/page_isolation.c
downloadlinux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.tar.gz
linux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.zip
Merge tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextgrafted
Pull networking updates from Jakub Kicinski: "Core: - Add dedicated kmem_cache for typical/small skb->head, avoid having to access struct page at kfree time, and improve memory use. - Introduce sysctl to set default RPS configuration for new netdevs. - Define Netlink protocol specification format which can be used to describe messages used by each family and auto-generate parsers. Add tools for generating kernel data structures and uAPI headers. - Expose all net/core sysctls inside netns. - Remove 4s sleep in netpoll if carrier is instantly detected on boot. - Add configurable limit of MDB entries per port, and port-vlan. - Continue populating drop reasons throughout the stack. - Retire a handful of legacy Qdiscs and classifiers. Protocols: - Support IPv4 big TCP (TSO frames larger than 64kB). - Add IP_LOCAL_PORT_RANGE socket option, to control local port range on socket by socket basis. - Track and report in procfs number of MPTCP sockets used. - Support mixing IPv4 and IPv6 flows in the in-kernel MPTCP path manager. - IPv6: don't check net.ipv6.route.max_size and rely on garbage collection to free memory (similarly to IPv4). - Support Penultimate Segment Pop (PSP) flavor in SRv6 (RFC8986). - ICMP: add per-rate limit counters. - Add support for user scanning requests in ieee802154. - Remove static WEP support. - Support minimal Wi-Fi 7 Extremely High Throughput (EHT) rate reporting. - WiFi 7 EHT channel puncturing support (client & AP). BPF: - Add a rbtree data structure following the "next-gen data structure" precedent set by recently added linked list, that is, by using kfunc + kptr instead of adding a new BPF map type. - Expose XDP hints via kfuncs with initial support for RX hash and timestamp metadata. - Add BPF_F_NO_TUNNEL_KEY extension to bpf_skb_set_tunnel_key to better support decap on GRE tunnel devices not operating in collect metadata. - Improve x86 JIT's codegen for PROBE_MEM runtime error checks. - Remove the need for trace_printk_lock for bpf_trace_printk and bpf_trace_vprintk helpers. - Extend libbpf's bpf_tracing.h support for tracing arguments of kprobes/uprobes and syscall as a special case. - Significantly reduce the search time for module symbols by livepatch and BPF. - Enable cpumasks to be used as kptrs, which is useful for tracing programs tracking which tasks end up running on which CPUs in different time intervals. - Add support for BPF trampoline on s390x and riscv64. - Add capability to export the XDP features supported by the NIC. - Add __bpf_kfunc tag for marking kernel functions as kfuncs. - Add cgroup.memory=nobpf kernel parameter option to disable BPF memory accounting for container environments. Netfilter: - Remove the CLUSTERIP target. It has been marked as obsolete for years, and we still have WARN splats wrt races of the out-of-band /proc interface installed by this target. - Add 'destroy' commands to nf_tables. They are identical to the existing 'delete' commands, but do not return an error if the referenced object (set, chain, rule...) did not exist. Driver API: - Improve cpumask_local_spread() locality to help NICs set the right IRQ affinity on AMD platforms. - Separate C22 and C45 MDIO bus transactions more clearly. - Introduce new DCB table to control DSCP rewrite on egress. - Support configuration of Physical Layer Collision Avoidance (PLCA) Reconciliation Sublayer (RS) (802.3cg-2019). Modern version of shared medium Ethernet. - Support for MAC Merge layer (IEEE 802.3-2018 clause 99). Allowing preemption of low priority frames by high priority frames. - Add support for controlling MACSec offload using netlink SET. - Rework devlink instance refcounts to allow registration and de-registration under the instance lock. Split the code into multiple files, drop some of the unnecessarily granular locks and factor out common parts of netlink operation handling. - Add TX frame aggregation parameters (for USB drivers). - Add a new attr TCA_EXT_WARN_MSG to report TC (offload) warning messages with notifications for debug. - Allow offloading of UDP NEW connections via act_ct. - Add support for per action HW stats in TC. - Support hardware miss to TC action (continue processing in SW from a specific point in the action chain). - Warn if old Wireless Extension user space interface is used with modern cfg80211/mac80211 drivers. Do not support Wireless Extensions for Wi-Fi 7 devices at all. Everyone should switch to using nl80211 interface instead. - Improve the CAN bit timing configuration. Use extack to return error messages directly to user space, update the SJW handling, including the definition of a new default value that will benefit CAN-FD controllers, by increasing their oscillator tolerance. New hardware / drivers: - Ethernet: - nVidia BlueField-3 support (control traffic driver) - Ethernet support for imx93 SoCs - Motorcomm yt8531 gigabit Ethernet PHY - onsemi NCN26000 10BASE-T1S PHY (with support for PLCA) - Microchip LAN8841 PHY (incl. cable diagnostics and PTP) - Amlogic gxl MDIO mux - WiFi: - RealTek RTL8188EU (rtl8xxxu) - Qualcomm Wi-Fi 7 devices (ath12k) - CAN: - Renesas R-Car V4H Drivers: - Bluetooth: - Set Per Platform Antenna Gain (PPAG) for Intel controllers. - Ethernet NICs: - Intel (1G, igc): - support TSN / Qbv / packet scheduling features of i226 model - Intel (100G, ice): - use GNSS subsystem instead of TTY - multi-buffer XDP support - extend support for GPIO pins to E823 devices - nVidia/Mellanox: - update the shared buffer configuration on PFC commands - implement PTP adjphase function for HW offset control - TC support for Geneve and GRE with VF tunnel offload - more efficient crypto key management method - multi-port eswitch support - Netronome/Corigine: - add DCB IEEE support - support IPsec offloading for NFP3800 - Freescale/NXP (enetc): - support XDP_REDIRECT for XDP non-linear buffers - improve reconfig, avoid link flap and waiting for idle - support MAC Merge layer - Other NICs: - sfc/ef100: add basic devlink support for ef100 - ionic: rx_push mode operation (writing descriptors via MMIO) - bnxt: use the auxiliary bus abstraction for RDMA - r8169: disable ASPM and reset bus in case of tx timeout - cpsw: support QSGMII mode for J721e CPSW9G - cpts: support pulse-per-second output - ngbe: add an mdio bus driver - usbnet: optimize usbnet_bh() by avoiding unnecessary queuing - r8152: handle devices with FW with NCM support - amd-xgbe: support 10Mbps, 2.5GbE speeds and rx-adaptation - virtio-net: support multi buffer XDP - virtio/vsock: replace virtio_vsock_pkt with sk_buff - tsnep: XDP support - Ethernet high-speed switches: - nVidia/Mellanox (mlxsw): - add support for latency TLV (in FW control messages) - Microchip (sparx5): - separate explicit and implicit traffic forwarding rules, make the implicit rules always active - add support for egress DSCP rewrite - IS0 VCAP support (Ingress Classification) - IS2 VCAP filters (protos, L3 addrs, L4 ports, flags, ToS etc.) - ES2 VCAP support (Egress Access Control) - support for Per-Stream Filtering and Policing (802.1Q, 8.6.5.1) - Ethernet embedded switches: - Marvell (mv88e6xxx): - add MAB (port auth) offload support - enable PTP receive for mv88e6390 - NXP (ocelot): - support MAC Merge layer - support for the the vsc7512 internal copper phys - Microchip: - lan9303: convert to PHYLINK - lan966x: support TC flower filter statistics - lan937x: PTP support for KSZ9563/KSZ8563 and LAN937x - lan937x: support Credit Based Shaper configuration - ksz9477: support Energy Efficient Ethernet - other: - qca8k: convert to regmap read/write API, use bulk operations - rswitch: Improve TX timestamp accuracy - Intel WiFi (iwlwifi): - EHT (Wi-Fi 7) rate reporting - STEP equalizer support: transfer some STEP (connection to radio on platforms with integrated wifi) related parameters from the BIOS to the firmware. - Qualcomm 802.11ax WiFi (ath11k): - IPQ5018 support - Fine Timing Measurement (FTM) responder role support - channel 177 support - MediaTek WiFi (mt76): - per-PHY LED support - mt7996: EHT (Wi-Fi 7) support - Wireless Ethernet Dispatch (WED) reset support - switch to using page pool allocator - RealTek WiFi (rtw89): - support new version of Bluetooth co-existance - Mobile: - rmnet: support TX aggregation" * tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1872 commits) page_pool: add a comment explaining the fragment counter usage net: ethtool: fix __ethtool_dev_mm_supported() implementation ethtool: pse-pd: Fix double word in comments xsk: add linux/vmalloc.h to xsk.c sefltests: netdevsim: wait for devlink instance after netns removal selftest: fib_tests: Always cleanup before exit net/mlx5e: Align IPsec ASO result memory to be as required by hardware net/mlx5e: TC, Set CT miss to the specific ct action instance net/mlx5e: Rename CHAIN_TO_REG to MAPPED_OBJ_TO_REG net/mlx5: Refactor tc miss handling to a single function net/mlx5: Kconfig: Make tc offload depend on tc skb extension net/sched: flower: Support hardware miss to tc action net/sched: flower: Move filter handle initialization earlier net/sched: cls_api: Support hardware miss to tc action net/sched: Rename user cookie and act cookie sfc: fix builds without CONFIG_RTC_LIB sfc: clean up some inconsistent indentings net/mlx4_en: Introduce flexible array to silence overflow warning net: lan966x: Fix possible deadlock inside PTP net/ulp: Remove redundant ->clone() test in inet_clone_ulp(). ...
Diffstat (limited to 'mm/page_isolation.c')
-rw-r--r--mm/page_isolation.c671
1 files changed, 671 insertions, 0 deletions
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
new file mode 100644
index 000000000..47fbc1696
--- /dev/null
+++ b/mm/page_isolation.c
@@ -0,0 +1,671 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/page_isolation.c
+ */
+
+#include <linux/mm.h>
+#include <linux/page-isolation.h>
+#include <linux/pageblock-flags.h>
+#include <linux/memory.h>
+#include <linux/hugetlb.h>
+#include <linux/page_owner.h>
+#include <linux/migrate.h>
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/page_isolation.h>
+
+/*
+ * This function checks whether the range [start_pfn, end_pfn) includes
+ * unmovable pages or not. The range must fall into a single pageblock and
+ * consequently belong to a single zone.
+ *
+ * PageLRU check without isolation or lru_lock could race so that
+ * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
+ * check without lock_page also may miss some movable non-lru pages at
+ * race condition. So you can't expect this function should be exact.
+ *
+ * Returns a page without holding a reference. If the caller wants to
+ * dereference that page (e.g., dumping), it has to make sure that it
+ * cannot get removed (e.g., via memory unplug) concurrently.
+ *
+ */
+static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
+ int migratetype, int flags)
+{
+ struct page *page = pfn_to_page(start_pfn);
+ struct zone *zone = page_zone(page);
+ unsigned long pfn;
+
+ VM_BUG_ON(pageblock_start_pfn(start_pfn) !=
+ pageblock_start_pfn(end_pfn - 1));
+
+ if (is_migrate_cma_page(page)) {
+ /*
+ * CMA allocations (alloc_contig_range) really need to mark
+ * isolate CMA pageblocks even when they are not movable in fact
+ * so consider them movable here.
+ */
+ if (is_migrate_cma(migratetype))
+ return NULL;
+
+ return page;
+ }
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ page = pfn_to_page(pfn);
+
+ /*
+ * Both, bootmem allocations and memory holes are marked
+ * PG_reserved and are unmovable. We can even have unmovable
+ * allocations inside ZONE_MOVABLE, for example when
+ * specifying "movablecore".
+ */
+ if (PageReserved(page))
+ return page;
+
+ /*
+ * If the zone is movable and we have ruled out all reserved
+ * pages then it should be reasonably safe to assume the rest
+ * is movable.
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ continue;
+
+ /*
+ * Hugepages are not in LRU lists, but they're movable.
+ * THPs are on the LRU, but need to be counted as #small pages.
+ * We need not scan over tail pages because we don't
+ * handle each tail page individually in migration.
+ */
+ if (PageHuge(page) || PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+ unsigned int skip_pages;
+
+ if (PageHuge(page)) {
+ if (!hugepage_migration_supported(page_hstate(head)))
+ return page;
+ } else if (!PageLRU(head) && !__PageMovable(head)) {
+ return page;
+ }
+
+ skip_pages = compound_nr(head) - (page - head);
+ pfn += skip_pages - 1;
+ continue;
+ }
+
+ /*
+ * We can't use page_count without pin a page
+ * because another CPU can free compound page.
+ * This check already skips compound tails of THP
+ * because their page->_refcount is zero at all time.
+ */
+ if (!page_ref_count(page)) {
+ if (PageBuddy(page))
+ pfn += (1 << buddy_order(page)) - 1;
+ continue;
+ }
+
+ /*
+ * The HWPoisoned page may be not in buddy system, and
+ * page_count() is not 0.
+ */
+ if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+ continue;
+
+ /*
+ * We treat all PageOffline() pages as movable when offlining
+ * to give drivers a chance to decrement their reference count
+ * in MEM_GOING_OFFLINE in order to indicate that these pages
+ * can be offlined as there are no direct references anymore.
+ * For actually unmovable PageOffline() where the driver does
+ * not support this, we will fail later when trying to actually
+ * move these pages that still have a reference count > 0.
+ * (false negatives in this function only)
+ */
+ if ((flags & MEMORY_OFFLINE) && PageOffline(page))
+ continue;
+
+ if (__PageMovable(page) || PageLRU(page))
+ continue;
+
+ /*
+ * If there are RECLAIMABLE pages, we need to check
+ * it. But now, memory offline itself doesn't call
+ * shrink_node_slabs() and it still to be fixed.
+ */
+ return page;
+ }
+ return NULL;
+}
+
+/*
+ * This function set pageblock migratetype to isolate if no unmovable page is
+ * present in [start_pfn, end_pfn). The pageblock must intersect with
+ * [start_pfn, end_pfn).
+ */
+static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct zone *zone = page_zone(page);
+ struct page *unmovable;
+ unsigned long flags;
+ unsigned long check_unmovable_start, check_unmovable_end;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ /*
+ * We assume the caller intended to SET migrate type to isolate.
+ * If it is already set, then someone else must have raced and
+ * set it before us.
+ */
+ if (is_migrate_isolate_page(page)) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return -EBUSY;
+ }
+
+ /*
+ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+ * We just check MOVABLE pages.
+ *
+ * Pass the intersection of [start_pfn, end_pfn) and the page's pageblock
+ * to avoid redundant checks.
+ */
+ check_unmovable_start = max(page_to_pfn(page), start_pfn);
+ check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)),
+ end_pfn);
+
+ unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
+ migratetype, isol_flags);
+ if (!unmovable) {
+ unsigned long nr_pages;
+ int mt = get_pageblock_migratetype(page);
+
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ zone->nr_isolate_pageblock++;
+ nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
+ NULL);
+
+ __mod_zone_freepage_state(zone, -nr_pages, mt);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return 0;
+ }
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ if (isol_flags & REPORT_FAILURE) {
+ /*
+ * printk() with zone->lock held will likely trigger a
+ * lockdep splat, so defer it here.
+ */
+ dump_page(unmovable, "unmovable page");
+ }
+
+ return -EBUSY;
+}
+
+static void unset_migratetype_isolate(struct page *page, int migratetype)
+{
+ struct zone *zone;
+ unsigned long flags, nr_pages;
+ bool isolated_page = false;
+ unsigned int order;
+ struct page *buddy;
+
+ zone = page_zone(page);
+ spin_lock_irqsave(&zone->lock, flags);
+ if (!is_migrate_isolate_page(page))
+ goto out;
+
+ /*
+ * Because freepage with more than pageblock_order on isolated
+ * pageblock is restricted to merge due to freepage counting problem,
+ * it is possible that there is free buddy page.
+ * move_freepages_block() doesn't care of merge so we need other
+ * approach in order to merge them. Isolation and free will make
+ * these pages to be merged.
+ */
+ if (PageBuddy(page)) {
+ order = buddy_order(page);
+ if (order >= pageblock_order && order < MAX_ORDER - 1) {
+ buddy = find_buddy_page_pfn(page, page_to_pfn(page),
+ order, NULL);
+ if (buddy && !is_migrate_isolate_page(buddy)) {
+ isolated_page = !!__isolate_free_page(page, order);
+ /*
+ * Isolating a free page in an isolated pageblock
+ * is expected to always work as watermarks don't
+ * apply here.
+ */
+ VM_WARN_ON(!isolated_page);
+ }
+ }
+ }
+
+ /*
+ * If we isolate freepage with more than pageblock_order, there
+ * should be no freepage in the range, so we could avoid costly
+ * pageblock scanning for freepage moving.
+ *
+ * We didn't actually touch any of the isolated pages, so place them
+ * to the tail of the freelist. This is an optimization for memory
+ * onlining - just onlined memory won't immediately be considered for
+ * allocation.
+ */
+ if (!isolated_page) {
+ nr_pages = move_freepages_block(zone, page, migratetype, NULL);
+ __mod_zone_freepage_state(zone, nr_pages, migratetype);
+ }
+ set_pageblock_migratetype(page, migratetype);
+ if (isolated_page)
+ __putback_isolated_page(page, order, migratetype);
+ zone->nr_isolate_pageblock--;
+out:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+static inline struct page *
+__first_valid_page(unsigned long pfn, unsigned long nr_pages)
+{
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page;
+
+ page = pfn_to_online_page(pfn + i);
+ if (!page)
+ continue;
+ return page;
+ }
+ return NULL;
+}
+
+/**
+ * isolate_single_pageblock() -- tries to isolate a pageblock that might be
+ * within a free or in-use page.
+ * @boundary_pfn: pageblock-aligned pfn that a page might cross
+ * @flags: isolation flags
+ * @gfp_flags: GFP flags used for migrating pages
+ * @isolate_before: isolate the pageblock before the boundary_pfn
+ * @skip_isolation: the flag to skip the pageblock isolation in second
+ * isolate_single_pageblock()
+ * @migratetype: migrate type to set in error recovery.
+ *
+ * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
+ * pageblock. When not all pageblocks within a page are isolated at the same
+ * time, free page accounting can go wrong. For example, in the case of
+ * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
+ * [ MAX_ORDER-1 ]
+ * [ pageblock0 | pageblock1 ]
+ * When either pageblock is isolated, if it is a free page, the page is not
+ * split into separate migratetype lists, which is supposed to; if it is an
+ * in-use page and freed later, __free_one_page() does not split the free page
+ * either. The function handles this by splitting the free page or migrating
+ * the in-use page then splitting the free page.
+ */
+static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
+ gfp_t gfp_flags, bool isolate_before, bool skip_isolation,
+ int migratetype)
+{
+ unsigned long start_pfn;
+ unsigned long isolate_pageblock;
+ unsigned long pfn;
+ struct zone *zone;
+ int ret;
+
+ VM_BUG_ON(!pageblock_aligned(boundary_pfn));
+
+ if (isolate_before)
+ isolate_pageblock = boundary_pfn - pageblock_nr_pages;
+ else
+ isolate_pageblock = boundary_pfn;
+
+ /*
+ * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid
+ * only isolating a subset of pageblocks from a bigger than pageblock
+ * free or in-use page. Also make sure all to-be-isolated pageblocks
+ * are within the same zone.
+ */
+ zone = page_zone(pfn_to_page(isolate_pageblock));
+ start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
+ zone->zone_start_pfn);
+
+ if (skip_isolation) {
+ int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+
+ VM_BUG_ON(!is_migrate_isolate(mt));
+ } else {
+ ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype,
+ flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Bail out early when the to-be-isolated pageblock does not form
+ * a free or in-use page across boundary_pfn:
+ *
+ * 1. isolate before boundary_pfn: the page after is not online
+ * 2. isolate after boundary_pfn: the page before is not online
+ *
+ * This also ensures correctness. Without it, when isolate after
+ * boundary_pfn and [start_pfn, boundary_pfn) are not online,
+ * __first_valid_page() will return unexpected NULL in the for loop
+ * below.
+ */
+ if (isolate_before) {
+ if (!pfn_to_online_page(boundary_pfn))
+ return 0;
+ } else {
+ if (!pfn_to_online_page(boundary_pfn - 1))
+ return 0;
+ }
+
+ for (pfn = start_pfn; pfn < boundary_pfn;) {
+ struct page *page = __first_valid_page(pfn, boundary_pfn - pfn);
+
+ VM_BUG_ON(!page);
+ pfn = page_to_pfn(page);
+ /*
+ * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
+ * free pages in [start_pfn, boundary_pfn), its head page will
+ * always be in the range.
+ */
+ if (PageBuddy(page)) {
+ int order = buddy_order(page);
+
+ if (pfn + (1UL << order) > boundary_pfn) {
+ /* free page changed before split, check it again */
+ if (split_free_page(page, order, boundary_pfn - pfn))
+ continue;
+ }
+
+ pfn += 1UL << order;
+ continue;
+ }
+ /*
+ * migrate compound pages then let the free page handling code
+ * above do the rest. If migration is not possible, just fail.
+ */
+ if (PageCompound(page)) {
+ struct page *head = compound_head(page);
+ unsigned long head_pfn = page_to_pfn(head);
+ unsigned long nr_pages = compound_nr(head);
+
+ if (head_pfn + nr_pages <= boundary_pfn) {
+ pfn = head_pfn + nr_pages;
+ continue;
+ }
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+ /*
+ * hugetlb, lru compound (THP), and movable compound pages
+ * can be migrated. Otherwise, fail the isolation.
+ */
+ if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
+ int order;
+ unsigned long outer_pfn;
+ int page_mt = get_pageblock_migratetype(page);
+ bool isolate_page = !is_migrate_isolate_page(page);
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(head_pfn)),
+ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
+ .no_set_skip_hint = true,
+ .gfp_mask = gfp_flags,
+ .alloc_contig = true,
+ };
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ /*
+ * XXX: mark the page as MIGRATE_ISOLATE so that
+ * no one else can grab the freed page after migration.
+ * Ideally, the page should be freed as two separate
+ * pages to be added into separate migratetype free
+ * lists.
+ */
+ if (isolate_page) {
+ ret = set_migratetype_isolate(page, page_mt,
+ flags, head_pfn, head_pfn + nr_pages);
+ if (ret)
+ goto failed;
+ }
+
+ ret = __alloc_contig_migrate_range(&cc, head_pfn,
+ head_pfn + nr_pages);
+
+ /*
+ * restore the page's migratetype so that it can
+ * be split into separate migratetype free lists
+ * later.
+ */
+ if (isolate_page)
+ unset_migratetype_isolate(page, page_mt);
+
+ if (ret)
+ goto failed;
+ /*
+ * reset pfn to the head of the free page, so
+ * that the free page handling code above can split
+ * the free page to the right migratetype list.
+ *
+ * head_pfn is not used here as a hugetlb page order
+ * can be bigger than MAX_ORDER-1, but after it is
+ * freed, the free page order is not. Use pfn within
+ * the range to find the head of the free page.
+ */
+ order = 0;
+ outer_pfn = pfn;
+ while (!PageBuddy(pfn_to_page(outer_pfn))) {
+ /* stop if we cannot find the free page */
+ if (++order >= MAX_ORDER)
+ goto failed;
+ outer_pfn &= ~0UL << order;
+ }
+ pfn = outer_pfn;
+ continue;
+ } else
+#endif
+ goto failed;
+ }
+
+ pfn++;
+ }
+ return 0;
+failed:
+ /* restore the original migratetype */
+ if (!skip_isolation)
+ unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype);
+ return -EBUSY;
+}
+
+/**
+ * start_isolate_page_range() - make page-allocation-type of range of pages to
+ * be MIGRATE_ISOLATE.
+ * @start_pfn: The lower PFN of the range to be isolated.
+ * @end_pfn: The upper PFN of the range to be isolated.
+ * @migratetype: Migrate type to set in error recovery.
+ * @flags: The following flags are allowed (they can be combined in
+ * a bit mask)
+ * MEMORY_OFFLINE - isolate to offline (!allocate) memory
+ * e.g., skip over PageHWPoison() pages
+ * and PageOffline() pages.
+ * REPORT_FAILURE - report details about the failure to
+ * isolate the range
+ * @gfp_flags: GFP flags used for migrating pages that sit across the
+ * range boundaries.
+ *
+ * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
+ * the range will never be allocated. Any free pages and pages freed in the
+ * future will not be allocated again. If specified range includes migrate types
+ * other than MOVABLE or CMA, this will fail with -EBUSY. For isolating all
+ * pages in the range finally, the caller have to free all pages in the range.
+ * test_page_isolated() can be used for test it.
+ *
+ * The function first tries to isolate the pageblocks at the beginning and end
+ * of the range, since there might be pages across the range boundaries.
+ * Afterwards, it isolates the rest of the range.
+ *
+ * There is no high level synchronization mechanism that prevents two threads
+ * from trying to isolate overlapping ranges. If this happens, one thread
+ * will notice pageblocks in the overlapping range already set to isolate.
+ * This happens in set_migratetype_isolate, and set_migratetype_isolate
+ * returns an error. We then clean up by restoring the migration type on
+ * pageblocks we may have modified and return -EBUSY to caller. This
+ * prevents two threads from simultaneously working on overlapping ranges.
+ *
+ * Please note that there is no strong synchronization with the page allocator
+ * either. Pages might be freed while their page blocks are marked ISOLATED.
+ * A call to drain_all_pages() after isolation can flush most of them. However
+ * in some cases pages might still end up on pcp lists and that would allow
+ * for their allocation even when they are in fact isolated already. Depending
+ * on how strong of a guarantee the caller needs, zone_pcp_disable/enable()
+ * might be used to flush and disable pcplist before isolation and enable after
+ * unisolation.
+ *
+ * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
+ */
+int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+ int migratetype, int flags, gfp_t gfp_flags)
+{
+ unsigned long pfn;
+ struct page *page;
+ /* isolation is done at page block granularity */
+ unsigned long isolate_start = pageblock_start_pfn(start_pfn);
+ unsigned long isolate_end = pageblock_align(end_pfn);
+ int ret;
+ bool skip_isolation = false;
+
+ /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
+ ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false,
+ skip_isolation, migratetype);
+ if (ret)
+ return ret;
+
+ if (isolate_start == isolate_end - pageblock_nr_pages)
+ skip_isolation = true;
+
+ /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
+ ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true,
+ skip_isolation, migratetype);
+ if (ret) {
+ unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
+ return ret;
+ }
+
+ /* skip isolated pageblocks at the beginning and end */
+ for (pfn = isolate_start + pageblock_nr_pages;
+ pfn < isolate_end - pageblock_nr_pages;
+ pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (page && set_migratetype_isolate(page, migratetype, flags,
+ start_pfn, end_pfn)) {
+ undo_isolate_page_range(isolate_start, pfn, migratetype);
+ unset_migratetype_isolate(
+ pfn_to_page(isolate_end - pageblock_nr_pages),
+ migratetype);
+ return -EBUSY;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Make isolated pages available again.
+ */
+void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+ int migratetype)
+{
+ unsigned long pfn;
+ struct page *page;
+ unsigned long isolate_start = pageblock_start_pfn(start_pfn);
+ unsigned long isolate_end = pageblock_align(end_pfn);
+
+ for (pfn = isolate_start;
+ pfn < isolate_end;
+ pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (!page || !is_migrate_isolate_page(page))
+ continue;
+ unset_migratetype_isolate(page, migratetype);
+ }
+}
+/*
+ * Test all pages in the range is free(means isolated) or not.
+ * all pages in [start_pfn...end_pfn) must be in the same zone.
+ * zone->lock must be held before call this.
+ *
+ * Returns the last tested pfn.
+ */
+static unsigned long
+__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
+ int flags)
+{
+ struct page *page;
+
+ while (pfn < end_pfn) {
+ page = pfn_to_page(pfn);
+ if (PageBuddy(page))
+ /*
+ * If the page is on a free list, it has to be on
+ * the correct MIGRATE_ISOLATE freelist. There is no
+ * simple way to verify that as VM_BUG_ON(), though.
+ */
+ pfn += 1 << buddy_order(page);
+ else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+ /* A HWPoisoned page cannot be also PageBuddy */
+ pfn++;
+ else if ((flags & MEMORY_OFFLINE) && PageOffline(page) &&
+ !page_count(page))
+ /*
+ * The responsible driver agreed to skip PageOffline()
+ * pages when offlining memory by dropping its
+ * reference in MEM_GOING_OFFLINE.
+ */
+ pfn++;
+ else
+ break;
+ }
+
+ return pfn;
+}
+
+/* Caller should ensure that requested range is in a single zone */
+int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
+ int isol_flags)
+{
+ unsigned long pfn, flags;
+ struct page *page;
+ struct zone *zone;
+ int ret;
+
+ /*
+ * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
+ * are not aligned to pageblock_nr_pages.
+ * Then we just check migratetype first.
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (page && !is_migrate_isolate_page(page))
+ break;
+ }
+ page = __first_valid_page(start_pfn, end_pfn - start_pfn);
+ if ((pfn < end_pfn) || !page) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* Check all pages are free or marked as ISOLATED */
+ zone = page_zone(page);
+ spin_lock_irqsave(&zone->lock, flags);
+ pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ ret = pfn < end_pfn ? -EBUSY : 0;
+
+out:
+ trace_test_pages_isolated(start_pfn, end_pfn, pfn);
+
+ return ret;
+}