xref: /openbmc/linux/mm/migrate_device.c (revision a790cc3a)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Device Memory Migration functionality.
4  *
5  * Originally written by Jérôme Glisse.
6  */
7 #include <linux/export.h>
8 #include <linux/memremap.h>
9 #include <linux/migrate.h>
10 #include <linux/mm.h>
11 #include <linux/mm_inline.h>
12 #include <linux/mmu_notifier.h>
13 #include <linux/oom.h>
14 #include <linux/pagewalk.h>
15 #include <linux/rmap.h>
16 #include <linux/swapops.h>
17 #include <asm/tlbflush.h>
18 #include "internal.h"
19 
20 static int migrate_vma_collect_skip(unsigned long start,
21 				    unsigned long end,
22 				    struct mm_walk *walk)
23 {
24 	struct migrate_vma *migrate = walk->private;
25 	unsigned long addr;
26 
27 	for (addr = start; addr < end; addr += PAGE_SIZE) {
28 		migrate->dst[migrate->npages] = 0;
29 		migrate->src[migrate->npages++] = 0;
30 	}
31 
32 	return 0;
33 }
34 
35 static int migrate_vma_collect_hole(unsigned long start,
36 				    unsigned long end,
37 				    __always_unused int depth,
38 				    struct mm_walk *walk)
39 {
40 	struct migrate_vma *migrate = walk->private;
41 	unsigned long addr;
42 
43 	/* Only allow populating anonymous memory. */
44 	if (!vma_is_anonymous(walk->vma))
45 		return migrate_vma_collect_skip(start, end, walk);
46 
47 	for (addr = start; addr < end; addr += PAGE_SIZE) {
48 		migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
49 		migrate->dst[migrate->npages] = 0;
50 		migrate->npages++;
51 		migrate->cpages++;
52 	}
53 
54 	return 0;
55 }
56 
57 static int migrate_vma_collect_pmd(pmd_t *pmdp,
58 				   unsigned long start,
59 				   unsigned long end,
60 				   struct mm_walk *walk)
61 {
62 	struct migrate_vma *migrate = walk->private;
63 	struct vm_area_struct *vma = walk->vma;
64 	struct mm_struct *mm = vma->vm_mm;
65 	unsigned long addr = start, unmapped = 0;
66 	spinlock_t *ptl;
67 	pte_t *ptep;
68 
69 again:
70 	if (pmd_none(*pmdp))
71 		return migrate_vma_collect_hole(start, end, -1, walk);
72 
73 	if (pmd_trans_huge(*pmdp)) {
74 		struct page *page;
75 
76 		ptl = pmd_lock(mm, pmdp);
77 		if (unlikely(!pmd_trans_huge(*pmdp))) {
78 			spin_unlock(ptl);
79 			goto again;
80 		}
81 
82 		page = pmd_page(*pmdp);
83 		if (is_huge_zero_page(page)) {
84 			spin_unlock(ptl);
85 			split_huge_pmd(vma, pmdp, addr);
86 			if (pmd_trans_unstable(pmdp))
87 				return migrate_vma_collect_skip(start, end,
88 								walk);
89 		} else {
90 			int ret;
91 
92 			get_page(page);
93 			spin_unlock(ptl);
94 			if (unlikely(!trylock_page(page)))
95 				return migrate_vma_collect_skip(start, end,
96 								walk);
97 			ret = split_huge_page(page);
98 			unlock_page(page);
99 			put_page(page);
100 			if (ret)
101 				return migrate_vma_collect_skip(start, end,
102 								walk);
103 			if (pmd_none(*pmdp))
104 				return migrate_vma_collect_hole(start, end, -1,
105 								walk);
106 		}
107 	}
108 
109 	if (unlikely(pmd_bad(*pmdp)))
110 		return migrate_vma_collect_skip(start, end, walk);
111 
112 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
113 	arch_enter_lazy_mmu_mode();
114 
115 	for (; addr < end; addr += PAGE_SIZE, ptep++) {
116 		unsigned long mpfn = 0, pfn;
117 		struct page *page;
118 		swp_entry_t entry;
119 		pte_t pte;
120 
121 		pte = *ptep;
122 
123 		if (pte_none(pte)) {
124 			if (vma_is_anonymous(vma)) {
125 				mpfn = MIGRATE_PFN_MIGRATE;
126 				migrate->cpages++;
127 			}
128 			goto next;
129 		}
130 
131 		if (!pte_present(pte)) {
132 			/*
133 			 * Only care about unaddressable device page special
134 			 * page table entry. Other special swap entries are not
135 			 * migratable, and we ignore regular swapped page.
136 			 */
137 			entry = pte_to_swp_entry(pte);
138 			if (!is_device_private_entry(entry))
139 				goto next;
140 
141 			page = pfn_swap_entry_to_page(entry);
142 			if (!(migrate->flags &
143 				MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
144 			    page->pgmap->owner != migrate->pgmap_owner)
145 				goto next;
146 
147 			mpfn = migrate_pfn(page_to_pfn(page)) |
148 					MIGRATE_PFN_MIGRATE;
149 			if (is_writable_device_private_entry(entry))
150 				mpfn |= MIGRATE_PFN_WRITE;
151 		} else {
152 			pfn = pte_pfn(pte);
153 			if (is_zero_pfn(pfn) &&
154 			    (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
155 				mpfn = MIGRATE_PFN_MIGRATE;
156 				migrate->cpages++;
157 				goto next;
158 			}
159 			page = vm_normal_page(migrate->vma, addr, pte);
160 			if (page && !is_zone_device_page(page) &&
161 			    !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
162 				goto next;
163 			else if (page && is_device_coherent_page(page) &&
164 			    (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
165 			     page->pgmap->owner != migrate->pgmap_owner))
166 				goto next;
167 			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
168 			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
169 		}
170 
171 		/* FIXME support THP */
172 		if (!page || !page->mapping || PageTransCompound(page)) {
173 			mpfn = 0;
174 			goto next;
175 		}
176 
177 		/*
178 		 * By getting a reference on the page we pin it and that blocks
179 		 * any kind of migration. Side effect is that it "freezes" the
180 		 * pte.
181 		 *
182 		 * We drop this reference after isolating the page from the lru
183 		 * for non device page (device page are not on the lru and thus
184 		 * can't be dropped from it).
185 		 */
186 		get_page(page);
187 
188 		/*
189 		 * Optimize for the common case where page is only mapped once
190 		 * in one process. If we can lock the page, then we can safely
191 		 * set up a special migration page table entry now.
192 		 */
193 		if (trylock_page(page)) {
194 			bool anon_exclusive;
195 			pte_t swp_pte;
196 
197 			flush_cache_page(vma, addr, pte_pfn(*ptep));
198 			anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
199 			if (anon_exclusive) {
200 				pte = ptep_clear_flush(vma, addr, ptep);
201 
202 				if (page_try_share_anon_rmap(page)) {
203 					set_pte_at(mm, addr, ptep, pte);
204 					unlock_page(page);
205 					put_page(page);
206 					mpfn = 0;
207 					goto next;
208 				}
209 			} else {
210 				pte = ptep_get_and_clear(mm, addr, ptep);
211 			}
212 
213 			migrate->cpages++;
214 
215 			/* Set the dirty flag on the folio now the pte is gone. */
216 			if (pte_dirty(pte))
217 				folio_mark_dirty(page_folio(page));
218 
219 			/* Setup special migration page table entry */
220 			if (mpfn & MIGRATE_PFN_WRITE)
221 				entry = make_writable_migration_entry(
222 							page_to_pfn(page));
223 			else if (anon_exclusive)
224 				entry = make_readable_exclusive_migration_entry(
225 							page_to_pfn(page));
226 			else
227 				entry = make_readable_migration_entry(
228 							page_to_pfn(page));
229 			swp_pte = swp_entry_to_pte(entry);
230 			if (pte_present(pte)) {
231 				if (pte_soft_dirty(pte))
232 					swp_pte = pte_swp_mksoft_dirty(swp_pte);
233 				if (pte_uffd_wp(pte))
234 					swp_pte = pte_swp_mkuffd_wp(swp_pte);
235 			} else {
236 				if (pte_swp_soft_dirty(pte))
237 					swp_pte = pte_swp_mksoft_dirty(swp_pte);
238 				if (pte_swp_uffd_wp(pte))
239 					swp_pte = pte_swp_mkuffd_wp(swp_pte);
240 			}
241 			set_pte_at(mm, addr, ptep, swp_pte);
242 
243 			/*
244 			 * This is like regular unmap: we remove the rmap and
245 			 * drop page refcount. Page won't be freed, as we took
246 			 * a reference just above.
247 			 */
248 			page_remove_rmap(page, vma, false);
249 			put_page(page);
250 
251 			if (pte_present(pte))
252 				unmapped++;
253 		} else {
254 			put_page(page);
255 			mpfn = 0;
256 		}
257 
258 next:
259 		migrate->dst[migrate->npages] = 0;
260 		migrate->src[migrate->npages++] = mpfn;
261 	}
262 
263 	/* Only flush the TLB if we actually modified any entries */
264 	if (unmapped)
265 		flush_tlb_range(walk->vma, start, end);
266 
267 	arch_leave_lazy_mmu_mode();
268 	pte_unmap_unlock(ptep - 1, ptl);
269 
270 	return 0;
271 }
272 
273 static const struct mm_walk_ops migrate_vma_walk_ops = {
274 	.pmd_entry		= migrate_vma_collect_pmd,
275 	.pte_hole		= migrate_vma_collect_hole,
276 };
277 
278 /*
279  * migrate_vma_collect() - collect pages over a range of virtual addresses
280  * @migrate: migrate struct containing all migration information
281  *
282  * This will walk the CPU page table. For each virtual address backed by a
283  * valid page, it updates the src array and takes a reference on the page, in
284  * order to pin the page until we lock it and unmap it.
285  */
286 static void migrate_vma_collect(struct migrate_vma *migrate)
287 {
288 	struct mmu_notifier_range range;
289 
290 	/*
291 	 * Note that the pgmap_owner is passed to the mmu notifier callback so
292 	 * that the registered device driver can skip invalidating device
293 	 * private page mappings that won't be migrated.
294 	 */
295 	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
296 		migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
297 		migrate->pgmap_owner);
298 	mmu_notifier_invalidate_range_start(&range);
299 
300 	walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
301 			&migrate_vma_walk_ops, migrate);
302 
303 	mmu_notifier_invalidate_range_end(&range);
304 	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
305 }
306 
307 /*
308  * migrate_vma_check_page() - check if page is pinned or not
309  * @page: struct page to check
310  *
311  * Pinned pages cannot be migrated. This is the same test as in
312  * folio_migrate_mapping(), except that here we allow migration of a
313  * ZONE_DEVICE page.
314  */
315 static bool migrate_vma_check_page(struct page *page)
316 {
317 	/*
318 	 * One extra ref because caller holds an extra reference, either from
319 	 * isolate_lru_page() for a regular page, or migrate_vma_collect() for
320 	 * a device page.
321 	 */
322 	int extra = 1;
323 
324 	/*
325 	 * FIXME support THP (transparent huge page), it is bit more complex to
326 	 * check them than regular pages, because they can be mapped with a pmd
327 	 * or with a pte (split pte mapping).
328 	 */
329 	if (PageCompound(page))
330 		return false;
331 
332 	/* Page from ZONE_DEVICE have one extra reference */
333 	if (is_zone_device_page(page))
334 		extra++;
335 
336 	/* For file back page */
337 	if (page_mapping(page))
338 		extra += 1 + page_has_private(page);
339 
340 	if ((page_count(page) - extra) > page_mapcount(page))
341 		return false;
342 
343 	return true;
344 }
345 
346 /*
347  * migrate_vma_unmap() - replace page mapping with special migration pte entry
348  * @migrate: migrate struct containing all migration information
349  *
350  * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
351  * special migration pte entry and check if it has been pinned. Pinned pages are
352  * restored because we cannot migrate them.
353  *
354  * This is the last step before we call the device driver callback to allocate
355  * destination memory and copy contents of original page over to new page.
356  */
357 static void migrate_vma_unmap(struct migrate_vma *migrate)
358 {
359 	const unsigned long npages = migrate->npages;
360 	unsigned long i, restore = 0;
361 	bool allow_drain = true;
362 
363 	lru_add_drain();
364 
365 	for (i = 0; i < npages; i++) {
366 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
367 		struct folio *folio;
368 
369 		if (!page)
370 			continue;
371 
372 		/* ZONE_DEVICE pages are not on LRU */
373 		if (!is_zone_device_page(page)) {
374 			if (!PageLRU(page) && allow_drain) {
375 				/* Drain CPU's pagevec */
376 				lru_add_drain_all();
377 				allow_drain = false;
378 			}
379 
380 			if (isolate_lru_page(page)) {
381 				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
382 				migrate->cpages--;
383 				restore++;
384 				continue;
385 			}
386 
387 			/* Drop the reference we took in collect */
388 			put_page(page);
389 		}
390 
391 		folio = page_folio(page);
392 		if (folio_mapped(folio))
393 			try_to_migrate(folio, 0);
394 
395 		if (page_mapped(page) || !migrate_vma_check_page(page)) {
396 			if (!is_zone_device_page(page)) {
397 				get_page(page);
398 				putback_lru_page(page);
399 			}
400 
401 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
402 			migrate->cpages--;
403 			restore++;
404 			continue;
405 		}
406 	}
407 
408 	for (i = 0; i < npages && restore; i++) {
409 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
410 		struct folio *folio;
411 
412 		if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
413 			continue;
414 
415 		folio = page_folio(page);
416 		remove_migration_ptes(folio, folio, false);
417 
418 		migrate->src[i] = 0;
419 		folio_unlock(folio);
420 		folio_put(folio);
421 		restore--;
422 	}
423 }
424 
425 /**
426  * migrate_vma_setup() - prepare to migrate a range of memory
427  * @args: contains the vma, start, and pfns arrays for the migration
428  *
429  * Returns: negative errno on failures, 0 when 0 or more pages were migrated
430  * without an error.
431  *
432  * Prepare to migrate a range of memory virtual address range by collecting all
433  * the pages backing each virtual address in the range, saving them inside the
434  * src array.  Then lock those pages and unmap them. Once the pages are locked
435  * and unmapped, check whether each page is pinned or not.  Pages that aren't
436  * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
437  * corresponding src array entry.  Then restores any pages that are pinned, by
438  * remapping and unlocking those pages.
439  *
440  * The caller should then allocate destination memory and copy source memory to
441  * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
442  * flag set).  Once these are allocated and copied, the caller must update each
443  * corresponding entry in the dst array with the pfn value of the destination
444  * page and with MIGRATE_PFN_VALID. Destination pages must be locked via
445  * lock_page().
446  *
447  * Note that the caller does not have to migrate all the pages that are marked
448  * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
449  * device memory to system memory.  If the caller cannot migrate a device page
450  * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
451  * consequences for the userspace process, so it must be avoided if at all
452  * possible.
453  *
454  * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
455  * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
456  * allowing the caller to allocate device memory for those unbacked virtual
457  * addresses.  For this the caller simply has to allocate device memory and
458  * properly set the destination entry like for regular migration.  Note that
459  * this can still fail, and thus inside the device driver you must check if the
460  * migration was successful for those entries after calling migrate_vma_pages(),
461  * just like for regular migration.
462  *
463  * After that, the callers must call migrate_vma_pages() to go over each entry
464  * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
465  * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
466  * then migrate_vma_pages() to migrate struct page information from the source
467  * struct page to the destination struct page.  If it fails to migrate the
468  * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
469  * src array.
470  *
471  * At this point all successfully migrated pages have an entry in the src
472  * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
473  * array entry with MIGRATE_PFN_VALID flag set.
474  *
475  * Once migrate_vma_pages() returns the caller may inspect which pages were
476  * successfully migrated, and which were not.  Successfully migrated pages will
477  * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
478  *
479  * It is safe to update device page table after migrate_vma_pages() because
480  * both destination and source page are still locked, and the mmap_lock is held
481  * in read mode (hence no one can unmap the range being migrated).
482  *
483  * Once the caller is done cleaning up things and updating its page table (if it
484  * chose to do so, this is not an obligation) it finally calls
485  * migrate_vma_finalize() to update the CPU page table to point to new pages
486  * for successfully migrated pages or otherwise restore the CPU page table to
487  * point to the original source pages.
488  */
489 int migrate_vma_setup(struct migrate_vma *args)
490 {
491 	long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
492 
493 	args->start &= PAGE_MASK;
494 	args->end &= PAGE_MASK;
495 	if (!args->vma || is_vm_hugetlb_page(args->vma) ||
496 	    (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
497 		return -EINVAL;
498 	if (nr_pages <= 0)
499 		return -EINVAL;
500 	if (args->start < args->vma->vm_start ||
501 	    args->start >= args->vma->vm_end)
502 		return -EINVAL;
503 	if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
504 		return -EINVAL;
505 	if (!args->src || !args->dst)
506 		return -EINVAL;
507 
508 	memset(args->src, 0, sizeof(*args->src) * nr_pages);
509 	args->cpages = 0;
510 	args->npages = 0;
511 
512 	migrate_vma_collect(args);
513 
514 	if (args->cpages)
515 		migrate_vma_unmap(args);
516 
517 	/*
518 	 * At this point pages are locked and unmapped, and thus they have
519 	 * stable content and can safely be copied to destination memory that
520 	 * is allocated by the drivers.
521 	 */
522 	return 0;
523 
524 }
525 EXPORT_SYMBOL(migrate_vma_setup);
526 
527 /*
528  * This code closely matches the code in:
529  *   __handle_mm_fault()
530  *     handle_pte_fault()
531  *       do_anonymous_page()
532  * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
533  * private or coherent page.
534  */
535 static void migrate_vma_insert_page(struct migrate_vma *migrate,
536 				    unsigned long addr,
537 				    struct page *page,
538 				    unsigned long *src)
539 {
540 	struct vm_area_struct *vma = migrate->vma;
541 	struct mm_struct *mm = vma->vm_mm;
542 	bool flush = false;
543 	spinlock_t *ptl;
544 	pte_t entry;
545 	pgd_t *pgdp;
546 	p4d_t *p4dp;
547 	pud_t *pudp;
548 	pmd_t *pmdp;
549 	pte_t *ptep;
550 
551 	/* Only allow populating anonymous memory */
552 	if (!vma_is_anonymous(vma))
553 		goto abort;
554 
555 	pgdp = pgd_offset(mm, addr);
556 	p4dp = p4d_alloc(mm, pgdp, addr);
557 	if (!p4dp)
558 		goto abort;
559 	pudp = pud_alloc(mm, p4dp, addr);
560 	if (!pudp)
561 		goto abort;
562 	pmdp = pmd_alloc(mm, pudp, addr);
563 	if (!pmdp)
564 		goto abort;
565 
566 	if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
567 		goto abort;
568 
569 	/*
570 	 * Use pte_alloc() instead of pte_alloc_map().  We can't run
571 	 * pte_offset_map() on pmds where a huge pmd might be created
572 	 * from a different thread.
573 	 *
574 	 * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
575 	 * parallel threads are excluded by other means.
576 	 *
577 	 * Here we only have mmap_read_lock(mm).
578 	 */
579 	if (pte_alloc(mm, pmdp))
580 		goto abort;
581 
582 	/* See the comment in pte_alloc_one_map() */
583 	if (unlikely(pmd_trans_unstable(pmdp)))
584 		goto abort;
585 
586 	if (unlikely(anon_vma_prepare(vma)))
587 		goto abort;
588 	if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
589 		goto abort;
590 
591 	/*
592 	 * The memory barrier inside __SetPageUptodate makes sure that
593 	 * preceding stores to the page contents become visible before
594 	 * the set_pte_at() write.
595 	 */
596 	__SetPageUptodate(page);
597 
598 	if (is_device_private_page(page)) {
599 		swp_entry_t swp_entry;
600 
601 		if (vma->vm_flags & VM_WRITE)
602 			swp_entry = make_writable_device_private_entry(
603 						page_to_pfn(page));
604 		else
605 			swp_entry = make_readable_device_private_entry(
606 						page_to_pfn(page));
607 		entry = swp_entry_to_pte(swp_entry);
608 	} else {
609 		if (is_zone_device_page(page) &&
610 		    !is_device_coherent_page(page)) {
611 			pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
612 			goto abort;
613 		}
614 		entry = mk_pte(page, vma->vm_page_prot);
615 		if (vma->vm_flags & VM_WRITE)
616 			entry = pte_mkwrite(pte_mkdirty(entry));
617 	}
618 
619 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
620 
621 	if (check_stable_address_space(mm))
622 		goto unlock_abort;
623 
624 	if (pte_present(*ptep)) {
625 		unsigned long pfn = pte_pfn(*ptep);
626 
627 		if (!is_zero_pfn(pfn))
628 			goto unlock_abort;
629 		flush = true;
630 	} else if (!pte_none(*ptep))
631 		goto unlock_abort;
632 
633 	/*
634 	 * Check for userfaultfd but do not deliver the fault. Instead,
635 	 * just back off.
636 	 */
637 	if (userfaultfd_missing(vma))
638 		goto unlock_abort;
639 
640 	inc_mm_counter(mm, MM_ANONPAGES);
641 	page_add_new_anon_rmap(page, vma, addr);
642 	if (!is_zone_device_page(page))
643 		lru_cache_add_inactive_or_unevictable(page, vma);
644 	get_page(page);
645 
646 	if (flush) {
647 		flush_cache_page(vma, addr, pte_pfn(*ptep));
648 		ptep_clear_flush_notify(vma, addr, ptep);
649 		set_pte_at_notify(mm, addr, ptep, entry);
650 		update_mmu_cache(vma, addr, ptep);
651 	} else {
652 		/* No need to invalidate - it was non-present before */
653 		set_pte_at(mm, addr, ptep, entry);
654 		update_mmu_cache(vma, addr, ptep);
655 	}
656 
657 	pte_unmap_unlock(ptep, ptl);
658 	*src = MIGRATE_PFN_MIGRATE;
659 	return;
660 
661 unlock_abort:
662 	pte_unmap_unlock(ptep, ptl);
663 abort:
664 	*src &= ~MIGRATE_PFN_MIGRATE;
665 }
666 
667 /**
668  * migrate_vma_pages() - migrate meta-data from src page to dst page
669  * @migrate: migrate struct containing all migration information
670  *
671  * This migrates struct page meta-data from source struct page to destination
672  * struct page. This effectively finishes the migration from source page to the
673  * destination page.
674  */
675 void migrate_vma_pages(struct migrate_vma *migrate)
676 {
677 	const unsigned long npages = migrate->npages;
678 	const unsigned long start = migrate->start;
679 	struct mmu_notifier_range range;
680 	unsigned long addr, i;
681 	bool notified = false;
682 
683 	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
684 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
685 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
686 		struct address_space *mapping;
687 		int r;
688 
689 		if (!newpage) {
690 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
691 			continue;
692 		}
693 
694 		if (!page) {
695 			/*
696 			 * The only time there is no vma is when called from
697 			 * migrate_device_coherent_page(). However this isn't
698 			 * called if the page could not be unmapped.
699 			 */
700 			VM_BUG_ON(!migrate->vma);
701 			if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
702 				continue;
703 			if (!notified) {
704 				notified = true;
705 
706 				mmu_notifier_range_init_owner(&range,
707 					MMU_NOTIFY_MIGRATE, 0, migrate->vma,
708 					migrate->vma->vm_mm, addr, migrate->end,
709 					migrate->pgmap_owner);
710 				mmu_notifier_invalidate_range_start(&range);
711 			}
712 			migrate_vma_insert_page(migrate, addr, newpage,
713 						&migrate->src[i]);
714 			continue;
715 		}
716 
717 		mapping = page_mapping(page);
718 
719 		if (is_device_private_page(newpage) ||
720 		    is_device_coherent_page(newpage)) {
721 			/*
722 			 * For now only support anonymous memory migrating to
723 			 * device private or coherent memory.
724 			 */
725 			if (mapping) {
726 				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
727 				continue;
728 			}
729 		} else if (is_zone_device_page(newpage)) {
730 			/*
731 			 * Other types of ZONE_DEVICE page are not supported.
732 			 */
733 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
734 			continue;
735 		}
736 
737 		r = migrate_folio(mapping, page_folio(newpage),
738 				page_folio(page), MIGRATE_SYNC_NO_COPY);
739 		if (r != MIGRATEPAGE_SUCCESS)
740 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
741 	}
742 
743 	/*
744 	 * No need to double call mmu_notifier->invalidate_range() callback as
745 	 * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
746 	 * did already call it.
747 	 */
748 	if (notified)
749 		mmu_notifier_invalidate_range_only_end(&range);
750 }
751 EXPORT_SYMBOL(migrate_vma_pages);
752 
753 /**
754  * migrate_vma_finalize() - restore CPU page table entry
755  * @migrate: migrate struct containing all migration information
756  *
757  * This replaces the special migration pte entry with either a mapping to the
758  * new page if migration was successful for that page, or to the original page
759  * otherwise.
760  *
761  * This also unlocks the pages and puts them back on the lru, or drops the extra
762  * refcount, for device pages.
763  */
764 void migrate_vma_finalize(struct migrate_vma *migrate)
765 {
766 	const unsigned long npages = migrate->npages;
767 	unsigned long i;
768 
769 	for (i = 0; i < npages; i++) {
770 		struct folio *dst, *src;
771 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
772 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
773 
774 		if (!page) {
775 			if (newpage) {
776 				unlock_page(newpage);
777 				put_page(newpage);
778 			}
779 			continue;
780 		}
781 
782 		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
783 			if (newpage) {
784 				unlock_page(newpage);
785 				put_page(newpage);
786 			}
787 			newpage = page;
788 		}
789 
790 		src = page_folio(page);
791 		dst = page_folio(newpage);
792 		remove_migration_ptes(src, dst, false);
793 		folio_unlock(src);
794 
795 		if (is_zone_device_page(page))
796 			put_page(page);
797 		else
798 			putback_lru_page(page);
799 
800 		if (newpage != page) {
801 			unlock_page(newpage);
802 			if (is_zone_device_page(newpage))
803 				put_page(newpage);
804 			else
805 				putback_lru_page(newpage);
806 		}
807 	}
808 }
809 EXPORT_SYMBOL(migrate_vma_finalize);
810 
811 /*
812  * Migrate a device coherent page back to normal memory. The caller should have
813  * a reference on page which will be copied to the new page if migration is
814  * successful or dropped on failure.
815  */
816 int migrate_device_coherent_page(struct page *page)
817 {
818 	unsigned long src_pfn, dst_pfn = 0;
819 	struct migrate_vma args;
820 	struct page *dpage;
821 
822 	WARN_ON_ONCE(PageCompound(page));
823 
824 	lock_page(page);
825 	src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
826 	args.src = &src_pfn;
827 	args.dst = &dst_pfn;
828 	args.cpages = 1;
829 	args.npages = 1;
830 	args.vma = NULL;
831 
832 	/*
833 	 * We don't have a VMA and don't need to walk the page tables to find
834 	 * the source page. So call migrate_vma_unmap() directly to unmap the
835 	 * page as migrate_vma_setup() will fail if args.vma == NULL.
836 	 */
837 	migrate_vma_unmap(&args);
838 	if (!(src_pfn & MIGRATE_PFN_MIGRATE))
839 		return -EBUSY;
840 
841 	dpage = alloc_page(GFP_USER | __GFP_NOWARN);
842 	if (dpage) {
843 		lock_page(dpage);
844 		dst_pfn = migrate_pfn(page_to_pfn(dpage));
845 	}
846 
847 	migrate_vma_pages(&args);
848 	if (src_pfn & MIGRATE_PFN_MIGRATE)
849 		copy_highpage(dpage, page);
850 	migrate_vma_finalize(&args);
851 
852 	if (src_pfn & MIGRATE_PFN_MIGRATE)
853 		return 0;
854 	return -EBUSY;
855 }
856