xref: /openbmc/linux/mm/page_reporting.c (revision 02cf8719b8cb2b474b37bcbeee4706950b3d1d3d)
136e66c55SAlexander Duyck // SPDX-License-Identifier: GPL-2.0
236e66c55SAlexander Duyck #include <linux/mm.h>
336e66c55SAlexander Duyck #include <linux/mmzone.h>
436e66c55SAlexander Duyck #include <linux/page_reporting.h>
536e66c55SAlexander Duyck #include <linux/gfp.h>
636e66c55SAlexander Duyck #include <linux/export.h>
736e66c55SAlexander Duyck #include <linux/delay.h>
836e66c55SAlexander Duyck #include <linux/scatterlist.h>
936e66c55SAlexander Duyck 
1036e66c55SAlexander Duyck #include "page_reporting.h"
1136e66c55SAlexander Duyck #include "internal.h"
1236e66c55SAlexander Duyck 
1336e66c55SAlexander Duyck #define PAGE_REPORTING_DELAY	(2 * HZ)
1436e66c55SAlexander Duyck static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
1536e66c55SAlexander Duyck 
1636e66c55SAlexander Duyck enum {
1736e66c55SAlexander Duyck 	PAGE_REPORTING_IDLE = 0,
1836e66c55SAlexander Duyck 	PAGE_REPORTING_REQUESTED,
1936e66c55SAlexander Duyck 	PAGE_REPORTING_ACTIVE
2036e66c55SAlexander Duyck };
2136e66c55SAlexander Duyck 
2236e66c55SAlexander Duyck /* request page reporting */
2336e66c55SAlexander Duyck static void
2436e66c55SAlexander Duyck __page_reporting_request(struct page_reporting_dev_info *prdev)
2536e66c55SAlexander Duyck {
2636e66c55SAlexander Duyck 	unsigned int state;
2736e66c55SAlexander Duyck 
2836e66c55SAlexander Duyck 	/* Check to see if we are in desired state */
2936e66c55SAlexander Duyck 	state = atomic_read(&prdev->state);
3036e66c55SAlexander Duyck 	if (state == PAGE_REPORTING_REQUESTED)
3136e66c55SAlexander Duyck 		return;
3236e66c55SAlexander Duyck 
3336e66c55SAlexander Duyck 	/*
3436e66c55SAlexander Duyck 	 *  If reporting is already active there is nothing we need to do.
3536e66c55SAlexander Duyck 	 *  Test against 0 as that represents PAGE_REPORTING_IDLE.
3636e66c55SAlexander Duyck 	 */
3736e66c55SAlexander Duyck 	state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
3836e66c55SAlexander Duyck 	if (state != PAGE_REPORTING_IDLE)
3936e66c55SAlexander Duyck 		return;
4036e66c55SAlexander Duyck 
4136e66c55SAlexander Duyck 	/*
4236e66c55SAlexander Duyck 	 * Delay the start of work to allow a sizable queue to build. For
4336e66c55SAlexander Duyck 	 * now we are limiting this to running no more than once every
4436e66c55SAlexander Duyck 	 * couple of seconds.
4536e66c55SAlexander Duyck 	 */
4636e66c55SAlexander Duyck 	schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
4736e66c55SAlexander Duyck }
4836e66c55SAlexander Duyck 
4936e66c55SAlexander Duyck /* notify prdev of free page reporting request */
5036e66c55SAlexander Duyck void __page_reporting_notify(void)
5136e66c55SAlexander Duyck {
5236e66c55SAlexander Duyck 	struct page_reporting_dev_info *prdev;
5336e66c55SAlexander Duyck 
5436e66c55SAlexander Duyck 	/*
5536e66c55SAlexander Duyck 	 * We use RCU to protect the pr_dev_info pointer. In almost all
5636e66c55SAlexander Duyck 	 * cases this should be present, however in the unlikely case of
5736e66c55SAlexander Duyck 	 * a shutdown this will be NULL and we should exit.
5836e66c55SAlexander Duyck 	 */
5936e66c55SAlexander Duyck 	rcu_read_lock();
6036e66c55SAlexander Duyck 	prdev = rcu_dereference(pr_dev_info);
6136e66c55SAlexander Duyck 	if (likely(prdev))
6236e66c55SAlexander Duyck 		__page_reporting_request(prdev);
6336e66c55SAlexander Duyck 
6436e66c55SAlexander Duyck 	rcu_read_unlock();
6536e66c55SAlexander Duyck }
6636e66c55SAlexander Duyck 
6736e66c55SAlexander Duyck static void
6836e66c55SAlexander Duyck page_reporting_drain(struct page_reporting_dev_info *prdev,
6936e66c55SAlexander Duyck 		     struct scatterlist *sgl, unsigned int nents, bool reported)
7036e66c55SAlexander Duyck {
7136e66c55SAlexander Duyck 	struct scatterlist *sg = sgl;
7236e66c55SAlexander Duyck 
7336e66c55SAlexander Duyck 	/*
7436e66c55SAlexander Duyck 	 * Drain the now reported pages back into their respective
7536e66c55SAlexander Duyck 	 * free lists/areas. We assume at least one page is populated.
7636e66c55SAlexander Duyck 	 */
7736e66c55SAlexander Duyck 	do {
7836e66c55SAlexander Duyck 		struct page *page = sg_page(sg);
7936e66c55SAlexander Duyck 		int mt = get_pageblock_migratetype(page);
8036e66c55SAlexander Duyck 		unsigned int order = get_order(sg->length);
8136e66c55SAlexander Duyck 
8236e66c55SAlexander Duyck 		__putback_isolated_page(page, order, mt);
8336e66c55SAlexander Duyck 
8436e66c55SAlexander Duyck 		/* If the pages were not reported due to error skip flagging */
8536e66c55SAlexander Duyck 		if (!reported)
8636e66c55SAlexander Duyck 			continue;
8736e66c55SAlexander Duyck 
8836e66c55SAlexander Duyck 		/*
8936e66c55SAlexander Duyck 		 * If page was not comingled with another page we can
9036e66c55SAlexander Duyck 		 * consider the result to be "reported" since the page
9136e66c55SAlexander Duyck 		 * hasn't been modified, otherwise we will need to
9236e66c55SAlexander Duyck 		 * report on the new larger page when we make our way
9336e66c55SAlexander Duyck 		 * up to that higher order.
9436e66c55SAlexander Duyck 		 */
9536e66c55SAlexander Duyck 		if (PageBuddy(page) && page_order(page) == order)
9636e66c55SAlexander Duyck 			__SetPageReported(page);
9736e66c55SAlexander Duyck 	} while ((sg = sg_next(sg)));
9836e66c55SAlexander Duyck 
9936e66c55SAlexander Duyck 	/* reinitialize scatterlist now that it is empty */
10036e66c55SAlexander Duyck 	sg_init_table(sgl, nents);
10136e66c55SAlexander Duyck }
10236e66c55SAlexander Duyck 
10336e66c55SAlexander Duyck /*
10436e66c55SAlexander Duyck  * The page reporting cycle consists of 4 stages, fill, report, drain, and
10536e66c55SAlexander Duyck  * idle. We will cycle through the first 3 stages until we cannot obtain a
10636e66c55SAlexander Duyck  * full scatterlist of pages, in that case we will switch to idle.
10736e66c55SAlexander Duyck  */
10836e66c55SAlexander Duyck static int
10936e66c55SAlexander Duyck page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
11036e66c55SAlexander Duyck 		     unsigned int order, unsigned int mt,
11136e66c55SAlexander Duyck 		     struct scatterlist *sgl, unsigned int *offset)
11236e66c55SAlexander Duyck {
11336e66c55SAlexander Duyck 	struct free_area *area = &zone->free_area[order];
11436e66c55SAlexander Duyck 	struct list_head *list = &area->free_list[mt];
11536e66c55SAlexander Duyck 	unsigned int page_len = PAGE_SIZE << order;
11636e66c55SAlexander Duyck 	struct page *page, *next;
11736e66c55SAlexander Duyck 	int err = 0;
11836e66c55SAlexander Duyck 
11936e66c55SAlexander Duyck 	/*
12036e66c55SAlexander Duyck 	 * Perform early check, if free area is empty there is
12136e66c55SAlexander Duyck 	 * nothing to process so we can skip this free_list.
12236e66c55SAlexander Duyck 	 */
12336e66c55SAlexander Duyck 	if (list_empty(list))
12436e66c55SAlexander Duyck 		return err;
12536e66c55SAlexander Duyck 
12636e66c55SAlexander Duyck 	spin_lock_irq(&zone->lock);
12736e66c55SAlexander Duyck 
12836e66c55SAlexander Duyck 	/* loop through free list adding unreported pages to sg list */
12936e66c55SAlexander Duyck 	list_for_each_entry_safe(page, next, list, lru) {
13036e66c55SAlexander Duyck 		/* We are going to skip over the reported pages. */
13136e66c55SAlexander Duyck 		if (PageReported(page))
13236e66c55SAlexander Duyck 			continue;
13336e66c55SAlexander Duyck 
134*02cf8719SAlexander Duyck 		/* Attempt to pull page from list and place in scatterlist */
135*02cf8719SAlexander Duyck 		if (*offset) {
136*02cf8719SAlexander Duyck 			if (!__isolate_free_page(page, order)) {
137*02cf8719SAlexander Duyck 				next = page;
13836e66c55SAlexander Duyck 				break;
139*02cf8719SAlexander Duyck 			}
14036e66c55SAlexander Duyck 
14136e66c55SAlexander Duyck 			/* Add page to scatter list */
14236e66c55SAlexander Duyck 			--(*offset);
14336e66c55SAlexander Duyck 			sg_set_page(&sgl[*offset], page, page_len, 0);
14436e66c55SAlexander Duyck 
14536e66c55SAlexander Duyck 			continue;
146*02cf8719SAlexander Duyck 		}
147*02cf8719SAlexander Duyck 
148*02cf8719SAlexander Duyck 		/*
149*02cf8719SAlexander Duyck 		 * Make the first non-processed page in the free list
150*02cf8719SAlexander Duyck 		 * the new head of the free list before we release the
151*02cf8719SAlexander Duyck 		 * zone lock.
152*02cf8719SAlexander Duyck 		 */
153*02cf8719SAlexander Duyck 		if (&page->lru != list && !list_is_first(&page->lru, list))
154*02cf8719SAlexander Duyck 			list_rotate_to_front(&page->lru, list);
15536e66c55SAlexander Duyck 
15636e66c55SAlexander Duyck 		/* release lock before waiting on report processing */
15736e66c55SAlexander Duyck 		spin_unlock_irq(&zone->lock);
15836e66c55SAlexander Duyck 
15936e66c55SAlexander Duyck 		/* begin processing pages in local list */
16036e66c55SAlexander Duyck 		err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
16136e66c55SAlexander Duyck 
16236e66c55SAlexander Duyck 		/* reset offset since the full list was reported */
16336e66c55SAlexander Duyck 		*offset = PAGE_REPORTING_CAPACITY;
16436e66c55SAlexander Duyck 
16536e66c55SAlexander Duyck 		/* reacquire zone lock and resume processing */
16636e66c55SAlexander Duyck 		spin_lock_irq(&zone->lock);
16736e66c55SAlexander Duyck 
16836e66c55SAlexander Duyck 		/* flush reported pages from the sg list */
16936e66c55SAlexander Duyck 		page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
17036e66c55SAlexander Duyck 
17136e66c55SAlexander Duyck 		/*
17236e66c55SAlexander Duyck 		 * Reset next to first entry, the old next isn't valid
17336e66c55SAlexander Duyck 		 * since we dropped the lock to report the pages
17436e66c55SAlexander Duyck 		 */
17536e66c55SAlexander Duyck 		next = list_first_entry(list, struct page, lru);
17636e66c55SAlexander Duyck 
17736e66c55SAlexander Duyck 		/* exit on error */
17836e66c55SAlexander Duyck 		if (err)
17936e66c55SAlexander Duyck 			break;
18036e66c55SAlexander Duyck 	}
18136e66c55SAlexander Duyck 
182*02cf8719SAlexander Duyck 	/* Rotate any leftover pages to the head of the freelist */
183*02cf8719SAlexander Duyck 	if (&next->lru != list && !list_is_first(&next->lru, list))
184*02cf8719SAlexander Duyck 		list_rotate_to_front(&next->lru, list);
185*02cf8719SAlexander Duyck 
18636e66c55SAlexander Duyck 	spin_unlock_irq(&zone->lock);
18736e66c55SAlexander Duyck 
18836e66c55SAlexander Duyck 	return err;
18936e66c55SAlexander Duyck }
19036e66c55SAlexander Duyck 
19136e66c55SAlexander Duyck static int
19236e66c55SAlexander Duyck page_reporting_process_zone(struct page_reporting_dev_info *prdev,
19336e66c55SAlexander Duyck 			    struct scatterlist *sgl, struct zone *zone)
19436e66c55SAlexander Duyck {
19536e66c55SAlexander Duyck 	unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
19636e66c55SAlexander Duyck 	unsigned long watermark;
19736e66c55SAlexander Duyck 	int err = 0;
19836e66c55SAlexander Duyck 
19936e66c55SAlexander Duyck 	/* Generate minimum watermark to be able to guarantee progress */
20036e66c55SAlexander Duyck 	watermark = low_wmark_pages(zone) +
20136e66c55SAlexander Duyck 		    (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
20236e66c55SAlexander Duyck 
20336e66c55SAlexander Duyck 	/*
20436e66c55SAlexander Duyck 	 * Cancel request if insufficient free memory or if we failed
20536e66c55SAlexander Duyck 	 * to allocate page reporting statistics for the zone.
20636e66c55SAlexander Duyck 	 */
20736e66c55SAlexander Duyck 	if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
20836e66c55SAlexander Duyck 		return err;
20936e66c55SAlexander Duyck 
21036e66c55SAlexander Duyck 	/* Process each free list starting from lowest order/mt */
21136e66c55SAlexander Duyck 	for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
21236e66c55SAlexander Duyck 		for (mt = 0; mt < MIGRATE_TYPES; mt++) {
21336e66c55SAlexander Duyck 			/* We do not pull pages from the isolate free list */
21436e66c55SAlexander Duyck 			if (is_migrate_isolate(mt))
21536e66c55SAlexander Duyck 				continue;
21636e66c55SAlexander Duyck 
21736e66c55SAlexander Duyck 			err = page_reporting_cycle(prdev, zone, order, mt,
21836e66c55SAlexander Duyck 						   sgl, &offset);
21936e66c55SAlexander Duyck 			if (err)
22036e66c55SAlexander Duyck 				return err;
22136e66c55SAlexander Duyck 		}
22236e66c55SAlexander Duyck 	}
22336e66c55SAlexander Duyck 
22436e66c55SAlexander Duyck 	/* report the leftover pages before going idle */
22536e66c55SAlexander Duyck 	leftover = PAGE_REPORTING_CAPACITY - offset;
22636e66c55SAlexander Duyck 	if (leftover) {
22736e66c55SAlexander Duyck 		sgl = &sgl[offset];
22836e66c55SAlexander Duyck 		err = prdev->report(prdev, sgl, leftover);
22936e66c55SAlexander Duyck 
23036e66c55SAlexander Duyck 		/* flush any remaining pages out from the last report */
23136e66c55SAlexander Duyck 		spin_lock_irq(&zone->lock);
23236e66c55SAlexander Duyck 		page_reporting_drain(prdev, sgl, leftover, !err);
23336e66c55SAlexander Duyck 		spin_unlock_irq(&zone->lock);
23436e66c55SAlexander Duyck 	}
23536e66c55SAlexander Duyck 
23636e66c55SAlexander Duyck 	return err;
23736e66c55SAlexander Duyck }
23836e66c55SAlexander Duyck 
23936e66c55SAlexander Duyck static void page_reporting_process(struct work_struct *work)
24036e66c55SAlexander Duyck {
24136e66c55SAlexander Duyck 	struct delayed_work *d_work = to_delayed_work(work);
24236e66c55SAlexander Duyck 	struct page_reporting_dev_info *prdev =
24336e66c55SAlexander Duyck 		container_of(d_work, struct page_reporting_dev_info, work);
24436e66c55SAlexander Duyck 	int err = 0, state = PAGE_REPORTING_ACTIVE;
24536e66c55SAlexander Duyck 	struct scatterlist *sgl;
24636e66c55SAlexander Duyck 	struct zone *zone;
24736e66c55SAlexander Duyck 
24836e66c55SAlexander Duyck 	/*
24936e66c55SAlexander Duyck 	 * Change the state to "Active" so that we can track if there is
25036e66c55SAlexander Duyck 	 * anyone requests page reporting after we complete our pass. If
25136e66c55SAlexander Duyck 	 * the state is not altered by the end of the pass we will switch
25236e66c55SAlexander Duyck 	 * to idle and quit scheduling reporting runs.
25336e66c55SAlexander Duyck 	 */
25436e66c55SAlexander Duyck 	atomic_set(&prdev->state, state);
25536e66c55SAlexander Duyck 
25636e66c55SAlexander Duyck 	/* allocate scatterlist to store pages being reported on */
25736e66c55SAlexander Duyck 	sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
25836e66c55SAlexander Duyck 	if (!sgl)
25936e66c55SAlexander Duyck 		goto err_out;
26036e66c55SAlexander Duyck 
26136e66c55SAlexander Duyck 	sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
26236e66c55SAlexander Duyck 
26336e66c55SAlexander Duyck 	for_each_zone(zone) {
26436e66c55SAlexander Duyck 		err = page_reporting_process_zone(prdev, sgl, zone);
26536e66c55SAlexander Duyck 		if (err)
26636e66c55SAlexander Duyck 			break;
26736e66c55SAlexander Duyck 	}
26836e66c55SAlexander Duyck 
26936e66c55SAlexander Duyck 	kfree(sgl);
27036e66c55SAlexander Duyck err_out:
27136e66c55SAlexander Duyck 	/*
27236e66c55SAlexander Duyck 	 * If the state has reverted back to requested then there may be
27336e66c55SAlexander Duyck 	 * additional pages to be processed. We will defer for 2s to allow
27436e66c55SAlexander Duyck 	 * more pages to accumulate.
27536e66c55SAlexander Duyck 	 */
27636e66c55SAlexander Duyck 	state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
27736e66c55SAlexander Duyck 	if (state == PAGE_REPORTING_REQUESTED)
27836e66c55SAlexander Duyck 		schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
27936e66c55SAlexander Duyck }
28036e66c55SAlexander Duyck 
28136e66c55SAlexander Duyck static DEFINE_MUTEX(page_reporting_mutex);
28236e66c55SAlexander Duyck DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
28336e66c55SAlexander Duyck 
28436e66c55SAlexander Duyck int page_reporting_register(struct page_reporting_dev_info *prdev)
28536e66c55SAlexander Duyck {
28636e66c55SAlexander Duyck 	int err = 0;
28736e66c55SAlexander Duyck 
28836e66c55SAlexander Duyck 	mutex_lock(&page_reporting_mutex);
28936e66c55SAlexander Duyck 
29036e66c55SAlexander Duyck 	/* nothing to do if already in use */
29136e66c55SAlexander Duyck 	if (rcu_access_pointer(pr_dev_info)) {
29236e66c55SAlexander Duyck 		err = -EBUSY;
29336e66c55SAlexander Duyck 		goto err_out;
29436e66c55SAlexander Duyck 	}
29536e66c55SAlexander Duyck 
29636e66c55SAlexander Duyck 	/* initialize state and work structures */
29736e66c55SAlexander Duyck 	atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
29836e66c55SAlexander Duyck 	INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
29936e66c55SAlexander Duyck 
30036e66c55SAlexander Duyck 	/* Begin initial flush of zones */
30136e66c55SAlexander Duyck 	__page_reporting_request(prdev);
30236e66c55SAlexander Duyck 
30336e66c55SAlexander Duyck 	/* Assign device to allow notifications */
30436e66c55SAlexander Duyck 	rcu_assign_pointer(pr_dev_info, prdev);
30536e66c55SAlexander Duyck 
30636e66c55SAlexander Duyck 	/* enable page reporting notification */
30736e66c55SAlexander Duyck 	if (!static_key_enabled(&page_reporting_enabled)) {
30836e66c55SAlexander Duyck 		static_branch_enable(&page_reporting_enabled);
30936e66c55SAlexander Duyck 		pr_info("Free page reporting enabled\n");
31036e66c55SAlexander Duyck 	}
31136e66c55SAlexander Duyck err_out:
31236e66c55SAlexander Duyck 	mutex_unlock(&page_reporting_mutex);
31336e66c55SAlexander Duyck 
31436e66c55SAlexander Duyck 	return err;
31536e66c55SAlexander Duyck }
31636e66c55SAlexander Duyck EXPORT_SYMBOL_GPL(page_reporting_register);
31736e66c55SAlexander Duyck 
31836e66c55SAlexander Duyck void page_reporting_unregister(struct page_reporting_dev_info *prdev)
31936e66c55SAlexander Duyck {
32036e66c55SAlexander Duyck 	mutex_lock(&page_reporting_mutex);
32136e66c55SAlexander Duyck 
32236e66c55SAlexander Duyck 	if (rcu_access_pointer(pr_dev_info) == prdev) {
32336e66c55SAlexander Duyck 		/* Disable page reporting notification */
32436e66c55SAlexander Duyck 		RCU_INIT_POINTER(pr_dev_info, NULL);
32536e66c55SAlexander Duyck 		synchronize_rcu();
32636e66c55SAlexander Duyck 
32736e66c55SAlexander Duyck 		/* Flush any existing work, and lock it out */
32836e66c55SAlexander Duyck 		cancel_delayed_work_sync(&prdev->work);
32936e66c55SAlexander Duyck 	}
33036e66c55SAlexander Duyck 
33136e66c55SAlexander Duyck 	mutex_unlock(&page_reporting_mutex);
33236e66c55SAlexander Duyck }
33336e66c55SAlexander Duyck EXPORT_SYMBOL_GPL(page_reporting_unregister);
334