xref: /openbmc/linux/mm/page_reporting.c (revision 36e66c554b5c6a9d17a229faca7a61693527b0bd)
1*36e66c55SAlexander Duyck // SPDX-License-Identifier: GPL-2.0
2*36e66c55SAlexander Duyck #include <linux/mm.h>
3*36e66c55SAlexander Duyck #include <linux/mmzone.h>
4*36e66c55SAlexander Duyck #include <linux/page_reporting.h>
5*36e66c55SAlexander Duyck #include <linux/gfp.h>
6*36e66c55SAlexander Duyck #include <linux/export.h>
7*36e66c55SAlexander Duyck #include <linux/delay.h>
8*36e66c55SAlexander Duyck #include <linux/scatterlist.h>
9*36e66c55SAlexander Duyck 
10*36e66c55SAlexander Duyck #include "page_reporting.h"
11*36e66c55SAlexander Duyck #include "internal.h"
12*36e66c55SAlexander Duyck 
13*36e66c55SAlexander Duyck #define PAGE_REPORTING_DELAY	(2 * HZ)
14*36e66c55SAlexander Duyck static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
15*36e66c55SAlexander Duyck 
16*36e66c55SAlexander Duyck enum {
17*36e66c55SAlexander Duyck 	PAGE_REPORTING_IDLE = 0,
18*36e66c55SAlexander Duyck 	PAGE_REPORTING_REQUESTED,
19*36e66c55SAlexander Duyck 	PAGE_REPORTING_ACTIVE
20*36e66c55SAlexander Duyck };
21*36e66c55SAlexander Duyck 
22*36e66c55SAlexander Duyck /* request page reporting */
23*36e66c55SAlexander Duyck static void
24*36e66c55SAlexander Duyck __page_reporting_request(struct page_reporting_dev_info *prdev)
25*36e66c55SAlexander Duyck {
26*36e66c55SAlexander Duyck 	unsigned int state;
27*36e66c55SAlexander Duyck 
28*36e66c55SAlexander Duyck 	/* Check to see if we are in desired state */
29*36e66c55SAlexander Duyck 	state = atomic_read(&prdev->state);
30*36e66c55SAlexander Duyck 	if (state == PAGE_REPORTING_REQUESTED)
31*36e66c55SAlexander Duyck 		return;
32*36e66c55SAlexander Duyck 
33*36e66c55SAlexander Duyck 	/*
34*36e66c55SAlexander Duyck 	 *  If reporting is already active there is nothing we need to do.
35*36e66c55SAlexander Duyck 	 *  Test against 0 as that represents PAGE_REPORTING_IDLE.
36*36e66c55SAlexander Duyck 	 */
37*36e66c55SAlexander Duyck 	state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
38*36e66c55SAlexander Duyck 	if (state != PAGE_REPORTING_IDLE)
39*36e66c55SAlexander Duyck 		return;
40*36e66c55SAlexander Duyck 
41*36e66c55SAlexander Duyck 	/*
42*36e66c55SAlexander Duyck 	 * Delay the start of work to allow a sizable queue to build. For
43*36e66c55SAlexander Duyck 	 * now we are limiting this to running no more than once every
44*36e66c55SAlexander Duyck 	 * couple of seconds.
45*36e66c55SAlexander Duyck 	 */
46*36e66c55SAlexander Duyck 	schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
47*36e66c55SAlexander Duyck }
48*36e66c55SAlexander Duyck 
49*36e66c55SAlexander Duyck /* notify prdev of free page reporting request */
50*36e66c55SAlexander Duyck void __page_reporting_notify(void)
51*36e66c55SAlexander Duyck {
52*36e66c55SAlexander Duyck 	struct page_reporting_dev_info *prdev;
53*36e66c55SAlexander Duyck 
54*36e66c55SAlexander Duyck 	/*
55*36e66c55SAlexander Duyck 	 * We use RCU to protect the pr_dev_info pointer. In almost all
56*36e66c55SAlexander Duyck 	 * cases this should be present, however in the unlikely case of
57*36e66c55SAlexander Duyck 	 * a shutdown this will be NULL and we should exit.
58*36e66c55SAlexander Duyck 	 */
59*36e66c55SAlexander Duyck 	rcu_read_lock();
60*36e66c55SAlexander Duyck 	prdev = rcu_dereference(pr_dev_info);
61*36e66c55SAlexander Duyck 	if (likely(prdev))
62*36e66c55SAlexander Duyck 		__page_reporting_request(prdev);
63*36e66c55SAlexander Duyck 
64*36e66c55SAlexander Duyck 	rcu_read_unlock();
65*36e66c55SAlexander Duyck }
66*36e66c55SAlexander Duyck 
67*36e66c55SAlexander Duyck static void
68*36e66c55SAlexander Duyck page_reporting_drain(struct page_reporting_dev_info *prdev,
69*36e66c55SAlexander Duyck 		     struct scatterlist *sgl, unsigned int nents, bool reported)
70*36e66c55SAlexander Duyck {
71*36e66c55SAlexander Duyck 	struct scatterlist *sg = sgl;
72*36e66c55SAlexander Duyck 
73*36e66c55SAlexander Duyck 	/*
74*36e66c55SAlexander Duyck 	 * Drain the now reported pages back into their respective
75*36e66c55SAlexander Duyck 	 * free lists/areas. We assume at least one page is populated.
76*36e66c55SAlexander Duyck 	 */
77*36e66c55SAlexander Duyck 	do {
78*36e66c55SAlexander Duyck 		struct page *page = sg_page(sg);
79*36e66c55SAlexander Duyck 		int mt = get_pageblock_migratetype(page);
80*36e66c55SAlexander Duyck 		unsigned int order = get_order(sg->length);
81*36e66c55SAlexander Duyck 
82*36e66c55SAlexander Duyck 		__putback_isolated_page(page, order, mt);
83*36e66c55SAlexander Duyck 
84*36e66c55SAlexander Duyck 		/* If the pages were not reported due to error skip flagging */
85*36e66c55SAlexander Duyck 		if (!reported)
86*36e66c55SAlexander Duyck 			continue;
87*36e66c55SAlexander Duyck 
88*36e66c55SAlexander Duyck 		/*
89*36e66c55SAlexander Duyck 		 * If page was not comingled with another page we can
90*36e66c55SAlexander Duyck 		 * consider the result to be "reported" since the page
91*36e66c55SAlexander Duyck 		 * hasn't been modified, otherwise we will need to
92*36e66c55SAlexander Duyck 		 * report on the new larger page when we make our way
93*36e66c55SAlexander Duyck 		 * up to that higher order.
94*36e66c55SAlexander Duyck 		 */
95*36e66c55SAlexander Duyck 		if (PageBuddy(page) && page_order(page) == order)
96*36e66c55SAlexander Duyck 			__SetPageReported(page);
97*36e66c55SAlexander Duyck 	} while ((sg = sg_next(sg)));
98*36e66c55SAlexander Duyck 
99*36e66c55SAlexander Duyck 	/* reinitialize scatterlist now that it is empty */
100*36e66c55SAlexander Duyck 	sg_init_table(sgl, nents);
101*36e66c55SAlexander Duyck }
102*36e66c55SAlexander Duyck 
103*36e66c55SAlexander Duyck /*
104*36e66c55SAlexander Duyck  * The page reporting cycle consists of 4 stages, fill, report, drain, and
105*36e66c55SAlexander Duyck  * idle. We will cycle through the first 3 stages until we cannot obtain a
106*36e66c55SAlexander Duyck  * full scatterlist of pages, in that case we will switch to idle.
107*36e66c55SAlexander Duyck  */
108*36e66c55SAlexander Duyck static int
109*36e66c55SAlexander Duyck page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
110*36e66c55SAlexander Duyck 		     unsigned int order, unsigned int mt,
111*36e66c55SAlexander Duyck 		     struct scatterlist *sgl, unsigned int *offset)
112*36e66c55SAlexander Duyck {
113*36e66c55SAlexander Duyck 	struct free_area *area = &zone->free_area[order];
114*36e66c55SAlexander Duyck 	struct list_head *list = &area->free_list[mt];
115*36e66c55SAlexander Duyck 	unsigned int page_len = PAGE_SIZE << order;
116*36e66c55SAlexander Duyck 	struct page *page, *next;
117*36e66c55SAlexander Duyck 	int err = 0;
118*36e66c55SAlexander Duyck 
119*36e66c55SAlexander Duyck 	/*
120*36e66c55SAlexander Duyck 	 * Perform early check, if free area is empty there is
121*36e66c55SAlexander Duyck 	 * nothing to process so we can skip this free_list.
122*36e66c55SAlexander Duyck 	 */
123*36e66c55SAlexander Duyck 	if (list_empty(list))
124*36e66c55SAlexander Duyck 		return err;
125*36e66c55SAlexander Duyck 
126*36e66c55SAlexander Duyck 	spin_lock_irq(&zone->lock);
127*36e66c55SAlexander Duyck 
128*36e66c55SAlexander Duyck 	/* loop through free list adding unreported pages to sg list */
129*36e66c55SAlexander Duyck 	list_for_each_entry_safe(page, next, list, lru) {
130*36e66c55SAlexander Duyck 		/* We are going to skip over the reported pages. */
131*36e66c55SAlexander Duyck 		if (PageReported(page))
132*36e66c55SAlexander Duyck 			continue;
133*36e66c55SAlexander Duyck 
134*36e66c55SAlexander Duyck 		/* Attempt to pull page from list */
135*36e66c55SAlexander Duyck 		if (!__isolate_free_page(page, order))
136*36e66c55SAlexander Duyck 			break;
137*36e66c55SAlexander Duyck 
138*36e66c55SAlexander Duyck 		/* Add page to scatter list */
139*36e66c55SAlexander Duyck 		--(*offset);
140*36e66c55SAlexander Duyck 		sg_set_page(&sgl[*offset], page, page_len, 0);
141*36e66c55SAlexander Duyck 
142*36e66c55SAlexander Duyck 		/* If scatterlist isn't full grab more pages */
143*36e66c55SAlexander Duyck 		if (*offset)
144*36e66c55SAlexander Duyck 			continue;
145*36e66c55SAlexander Duyck 
146*36e66c55SAlexander Duyck 		/* release lock before waiting on report processing */
147*36e66c55SAlexander Duyck 		spin_unlock_irq(&zone->lock);
148*36e66c55SAlexander Duyck 
149*36e66c55SAlexander Duyck 		/* begin processing pages in local list */
150*36e66c55SAlexander Duyck 		err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
151*36e66c55SAlexander Duyck 
152*36e66c55SAlexander Duyck 		/* reset offset since the full list was reported */
153*36e66c55SAlexander Duyck 		*offset = PAGE_REPORTING_CAPACITY;
154*36e66c55SAlexander Duyck 
155*36e66c55SAlexander Duyck 		/* reacquire zone lock and resume processing */
156*36e66c55SAlexander Duyck 		spin_lock_irq(&zone->lock);
157*36e66c55SAlexander Duyck 
158*36e66c55SAlexander Duyck 		/* flush reported pages from the sg list */
159*36e66c55SAlexander Duyck 		page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
160*36e66c55SAlexander Duyck 
161*36e66c55SAlexander Duyck 		/*
162*36e66c55SAlexander Duyck 		 * Reset next to first entry, the old next isn't valid
163*36e66c55SAlexander Duyck 		 * since we dropped the lock to report the pages
164*36e66c55SAlexander Duyck 		 */
165*36e66c55SAlexander Duyck 		next = list_first_entry(list, struct page, lru);
166*36e66c55SAlexander Duyck 
167*36e66c55SAlexander Duyck 		/* exit on error */
168*36e66c55SAlexander Duyck 		if (err)
169*36e66c55SAlexander Duyck 			break;
170*36e66c55SAlexander Duyck 	}
171*36e66c55SAlexander Duyck 
172*36e66c55SAlexander Duyck 	spin_unlock_irq(&zone->lock);
173*36e66c55SAlexander Duyck 
174*36e66c55SAlexander Duyck 	return err;
175*36e66c55SAlexander Duyck }
176*36e66c55SAlexander Duyck 
177*36e66c55SAlexander Duyck static int
178*36e66c55SAlexander Duyck page_reporting_process_zone(struct page_reporting_dev_info *prdev,
179*36e66c55SAlexander Duyck 			    struct scatterlist *sgl, struct zone *zone)
180*36e66c55SAlexander Duyck {
181*36e66c55SAlexander Duyck 	unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
182*36e66c55SAlexander Duyck 	unsigned long watermark;
183*36e66c55SAlexander Duyck 	int err = 0;
184*36e66c55SAlexander Duyck 
185*36e66c55SAlexander Duyck 	/* Generate minimum watermark to be able to guarantee progress */
186*36e66c55SAlexander Duyck 	watermark = low_wmark_pages(zone) +
187*36e66c55SAlexander Duyck 		    (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
188*36e66c55SAlexander Duyck 
189*36e66c55SAlexander Duyck 	/*
190*36e66c55SAlexander Duyck 	 * Cancel request if insufficient free memory or if we failed
191*36e66c55SAlexander Duyck 	 * to allocate page reporting statistics for the zone.
192*36e66c55SAlexander Duyck 	 */
193*36e66c55SAlexander Duyck 	if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
194*36e66c55SAlexander Duyck 		return err;
195*36e66c55SAlexander Duyck 
196*36e66c55SAlexander Duyck 	/* Process each free list starting from lowest order/mt */
197*36e66c55SAlexander Duyck 	for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
198*36e66c55SAlexander Duyck 		for (mt = 0; mt < MIGRATE_TYPES; mt++) {
199*36e66c55SAlexander Duyck 			/* We do not pull pages from the isolate free list */
200*36e66c55SAlexander Duyck 			if (is_migrate_isolate(mt))
201*36e66c55SAlexander Duyck 				continue;
202*36e66c55SAlexander Duyck 
203*36e66c55SAlexander Duyck 			err = page_reporting_cycle(prdev, zone, order, mt,
204*36e66c55SAlexander Duyck 						   sgl, &offset);
205*36e66c55SAlexander Duyck 			if (err)
206*36e66c55SAlexander Duyck 				return err;
207*36e66c55SAlexander Duyck 		}
208*36e66c55SAlexander Duyck 	}
209*36e66c55SAlexander Duyck 
210*36e66c55SAlexander Duyck 	/* report the leftover pages before going idle */
211*36e66c55SAlexander Duyck 	leftover = PAGE_REPORTING_CAPACITY - offset;
212*36e66c55SAlexander Duyck 	if (leftover) {
213*36e66c55SAlexander Duyck 		sgl = &sgl[offset];
214*36e66c55SAlexander Duyck 		err = prdev->report(prdev, sgl, leftover);
215*36e66c55SAlexander Duyck 
216*36e66c55SAlexander Duyck 		/* flush any remaining pages out from the last report */
217*36e66c55SAlexander Duyck 		spin_lock_irq(&zone->lock);
218*36e66c55SAlexander Duyck 		page_reporting_drain(prdev, sgl, leftover, !err);
219*36e66c55SAlexander Duyck 		spin_unlock_irq(&zone->lock);
220*36e66c55SAlexander Duyck 	}
221*36e66c55SAlexander Duyck 
222*36e66c55SAlexander Duyck 	return err;
223*36e66c55SAlexander Duyck }
224*36e66c55SAlexander Duyck 
225*36e66c55SAlexander Duyck static void page_reporting_process(struct work_struct *work)
226*36e66c55SAlexander Duyck {
227*36e66c55SAlexander Duyck 	struct delayed_work *d_work = to_delayed_work(work);
228*36e66c55SAlexander Duyck 	struct page_reporting_dev_info *prdev =
229*36e66c55SAlexander Duyck 		container_of(d_work, struct page_reporting_dev_info, work);
230*36e66c55SAlexander Duyck 	int err = 0, state = PAGE_REPORTING_ACTIVE;
231*36e66c55SAlexander Duyck 	struct scatterlist *sgl;
232*36e66c55SAlexander Duyck 	struct zone *zone;
233*36e66c55SAlexander Duyck 
234*36e66c55SAlexander Duyck 	/*
235*36e66c55SAlexander Duyck 	 * Change the state to "Active" so that we can track if there is
236*36e66c55SAlexander Duyck 	 * anyone requests page reporting after we complete our pass. If
237*36e66c55SAlexander Duyck 	 * the state is not altered by the end of the pass we will switch
238*36e66c55SAlexander Duyck 	 * to idle and quit scheduling reporting runs.
239*36e66c55SAlexander Duyck 	 */
240*36e66c55SAlexander Duyck 	atomic_set(&prdev->state, state);
241*36e66c55SAlexander Duyck 
242*36e66c55SAlexander Duyck 	/* allocate scatterlist to store pages being reported on */
243*36e66c55SAlexander Duyck 	sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
244*36e66c55SAlexander Duyck 	if (!sgl)
245*36e66c55SAlexander Duyck 		goto err_out;
246*36e66c55SAlexander Duyck 
247*36e66c55SAlexander Duyck 	sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
248*36e66c55SAlexander Duyck 
249*36e66c55SAlexander Duyck 	for_each_zone(zone) {
250*36e66c55SAlexander Duyck 		err = page_reporting_process_zone(prdev, sgl, zone);
251*36e66c55SAlexander Duyck 		if (err)
252*36e66c55SAlexander Duyck 			break;
253*36e66c55SAlexander Duyck 	}
254*36e66c55SAlexander Duyck 
255*36e66c55SAlexander Duyck 	kfree(sgl);
256*36e66c55SAlexander Duyck err_out:
257*36e66c55SAlexander Duyck 	/*
258*36e66c55SAlexander Duyck 	 * If the state has reverted back to requested then there may be
259*36e66c55SAlexander Duyck 	 * additional pages to be processed. We will defer for 2s to allow
260*36e66c55SAlexander Duyck 	 * more pages to accumulate.
261*36e66c55SAlexander Duyck 	 */
262*36e66c55SAlexander Duyck 	state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
263*36e66c55SAlexander Duyck 	if (state == PAGE_REPORTING_REQUESTED)
264*36e66c55SAlexander Duyck 		schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
265*36e66c55SAlexander Duyck }
266*36e66c55SAlexander Duyck 
267*36e66c55SAlexander Duyck static DEFINE_MUTEX(page_reporting_mutex);
268*36e66c55SAlexander Duyck DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
269*36e66c55SAlexander Duyck 
270*36e66c55SAlexander Duyck int page_reporting_register(struct page_reporting_dev_info *prdev)
271*36e66c55SAlexander Duyck {
272*36e66c55SAlexander Duyck 	int err = 0;
273*36e66c55SAlexander Duyck 
274*36e66c55SAlexander Duyck 	mutex_lock(&page_reporting_mutex);
275*36e66c55SAlexander Duyck 
276*36e66c55SAlexander Duyck 	/* nothing to do if already in use */
277*36e66c55SAlexander Duyck 	if (rcu_access_pointer(pr_dev_info)) {
278*36e66c55SAlexander Duyck 		err = -EBUSY;
279*36e66c55SAlexander Duyck 		goto err_out;
280*36e66c55SAlexander Duyck 	}
281*36e66c55SAlexander Duyck 
282*36e66c55SAlexander Duyck 	/* initialize state and work structures */
283*36e66c55SAlexander Duyck 	atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
284*36e66c55SAlexander Duyck 	INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
285*36e66c55SAlexander Duyck 
286*36e66c55SAlexander Duyck 	/* Begin initial flush of zones */
287*36e66c55SAlexander Duyck 	__page_reporting_request(prdev);
288*36e66c55SAlexander Duyck 
289*36e66c55SAlexander Duyck 	/* Assign device to allow notifications */
290*36e66c55SAlexander Duyck 	rcu_assign_pointer(pr_dev_info, prdev);
291*36e66c55SAlexander Duyck 
292*36e66c55SAlexander Duyck 	/* enable page reporting notification */
293*36e66c55SAlexander Duyck 	if (!static_key_enabled(&page_reporting_enabled)) {
294*36e66c55SAlexander Duyck 		static_branch_enable(&page_reporting_enabled);
295*36e66c55SAlexander Duyck 		pr_info("Free page reporting enabled\n");
296*36e66c55SAlexander Duyck 	}
297*36e66c55SAlexander Duyck err_out:
298*36e66c55SAlexander Duyck 	mutex_unlock(&page_reporting_mutex);
299*36e66c55SAlexander Duyck 
300*36e66c55SAlexander Duyck 	return err;
301*36e66c55SAlexander Duyck }
302*36e66c55SAlexander Duyck EXPORT_SYMBOL_GPL(page_reporting_register);
303*36e66c55SAlexander Duyck 
304*36e66c55SAlexander Duyck void page_reporting_unregister(struct page_reporting_dev_info *prdev)
305*36e66c55SAlexander Duyck {
306*36e66c55SAlexander Duyck 	mutex_lock(&page_reporting_mutex);
307*36e66c55SAlexander Duyck 
308*36e66c55SAlexander Duyck 	if (rcu_access_pointer(pr_dev_info) == prdev) {
309*36e66c55SAlexander Duyck 		/* Disable page reporting notification */
310*36e66c55SAlexander Duyck 		RCU_INIT_POINTER(pr_dev_info, NULL);
311*36e66c55SAlexander Duyck 		synchronize_rcu();
312*36e66c55SAlexander Duyck 
313*36e66c55SAlexander Duyck 		/* Flush any existing work, and lock it out */
314*36e66c55SAlexander Duyck 		cancel_delayed_work_sync(&prdev->work);
315*36e66c55SAlexander Duyck 	}
316*36e66c55SAlexander Duyck 
317*36e66c55SAlexander Duyck 	mutex_unlock(&page_reporting_mutex);
318*36e66c55SAlexander Duyck }
319*36e66c55SAlexander Duyck EXPORT_SYMBOL_GPL(page_reporting_unregister);
320