1*36e66c55SAlexander Duyck // SPDX-License-Identifier: GPL-2.0 2*36e66c55SAlexander Duyck #include <linux/mm.h> 3*36e66c55SAlexander Duyck #include <linux/mmzone.h> 4*36e66c55SAlexander Duyck #include <linux/page_reporting.h> 5*36e66c55SAlexander Duyck #include <linux/gfp.h> 6*36e66c55SAlexander Duyck #include <linux/export.h> 7*36e66c55SAlexander Duyck #include <linux/delay.h> 8*36e66c55SAlexander Duyck #include <linux/scatterlist.h> 9*36e66c55SAlexander Duyck 10*36e66c55SAlexander Duyck #include "page_reporting.h" 11*36e66c55SAlexander Duyck #include "internal.h" 12*36e66c55SAlexander Duyck 13*36e66c55SAlexander Duyck #define PAGE_REPORTING_DELAY (2 * HZ) 14*36e66c55SAlexander Duyck static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; 15*36e66c55SAlexander Duyck 16*36e66c55SAlexander Duyck enum { 17*36e66c55SAlexander Duyck PAGE_REPORTING_IDLE = 0, 18*36e66c55SAlexander Duyck PAGE_REPORTING_REQUESTED, 19*36e66c55SAlexander Duyck PAGE_REPORTING_ACTIVE 20*36e66c55SAlexander Duyck }; 21*36e66c55SAlexander Duyck 22*36e66c55SAlexander Duyck /* request page reporting */ 23*36e66c55SAlexander Duyck static void 24*36e66c55SAlexander Duyck __page_reporting_request(struct page_reporting_dev_info *prdev) 25*36e66c55SAlexander Duyck { 26*36e66c55SAlexander Duyck unsigned int state; 27*36e66c55SAlexander Duyck 28*36e66c55SAlexander Duyck /* Check to see if we are in desired state */ 29*36e66c55SAlexander Duyck state = atomic_read(&prdev->state); 30*36e66c55SAlexander Duyck if (state == PAGE_REPORTING_REQUESTED) 31*36e66c55SAlexander Duyck return; 32*36e66c55SAlexander Duyck 33*36e66c55SAlexander Duyck /* 34*36e66c55SAlexander Duyck * If reporting is already active there is nothing we need to do. 35*36e66c55SAlexander Duyck * Test against 0 as that represents PAGE_REPORTING_IDLE. 36*36e66c55SAlexander Duyck */ 37*36e66c55SAlexander Duyck state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED); 38*36e66c55SAlexander Duyck if (state != PAGE_REPORTING_IDLE) 39*36e66c55SAlexander Duyck return; 40*36e66c55SAlexander Duyck 41*36e66c55SAlexander Duyck /* 42*36e66c55SAlexander Duyck * Delay the start of work to allow a sizable queue to build. For 43*36e66c55SAlexander Duyck * now we are limiting this to running no more than once every 44*36e66c55SAlexander Duyck * couple of seconds. 45*36e66c55SAlexander Duyck */ 46*36e66c55SAlexander Duyck schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 47*36e66c55SAlexander Duyck } 48*36e66c55SAlexander Duyck 49*36e66c55SAlexander Duyck /* notify prdev of free page reporting request */ 50*36e66c55SAlexander Duyck void __page_reporting_notify(void) 51*36e66c55SAlexander Duyck { 52*36e66c55SAlexander Duyck struct page_reporting_dev_info *prdev; 53*36e66c55SAlexander Duyck 54*36e66c55SAlexander Duyck /* 55*36e66c55SAlexander Duyck * We use RCU to protect the pr_dev_info pointer. In almost all 56*36e66c55SAlexander Duyck * cases this should be present, however in the unlikely case of 57*36e66c55SAlexander Duyck * a shutdown this will be NULL and we should exit. 58*36e66c55SAlexander Duyck */ 59*36e66c55SAlexander Duyck rcu_read_lock(); 60*36e66c55SAlexander Duyck prdev = rcu_dereference(pr_dev_info); 61*36e66c55SAlexander Duyck if (likely(prdev)) 62*36e66c55SAlexander Duyck __page_reporting_request(prdev); 63*36e66c55SAlexander Duyck 64*36e66c55SAlexander Duyck rcu_read_unlock(); 65*36e66c55SAlexander Duyck } 66*36e66c55SAlexander Duyck 67*36e66c55SAlexander Duyck static void 68*36e66c55SAlexander Duyck page_reporting_drain(struct page_reporting_dev_info *prdev, 69*36e66c55SAlexander Duyck struct scatterlist *sgl, unsigned int nents, bool reported) 70*36e66c55SAlexander Duyck { 71*36e66c55SAlexander Duyck struct scatterlist *sg = sgl; 72*36e66c55SAlexander Duyck 73*36e66c55SAlexander Duyck /* 74*36e66c55SAlexander Duyck * Drain the now reported pages back into their respective 75*36e66c55SAlexander Duyck * free lists/areas. We assume at least one page is populated. 76*36e66c55SAlexander Duyck */ 77*36e66c55SAlexander Duyck do { 78*36e66c55SAlexander Duyck struct page *page = sg_page(sg); 79*36e66c55SAlexander Duyck int mt = get_pageblock_migratetype(page); 80*36e66c55SAlexander Duyck unsigned int order = get_order(sg->length); 81*36e66c55SAlexander Duyck 82*36e66c55SAlexander Duyck __putback_isolated_page(page, order, mt); 83*36e66c55SAlexander Duyck 84*36e66c55SAlexander Duyck /* If the pages were not reported due to error skip flagging */ 85*36e66c55SAlexander Duyck if (!reported) 86*36e66c55SAlexander Duyck continue; 87*36e66c55SAlexander Duyck 88*36e66c55SAlexander Duyck /* 89*36e66c55SAlexander Duyck * If page was not comingled with another page we can 90*36e66c55SAlexander Duyck * consider the result to be "reported" since the page 91*36e66c55SAlexander Duyck * hasn't been modified, otherwise we will need to 92*36e66c55SAlexander Duyck * report on the new larger page when we make our way 93*36e66c55SAlexander Duyck * up to that higher order. 94*36e66c55SAlexander Duyck */ 95*36e66c55SAlexander Duyck if (PageBuddy(page) && page_order(page) == order) 96*36e66c55SAlexander Duyck __SetPageReported(page); 97*36e66c55SAlexander Duyck } while ((sg = sg_next(sg))); 98*36e66c55SAlexander Duyck 99*36e66c55SAlexander Duyck /* reinitialize scatterlist now that it is empty */ 100*36e66c55SAlexander Duyck sg_init_table(sgl, nents); 101*36e66c55SAlexander Duyck } 102*36e66c55SAlexander Duyck 103*36e66c55SAlexander Duyck /* 104*36e66c55SAlexander Duyck * The page reporting cycle consists of 4 stages, fill, report, drain, and 105*36e66c55SAlexander Duyck * idle. We will cycle through the first 3 stages until we cannot obtain a 106*36e66c55SAlexander Duyck * full scatterlist of pages, in that case we will switch to idle. 107*36e66c55SAlexander Duyck */ 108*36e66c55SAlexander Duyck static int 109*36e66c55SAlexander Duyck page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, 110*36e66c55SAlexander Duyck unsigned int order, unsigned int mt, 111*36e66c55SAlexander Duyck struct scatterlist *sgl, unsigned int *offset) 112*36e66c55SAlexander Duyck { 113*36e66c55SAlexander Duyck struct free_area *area = &zone->free_area[order]; 114*36e66c55SAlexander Duyck struct list_head *list = &area->free_list[mt]; 115*36e66c55SAlexander Duyck unsigned int page_len = PAGE_SIZE << order; 116*36e66c55SAlexander Duyck struct page *page, *next; 117*36e66c55SAlexander Duyck int err = 0; 118*36e66c55SAlexander Duyck 119*36e66c55SAlexander Duyck /* 120*36e66c55SAlexander Duyck * Perform early check, if free area is empty there is 121*36e66c55SAlexander Duyck * nothing to process so we can skip this free_list. 122*36e66c55SAlexander Duyck */ 123*36e66c55SAlexander Duyck if (list_empty(list)) 124*36e66c55SAlexander Duyck return err; 125*36e66c55SAlexander Duyck 126*36e66c55SAlexander Duyck spin_lock_irq(&zone->lock); 127*36e66c55SAlexander Duyck 128*36e66c55SAlexander Duyck /* loop through free list adding unreported pages to sg list */ 129*36e66c55SAlexander Duyck list_for_each_entry_safe(page, next, list, lru) { 130*36e66c55SAlexander Duyck /* We are going to skip over the reported pages. */ 131*36e66c55SAlexander Duyck if (PageReported(page)) 132*36e66c55SAlexander Duyck continue; 133*36e66c55SAlexander Duyck 134*36e66c55SAlexander Duyck /* Attempt to pull page from list */ 135*36e66c55SAlexander Duyck if (!__isolate_free_page(page, order)) 136*36e66c55SAlexander Duyck break; 137*36e66c55SAlexander Duyck 138*36e66c55SAlexander Duyck /* Add page to scatter list */ 139*36e66c55SAlexander Duyck --(*offset); 140*36e66c55SAlexander Duyck sg_set_page(&sgl[*offset], page, page_len, 0); 141*36e66c55SAlexander Duyck 142*36e66c55SAlexander Duyck /* If scatterlist isn't full grab more pages */ 143*36e66c55SAlexander Duyck if (*offset) 144*36e66c55SAlexander Duyck continue; 145*36e66c55SAlexander Duyck 146*36e66c55SAlexander Duyck /* release lock before waiting on report processing */ 147*36e66c55SAlexander Duyck spin_unlock_irq(&zone->lock); 148*36e66c55SAlexander Duyck 149*36e66c55SAlexander Duyck /* begin processing pages in local list */ 150*36e66c55SAlexander Duyck err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY); 151*36e66c55SAlexander Duyck 152*36e66c55SAlexander Duyck /* reset offset since the full list was reported */ 153*36e66c55SAlexander Duyck *offset = PAGE_REPORTING_CAPACITY; 154*36e66c55SAlexander Duyck 155*36e66c55SAlexander Duyck /* reacquire zone lock and resume processing */ 156*36e66c55SAlexander Duyck spin_lock_irq(&zone->lock); 157*36e66c55SAlexander Duyck 158*36e66c55SAlexander Duyck /* flush reported pages from the sg list */ 159*36e66c55SAlexander Duyck page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err); 160*36e66c55SAlexander Duyck 161*36e66c55SAlexander Duyck /* 162*36e66c55SAlexander Duyck * Reset next to first entry, the old next isn't valid 163*36e66c55SAlexander Duyck * since we dropped the lock to report the pages 164*36e66c55SAlexander Duyck */ 165*36e66c55SAlexander Duyck next = list_first_entry(list, struct page, lru); 166*36e66c55SAlexander Duyck 167*36e66c55SAlexander Duyck /* exit on error */ 168*36e66c55SAlexander Duyck if (err) 169*36e66c55SAlexander Duyck break; 170*36e66c55SAlexander Duyck } 171*36e66c55SAlexander Duyck 172*36e66c55SAlexander Duyck spin_unlock_irq(&zone->lock); 173*36e66c55SAlexander Duyck 174*36e66c55SAlexander Duyck return err; 175*36e66c55SAlexander Duyck } 176*36e66c55SAlexander Duyck 177*36e66c55SAlexander Duyck static int 178*36e66c55SAlexander Duyck page_reporting_process_zone(struct page_reporting_dev_info *prdev, 179*36e66c55SAlexander Duyck struct scatterlist *sgl, struct zone *zone) 180*36e66c55SAlexander Duyck { 181*36e66c55SAlexander Duyck unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY; 182*36e66c55SAlexander Duyck unsigned long watermark; 183*36e66c55SAlexander Duyck int err = 0; 184*36e66c55SAlexander Duyck 185*36e66c55SAlexander Duyck /* Generate minimum watermark to be able to guarantee progress */ 186*36e66c55SAlexander Duyck watermark = low_wmark_pages(zone) + 187*36e66c55SAlexander Duyck (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER); 188*36e66c55SAlexander Duyck 189*36e66c55SAlexander Duyck /* 190*36e66c55SAlexander Duyck * Cancel request if insufficient free memory or if we failed 191*36e66c55SAlexander Duyck * to allocate page reporting statistics for the zone. 192*36e66c55SAlexander Duyck */ 193*36e66c55SAlexander Duyck if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 194*36e66c55SAlexander Duyck return err; 195*36e66c55SAlexander Duyck 196*36e66c55SAlexander Duyck /* Process each free list starting from lowest order/mt */ 197*36e66c55SAlexander Duyck for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) { 198*36e66c55SAlexander Duyck for (mt = 0; mt < MIGRATE_TYPES; mt++) { 199*36e66c55SAlexander Duyck /* We do not pull pages from the isolate free list */ 200*36e66c55SAlexander Duyck if (is_migrate_isolate(mt)) 201*36e66c55SAlexander Duyck continue; 202*36e66c55SAlexander Duyck 203*36e66c55SAlexander Duyck err = page_reporting_cycle(prdev, zone, order, mt, 204*36e66c55SAlexander Duyck sgl, &offset); 205*36e66c55SAlexander Duyck if (err) 206*36e66c55SAlexander Duyck return err; 207*36e66c55SAlexander Duyck } 208*36e66c55SAlexander Duyck } 209*36e66c55SAlexander Duyck 210*36e66c55SAlexander Duyck /* report the leftover pages before going idle */ 211*36e66c55SAlexander Duyck leftover = PAGE_REPORTING_CAPACITY - offset; 212*36e66c55SAlexander Duyck if (leftover) { 213*36e66c55SAlexander Duyck sgl = &sgl[offset]; 214*36e66c55SAlexander Duyck err = prdev->report(prdev, sgl, leftover); 215*36e66c55SAlexander Duyck 216*36e66c55SAlexander Duyck /* flush any remaining pages out from the last report */ 217*36e66c55SAlexander Duyck spin_lock_irq(&zone->lock); 218*36e66c55SAlexander Duyck page_reporting_drain(prdev, sgl, leftover, !err); 219*36e66c55SAlexander Duyck spin_unlock_irq(&zone->lock); 220*36e66c55SAlexander Duyck } 221*36e66c55SAlexander Duyck 222*36e66c55SAlexander Duyck return err; 223*36e66c55SAlexander Duyck } 224*36e66c55SAlexander Duyck 225*36e66c55SAlexander Duyck static void page_reporting_process(struct work_struct *work) 226*36e66c55SAlexander Duyck { 227*36e66c55SAlexander Duyck struct delayed_work *d_work = to_delayed_work(work); 228*36e66c55SAlexander Duyck struct page_reporting_dev_info *prdev = 229*36e66c55SAlexander Duyck container_of(d_work, struct page_reporting_dev_info, work); 230*36e66c55SAlexander Duyck int err = 0, state = PAGE_REPORTING_ACTIVE; 231*36e66c55SAlexander Duyck struct scatterlist *sgl; 232*36e66c55SAlexander Duyck struct zone *zone; 233*36e66c55SAlexander Duyck 234*36e66c55SAlexander Duyck /* 235*36e66c55SAlexander Duyck * Change the state to "Active" so that we can track if there is 236*36e66c55SAlexander Duyck * anyone requests page reporting after we complete our pass. If 237*36e66c55SAlexander Duyck * the state is not altered by the end of the pass we will switch 238*36e66c55SAlexander Duyck * to idle and quit scheduling reporting runs. 239*36e66c55SAlexander Duyck */ 240*36e66c55SAlexander Duyck atomic_set(&prdev->state, state); 241*36e66c55SAlexander Duyck 242*36e66c55SAlexander Duyck /* allocate scatterlist to store pages being reported on */ 243*36e66c55SAlexander Duyck sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL); 244*36e66c55SAlexander Duyck if (!sgl) 245*36e66c55SAlexander Duyck goto err_out; 246*36e66c55SAlexander Duyck 247*36e66c55SAlexander Duyck sg_init_table(sgl, PAGE_REPORTING_CAPACITY); 248*36e66c55SAlexander Duyck 249*36e66c55SAlexander Duyck for_each_zone(zone) { 250*36e66c55SAlexander Duyck err = page_reporting_process_zone(prdev, sgl, zone); 251*36e66c55SAlexander Duyck if (err) 252*36e66c55SAlexander Duyck break; 253*36e66c55SAlexander Duyck } 254*36e66c55SAlexander Duyck 255*36e66c55SAlexander Duyck kfree(sgl); 256*36e66c55SAlexander Duyck err_out: 257*36e66c55SAlexander Duyck /* 258*36e66c55SAlexander Duyck * If the state has reverted back to requested then there may be 259*36e66c55SAlexander Duyck * additional pages to be processed. We will defer for 2s to allow 260*36e66c55SAlexander Duyck * more pages to accumulate. 261*36e66c55SAlexander Duyck */ 262*36e66c55SAlexander Duyck state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE); 263*36e66c55SAlexander Duyck if (state == PAGE_REPORTING_REQUESTED) 264*36e66c55SAlexander Duyck schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 265*36e66c55SAlexander Duyck } 266*36e66c55SAlexander Duyck 267*36e66c55SAlexander Duyck static DEFINE_MUTEX(page_reporting_mutex); 268*36e66c55SAlexander Duyck DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); 269*36e66c55SAlexander Duyck 270*36e66c55SAlexander Duyck int page_reporting_register(struct page_reporting_dev_info *prdev) 271*36e66c55SAlexander Duyck { 272*36e66c55SAlexander Duyck int err = 0; 273*36e66c55SAlexander Duyck 274*36e66c55SAlexander Duyck mutex_lock(&page_reporting_mutex); 275*36e66c55SAlexander Duyck 276*36e66c55SAlexander Duyck /* nothing to do if already in use */ 277*36e66c55SAlexander Duyck if (rcu_access_pointer(pr_dev_info)) { 278*36e66c55SAlexander Duyck err = -EBUSY; 279*36e66c55SAlexander Duyck goto err_out; 280*36e66c55SAlexander Duyck } 281*36e66c55SAlexander Duyck 282*36e66c55SAlexander Duyck /* initialize state and work structures */ 283*36e66c55SAlexander Duyck atomic_set(&prdev->state, PAGE_REPORTING_IDLE); 284*36e66c55SAlexander Duyck INIT_DELAYED_WORK(&prdev->work, &page_reporting_process); 285*36e66c55SAlexander Duyck 286*36e66c55SAlexander Duyck /* Begin initial flush of zones */ 287*36e66c55SAlexander Duyck __page_reporting_request(prdev); 288*36e66c55SAlexander Duyck 289*36e66c55SAlexander Duyck /* Assign device to allow notifications */ 290*36e66c55SAlexander Duyck rcu_assign_pointer(pr_dev_info, prdev); 291*36e66c55SAlexander Duyck 292*36e66c55SAlexander Duyck /* enable page reporting notification */ 293*36e66c55SAlexander Duyck if (!static_key_enabled(&page_reporting_enabled)) { 294*36e66c55SAlexander Duyck static_branch_enable(&page_reporting_enabled); 295*36e66c55SAlexander Duyck pr_info("Free page reporting enabled\n"); 296*36e66c55SAlexander Duyck } 297*36e66c55SAlexander Duyck err_out: 298*36e66c55SAlexander Duyck mutex_unlock(&page_reporting_mutex); 299*36e66c55SAlexander Duyck 300*36e66c55SAlexander Duyck return err; 301*36e66c55SAlexander Duyck } 302*36e66c55SAlexander Duyck EXPORT_SYMBOL_GPL(page_reporting_register); 303*36e66c55SAlexander Duyck 304*36e66c55SAlexander Duyck void page_reporting_unregister(struct page_reporting_dev_info *prdev) 305*36e66c55SAlexander Duyck { 306*36e66c55SAlexander Duyck mutex_lock(&page_reporting_mutex); 307*36e66c55SAlexander Duyck 308*36e66c55SAlexander Duyck if (rcu_access_pointer(pr_dev_info) == prdev) { 309*36e66c55SAlexander Duyck /* Disable page reporting notification */ 310*36e66c55SAlexander Duyck RCU_INIT_POINTER(pr_dev_info, NULL); 311*36e66c55SAlexander Duyck synchronize_rcu(); 312*36e66c55SAlexander Duyck 313*36e66c55SAlexander Duyck /* Flush any existing work, and lock it out */ 314*36e66c55SAlexander Duyck cancel_delayed_work_sync(&prdev->work); 315*36e66c55SAlexander Duyck } 316*36e66c55SAlexander Duyck 317*36e66c55SAlexander Duyck mutex_unlock(&page_reporting_mutex); 318*36e66c55SAlexander Duyck } 319*36e66c55SAlexander Duyck EXPORT_SYMBOL_GPL(page_reporting_unregister); 320