136e66c55SAlexander Duyck // SPDX-License-Identifier: GPL-2.0 236e66c55SAlexander Duyck #include <linux/mm.h> 336e66c55SAlexander Duyck #include <linux/mmzone.h> 436e66c55SAlexander Duyck #include <linux/page_reporting.h> 536e66c55SAlexander Duyck #include <linux/gfp.h> 636e66c55SAlexander Duyck #include <linux/export.h> 7f58780a8SGavin Shan #include <linux/module.h> 836e66c55SAlexander Duyck #include <linux/delay.h> 936e66c55SAlexander Duyck #include <linux/scatterlist.h> 1036e66c55SAlexander Duyck 1136e66c55SAlexander Duyck #include "page_reporting.h" 1236e66c55SAlexander Duyck #include "internal.h" 1336e66c55SAlexander Duyck 14f58780a8SGavin Shan unsigned int page_reporting_order = MAX_ORDER; 15f58780a8SGavin Shan module_param(page_reporting_order, uint, 0644); 16f58780a8SGavin Shan MODULE_PARM_DESC(page_reporting_order, "Set page reporting order"); 17f58780a8SGavin Shan 1836e66c55SAlexander Duyck #define PAGE_REPORTING_DELAY (2 * HZ) 1936e66c55SAlexander Duyck static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; 2036e66c55SAlexander Duyck 2136e66c55SAlexander Duyck enum { 2236e66c55SAlexander Duyck PAGE_REPORTING_IDLE = 0, 2336e66c55SAlexander Duyck PAGE_REPORTING_REQUESTED, 2436e66c55SAlexander Duyck PAGE_REPORTING_ACTIVE 2536e66c55SAlexander Duyck }; 2636e66c55SAlexander Duyck 2736e66c55SAlexander Duyck /* request page reporting */ 2836e66c55SAlexander Duyck static void 2936e66c55SAlexander Duyck __page_reporting_request(struct page_reporting_dev_info *prdev) 3036e66c55SAlexander Duyck { 3136e66c55SAlexander Duyck unsigned int state; 3236e66c55SAlexander Duyck 3336e66c55SAlexander Duyck /* Check to see if we are in desired state */ 3436e66c55SAlexander Duyck state = atomic_read(&prdev->state); 3536e66c55SAlexander Duyck if (state == PAGE_REPORTING_REQUESTED) 3636e66c55SAlexander Duyck return; 3736e66c55SAlexander Duyck 3836e66c55SAlexander Duyck /* 3936e66c55SAlexander Duyck * If reporting is already active there is nothing we need to do. 4036e66c55SAlexander Duyck * Test against 0 as that represents PAGE_REPORTING_IDLE. 4136e66c55SAlexander Duyck */ 4236e66c55SAlexander Duyck state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED); 4336e66c55SAlexander Duyck if (state != PAGE_REPORTING_IDLE) 4436e66c55SAlexander Duyck return; 4536e66c55SAlexander Duyck 4636e66c55SAlexander Duyck /* 4736e66c55SAlexander Duyck * Delay the start of work to allow a sizable queue to build. For 4836e66c55SAlexander Duyck * now we are limiting this to running no more than once every 4936e66c55SAlexander Duyck * couple of seconds. 5036e66c55SAlexander Duyck */ 5136e66c55SAlexander Duyck schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 5236e66c55SAlexander Duyck } 5336e66c55SAlexander Duyck 5436e66c55SAlexander Duyck /* notify prdev of free page reporting request */ 5536e66c55SAlexander Duyck void __page_reporting_notify(void) 5636e66c55SAlexander Duyck { 5736e66c55SAlexander Duyck struct page_reporting_dev_info *prdev; 5836e66c55SAlexander Duyck 5936e66c55SAlexander Duyck /* 6036e66c55SAlexander Duyck * We use RCU to protect the pr_dev_info pointer. In almost all 6136e66c55SAlexander Duyck * cases this should be present, however in the unlikely case of 6236e66c55SAlexander Duyck * a shutdown this will be NULL and we should exit. 6336e66c55SAlexander Duyck */ 6436e66c55SAlexander Duyck rcu_read_lock(); 6536e66c55SAlexander Duyck prdev = rcu_dereference(pr_dev_info); 6636e66c55SAlexander Duyck if (likely(prdev)) 6736e66c55SAlexander Duyck __page_reporting_request(prdev); 6836e66c55SAlexander Duyck 6936e66c55SAlexander Duyck rcu_read_unlock(); 7036e66c55SAlexander Duyck } 7136e66c55SAlexander Duyck 7236e66c55SAlexander Duyck static void 7336e66c55SAlexander Duyck page_reporting_drain(struct page_reporting_dev_info *prdev, 7436e66c55SAlexander Duyck struct scatterlist *sgl, unsigned int nents, bool reported) 7536e66c55SAlexander Duyck { 7636e66c55SAlexander Duyck struct scatterlist *sg = sgl; 7736e66c55SAlexander Duyck 7836e66c55SAlexander Duyck /* 7936e66c55SAlexander Duyck * Drain the now reported pages back into their respective 8036e66c55SAlexander Duyck * free lists/areas. We assume at least one page is populated. 8136e66c55SAlexander Duyck */ 8236e66c55SAlexander Duyck do { 8336e66c55SAlexander Duyck struct page *page = sg_page(sg); 8436e66c55SAlexander Duyck int mt = get_pageblock_migratetype(page); 8536e66c55SAlexander Duyck unsigned int order = get_order(sg->length); 8636e66c55SAlexander Duyck 8736e66c55SAlexander Duyck __putback_isolated_page(page, order, mt); 8836e66c55SAlexander Duyck 8936e66c55SAlexander Duyck /* If the pages were not reported due to error skip flagging */ 9036e66c55SAlexander Duyck if (!reported) 9136e66c55SAlexander Duyck continue; 9236e66c55SAlexander Duyck 9336e66c55SAlexander Duyck /* 9436e66c55SAlexander Duyck * If page was not comingled with another page we can 9536e66c55SAlexander Duyck * consider the result to be "reported" since the page 9636e66c55SAlexander Duyck * hasn't been modified, otherwise we will need to 9736e66c55SAlexander Duyck * report on the new larger page when we make our way 9836e66c55SAlexander Duyck * up to that higher order. 9936e66c55SAlexander Duyck */ 100ab130f91SMatthew Wilcox (Oracle) if (PageBuddy(page) && buddy_order(page) == order) 10136e66c55SAlexander Duyck __SetPageReported(page); 10236e66c55SAlexander Duyck } while ((sg = sg_next(sg))); 10336e66c55SAlexander Duyck 10436e66c55SAlexander Duyck /* reinitialize scatterlist now that it is empty */ 10536e66c55SAlexander Duyck sg_init_table(sgl, nents); 10636e66c55SAlexander Duyck } 10736e66c55SAlexander Duyck 10836e66c55SAlexander Duyck /* 10936e66c55SAlexander Duyck * The page reporting cycle consists of 4 stages, fill, report, drain, and 11036e66c55SAlexander Duyck * idle. We will cycle through the first 3 stages until we cannot obtain a 11136e66c55SAlexander Duyck * full scatterlist of pages, in that case we will switch to idle. 11236e66c55SAlexander Duyck */ 11336e66c55SAlexander Duyck static int 11436e66c55SAlexander Duyck page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, 11536e66c55SAlexander Duyck unsigned int order, unsigned int mt, 11636e66c55SAlexander Duyck struct scatterlist *sgl, unsigned int *offset) 11736e66c55SAlexander Duyck { 11836e66c55SAlexander Duyck struct free_area *area = &zone->free_area[order]; 11936e66c55SAlexander Duyck struct list_head *list = &area->free_list[mt]; 12036e66c55SAlexander Duyck unsigned int page_len = PAGE_SIZE << order; 12136e66c55SAlexander Duyck struct page *page, *next; 12243b76f29SAlexander Duyck long budget; 12336e66c55SAlexander Duyck int err = 0; 12436e66c55SAlexander Duyck 12536e66c55SAlexander Duyck /* 12636e66c55SAlexander Duyck * Perform early check, if free area is empty there is 12736e66c55SAlexander Duyck * nothing to process so we can skip this free_list. 12836e66c55SAlexander Duyck */ 12936e66c55SAlexander Duyck if (list_empty(list)) 13036e66c55SAlexander Duyck return err; 13136e66c55SAlexander Duyck 13236e66c55SAlexander Duyck spin_lock_irq(&zone->lock); 13336e66c55SAlexander Duyck 13443b76f29SAlexander Duyck /* 13543b76f29SAlexander Duyck * Limit how many calls we will be making to the page reporting 13643b76f29SAlexander Duyck * device for this list. By doing this we avoid processing any 13743b76f29SAlexander Duyck * given list for too long. 13843b76f29SAlexander Duyck * 13943b76f29SAlexander Duyck * The current value used allows us enough calls to process over a 14043b76f29SAlexander Duyck * sixteenth of the current list plus one additional call to handle 14143b76f29SAlexander Duyck * any pages that may have already been present from the previous 14243b76f29SAlexander Duyck * list processed. This should result in us reporting all pages on 14343b76f29SAlexander Duyck * an idle system in about 30 seconds. 14443b76f29SAlexander Duyck * 14543b76f29SAlexander Duyck * The division here should be cheap since PAGE_REPORTING_CAPACITY 14643b76f29SAlexander Duyck * should always be a power of 2. 14743b76f29SAlexander Duyck */ 14843b76f29SAlexander Duyck budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16); 14943b76f29SAlexander Duyck 15036e66c55SAlexander Duyck /* loop through free list adding unreported pages to sg list */ 15136e66c55SAlexander Duyck list_for_each_entry_safe(page, next, list, lru) { 15236e66c55SAlexander Duyck /* We are going to skip over the reported pages. */ 15336e66c55SAlexander Duyck if (PageReported(page)) 15436e66c55SAlexander Duyck continue; 15536e66c55SAlexander Duyck 15643b76f29SAlexander Duyck /* 15743b76f29SAlexander Duyck * If we fully consumed our budget then update our 15843b76f29SAlexander Duyck * state to indicate that we are requesting additional 15943b76f29SAlexander Duyck * processing and exit this list. 16043b76f29SAlexander Duyck */ 16143b76f29SAlexander Duyck if (budget < 0) { 16243b76f29SAlexander Duyck atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED); 16343b76f29SAlexander Duyck next = page; 16443b76f29SAlexander Duyck break; 16543b76f29SAlexander Duyck } 16643b76f29SAlexander Duyck 16702cf8719SAlexander Duyck /* Attempt to pull page from list and place in scatterlist */ 16802cf8719SAlexander Duyck if (*offset) { 16902cf8719SAlexander Duyck if (!__isolate_free_page(page, order)) { 17002cf8719SAlexander Duyck next = page; 17136e66c55SAlexander Duyck break; 17202cf8719SAlexander Duyck } 17336e66c55SAlexander Duyck 17436e66c55SAlexander Duyck /* Add page to scatter list */ 17536e66c55SAlexander Duyck --(*offset); 17636e66c55SAlexander Duyck sg_set_page(&sgl[*offset], page, page_len, 0); 17736e66c55SAlexander Duyck 17836e66c55SAlexander Duyck continue; 17902cf8719SAlexander Duyck } 18002cf8719SAlexander Duyck 18102cf8719SAlexander Duyck /* 18243b76f29SAlexander Duyck * Make the first non-reported page in the free list 18302cf8719SAlexander Duyck * the new head of the free list before we release the 18402cf8719SAlexander Duyck * zone lock. 18502cf8719SAlexander Duyck */ 18658f6f034SWei Yang if (!list_is_first(&page->lru, list)) 18702cf8719SAlexander Duyck list_rotate_to_front(&page->lru, list); 18836e66c55SAlexander Duyck 18936e66c55SAlexander Duyck /* release lock before waiting on report processing */ 19036e66c55SAlexander Duyck spin_unlock_irq(&zone->lock); 19136e66c55SAlexander Duyck 19236e66c55SAlexander Duyck /* begin processing pages in local list */ 19336e66c55SAlexander Duyck err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY); 19436e66c55SAlexander Duyck 19536e66c55SAlexander Duyck /* reset offset since the full list was reported */ 19636e66c55SAlexander Duyck *offset = PAGE_REPORTING_CAPACITY; 19736e66c55SAlexander Duyck 19843b76f29SAlexander Duyck /* update budget to reflect call to report function */ 19943b76f29SAlexander Duyck budget--; 20043b76f29SAlexander Duyck 20136e66c55SAlexander Duyck /* reacquire zone lock and resume processing */ 20236e66c55SAlexander Duyck spin_lock_irq(&zone->lock); 20336e66c55SAlexander Duyck 20436e66c55SAlexander Duyck /* flush reported pages from the sg list */ 20536e66c55SAlexander Duyck page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err); 20636e66c55SAlexander Duyck 20736e66c55SAlexander Duyck /* 20836e66c55SAlexander Duyck * Reset next to first entry, the old next isn't valid 20936e66c55SAlexander Duyck * since we dropped the lock to report the pages 21036e66c55SAlexander Duyck */ 21136e66c55SAlexander Duyck next = list_first_entry(list, struct page, lru); 21236e66c55SAlexander Duyck 21336e66c55SAlexander Duyck /* exit on error */ 21436e66c55SAlexander Duyck if (err) 21536e66c55SAlexander Duyck break; 21636e66c55SAlexander Duyck } 21736e66c55SAlexander Duyck 21802cf8719SAlexander Duyck /* Rotate any leftover pages to the head of the freelist */ 2195df6d792Ssh_def@163.com if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list)) 22002cf8719SAlexander Duyck list_rotate_to_front(&next->lru, list); 22102cf8719SAlexander Duyck 22236e66c55SAlexander Duyck spin_unlock_irq(&zone->lock); 22336e66c55SAlexander Duyck 22436e66c55SAlexander Duyck return err; 22536e66c55SAlexander Duyck } 22636e66c55SAlexander Duyck 22736e66c55SAlexander Duyck static int 22836e66c55SAlexander Duyck page_reporting_process_zone(struct page_reporting_dev_info *prdev, 22936e66c55SAlexander Duyck struct scatterlist *sgl, struct zone *zone) 23036e66c55SAlexander Duyck { 23136e66c55SAlexander Duyck unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY; 23236e66c55SAlexander Duyck unsigned long watermark; 23336e66c55SAlexander Duyck int err = 0; 23436e66c55SAlexander Duyck 23536e66c55SAlexander Duyck /* Generate minimum watermark to be able to guarantee progress */ 23636e66c55SAlexander Duyck watermark = low_wmark_pages(zone) + 237f58780a8SGavin Shan (PAGE_REPORTING_CAPACITY << page_reporting_order); 23836e66c55SAlexander Duyck 23936e66c55SAlexander Duyck /* 24036e66c55SAlexander Duyck * Cancel request if insufficient free memory or if we failed 24136e66c55SAlexander Duyck * to allocate page reporting statistics for the zone. 24236e66c55SAlexander Duyck */ 24336e66c55SAlexander Duyck if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 24436e66c55SAlexander Duyck return err; 24536e66c55SAlexander Duyck 24636e66c55SAlexander Duyck /* Process each free list starting from lowest order/mt */ 247f58780a8SGavin Shan for (order = page_reporting_order; order < MAX_ORDER; order++) { 24836e66c55SAlexander Duyck for (mt = 0; mt < MIGRATE_TYPES; mt++) { 24936e66c55SAlexander Duyck /* We do not pull pages from the isolate free list */ 25036e66c55SAlexander Duyck if (is_migrate_isolate(mt)) 25136e66c55SAlexander Duyck continue; 25236e66c55SAlexander Duyck 25336e66c55SAlexander Duyck err = page_reporting_cycle(prdev, zone, order, mt, 25436e66c55SAlexander Duyck sgl, &offset); 25536e66c55SAlexander Duyck if (err) 25636e66c55SAlexander Duyck return err; 25736e66c55SAlexander Duyck } 25836e66c55SAlexander Duyck } 25936e66c55SAlexander Duyck 26036e66c55SAlexander Duyck /* report the leftover pages before going idle */ 26136e66c55SAlexander Duyck leftover = PAGE_REPORTING_CAPACITY - offset; 26236e66c55SAlexander Duyck if (leftover) { 26336e66c55SAlexander Duyck sgl = &sgl[offset]; 26436e66c55SAlexander Duyck err = prdev->report(prdev, sgl, leftover); 26536e66c55SAlexander Duyck 26636e66c55SAlexander Duyck /* flush any remaining pages out from the last report */ 26736e66c55SAlexander Duyck spin_lock_irq(&zone->lock); 26836e66c55SAlexander Duyck page_reporting_drain(prdev, sgl, leftover, !err); 26936e66c55SAlexander Duyck spin_unlock_irq(&zone->lock); 27036e66c55SAlexander Duyck } 27136e66c55SAlexander Duyck 27236e66c55SAlexander Duyck return err; 27336e66c55SAlexander Duyck } 27436e66c55SAlexander Duyck 27536e66c55SAlexander Duyck static void page_reporting_process(struct work_struct *work) 27636e66c55SAlexander Duyck { 27736e66c55SAlexander Duyck struct delayed_work *d_work = to_delayed_work(work); 27836e66c55SAlexander Duyck struct page_reporting_dev_info *prdev = 27936e66c55SAlexander Duyck container_of(d_work, struct page_reporting_dev_info, work); 28036e66c55SAlexander Duyck int err = 0, state = PAGE_REPORTING_ACTIVE; 28136e66c55SAlexander Duyck struct scatterlist *sgl; 28236e66c55SAlexander Duyck struct zone *zone; 28336e66c55SAlexander Duyck 28436e66c55SAlexander Duyck /* 28536e66c55SAlexander Duyck * Change the state to "Active" so that we can track if there is 28636e66c55SAlexander Duyck * anyone requests page reporting after we complete our pass. If 28736e66c55SAlexander Duyck * the state is not altered by the end of the pass we will switch 28836e66c55SAlexander Duyck * to idle and quit scheduling reporting runs. 28936e66c55SAlexander Duyck */ 29036e66c55SAlexander Duyck atomic_set(&prdev->state, state); 29136e66c55SAlexander Duyck 29236e66c55SAlexander Duyck /* allocate scatterlist to store pages being reported on */ 29336e66c55SAlexander Duyck sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL); 29436e66c55SAlexander Duyck if (!sgl) 29536e66c55SAlexander Duyck goto err_out; 29636e66c55SAlexander Duyck 29736e66c55SAlexander Duyck sg_init_table(sgl, PAGE_REPORTING_CAPACITY); 29836e66c55SAlexander Duyck 29936e66c55SAlexander Duyck for_each_zone(zone) { 30036e66c55SAlexander Duyck err = page_reporting_process_zone(prdev, sgl, zone); 30136e66c55SAlexander Duyck if (err) 30236e66c55SAlexander Duyck break; 30336e66c55SAlexander Duyck } 30436e66c55SAlexander Duyck 30536e66c55SAlexander Duyck kfree(sgl); 30636e66c55SAlexander Duyck err_out: 30736e66c55SAlexander Duyck /* 30836e66c55SAlexander Duyck * If the state has reverted back to requested then there may be 30936e66c55SAlexander Duyck * additional pages to be processed. We will defer for 2s to allow 31036e66c55SAlexander Duyck * more pages to accumulate. 31136e66c55SAlexander Duyck */ 31236e66c55SAlexander Duyck state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE); 31336e66c55SAlexander Duyck if (state == PAGE_REPORTING_REQUESTED) 31436e66c55SAlexander Duyck schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 31536e66c55SAlexander Duyck } 31636e66c55SAlexander Duyck 31736e66c55SAlexander Duyck static DEFINE_MUTEX(page_reporting_mutex); 31836e66c55SAlexander Duyck DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); 31936e66c55SAlexander Duyck 32036e66c55SAlexander Duyck int page_reporting_register(struct page_reporting_dev_info *prdev) 32136e66c55SAlexander Duyck { 32236e66c55SAlexander Duyck int err = 0; 32336e66c55SAlexander Duyck 32436e66c55SAlexander Duyck mutex_lock(&page_reporting_mutex); 32536e66c55SAlexander Duyck 32636e66c55SAlexander Duyck /* nothing to do if already in use */ 32736e66c55SAlexander Duyck if (rcu_access_pointer(pr_dev_info)) { 32836e66c55SAlexander Duyck err = -EBUSY; 32936e66c55SAlexander Duyck goto err_out; 33036e66c55SAlexander Duyck } 33136e66c55SAlexander Duyck 332*9f849c6fSGavin Shan /* 333*9f849c6fSGavin Shan * Update the page reporting order if it's specified by driver. 334*9f849c6fSGavin Shan * Otherwise, it falls back to @pageblock_order. 335*9f849c6fSGavin Shan */ 336*9f849c6fSGavin Shan page_reporting_order = prdev->order ? : pageblock_order; 337*9f849c6fSGavin Shan 33836e66c55SAlexander Duyck /* initialize state and work structures */ 33936e66c55SAlexander Duyck atomic_set(&prdev->state, PAGE_REPORTING_IDLE); 34036e66c55SAlexander Duyck INIT_DELAYED_WORK(&prdev->work, &page_reporting_process); 34136e66c55SAlexander Duyck 34236e66c55SAlexander Duyck /* Begin initial flush of zones */ 34336e66c55SAlexander Duyck __page_reporting_request(prdev); 34436e66c55SAlexander Duyck 34536e66c55SAlexander Duyck /* Assign device to allow notifications */ 34636e66c55SAlexander Duyck rcu_assign_pointer(pr_dev_info, prdev); 34736e66c55SAlexander Duyck 34836e66c55SAlexander Duyck /* enable page reporting notification */ 34936e66c55SAlexander Duyck if (!static_key_enabled(&page_reporting_enabled)) { 35036e66c55SAlexander Duyck static_branch_enable(&page_reporting_enabled); 35136e66c55SAlexander Duyck pr_info("Free page reporting enabled\n"); 35236e66c55SAlexander Duyck } 35336e66c55SAlexander Duyck err_out: 35436e66c55SAlexander Duyck mutex_unlock(&page_reporting_mutex); 35536e66c55SAlexander Duyck 35636e66c55SAlexander Duyck return err; 35736e66c55SAlexander Duyck } 35836e66c55SAlexander Duyck EXPORT_SYMBOL_GPL(page_reporting_register); 35936e66c55SAlexander Duyck 36036e66c55SAlexander Duyck void page_reporting_unregister(struct page_reporting_dev_info *prdev) 36136e66c55SAlexander Duyck { 36236e66c55SAlexander Duyck mutex_lock(&page_reporting_mutex); 36336e66c55SAlexander Duyck 36436e66c55SAlexander Duyck if (rcu_access_pointer(pr_dev_info) == prdev) { 36536e66c55SAlexander Duyck /* Disable page reporting notification */ 36636e66c55SAlexander Duyck RCU_INIT_POINTER(pr_dev_info, NULL); 36736e66c55SAlexander Duyck synchronize_rcu(); 36836e66c55SAlexander Duyck 36936e66c55SAlexander Duyck /* Flush any existing work, and lock it out */ 37036e66c55SAlexander Duyck cancel_delayed_work_sync(&prdev->work); 37136e66c55SAlexander Duyck } 37236e66c55SAlexander Duyck 37336e66c55SAlexander Duyck mutex_unlock(&page_reporting_mutex); 37436e66c55SAlexander Duyck } 37536e66c55SAlexander Duyck EXPORT_SYMBOL_GPL(page_reporting_unregister); 376