136e66c55SAlexander Duyck // SPDX-License-Identifier: GPL-2.0 236e66c55SAlexander Duyck #include <linux/mm.h> 336e66c55SAlexander Duyck #include <linux/mmzone.h> 436e66c55SAlexander Duyck #include <linux/page_reporting.h> 536e66c55SAlexander Duyck #include <linux/gfp.h> 636e66c55SAlexander Duyck #include <linux/export.h> 736e66c55SAlexander Duyck #include <linux/delay.h> 836e66c55SAlexander Duyck #include <linux/scatterlist.h> 936e66c55SAlexander Duyck 1036e66c55SAlexander Duyck #include "page_reporting.h" 1136e66c55SAlexander Duyck #include "internal.h" 1236e66c55SAlexander Duyck 1336e66c55SAlexander Duyck #define PAGE_REPORTING_DELAY (2 * HZ) 1436e66c55SAlexander Duyck static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; 1536e66c55SAlexander Duyck 1636e66c55SAlexander Duyck enum { 1736e66c55SAlexander Duyck PAGE_REPORTING_IDLE = 0, 1836e66c55SAlexander Duyck PAGE_REPORTING_REQUESTED, 1936e66c55SAlexander Duyck PAGE_REPORTING_ACTIVE 2036e66c55SAlexander Duyck }; 2136e66c55SAlexander Duyck 2236e66c55SAlexander Duyck /* request page reporting */ 2336e66c55SAlexander Duyck static void 2436e66c55SAlexander Duyck __page_reporting_request(struct page_reporting_dev_info *prdev) 2536e66c55SAlexander Duyck { 2636e66c55SAlexander Duyck unsigned int state; 2736e66c55SAlexander Duyck 2836e66c55SAlexander Duyck /* Check to see if we are in desired state */ 2936e66c55SAlexander Duyck state = atomic_read(&prdev->state); 3036e66c55SAlexander Duyck if (state == PAGE_REPORTING_REQUESTED) 3136e66c55SAlexander Duyck return; 3236e66c55SAlexander Duyck 3336e66c55SAlexander Duyck /* 3436e66c55SAlexander Duyck * If reporting is already active there is nothing we need to do. 3536e66c55SAlexander Duyck * Test against 0 as that represents PAGE_REPORTING_IDLE. 3636e66c55SAlexander Duyck */ 3736e66c55SAlexander Duyck state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED); 3836e66c55SAlexander Duyck if (state != PAGE_REPORTING_IDLE) 3936e66c55SAlexander Duyck return; 4036e66c55SAlexander Duyck 4136e66c55SAlexander Duyck /* 4236e66c55SAlexander Duyck * Delay the start of work to allow a sizable queue to build. For 4336e66c55SAlexander Duyck * now we are limiting this to running no more than once every 4436e66c55SAlexander Duyck * couple of seconds. 4536e66c55SAlexander Duyck */ 4636e66c55SAlexander Duyck schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 4736e66c55SAlexander Duyck } 4836e66c55SAlexander Duyck 4936e66c55SAlexander Duyck /* notify prdev of free page reporting request */ 5036e66c55SAlexander Duyck void __page_reporting_notify(void) 5136e66c55SAlexander Duyck { 5236e66c55SAlexander Duyck struct page_reporting_dev_info *prdev; 5336e66c55SAlexander Duyck 5436e66c55SAlexander Duyck /* 5536e66c55SAlexander Duyck * We use RCU to protect the pr_dev_info pointer. In almost all 5636e66c55SAlexander Duyck * cases this should be present, however in the unlikely case of 5736e66c55SAlexander Duyck * a shutdown this will be NULL and we should exit. 5836e66c55SAlexander Duyck */ 5936e66c55SAlexander Duyck rcu_read_lock(); 6036e66c55SAlexander Duyck prdev = rcu_dereference(pr_dev_info); 6136e66c55SAlexander Duyck if (likely(prdev)) 6236e66c55SAlexander Duyck __page_reporting_request(prdev); 6336e66c55SAlexander Duyck 6436e66c55SAlexander Duyck rcu_read_unlock(); 6536e66c55SAlexander Duyck } 6636e66c55SAlexander Duyck 6736e66c55SAlexander Duyck static void 6836e66c55SAlexander Duyck page_reporting_drain(struct page_reporting_dev_info *prdev, 6936e66c55SAlexander Duyck struct scatterlist *sgl, unsigned int nents, bool reported) 7036e66c55SAlexander Duyck { 7136e66c55SAlexander Duyck struct scatterlist *sg = sgl; 7236e66c55SAlexander Duyck 7336e66c55SAlexander Duyck /* 7436e66c55SAlexander Duyck * Drain the now reported pages back into their respective 7536e66c55SAlexander Duyck * free lists/areas. We assume at least one page is populated. 7636e66c55SAlexander Duyck */ 7736e66c55SAlexander Duyck do { 7836e66c55SAlexander Duyck struct page *page = sg_page(sg); 7936e66c55SAlexander Duyck int mt = get_pageblock_migratetype(page); 8036e66c55SAlexander Duyck unsigned int order = get_order(sg->length); 8136e66c55SAlexander Duyck 8236e66c55SAlexander Duyck __putback_isolated_page(page, order, mt); 8336e66c55SAlexander Duyck 8436e66c55SAlexander Duyck /* If the pages were not reported due to error skip flagging */ 8536e66c55SAlexander Duyck if (!reported) 8636e66c55SAlexander Duyck continue; 8736e66c55SAlexander Duyck 8836e66c55SAlexander Duyck /* 8936e66c55SAlexander Duyck * If page was not comingled with another page we can 9036e66c55SAlexander Duyck * consider the result to be "reported" since the page 9136e66c55SAlexander Duyck * hasn't been modified, otherwise we will need to 9236e66c55SAlexander Duyck * report on the new larger page when we make our way 9336e66c55SAlexander Duyck * up to that higher order. 9436e66c55SAlexander Duyck */ 95ab130f91SMatthew Wilcox (Oracle) if (PageBuddy(page) && buddy_order(page) == order) 9636e66c55SAlexander Duyck __SetPageReported(page); 9736e66c55SAlexander Duyck } while ((sg = sg_next(sg))); 9836e66c55SAlexander Duyck 9936e66c55SAlexander Duyck /* reinitialize scatterlist now that it is empty */ 10036e66c55SAlexander Duyck sg_init_table(sgl, nents); 10136e66c55SAlexander Duyck } 10236e66c55SAlexander Duyck 10336e66c55SAlexander Duyck /* 10436e66c55SAlexander Duyck * The page reporting cycle consists of 4 stages, fill, report, drain, and 10536e66c55SAlexander Duyck * idle. We will cycle through the first 3 stages until we cannot obtain a 10636e66c55SAlexander Duyck * full scatterlist of pages, in that case we will switch to idle. 10736e66c55SAlexander Duyck */ 10836e66c55SAlexander Duyck static int 10936e66c55SAlexander Duyck page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, 11036e66c55SAlexander Duyck unsigned int order, unsigned int mt, 11136e66c55SAlexander Duyck struct scatterlist *sgl, unsigned int *offset) 11236e66c55SAlexander Duyck { 11336e66c55SAlexander Duyck struct free_area *area = &zone->free_area[order]; 11436e66c55SAlexander Duyck struct list_head *list = &area->free_list[mt]; 11536e66c55SAlexander Duyck unsigned int page_len = PAGE_SIZE << order; 11636e66c55SAlexander Duyck struct page *page, *next; 11743b76f29SAlexander Duyck long budget; 11836e66c55SAlexander Duyck int err = 0; 11936e66c55SAlexander Duyck 12036e66c55SAlexander Duyck /* 12136e66c55SAlexander Duyck * Perform early check, if free area is empty there is 12236e66c55SAlexander Duyck * nothing to process so we can skip this free_list. 12336e66c55SAlexander Duyck */ 12436e66c55SAlexander Duyck if (list_empty(list)) 12536e66c55SAlexander Duyck return err; 12636e66c55SAlexander Duyck 12736e66c55SAlexander Duyck spin_lock_irq(&zone->lock); 12836e66c55SAlexander Duyck 12943b76f29SAlexander Duyck /* 13043b76f29SAlexander Duyck * Limit how many calls we will be making to the page reporting 13143b76f29SAlexander Duyck * device for this list. By doing this we avoid processing any 13243b76f29SAlexander Duyck * given list for too long. 13343b76f29SAlexander Duyck * 13443b76f29SAlexander Duyck * The current value used allows us enough calls to process over a 13543b76f29SAlexander Duyck * sixteenth of the current list plus one additional call to handle 13643b76f29SAlexander Duyck * any pages that may have already been present from the previous 13743b76f29SAlexander Duyck * list processed. This should result in us reporting all pages on 13843b76f29SAlexander Duyck * an idle system in about 30 seconds. 13943b76f29SAlexander Duyck * 14043b76f29SAlexander Duyck * The division here should be cheap since PAGE_REPORTING_CAPACITY 14143b76f29SAlexander Duyck * should always be a power of 2. 14243b76f29SAlexander Duyck */ 14343b76f29SAlexander Duyck budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16); 14443b76f29SAlexander Duyck 14536e66c55SAlexander Duyck /* loop through free list adding unreported pages to sg list */ 14636e66c55SAlexander Duyck list_for_each_entry_safe(page, next, list, lru) { 14736e66c55SAlexander Duyck /* We are going to skip over the reported pages. */ 14836e66c55SAlexander Duyck if (PageReported(page)) 14936e66c55SAlexander Duyck continue; 15036e66c55SAlexander Duyck 15143b76f29SAlexander Duyck /* 15243b76f29SAlexander Duyck * If we fully consumed our budget then update our 15343b76f29SAlexander Duyck * state to indicate that we are requesting additional 15443b76f29SAlexander Duyck * processing and exit this list. 15543b76f29SAlexander Duyck */ 15643b76f29SAlexander Duyck if (budget < 0) { 15743b76f29SAlexander Duyck atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED); 15843b76f29SAlexander Duyck next = page; 15943b76f29SAlexander Duyck break; 16043b76f29SAlexander Duyck } 16143b76f29SAlexander Duyck 16202cf8719SAlexander Duyck /* Attempt to pull page from list and place in scatterlist */ 16302cf8719SAlexander Duyck if (*offset) { 16402cf8719SAlexander Duyck if (!__isolate_free_page(page, order)) { 16502cf8719SAlexander Duyck next = page; 16636e66c55SAlexander Duyck break; 16702cf8719SAlexander Duyck } 16836e66c55SAlexander Duyck 16936e66c55SAlexander Duyck /* Add page to scatter list */ 17036e66c55SAlexander Duyck --(*offset); 17136e66c55SAlexander Duyck sg_set_page(&sgl[*offset], page, page_len, 0); 17236e66c55SAlexander Duyck 17336e66c55SAlexander Duyck continue; 17402cf8719SAlexander Duyck } 17502cf8719SAlexander Duyck 17602cf8719SAlexander Duyck /* 17743b76f29SAlexander Duyck * Make the first non-reported page in the free list 17802cf8719SAlexander Duyck * the new head of the free list before we release the 17902cf8719SAlexander Duyck * zone lock. 18002cf8719SAlexander Duyck */ 18158f6f034SWei Yang if (!list_is_first(&page->lru, list)) 18202cf8719SAlexander Duyck list_rotate_to_front(&page->lru, list); 18336e66c55SAlexander Duyck 18436e66c55SAlexander Duyck /* release lock before waiting on report processing */ 18536e66c55SAlexander Duyck spin_unlock_irq(&zone->lock); 18636e66c55SAlexander Duyck 18736e66c55SAlexander Duyck /* begin processing pages in local list */ 18836e66c55SAlexander Duyck err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY); 18936e66c55SAlexander Duyck 19036e66c55SAlexander Duyck /* reset offset since the full list was reported */ 19136e66c55SAlexander Duyck *offset = PAGE_REPORTING_CAPACITY; 19236e66c55SAlexander Duyck 19343b76f29SAlexander Duyck /* update budget to reflect call to report function */ 19443b76f29SAlexander Duyck budget--; 19543b76f29SAlexander Duyck 19636e66c55SAlexander Duyck /* reacquire zone lock and resume processing */ 19736e66c55SAlexander Duyck spin_lock_irq(&zone->lock); 19836e66c55SAlexander Duyck 19936e66c55SAlexander Duyck /* flush reported pages from the sg list */ 20036e66c55SAlexander Duyck page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err); 20136e66c55SAlexander Duyck 20236e66c55SAlexander Duyck /* 20336e66c55SAlexander Duyck * Reset next to first entry, the old next isn't valid 20436e66c55SAlexander Duyck * since we dropped the lock to report the pages 20536e66c55SAlexander Duyck */ 20636e66c55SAlexander Duyck next = list_first_entry(list, struct page, lru); 20736e66c55SAlexander Duyck 20836e66c55SAlexander Duyck /* exit on error */ 20936e66c55SAlexander Duyck if (err) 21036e66c55SAlexander Duyck break; 21136e66c55SAlexander Duyck } 21236e66c55SAlexander Duyck 21302cf8719SAlexander Duyck /* Rotate any leftover pages to the head of the freelist */ 214*5df6d792Ssh_def@163.com if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list)) 21502cf8719SAlexander Duyck list_rotate_to_front(&next->lru, list); 21602cf8719SAlexander Duyck 21736e66c55SAlexander Duyck spin_unlock_irq(&zone->lock); 21836e66c55SAlexander Duyck 21936e66c55SAlexander Duyck return err; 22036e66c55SAlexander Duyck } 22136e66c55SAlexander Duyck 22236e66c55SAlexander Duyck static int 22336e66c55SAlexander Duyck page_reporting_process_zone(struct page_reporting_dev_info *prdev, 22436e66c55SAlexander Duyck struct scatterlist *sgl, struct zone *zone) 22536e66c55SAlexander Duyck { 22636e66c55SAlexander Duyck unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY; 22736e66c55SAlexander Duyck unsigned long watermark; 22836e66c55SAlexander Duyck int err = 0; 22936e66c55SAlexander Duyck 23036e66c55SAlexander Duyck /* Generate minimum watermark to be able to guarantee progress */ 23136e66c55SAlexander Duyck watermark = low_wmark_pages(zone) + 23236e66c55SAlexander Duyck (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER); 23336e66c55SAlexander Duyck 23436e66c55SAlexander Duyck /* 23536e66c55SAlexander Duyck * Cancel request if insufficient free memory or if we failed 23636e66c55SAlexander Duyck * to allocate page reporting statistics for the zone. 23736e66c55SAlexander Duyck */ 23836e66c55SAlexander Duyck if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 23936e66c55SAlexander Duyck return err; 24036e66c55SAlexander Duyck 24136e66c55SAlexander Duyck /* Process each free list starting from lowest order/mt */ 24236e66c55SAlexander Duyck for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) { 24336e66c55SAlexander Duyck for (mt = 0; mt < MIGRATE_TYPES; mt++) { 24436e66c55SAlexander Duyck /* We do not pull pages from the isolate free list */ 24536e66c55SAlexander Duyck if (is_migrate_isolate(mt)) 24636e66c55SAlexander Duyck continue; 24736e66c55SAlexander Duyck 24836e66c55SAlexander Duyck err = page_reporting_cycle(prdev, zone, order, mt, 24936e66c55SAlexander Duyck sgl, &offset); 25036e66c55SAlexander Duyck if (err) 25136e66c55SAlexander Duyck return err; 25236e66c55SAlexander Duyck } 25336e66c55SAlexander Duyck } 25436e66c55SAlexander Duyck 25536e66c55SAlexander Duyck /* report the leftover pages before going idle */ 25636e66c55SAlexander Duyck leftover = PAGE_REPORTING_CAPACITY - offset; 25736e66c55SAlexander Duyck if (leftover) { 25836e66c55SAlexander Duyck sgl = &sgl[offset]; 25936e66c55SAlexander Duyck err = prdev->report(prdev, sgl, leftover); 26036e66c55SAlexander Duyck 26136e66c55SAlexander Duyck /* flush any remaining pages out from the last report */ 26236e66c55SAlexander Duyck spin_lock_irq(&zone->lock); 26336e66c55SAlexander Duyck page_reporting_drain(prdev, sgl, leftover, !err); 26436e66c55SAlexander Duyck spin_unlock_irq(&zone->lock); 26536e66c55SAlexander Duyck } 26636e66c55SAlexander Duyck 26736e66c55SAlexander Duyck return err; 26836e66c55SAlexander Duyck } 26936e66c55SAlexander Duyck 27036e66c55SAlexander Duyck static void page_reporting_process(struct work_struct *work) 27136e66c55SAlexander Duyck { 27236e66c55SAlexander Duyck struct delayed_work *d_work = to_delayed_work(work); 27336e66c55SAlexander Duyck struct page_reporting_dev_info *prdev = 27436e66c55SAlexander Duyck container_of(d_work, struct page_reporting_dev_info, work); 27536e66c55SAlexander Duyck int err = 0, state = PAGE_REPORTING_ACTIVE; 27636e66c55SAlexander Duyck struct scatterlist *sgl; 27736e66c55SAlexander Duyck struct zone *zone; 27836e66c55SAlexander Duyck 27936e66c55SAlexander Duyck /* 28036e66c55SAlexander Duyck * Change the state to "Active" so that we can track if there is 28136e66c55SAlexander Duyck * anyone requests page reporting after we complete our pass. If 28236e66c55SAlexander Duyck * the state is not altered by the end of the pass we will switch 28336e66c55SAlexander Duyck * to idle and quit scheduling reporting runs. 28436e66c55SAlexander Duyck */ 28536e66c55SAlexander Duyck atomic_set(&prdev->state, state); 28636e66c55SAlexander Duyck 28736e66c55SAlexander Duyck /* allocate scatterlist to store pages being reported on */ 28836e66c55SAlexander Duyck sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL); 28936e66c55SAlexander Duyck if (!sgl) 29036e66c55SAlexander Duyck goto err_out; 29136e66c55SAlexander Duyck 29236e66c55SAlexander Duyck sg_init_table(sgl, PAGE_REPORTING_CAPACITY); 29336e66c55SAlexander Duyck 29436e66c55SAlexander Duyck for_each_zone(zone) { 29536e66c55SAlexander Duyck err = page_reporting_process_zone(prdev, sgl, zone); 29636e66c55SAlexander Duyck if (err) 29736e66c55SAlexander Duyck break; 29836e66c55SAlexander Duyck } 29936e66c55SAlexander Duyck 30036e66c55SAlexander Duyck kfree(sgl); 30136e66c55SAlexander Duyck err_out: 30236e66c55SAlexander Duyck /* 30336e66c55SAlexander Duyck * If the state has reverted back to requested then there may be 30436e66c55SAlexander Duyck * additional pages to be processed. We will defer for 2s to allow 30536e66c55SAlexander Duyck * more pages to accumulate. 30636e66c55SAlexander Duyck */ 30736e66c55SAlexander Duyck state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE); 30836e66c55SAlexander Duyck if (state == PAGE_REPORTING_REQUESTED) 30936e66c55SAlexander Duyck schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 31036e66c55SAlexander Duyck } 31136e66c55SAlexander Duyck 31236e66c55SAlexander Duyck static DEFINE_MUTEX(page_reporting_mutex); 31336e66c55SAlexander Duyck DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); 31436e66c55SAlexander Duyck 31536e66c55SAlexander Duyck int page_reporting_register(struct page_reporting_dev_info *prdev) 31636e66c55SAlexander Duyck { 31736e66c55SAlexander Duyck int err = 0; 31836e66c55SAlexander Duyck 31936e66c55SAlexander Duyck mutex_lock(&page_reporting_mutex); 32036e66c55SAlexander Duyck 32136e66c55SAlexander Duyck /* nothing to do if already in use */ 32236e66c55SAlexander Duyck if (rcu_access_pointer(pr_dev_info)) { 32336e66c55SAlexander Duyck err = -EBUSY; 32436e66c55SAlexander Duyck goto err_out; 32536e66c55SAlexander Duyck } 32636e66c55SAlexander Duyck 32736e66c55SAlexander Duyck /* initialize state and work structures */ 32836e66c55SAlexander Duyck atomic_set(&prdev->state, PAGE_REPORTING_IDLE); 32936e66c55SAlexander Duyck INIT_DELAYED_WORK(&prdev->work, &page_reporting_process); 33036e66c55SAlexander Duyck 33136e66c55SAlexander Duyck /* Begin initial flush of zones */ 33236e66c55SAlexander Duyck __page_reporting_request(prdev); 33336e66c55SAlexander Duyck 33436e66c55SAlexander Duyck /* Assign device to allow notifications */ 33536e66c55SAlexander Duyck rcu_assign_pointer(pr_dev_info, prdev); 33636e66c55SAlexander Duyck 33736e66c55SAlexander Duyck /* enable page reporting notification */ 33836e66c55SAlexander Duyck if (!static_key_enabled(&page_reporting_enabled)) { 33936e66c55SAlexander Duyck static_branch_enable(&page_reporting_enabled); 34036e66c55SAlexander Duyck pr_info("Free page reporting enabled\n"); 34136e66c55SAlexander Duyck } 34236e66c55SAlexander Duyck err_out: 34336e66c55SAlexander Duyck mutex_unlock(&page_reporting_mutex); 34436e66c55SAlexander Duyck 34536e66c55SAlexander Duyck return err; 34636e66c55SAlexander Duyck } 34736e66c55SAlexander Duyck EXPORT_SYMBOL_GPL(page_reporting_register); 34836e66c55SAlexander Duyck 34936e66c55SAlexander Duyck void page_reporting_unregister(struct page_reporting_dev_info *prdev) 35036e66c55SAlexander Duyck { 35136e66c55SAlexander Duyck mutex_lock(&page_reporting_mutex); 35236e66c55SAlexander Duyck 35336e66c55SAlexander Duyck if (rcu_access_pointer(pr_dev_info) == prdev) { 35436e66c55SAlexander Duyck /* Disable page reporting notification */ 35536e66c55SAlexander Duyck RCU_INIT_POINTER(pr_dev_info, NULL); 35636e66c55SAlexander Duyck synchronize_rcu(); 35736e66c55SAlexander Duyck 35836e66c55SAlexander Duyck /* Flush any existing work, and lock it out */ 35936e66c55SAlexander Duyck cancel_delayed_work_sync(&prdev->work); 36036e66c55SAlexander Duyck } 36136e66c55SAlexander Duyck 36236e66c55SAlexander Duyck mutex_unlock(&page_reporting_mutex); 36336e66c55SAlexander Duyck } 36436e66c55SAlexander Duyck EXPORT_SYMBOL_GPL(page_reporting_unregister); 365