136e66c55SAlexander Duyck // SPDX-License-Identifier: GPL-2.0 236e66c55SAlexander Duyck #include <linux/mm.h> 336e66c55SAlexander Duyck #include <linux/mmzone.h> 436e66c55SAlexander Duyck #include <linux/page_reporting.h> 536e66c55SAlexander Duyck #include <linux/gfp.h> 636e66c55SAlexander Duyck #include <linux/export.h> 736e66c55SAlexander Duyck #include <linux/delay.h> 836e66c55SAlexander Duyck #include <linux/scatterlist.h> 936e66c55SAlexander Duyck 1036e66c55SAlexander Duyck #include "page_reporting.h" 1136e66c55SAlexander Duyck #include "internal.h" 1236e66c55SAlexander Duyck 1336e66c55SAlexander Duyck #define PAGE_REPORTING_DELAY (2 * HZ) 1436e66c55SAlexander Duyck static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; 1536e66c55SAlexander Duyck 1636e66c55SAlexander Duyck enum { 1736e66c55SAlexander Duyck PAGE_REPORTING_IDLE = 0, 1836e66c55SAlexander Duyck PAGE_REPORTING_REQUESTED, 1936e66c55SAlexander Duyck PAGE_REPORTING_ACTIVE 2036e66c55SAlexander Duyck }; 2136e66c55SAlexander Duyck 2236e66c55SAlexander Duyck /* request page reporting */ 2336e66c55SAlexander Duyck static void 2436e66c55SAlexander Duyck __page_reporting_request(struct page_reporting_dev_info *prdev) 2536e66c55SAlexander Duyck { 2636e66c55SAlexander Duyck unsigned int state; 2736e66c55SAlexander Duyck 2836e66c55SAlexander Duyck /* Check to see if we are in desired state */ 2936e66c55SAlexander Duyck state = atomic_read(&prdev->state); 3036e66c55SAlexander Duyck if (state == PAGE_REPORTING_REQUESTED) 3136e66c55SAlexander Duyck return; 3236e66c55SAlexander Duyck 3336e66c55SAlexander Duyck /* 3436e66c55SAlexander Duyck * If reporting is already active there is nothing we need to do. 3536e66c55SAlexander Duyck * Test against 0 as that represents PAGE_REPORTING_IDLE. 3636e66c55SAlexander Duyck */ 3736e66c55SAlexander Duyck state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED); 3836e66c55SAlexander Duyck if (state != PAGE_REPORTING_IDLE) 3936e66c55SAlexander Duyck return; 4036e66c55SAlexander Duyck 4136e66c55SAlexander Duyck /* 4236e66c55SAlexander Duyck * Delay the start of work to allow a sizable queue to build. For 4336e66c55SAlexander Duyck * now we are limiting this to running no more than once every 4436e66c55SAlexander Duyck * couple of seconds. 4536e66c55SAlexander Duyck */ 4636e66c55SAlexander Duyck schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 4736e66c55SAlexander Duyck } 4836e66c55SAlexander Duyck 4936e66c55SAlexander Duyck /* notify prdev of free page reporting request */ 5036e66c55SAlexander Duyck void __page_reporting_notify(void) 5136e66c55SAlexander Duyck { 5236e66c55SAlexander Duyck struct page_reporting_dev_info *prdev; 5336e66c55SAlexander Duyck 5436e66c55SAlexander Duyck /* 5536e66c55SAlexander Duyck * We use RCU to protect the pr_dev_info pointer. In almost all 5636e66c55SAlexander Duyck * cases this should be present, however in the unlikely case of 5736e66c55SAlexander Duyck * a shutdown this will be NULL and we should exit. 5836e66c55SAlexander Duyck */ 5936e66c55SAlexander Duyck rcu_read_lock(); 6036e66c55SAlexander Duyck prdev = rcu_dereference(pr_dev_info); 6136e66c55SAlexander Duyck if (likely(prdev)) 6236e66c55SAlexander Duyck __page_reporting_request(prdev); 6336e66c55SAlexander Duyck 6436e66c55SAlexander Duyck rcu_read_unlock(); 6536e66c55SAlexander Duyck } 6636e66c55SAlexander Duyck 6736e66c55SAlexander Duyck static void 6836e66c55SAlexander Duyck page_reporting_drain(struct page_reporting_dev_info *prdev, 6936e66c55SAlexander Duyck struct scatterlist *sgl, unsigned int nents, bool reported) 7036e66c55SAlexander Duyck { 7136e66c55SAlexander Duyck struct scatterlist *sg = sgl; 7236e66c55SAlexander Duyck 7336e66c55SAlexander Duyck /* 7436e66c55SAlexander Duyck * Drain the now reported pages back into their respective 7536e66c55SAlexander Duyck * free lists/areas. We assume at least one page is populated. 7636e66c55SAlexander Duyck */ 7736e66c55SAlexander Duyck do { 7836e66c55SAlexander Duyck struct page *page = sg_page(sg); 7936e66c55SAlexander Duyck int mt = get_pageblock_migratetype(page); 8036e66c55SAlexander Duyck unsigned int order = get_order(sg->length); 8136e66c55SAlexander Duyck 8236e66c55SAlexander Duyck __putback_isolated_page(page, order, mt); 8336e66c55SAlexander Duyck 8436e66c55SAlexander Duyck /* If the pages were not reported due to error skip flagging */ 8536e66c55SAlexander Duyck if (!reported) 8636e66c55SAlexander Duyck continue; 8736e66c55SAlexander Duyck 8836e66c55SAlexander Duyck /* 8936e66c55SAlexander Duyck * If page was not comingled with another page we can 9036e66c55SAlexander Duyck * consider the result to be "reported" since the page 9136e66c55SAlexander Duyck * hasn't been modified, otherwise we will need to 9236e66c55SAlexander Duyck * report on the new larger page when we make our way 9336e66c55SAlexander Duyck * up to that higher order. 9436e66c55SAlexander Duyck */ 9536e66c55SAlexander Duyck if (PageBuddy(page) && page_order(page) == order) 9636e66c55SAlexander Duyck __SetPageReported(page); 9736e66c55SAlexander Duyck } while ((sg = sg_next(sg))); 9836e66c55SAlexander Duyck 9936e66c55SAlexander Duyck /* reinitialize scatterlist now that it is empty */ 10036e66c55SAlexander Duyck sg_init_table(sgl, nents); 10136e66c55SAlexander Duyck } 10236e66c55SAlexander Duyck 10336e66c55SAlexander Duyck /* 10436e66c55SAlexander Duyck * The page reporting cycle consists of 4 stages, fill, report, drain, and 10536e66c55SAlexander Duyck * idle. We will cycle through the first 3 stages until we cannot obtain a 10636e66c55SAlexander Duyck * full scatterlist of pages, in that case we will switch to idle. 10736e66c55SAlexander Duyck */ 10836e66c55SAlexander Duyck static int 10936e66c55SAlexander Duyck page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, 11036e66c55SAlexander Duyck unsigned int order, unsigned int mt, 11136e66c55SAlexander Duyck struct scatterlist *sgl, unsigned int *offset) 11236e66c55SAlexander Duyck { 11336e66c55SAlexander Duyck struct free_area *area = &zone->free_area[order]; 11436e66c55SAlexander Duyck struct list_head *list = &area->free_list[mt]; 11536e66c55SAlexander Duyck unsigned int page_len = PAGE_SIZE << order; 11636e66c55SAlexander Duyck struct page *page, *next; 11736e66c55SAlexander Duyck int err = 0; 11836e66c55SAlexander Duyck 11936e66c55SAlexander Duyck /* 12036e66c55SAlexander Duyck * Perform early check, if free area is empty there is 12136e66c55SAlexander Duyck * nothing to process so we can skip this free_list. 12236e66c55SAlexander Duyck */ 12336e66c55SAlexander Duyck if (list_empty(list)) 12436e66c55SAlexander Duyck return err; 12536e66c55SAlexander Duyck 12636e66c55SAlexander Duyck spin_lock_irq(&zone->lock); 12736e66c55SAlexander Duyck 12836e66c55SAlexander Duyck /* loop through free list adding unreported pages to sg list */ 12936e66c55SAlexander Duyck list_for_each_entry_safe(page, next, list, lru) { 13036e66c55SAlexander Duyck /* We are going to skip over the reported pages. */ 13136e66c55SAlexander Duyck if (PageReported(page)) 13236e66c55SAlexander Duyck continue; 13336e66c55SAlexander Duyck 134*02cf8719SAlexander Duyck /* Attempt to pull page from list and place in scatterlist */ 135*02cf8719SAlexander Duyck if (*offset) { 136*02cf8719SAlexander Duyck if (!__isolate_free_page(page, order)) { 137*02cf8719SAlexander Duyck next = page; 13836e66c55SAlexander Duyck break; 139*02cf8719SAlexander Duyck } 14036e66c55SAlexander Duyck 14136e66c55SAlexander Duyck /* Add page to scatter list */ 14236e66c55SAlexander Duyck --(*offset); 14336e66c55SAlexander Duyck sg_set_page(&sgl[*offset], page, page_len, 0); 14436e66c55SAlexander Duyck 14536e66c55SAlexander Duyck continue; 146*02cf8719SAlexander Duyck } 147*02cf8719SAlexander Duyck 148*02cf8719SAlexander Duyck /* 149*02cf8719SAlexander Duyck * Make the first non-processed page in the free list 150*02cf8719SAlexander Duyck * the new head of the free list before we release the 151*02cf8719SAlexander Duyck * zone lock. 152*02cf8719SAlexander Duyck */ 153*02cf8719SAlexander Duyck if (&page->lru != list && !list_is_first(&page->lru, list)) 154*02cf8719SAlexander Duyck list_rotate_to_front(&page->lru, list); 15536e66c55SAlexander Duyck 15636e66c55SAlexander Duyck /* release lock before waiting on report processing */ 15736e66c55SAlexander Duyck spin_unlock_irq(&zone->lock); 15836e66c55SAlexander Duyck 15936e66c55SAlexander Duyck /* begin processing pages in local list */ 16036e66c55SAlexander Duyck err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY); 16136e66c55SAlexander Duyck 16236e66c55SAlexander Duyck /* reset offset since the full list was reported */ 16336e66c55SAlexander Duyck *offset = PAGE_REPORTING_CAPACITY; 16436e66c55SAlexander Duyck 16536e66c55SAlexander Duyck /* reacquire zone lock and resume processing */ 16636e66c55SAlexander Duyck spin_lock_irq(&zone->lock); 16736e66c55SAlexander Duyck 16836e66c55SAlexander Duyck /* flush reported pages from the sg list */ 16936e66c55SAlexander Duyck page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err); 17036e66c55SAlexander Duyck 17136e66c55SAlexander Duyck /* 17236e66c55SAlexander Duyck * Reset next to first entry, the old next isn't valid 17336e66c55SAlexander Duyck * since we dropped the lock to report the pages 17436e66c55SAlexander Duyck */ 17536e66c55SAlexander Duyck next = list_first_entry(list, struct page, lru); 17636e66c55SAlexander Duyck 17736e66c55SAlexander Duyck /* exit on error */ 17836e66c55SAlexander Duyck if (err) 17936e66c55SAlexander Duyck break; 18036e66c55SAlexander Duyck } 18136e66c55SAlexander Duyck 182*02cf8719SAlexander Duyck /* Rotate any leftover pages to the head of the freelist */ 183*02cf8719SAlexander Duyck if (&next->lru != list && !list_is_first(&next->lru, list)) 184*02cf8719SAlexander Duyck list_rotate_to_front(&next->lru, list); 185*02cf8719SAlexander Duyck 18636e66c55SAlexander Duyck spin_unlock_irq(&zone->lock); 18736e66c55SAlexander Duyck 18836e66c55SAlexander Duyck return err; 18936e66c55SAlexander Duyck } 19036e66c55SAlexander Duyck 19136e66c55SAlexander Duyck static int 19236e66c55SAlexander Duyck page_reporting_process_zone(struct page_reporting_dev_info *prdev, 19336e66c55SAlexander Duyck struct scatterlist *sgl, struct zone *zone) 19436e66c55SAlexander Duyck { 19536e66c55SAlexander Duyck unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY; 19636e66c55SAlexander Duyck unsigned long watermark; 19736e66c55SAlexander Duyck int err = 0; 19836e66c55SAlexander Duyck 19936e66c55SAlexander Duyck /* Generate minimum watermark to be able to guarantee progress */ 20036e66c55SAlexander Duyck watermark = low_wmark_pages(zone) + 20136e66c55SAlexander Duyck (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER); 20236e66c55SAlexander Duyck 20336e66c55SAlexander Duyck /* 20436e66c55SAlexander Duyck * Cancel request if insufficient free memory or if we failed 20536e66c55SAlexander Duyck * to allocate page reporting statistics for the zone. 20636e66c55SAlexander Duyck */ 20736e66c55SAlexander Duyck if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 20836e66c55SAlexander Duyck return err; 20936e66c55SAlexander Duyck 21036e66c55SAlexander Duyck /* Process each free list starting from lowest order/mt */ 21136e66c55SAlexander Duyck for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) { 21236e66c55SAlexander Duyck for (mt = 0; mt < MIGRATE_TYPES; mt++) { 21336e66c55SAlexander Duyck /* We do not pull pages from the isolate free list */ 21436e66c55SAlexander Duyck if (is_migrate_isolate(mt)) 21536e66c55SAlexander Duyck continue; 21636e66c55SAlexander Duyck 21736e66c55SAlexander Duyck err = page_reporting_cycle(prdev, zone, order, mt, 21836e66c55SAlexander Duyck sgl, &offset); 21936e66c55SAlexander Duyck if (err) 22036e66c55SAlexander Duyck return err; 22136e66c55SAlexander Duyck } 22236e66c55SAlexander Duyck } 22336e66c55SAlexander Duyck 22436e66c55SAlexander Duyck /* report the leftover pages before going idle */ 22536e66c55SAlexander Duyck leftover = PAGE_REPORTING_CAPACITY - offset; 22636e66c55SAlexander Duyck if (leftover) { 22736e66c55SAlexander Duyck sgl = &sgl[offset]; 22836e66c55SAlexander Duyck err = prdev->report(prdev, sgl, leftover); 22936e66c55SAlexander Duyck 23036e66c55SAlexander Duyck /* flush any remaining pages out from the last report */ 23136e66c55SAlexander Duyck spin_lock_irq(&zone->lock); 23236e66c55SAlexander Duyck page_reporting_drain(prdev, sgl, leftover, !err); 23336e66c55SAlexander Duyck spin_unlock_irq(&zone->lock); 23436e66c55SAlexander Duyck } 23536e66c55SAlexander Duyck 23636e66c55SAlexander Duyck return err; 23736e66c55SAlexander Duyck } 23836e66c55SAlexander Duyck 23936e66c55SAlexander Duyck static void page_reporting_process(struct work_struct *work) 24036e66c55SAlexander Duyck { 24136e66c55SAlexander Duyck struct delayed_work *d_work = to_delayed_work(work); 24236e66c55SAlexander Duyck struct page_reporting_dev_info *prdev = 24336e66c55SAlexander Duyck container_of(d_work, struct page_reporting_dev_info, work); 24436e66c55SAlexander Duyck int err = 0, state = PAGE_REPORTING_ACTIVE; 24536e66c55SAlexander Duyck struct scatterlist *sgl; 24636e66c55SAlexander Duyck struct zone *zone; 24736e66c55SAlexander Duyck 24836e66c55SAlexander Duyck /* 24936e66c55SAlexander Duyck * Change the state to "Active" so that we can track if there is 25036e66c55SAlexander Duyck * anyone requests page reporting after we complete our pass. If 25136e66c55SAlexander Duyck * the state is not altered by the end of the pass we will switch 25236e66c55SAlexander Duyck * to idle and quit scheduling reporting runs. 25336e66c55SAlexander Duyck */ 25436e66c55SAlexander Duyck atomic_set(&prdev->state, state); 25536e66c55SAlexander Duyck 25636e66c55SAlexander Duyck /* allocate scatterlist to store pages being reported on */ 25736e66c55SAlexander Duyck sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL); 25836e66c55SAlexander Duyck if (!sgl) 25936e66c55SAlexander Duyck goto err_out; 26036e66c55SAlexander Duyck 26136e66c55SAlexander Duyck sg_init_table(sgl, PAGE_REPORTING_CAPACITY); 26236e66c55SAlexander Duyck 26336e66c55SAlexander Duyck for_each_zone(zone) { 26436e66c55SAlexander Duyck err = page_reporting_process_zone(prdev, sgl, zone); 26536e66c55SAlexander Duyck if (err) 26636e66c55SAlexander Duyck break; 26736e66c55SAlexander Duyck } 26836e66c55SAlexander Duyck 26936e66c55SAlexander Duyck kfree(sgl); 27036e66c55SAlexander Duyck err_out: 27136e66c55SAlexander Duyck /* 27236e66c55SAlexander Duyck * If the state has reverted back to requested then there may be 27336e66c55SAlexander Duyck * additional pages to be processed. We will defer for 2s to allow 27436e66c55SAlexander Duyck * more pages to accumulate. 27536e66c55SAlexander Duyck */ 27636e66c55SAlexander Duyck state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE); 27736e66c55SAlexander Duyck if (state == PAGE_REPORTING_REQUESTED) 27836e66c55SAlexander Duyck schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 27936e66c55SAlexander Duyck } 28036e66c55SAlexander Duyck 28136e66c55SAlexander Duyck static DEFINE_MUTEX(page_reporting_mutex); 28236e66c55SAlexander Duyck DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); 28336e66c55SAlexander Duyck 28436e66c55SAlexander Duyck int page_reporting_register(struct page_reporting_dev_info *prdev) 28536e66c55SAlexander Duyck { 28636e66c55SAlexander Duyck int err = 0; 28736e66c55SAlexander Duyck 28836e66c55SAlexander Duyck mutex_lock(&page_reporting_mutex); 28936e66c55SAlexander Duyck 29036e66c55SAlexander Duyck /* nothing to do if already in use */ 29136e66c55SAlexander Duyck if (rcu_access_pointer(pr_dev_info)) { 29236e66c55SAlexander Duyck err = -EBUSY; 29336e66c55SAlexander Duyck goto err_out; 29436e66c55SAlexander Duyck } 29536e66c55SAlexander Duyck 29636e66c55SAlexander Duyck /* initialize state and work structures */ 29736e66c55SAlexander Duyck atomic_set(&prdev->state, PAGE_REPORTING_IDLE); 29836e66c55SAlexander Duyck INIT_DELAYED_WORK(&prdev->work, &page_reporting_process); 29936e66c55SAlexander Duyck 30036e66c55SAlexander Duyck /* Begin initial flush of zones */ 30136e66c55SAlexander Duyck __page_reporting_request(prdev); 30236e66c55SAlexander Duyck 30336e66c55SAlexander Duyck /* Assign device to allow notifications */ 30436e66c55SAlexander Duyck rcu_assign_pointer(pr_dev_info, prdev); 30536e66c55SAlexander Duyck 30636e66c55SAlexander Duyck /* enable page reporting notification */ 30736e66c55SAlexander Duyck if (!static_key_enabled(&page_reporting_enabled)) { 30836e66c55SAlexander Duyck static_branch_enable(&page_reporting_enabled); 30936e66c55SAlexander Duyck pr_info("Free page reporting enabled\n"); 31036e66c55SAlexander Duyck } 31136e66c55SAlexander Duyck err_out: 31236e66c55SAlexander Duyck mutex_unlock(&page_reporting_mutex); 31336e66c55SAlexander Duyck 31436e66c55SAlexander Duyck return err; 31536e66c55SAlexander Duyck } 31636e66c55SAlexander Duyck EXPORT_SYMBOL_GPL(page_reporting_register); 31736e66c55SAlexander Duyck 31836e66c55SAlexander Duyck void page_reporting_unregister(struct page_reporting_dev_info *prdev) 31936e66c55SAlexander Duyck { 32036e66c55SAlexander Duyck mutex_lock(&page_reporting_mutex); 32136e66c55SAlexander Duyck 32236e66c55SAlexander Duyck if (rcu_access_pointer(pr_dev_info) == prdev) { 32336e66c55SAlexander Duyck /* Disable page reporting notification */ 32436e66c55SAlexander Duyck RCU_INIT_POINTER(pr_dev_info, NULL); 32536e66c55SAlexander Duyck synchronize_rcu(); 32636e66c55SAlexander Duyck 32736e66c55SAlexander Duyck /* Flush any existing work, and lock it out */ 32836e66c55SAlexander Duyck cancel_delayed_work_sync(&prdev->work); 32936e66c55SAlexander Duyck } 33036e66c55SAlexander Duyck 33136e66c55SAlexander Duyck mutex_unlock(&page_reporting_mutex); 33236e66c55SAlexander Duyck } 33336e66c55SAlexander Duyck EXPORT_SYMBOL_GPL(page_reporting_unregister); 334