1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/mm.h> 3 #include <linux/mmzone.h> 4 #include <linux/page_reporting.h> 5 #include <linux/gfp.h> 6 #include <linux/export.h> 7 #include <linux/delay.h> 8 #include <linux/scatterlist.h> 9 10 #include "page_reporting.h" 11 #include "internal.h" 12 13 #define PAGE_REPORTING_DELAY (2 * HZ) 14 static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; 15 16 enum { 17 PAGE_REPORTING_IDLE = 0, 18 PAGE_REPORTING_REQUESTED, 19 PAGE_REPORTING_ACTIVE 20 }; 21 22 /* request page reporting */ 23 static void 24 __page_reporting_request(struct page_reporting_dev_info *prdev) 25 { 26 unsigned int state; 27 28 /* Check to see if we are in desired state */ 29 state = atomic_read(&prdev->state); 30 if (state == PAGE_REPORTING_REQUESTED) 31 return; 32 33 /* 34 * If reporting is already active there is nothing we need to do. 35 * Test against 0 as that represents PAGE_REPORTING_IDLE. 36 */ 37 state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED); 38 if (state != PAGE_REPORTING_IDLE) 39 return; 40 41 /* 42 * Delay the start of work to allow a sizable queue to build. For 43 * now we are limiting this to running no more than once every 44 * couple of seconds. 45 */ 46 schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 47 } 48 49 /* notify prdev of free page reporting request */ 50 void __page_reporting_notify(void) 51 { 52 struct page_reporting_dev_info *prdev; 53 54 /* 55 * We use RCU to protect the pr_dev_info pointer. In almost all 56 * cases this should be present, however in the unlikely case of 57 * a shutdown this will be NULL and we should exit. 58 */ 59 rcu_read_lock(); 60 prdev = rcu_dereference(pr_dev_info); 61 if (likely(prdev)) 62 __page_reporting_request(prdev); 63 64 rcu_read_unlock(); 65 } 66 67 static void 68 page_reporting_drain(struct page_reporting_dev_info *prdev, 69 struct scatterlist *sgl, unsigned int nents, bool reported) 70 { 71 struct scatterlist *sg = sgl; 72 73 /* 74 * Drain the now reported pages back into their respective 75 * free lists/areas. We assume at least one page is populated. 76 */ 77 do { 78 struct page *page = sg_page(sg); 79 int mt = get_pageblock_migratetype(page); 80 unsigned int order = get_order(sg->length); 81 82 __putback_isolated_page(page, order, mt); 83 84 /* If the pages were not reported due to error skip flagging */ 85 if (!reported) 86 continue; 87 88 /* 89 * If page was not comingled with another page we can 90 * consider the result to be "reported" since the page 91 * hasn't been modified, otherwise we will need to 92 * report on the new larger page when we make our way 93 * up to that higher order. 94 */ 95 if (PageBuddy(page) && page_order(page) == order) 96 __SetPageReported(page); 97 } while ((sg = sg_next(sg))); 98 99 /* reinitialize scatterlist now that it is empty */ 100 sg_init_table(sgl, nents); 101 } 102 103 /* 104 * The page reporting cycle consists of 4 stages, fill, report, drain, and 105 * idle. We will cycle through the first 3 stages until we cannot obtain a 106 * full scatterlist of pages, in that case we will switch to idle. 107 */ 108 static int 109 page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, 110 unsigned int order, unsigned int mt, 111 struct scatterlist *sgl, unsigned int *offset) 112 { 113 struct free_area *area = &zone->free_area[order]; 114 struct list_head *list = &area->free_list[mt]; 115 unsigned int page_len = PAGE_SIZE << order; 116 struct page *page, *next; 117 long budget; 118 int err = 0; 119 120 /* 121 * Perform early check, if free area is empty there is 122 * nothing to process so we can skip this free_list. 123 */ 124 if (list_empty(list)) 125 return err; 126 127 spin_lock_irq(&zone->lock); 128 129 /* 130 * Limit how many calls we will be making to the page reporting 131 * device for this list. By doing this we avoid processing any 132 * given list for too long. 133 * 134 * The current value used allows us enough calls to process over a 135 * sixteenth of the current list plus one additional call to handle 136 * any pages that may have already been present from the previous 137 * list processed. This should result in us reporting all pages on 138 * an idle system in about 30 seconds. 139 * 140 * The division here should be cheap since PAGE_REPORTING_CAPACITY 141 * should always be a power of 2. 142 */ 143 budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16); 144 145 /* loop through free list adding unreported pages to sg list */ 146 list_for_each_entry_safe(page, next, list, lru) { 147 /* We are going to skip over the reported pages. */ 148 if (PageReported(page)) 149 continue; 150 151 /* 152 * If we fully consumed our budget then update our 153 * state to indicate that we are requesting additional 154 * processing and exit this list. 155 */ 156 if (budget < 0) { 157 atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED); 158 next = page; 159 break; 160 } 161 162 /* Attempt to pull page from list and place in scatterlist */ 163 if (*offset) { 164 if (!__isolate_free_page(page, order)) { 165 next = page; 166 break; 167 } 168 169 /* Add page to scatter list */ 170 --(*offset); 171 sg_set_page(&sgl[*offset], page, page_len, 0); 172 173 continue; 174 } 175 176 /* 177 * Make the first non-reported page in the free list 178 * the new head of the free list before we release the 179 * zone lock. 180 */ 181 if (&page->lru != list && !list_is_first(&page->lru, list)) 182 list_rotate_to_front(&page->lru, list); 183 184 /* release lock before waiting on report processing */ 185 spin_unlock_irq(&zone->lock); 186 187 /* begin processing pages in local list */ 188 err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY); 189 190 /* reset offset since the full list was reported */ 191 *offset = PAGE_REPORTING_CAPACITY; 192 193 /* update budget to reflect call to report function */ 194 budget--; 195 196 /* reacquire zone lock and resume processing */ 197 spin_lock_irq(&zone->lock); 198 199 /* flush reported pages from the sg list */ 200 page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err); 201 202 /* 203 * Reset next to first entry, the old next isn't valid 204 * since we dropped the lock to report the pages 205 */ 206 next = list_first_entry(list, struct page, lru); 207 208 /* exit on error */ 209 if (err) 210 break; 211 } 212 213 /* Rotate any leftover pages to the head of the freelist */ 214 if (&next->lru != list && !list_is_first(&next->lru, list)) 215 list_rotate_to_front(&next->lru, list); 216 217 spin_unlock_irq(&zone->lock); 218 219 return err; 220 } 221 222 static int 223 page_reporting_process_zone(struct page_reporting_dev_info *prdev, 224 struct scatterlist *sgl, struct zone *zone) 225 { 226 unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY; 227 unsigned long watermark; 228 int err = 0; 229 230 /* Generate minimum watermark to be able to guarantee progress */ 231 watermark = low_wmark_pages(zone) + 232 (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER); 233 234 /* 235 * Cancel request if insufficient free memory or if we failed 236 * to allocate page reporting statistics for the zone. 237 */ 238 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 239 return err; 240 241 /* Process each free list starting from lowest order/mt */ 242 for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) { 243 for (mt = 0; mt < MIGRATE_TYPES; mt++) { 244 /* We do not pull pages from the isolate free list */ 245 if (is_migrate_isolate(mt)) 246 continue; 247 248 err = page_reporting_cycle(prdev, zone, order, mt, 249 sgl, &offset); 250 if (err) 251 return err; 252 } 253 } 254 255 /* report the leftover pages before going idle */ 256 leftover = PAGE_REPORTING_CAPACITY - offset; 257 if (leftover) { 258 sgl = &sgl[offset]; 259 err = prdev->report(prdev, sgl, leftover); 260 261 /* flush any remaining pages out from the last report */ 262 spin_lock_irq(&zone->lock); 263 page_reporting_drain(prdev, sgl, leftover, !err); 264 spin_unlock_irq(&zone->lock); 265 } 266 267 return err; 268 } 269 270 static void page_reporting_process(struct work_struct *work) 271 { 272 struct delayed_work *d_work = to_delayed_work(work); 273 struct page_reporting_dev_info *prdev = 274 container_of(d_work, struct page_reporting_dev_info, work); 275 int err = 0, state = PAGE_REPORTING_ACTIVE; 276 struct scatterlist *sgl; 277 struct zone *zone; 278 279 /* 280 * Change the state to "Active" so that we can track if there is 281 * anyone requests page reporting after we complete our pass. If 282 * the state is not altered by the end of the pass we will switch 283 * to idle and quit scheduling reporting runs. 284 */ 285 atomic_set(&prdev->state, state); 286 287 /* allocate scatterlist to store pages being reported on */ 288 sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL); 289 if (!sgl) 290 goto err_out; 291 292 sg_init_table(sgl, PAGE_REPORTING_CAPACITY); 293 294 for_each_zone(zone) { 295 err = page_reporting_process_zone(prdev, sgl, zone); 296 if (err) 297 break; 298 } 299 300 kfree(sgl); 301 err_out: 302 /* 303 * If the state has reverted back to requested then there may be 304 * additional pages to be processed. We will defer for 2s to allow 305 * more pages to accumulate. 306 */ 307 state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE); 308 if (state == PAGE_REPORTING_REQUESTED) 309 schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 310 } 311 312 static DEFINE_MUTEX(page_reporting_mutex); 313 DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); 314 315 int page_reporting_register(struct page_reporting_dev_info *prdev) 316 { 317 int err = 0; 318 319 mutex_lock(&page_reporting_mutex); 320 321 /* nothing to do if already in use */ 322 if (rcu_access_pointer(pr_dev_info)) { 323 err = -EBUSY; 324 goto err_out; 325 } 326 327 /* initialize state and work structures */ 328 atomic_set(&prdev->state, PAGE_REPORTING_IDLE); 329 INIT_DELAYED_WORK(&prdev->work, &page_reporting_process); 330 331 /* Begin initial flush of zones */ 332 __page_reporting_request(prdev); 333 334 /* Assign device to allow notifications */ 335 rcu_assign_pointer(pr_dev_info, prdev); 336 337 /* enable page reporting notification */ 338 if (!static_key_enabled(&page_reporting_enabled)) { 339 static_branch_enable(&page_reporting_enabled); 340 pr_info("Free page reporting enabled\n"); 341 } 342 err_out: 343 mutex_unlock(&page_reporting_mutex); 344 345 return err; 346 } 347 EXPORT_SYMBOL_GPL(page_reporting_register); 348 349 void page_reporting_unregister(struct page_reporting_dev_info *prdev) 350 { 351 mutex_lock(&page_reporting_mutex); 352 353 if (rcu_access_pointer(pr_dev_info) == prdev) { 354 /* Disable page reporting notification */ 355 RCU_INIT_POINTER(pr_dev_info, NULL); 356 synchronize_rcu(); 357 358 /* Flush any existing work, and lock it out */ 359 cancel_delayed_work_sync(&prdev->work); 360 } 361 362 mutex_unlock(&page_reporting_mutex); 363 } 364 EXPORT_SYMBOL_GPL(page_reporting_unregister); 365