1*1da177e4SLinus Torvalds /* 2*1da177e4SLinus Torvalds * linux/mm/vmscan.c 3*1da177e4SLinus Torvalds * 4*1da177e4SLinus Torvalds * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5*1da177e4SLinus Torvalds * 6*1da177e4SLinus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie. 7*1da177e4SLinus Torvalds * kswapd added: 7.1.96 sct 8*1da177e4SLinus Torvalds * Removed kswapd_ctl limits, and swap out as many pages as needed 9*1da177e4SLinus Torvalds * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 10*1da177e4SLinus Torvalds * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 11*1da177e4SLinus Torvalds * Multiqueue VM started 5.8.00, Rik van Riel. 12*1da177e4SLinus Torvalds */ 13*1da177e4SLinus Torvalds 14*1da177e4SLinus Torvalds #include <linux/mm.h> 15*1da177e4SLinus Torvalds #include <linux/module.h> 16*1da177e4SLinus Torvalds #include <linux/slab.h> 17*1da177e4SLinus Torvalds #include <linux/kernel_stat.h> 18*1da177e4SLinus Torvalds #include <linux/swap.h> 19*1da177e4SLinus Torvalds #include <linux/pagemap.h> 20*1da177e4SLinus Torvalds #include <linux/init.h> 21*1da177e4SLinus Torvalds #include <linux/highmem.h> 22*1da177e4SLinus Torvalds #include <linux/file.h> 23*1da177e4SLinus Torvalds #include <linux/writeback.h> 24*1da177e4SLinus Torvalds #include <linux/blkdev.h> 25*1da177e4SLinus Torvalds #include <linux/buffer_head.h> /* for try_to_release_page(), 26*1da177e4SLinus Torvalds buffer_heads_over_limit */ 27*1da177e4SLinus Torvalds #include <linux/mm_inline.h> 28*1da177e4SLinus Torvalds #include <linux/pagevec.h> 29*1da177e4SLinus Torvalds #include <linux/backing-dev.h> 30*1da177e4SLinus Torvalds #include <linux/rmap.h> 31*1da177e4SLinus Torvalds #include <linux/topology.h> 32*1da177e4SLinus Torvalds #include <linux/cpu.h> 33*1da177e4SLinus Torvalds #include <linux/cpuset.h> 34*1da177e4SLinus Torvalds #include <linux/notifier.h> 35*1da177e4SLinus Torvalds #include <linux/rwsem.h> 36*1da177e4SLinus Torvalds 37*1da177e4SLinus Torvalds #include <asm/tlbflush.h> 38*1da177e4SLinus Torvalds #include <asm/div64.h> 39*1da177e4SLinus Torvalds 40*1da177e4SLinus Torvalds #include <linux/swapops.h> 41*1da177e4SLinus Torvalds 42*1da177e4SLinus Torvalds /* possible outcome of pageout() */ 43*1da177e4SLinus Torvalds typedef enum { 44*1da177e4SLinus Torvalds /* failed to write page out, page is locked */ 45*1da177e4SLinus Torvalds PAGE_KEEP, 46*1da177e4SLinus Torvalds /* move page to the active list, page is locked */ 47*1da177e4SLinus Torvalds PAGE_ACTIVATE, 48*1da177e4SLinus Torvalds /* page has been sent to the disk successfully, page is unlocked */ 49*1da177e4SLinus Torvalds PAGE_SUCCESS, 50*1da177e4SLinus Torvalds /* page is clean and locked */ 51*1da177e4SLinus Torvalds PAGE_CLEAN, 52*1da177e4SLinus Torvalds } pageout_t; 53*1da177e4SLinus Torvalds 54*1da177e4SLinus Torvalds struct scan_control { 55*1da177e4SLinus Torvalds /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ 56*1da177e4SLinus Torvalds unsigned long nr_to_scan; 57*1da177e4SLinus Torvalds 58*1da177e4SLinus Torvalds /* Incremented by the number of inactive pages that were scanned */ 59*1da177e4SLinus Torvalds unsigned long nr_scanned; 60*1da177e4SLinus Torvalds 61*1da177e4SLinus Torvalds /* Incremented by the number of pages reclaimed */ 62*1da177e4SLinus Torvalds unsigned long nr_reclaimed; 63*1da177e4SLinus Torvalds 64*1da177e4SLinus Torvalds unsigned long nr_mapped; /* From page_state */ 65*1da177e4SLinus Torvalds 66*1da177e4SLinus Torvalds /* How many pages shrink_cache() should reclaim */ 67*1da177e4SLinus Torvalds int nr_to_reclaim; 68*1da177e4SLinus Torvalds 69*1da177e4SLinus Torvalds /* Ask shrink_caches, or shrink_zone to scan at this priority */ 70*1da177e4SLinus Torvalds unsigned int priority; 71*1da177e4SLinus Torvalds 72*1da177e4SLinus Torvalds /* This context's GFP mask */ 73*1da177e4SLinus Torvalds unsigned int gfp_mask; 74*1da177e4SLinus Torvalds 75*1da177e4SLinus Torvalds int may_writepage; 76*1da177e4SLinus Torvalds 77*1da177e4SLinus Torvalds /* This context's SWAP_CLUSTER_MAX. If freeing memory for 78*1da177e4SLinus Torvalds * suspend, we effectively ignore SWAP_CLUSTER_MAX. 79*1da177e4SLinus Torvalds * In this context, it doesn't matter that we scan the 80*1da177e4SLinus Torvalds * whole list at once. */ 81*1da177e4SLinus Torvalds int swap_cluster_max; 82*1da177e4SLinus Torvalds }; 83*1da177e4SLinus Torvalds 84*1da177e4SLinus Torvalds /* 85*1da177e4SLinus Torvalds * The list of shrinker callbacks used by to apply pressure to 86*1da177e4SLinus Torvalds * ageable caches. 87*1da177e4SLinus Torvalds */ 88*1da177e4SLinus Torvalds struct shrinker { 89*1da177e4SLinus Torvalds shrinker_t shrinker; 90*1da177e4SLinus Torvalds struct list_head list; 91*1da177e4SLinus Torvalds int seeks; /* seeks to recreate an obj */ 92*1da177e4SLinus Torvalds long nr; /* objs pending delete */ 93*1da177e4SLinus Torvalds }; 94*1da177e4SLinus Torvalds 95*1da177e4SLinus Torvalds #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 96*1da177e4SLinus Torvalds 97*1da177e4SLinus Torvalds #ifdef ARCH_HAS_PREFETCH 98*1da177e4SLinus Torvalds #define prefetch_prev_lru_page(_page, _base, _field) \ 99*1da177e4SLinus Torvalds do { \ 100*1da177e4SLinus Torvalds if ((_page)->lru.prev != _base) { \ 101*1da177e4SLinus Torvalds struct page *prev; \ 102*1da177e4SLinus Torvalds \ 103*1da177e4SLinus Torvalds prev = lru_to_page(&(_page->lru)); \ 104*1da177e4SLinus Torvalds prefetch(&prev->_field); \ 105*1da177e4SLinus Torvalds } \ 106*1da177e4SLinus Torvalds } while (0) 107*1da177e4SLinus Torvalds #else 108*1da177e4SLinus Torvalds #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 109*1da177e4SLinus Torvalds #endif 110*1da177e4SLinus Torvalds 111*1da177e4SLinus Torvalds #ifdef ARCH_HAS_PREFETCHW 112*1da177e4SLinus Torvalds #define prefetchw_prev_lru_page(_page, _base, _field) \ 113*1da177e4SLinus Torvalds do { \ 114*1da177e4SLinus Torvalds if ((_page)->lru.prev != _base) { \ 115*1da177e4SLinus Torvalds struct page *prev; \ 116*1da177e4SLinus Torvalds \ 117*1da177e4SLinus Torvalds prev = lru_to_page(&(_page->lru)); \ 118*1da177e4SLinus Torvalds prefetchw(&prev->_field); \ 119*1da177e4SLinus Torvalds } \ 120*1da177e4SLinus Torvalds } while (0) 121*1da177e4SLinus Torvalds #else 122*1da177e4SLinus Torvalds #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 123*1da177e4SLinus Torvalds #endif 124*1da177e4SLinus Torvalds 125*1da177e4SLinus Torvalds /* 126*1da177e4SLinus Torvalds * From 0 .. 100. Higher means more swappy. 127*1da177e4SLinus Torvalds */ 128*1da177e4SLinus Torvalds int vm_swappiness = 60; 129*1da177e4SLinus Torvalds static long total_memory; 130*1da177e4SLinus Torvalds 131*1da177e4SLinus Torvalds static LIST_HEAD(shrinker_list); 132*1da177e4SLinus Torvalds static DECLARE_RWSEM(shrinker_rwsem); 133*1da177e4SLinus Torvalds 134*1da177e4SLinus Torvalds /* 135*1da177e4SLinus Torvalds * Add a shrinker callback to be called from the vm 136*1da177e4SLinus Torvalds */ 137*1da177e4SLinus Torvalds struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) 138*1da177e4SLinus Torvalds { 139*1da177e4SLinus Torvalds struct shrinker *shrinker; 140*1da177e4SLinus Torvalds 141*1da177e4SLinus Torvalds shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); 142*1da177e4SLinus Torvalds if (shrinker) { 143*1da177e4SLinus Torvalds shrinker->shrinker = theshrinker; 144*1da177e4SLinus Torvalds shrinker->seeks = seeks; 145*1da177e4SLinus Torvalds shrinker->nr = 0; 146*1da177e4SLinus Torvalds down_write(&shrinker_rwsem); 147*1da177e4SLinus Torvalds list_add_tail(&shrinker->list, &shrinker_list); 148*1da177e4SLinus Torvalds up_write(&shrinker_rwsem); 149*1da177e4SLinus Torvalds } 150*1da177e4SLinus Torvalds return shrinker; 151*1da177e4SLinus Torvalds } 152*1da177e4SLinus Torvalds EXPORT_SYMBOL(set_shrinker); 153*1da177e4SLinus Torvalds 154*1da177e4SLinus Torvalds /* 155*1da177e4SLinus Torvalds * Remove one 156*1da177e4SLinus Torvalds */ 157*1da177e4SLinus Torvalds void remove_shrinker(struct shrinker *shrinker) 158*1da177e4SLinus Torvalds { 159*1da177e4SLinus Torvalds down_write(&shrinker_rwsem); 160*1da177e4SLinus Torvalds list_del(&shrinker->list); 161*1da177e4SLinus Torvalds up_write(&shrinker_rwsem); 162*1da177e4SLinus Torvalds kfree(shrinker); 163*1da177e4SLinus Torvalds } 164*1da177e4SLinus Torvalds EXPORT_SYMBOL(remove_shrinker); 165*1da177e4SLinus Torvalds 166*1da177e4SLinus Torvalds #define SHRINK_BATCH 128 167*1da177e4SLinus Torvalds /* 168*1da177e4SLinus Torvalds * Call the shrink functions to age shrinkable caches 169*1da177e4SLinus Torvalds * 170*1da177e4SLinus Torvalds * Here we assume it costs one seek to replace a lru page and that it also 171*1da177e4SLinus Torvalds * takes a seek to recreate a cache object. With this in mind we age equal 172*1da177e4SLinus Torvalds * percentages of the lru and ageable caches. This should balance the seeks 173*1da177e4SLinus Torvalds * generated by these structures. 174*1da177e4SLinus Torvalds * 175*1da177e4SLinus Torvalds * If the vm encounted mapped pages on the LRU it increase the pressure on 176*1da177e4SLinus Torvalds * slab to avoid swapping. 177*1da177e4SLinus Torvalds * 178*1da177e4SLinus Torvalds * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 179*1da177e4SLinus Torvalds * 180*1da177e4SLinus Torvalds * `lru_pages' represents the number of on-LRU pages in all the zones which 181*1da177e4SLinus Torvalds * are eligible for the caller's allocation attempt. It is used for balancing 182*1da177e4SLinus Torvalds * slab reclaim versus page reclaim. 183*1da177e4SLinus Torvalds */ 184*1da177e4SLinus Torvalds static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, 185*1da177e4SLinus Torvalds unsigned long lru_pages) 186*1da177e4SLinus Torvalds { 187*1da177e4SLinus Torvalds struct shrinker *shrinker; 188*1da177e4SLinus Torvalds 189*1da177e4SLinus Torvalds if (scanned == 0) 190*1da177e4SLinus Torvalds scanned = SWAP_CLUSTER_MAX; 191*1da177e4SLinus Torvalds 192*1da177e4SLinus Torvalds if (!down_read_trylock(&shrinker_rwsem)) 193*1da177e4SLinus Torvalds return 0; 194*1da177e4SLinus Torvalds 195*1da177e4SLinus Torvalds list_for_each_entry(shrinker, &shrinker_list, list) { 196*1da177e4SLinus Torvalds unsigned long long delta; 197*1da177e4SLinus Torvalds unsigned long total_scan; 198*1da177e4SLinus Torvalds 199*1da177e4SLinus Torvalds delta = (4 * scanned) / shrinker->seeks; 200*1da177e4SLinus Torvalds delta *= (*shrinker->shrinker)(0, gfp_mask); 201*1da177e4SLinus Torvalds do_div(delta, lru_pages + 1); 202*1da177e4SLinus Torvalds shrinker->nr += delta; 203*1da177e4SLinus Torvalds if (shrinker->nr < 0) 204*1da177e4SLinus Torvalds shrinker->nr = LONG_MAX; /* It wrapped! */ 205*1da177e4SLinus Torvalds 206*1da177e4SLinus Torvalds total_scan = shrinker->nr; 207*1da177e4SLinus Torvalds shrinker->nr = 0; 208*1da177e4SLinus Torvalds 209*1da177e4SLinus Torvalds while (total_scan >= SHRINK_BATCH) { 210*1da177e4SLinus Torvalds long this_scan = SHRINK_BATCH; 211*1da177e4SLinus Torvalds int shrink_ret; 212*1da177e4SLinus Torvalds 213*1da177e4SLinus Torvalds shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); 214*1da177e4SLinus Torvalds if (shrink_ret == -1) 215*1da177e4SLinus Torvalds break; 216*1da177e4SLinus Torvalds mod_page_state(slabs_scanned, this_scan); 217*1da177e4SLinus Torvalds total_scan -= this_scan; 218*1da177e4SLinus Torvalds 219*1da177e4SLinus Torvalds cond_resched(); 220*1da177e4SLinus Torvalds } 221*1da177e4SLinus Torvalds 222*1da177e4SLinus Torvalds shrinker->nr += total_scan; 223*1da177e4SLinus Torvalds } 224*1da177e4SLinus Torvalds up_read(&shrinker_rwsem); 225*1da177e4SLinus Torvalds return 0; 226*1da177e4SLinus Torvalds } 227*1da177e4SLinus Torvalds 228*1da177e4SLinus Torvalds /* Called without lock on whether page is mapped, so answer is unstable */ 229*1da177e4SLinus Torvalds static inline int page_mapping_inuse(struct page *page) 230*1da177e4SLinus Torvalds { 231*1da177e4SLinus Torvalds struct address_space *mapping; 232*1da177e4SLinus Torvalds 233*1da177e4SLinus Torvalds /* Page is in somebody's page tables. */ 234*1da177e4SLinus Torvalds if (page_mapped(page)) 235*1da177e4SLinus Torvalds return 1; 236*1da177e4SLinus Torvalds 237*1da177e4SLinus Torvalds /* Be more reluctant to reclaim swapcache than pagecache */ 238*1da177e4SLinus Torvalds if (PageSwapCache(page)) 239*1da177e4SLinus Torvalds return 1; 240*1da177e4SLinus Torvalds 241*1da177e4SLinus Torvalds mapping = page_mapping(page); 242*1da177e4SLinus Torvalds if (!mapping) 243*1da177e4SLinus Torvalds return 0; 244*1da177e4SLinus Torvalds 245*1da177e4SLinus Torvalds /* File is mmap'd by somebody? */ 246*1da177e4SLinus Torvalds return mapping_mapped(mapping); 247*1da177e4SLinus Torvalds } 248*1da177e4SLinus Torvalds 249*1da177e4SLinus Torvalds static inline int is_page_cache_freeable(struct page *page) 250*1da177e4SLinus Torvalds { 251*1da177e4SLinus Torvalds return page_count(page) - !!PagePrivate(page) == 2; 252*1da177e4SLinus Torvalds } 253*1da177e4SLinus Torvalds 254*1da177e4SLinus Torvalds static int may_write_to_queue(struct backing_dev_info *bdi) 255*1da177e4SLinus Torvalds { 256*1da177e4SLinus Torvalds if (current_is_kswapd()) 257*1da177e4SLinus Torvalds return 1; 258*1da177e4SLinus Torvalds if (current_is_pdflush()) /* This is unlikely, but why not... */ 259*1da177e4SLinus Torvalds return 1; 260*1da177e4SLinus Torvalds if (!bdi_write_congested(bdi)) 261*1da177e4SLinus Torvalds return 1; 262*1da177e4SLinus Torvalds if (bdi == current->backing_dev_info) 263*1da177e4SLinus Torvalds return 1; 264*1da177e4SLinus Torvalds return 0; 265*1da177e4SLinus Torvalds } 266*1da177e4SLinus Torvalds 267*1da177e4SLinus Torvalds /* 268*1da177e4SLinus Torvalds * We detected a synchronous write error writing a page out. Probably 269*1da177e4SLinus Torvalds * -ENOSPC. We need to propagate that into the address_space for a subsequent 270*1da177e4SLinus Torvalds * fsync(), msync() or close(). 271*1da177e4SLinus Torvalds * 272*1da177e4SLinus Torvalds * The tricky part is that after writepage we cannot touch the mapping: nothing 273*1da177e4SLinus Torvalds * prevents it from being freed up. But we have a ref on the page and once 274*1da177e4SLinus Torvalds * that page is locked, the mapping is pinned. 275*1da177e4SLinus Torvalds * 276*1da177e4SLinus Torvalds * We're allowed to run sleeping lock_page() here because we know the caller has 277*1da177e4SLinus Torvalds * __GFP_FS. 278*1da177e4SLinus Torvalds */ 279*1da177e4SLinus Torvalds static void handle_write_error(struct address_space *mapping, 280*1da177e4SLinus Torvalds struct page *page, int error) 281*1da177e4SLinus Torvalds { 282*1da177e4SLinus Torvalds lock_page(page); 283*1da177e4SLinus Torvalds if (page_mapping(page) == mapping) { 284*1da177e4SLinus Torvalds if (error == -ENOSPC) 285*1da177e4SLinus Torvalds set_bit(AS_ENOSPC, &mapping->flags); 286*1da177e4SLinus Torvalds else 287*1da177e4SLinus Torvalds set_bit(AS_EIO, &mapping->flags); 288*1da177e4SLinus Torvalds } 289*1da177e4SLinus Torvalds unlock_page(page); 290*1da177e4SLinus Torvalds } 291*1da177e4SLinus Torvalds 292*1da177e4SLinus Torvalds /* 293*1da177e4SLinus Torvalds * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). 294*1da177e4SLinus Torvalds */ 295*1da177e4SLinus Torvalds static pageout_t pageout(struct page *page, struct address_space *mapping) 296*1da177e4SLinus Torvalds { 297*1da177e4SLinus Torvalds /* 298*1da177e4SLinus Torvalds * If the page is dirty, only perform writeback if that write 299*1da177e4SLinus Torvalds * will be non-blocking. To prevent this allocation from being 300*1da177e4SLinus Torvalds * stalled by pagecache activity. But note that there may be 301*1da177e4SLinus Torvalds * stalls if we need to run get_block(). We could test 302*1da177e4SLinus Torvalds * PagePrivate for that. 303*1da177e4SLinus Torvalds * 304*1da177e4SLinus Torvalds * If this process is currently in generic_file_write() against 305*1da177e4SLinus Torvalds * this page's queue, we can perform writeback even if that 306*1da177e4SLinus Torvalds * will block. 307*1da177e4SLinus Torvalds * 308*1da177e4SLinus Torvalds * If the page is swapcache, write it back even if that would 309*1da177e4SLinus Torvalds * block, for some throttling. This happens by accident, because 310*1da177e4SLinus Torvalds * swap_backing_dev_info is bust: it doesn't reflect the 311*1da177e4SLinus Torvalds * congestion state of the swapdevs. Easy to fix, if needed. 312*1da177e4SLinus Torvalds * See swapfile.c:page_queue_congested(). 313*1da177e4SLinus Torvalds */ 314*1da177e4SLinus Torvalds if (!is_page_cache_freeable(page)) 315*1da177e4SLinus Torvalds return PAGE_KEEP; 316*1da177e4SLinus Torvalds if (!mapping) { 317*1da177e4SLinus Torvalds /* 318*1da177e4SLinus Torvalds * Some data journaling orphaned pages can have 319*1da177e4SLinus Torvalds * page->mapping == NULL while being dirty with clean buffers. 320*1da177e4SLinus Torvalds */ 321*1da177e4SLinus Torvalds if (PageDirty(page) && PagePrivate(page)) { 322*1da177e4SLinus Torvalds if (try_to_free_buffers(page)) { 323*1da177e4SLinus Torvalds ClearPageDirty(page); 324*1da177e4SLinus Torvalds printk("%s: orphaned page\n", __FUNCTION__); 325*1da177e4SLinus Torvalds return PAGE_CLEAN; 326*1da177e4SLinus Torvalds } 327*1da177e4SLinus Torvalds } 328*1da177e4SLinus Torvalds return PAGE_KEEP; 329*1da177e4SLinus Torvalds } 330*1da177e4SLinus Torvalds if (mapping->a_ops->writepage == NULL) 331*1da177e4SLinus Torvalds return PAGE_ACTIVATE; 332*1da177e4SLinus Torvalds if (!may_write_to_queue(mapping->backing_dev_info)) 333*1da177e4SLinus Torvalds return PAGE_KEEP; 334*1da177e4SLinus Torvalds 335*1da177e4SLinus Torvalds if (clear_page_dirty_for_io(page)) { 336*1da177e4SLinus Torvalds int res; 337*1da177e4SLinus Torvalds struct writeback_control wbc = { 338*1da177e4SLinus Torvalds .sync_mode = WB_SYNC_NONE, 339*1da177e4SLinus Torvalds .nr_to_write = SWAP_CLUSTER_MAX, 340*1da177e4SLinus Torvalds .nonblocking = 1, 341*1da177e4SLinus Torvalds .for_reclaim = 1, 342*1da177e4SLinus Torvalds }; 343*1da177e4SLinus Torvalds 344*1da177e4SLinus Torvalds SetPageReclaim(page); 345*1da177e4SLinus Torvalds res = mapping->a_ops->writepage(page, &wbc); 346*1da177e4SLinus Torvalds if (res < 0) 347*1da177e4SLinus Torvalds handle_write_error(mapping, page, res); 348*1da177e4SLinus Torvalds if (res == WRITEPAGE_ACTIVATE) { 349*1da177e4SLinus Torvalds ClearPageReclaim(page); 350*1da177e4SLinus Torvalds return PAGE_ACTIVATE; 351*1da177e4SLinus Torvalds } 352*1da177e4SLinus Torvalds if (!PageWriteback(page)) { 353*1da177e4SLinus Torvalds /* synchronous write or broken a_ops? */ 354*1da177e4SLinus Torvalds ClearPageReclaim(page); 355*1da177e4SLinus Torvalds } 356*1da177e4SLinus Torvalds 357*1da177e4SLinus Torvalds return PAGE_SUCCESS; 358*1da177e4SLinus Torvalds } 359*1da177e4SLinus Torvalds 360*1da177e4SLinus Torvalds return PAGE_CLEAN; 361*1da177e4SLinus Torvalds } 362*1da177e4SLinus Torvalds 363*1da177e4SLinus Torvalds /* 364*1da177e4SLinus Torvalds * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed 365*1da177e4SLinus Torvalds */ 366*1da177e4SLinus Torvalds static int shrink_list(struct list_head *page_list, struct scan_control *sc) 367*1da177e4SLinus Torvalds { 368*1da177e4SLinus Torvalds LIST_HEAD(ret_pages); 369*1da177e4SLinus Torvalds struct pagevec freed_pvec; 370*1da177e4SLinus Torvalds int pgactivate = 0; 371*1da177e4SLinus Torvalds int reclaimed = 0; 372*1da177e4SLinus Torvalds 373*1da177e4SLinus Torvalds cond_resched(); 374*1da177e4SLinus Torvalds 375*1da177e4SLinus Torvalds pagevec_init(&freed_pvec, 1); 376*1da177e4SLinus Torvalds while (!list_empty(page_list)) { 377*1da177e4SLinus Torvalds struct address_space *mapping; 378*1da177e4SLinus Torvalds struct page *page; 379*1da177e4SLinus Torvalds int may_enter_fs; 380*1da177e4SLinus Torvalds int referenced; 381*1da177e4SLinus Torvalds 382*1da177e4SLinus Torvalds cond_resched(); 383*1da177e4SLinus Torvalds 384*1da177e4SLinus Torvalds page = lru_to_page(page_list); 385*1da177e4SLinus Torvalds list_del(&page->lru); 386*1da177e4SLinus Torvalds 387*1da177e4SLinus Torvalds if (TestSetPageLocked(page)) 388*1da177e4SLinus Torvalds goto keep; 389*1da177e4SLinus Torvalds 390*1da177e4SLinus Torvalds BUG_ON(PageActive(page)); 391*1da177e4SLinus Torvalds 392*1da177e4SLinus Torvalds sc->nr_scanned++; 393*1da177e4SLinus Torvalds /* Double the slab pressure for mapped and swapcache pages */ 394*1da177e4SLinus Torvalds if (page_mapped(page) || PageSwapCache(page)) 395*1da177e4SLinus Torvalds sc->nr_scanned++; 396*1da177e4SLinus Torvalds 397*1da177e4SLinus Torvalds if (PageWriteback(page)) 398*1da177e4SLinus Torvalds goto keep_locked; 399*1da177e4SLinus Torvalds 400*1da177e4SLinus Torvalds referenced = page_referenced(page, 1, sc->priority <= 0); 401*1da177e4SLinus Torvalds /* In active use or really unfreeable? Activate it. */ 402*1da177e4SLinus Torvalds if (referenced && page_mapping_inuse(page)) 403*1da177e4SLinus Torvalds goto activate_locked; 404*1da177e4SLinus Torvalds 405*1da177e4SLinus Torvalds #ifdef CONFIG_SWAP 406*1da177e4SLinus Torvalds /* 407*1da177e4SLinus Torvalds * Anonymous process memory has backing store? 408*1da177e4SLinus Torvalds * Try to allocate it some swap space here. 409*1da177e4SLinus Torvalds */ 410*1da177e4SLinus Torvalds if (PageAnon(page) && !PageSwapCache(page)) { 411*1da177e4SLinus Torvalds if (!add_to_swap(page)) 412*1da177e4SLinus Torvalds goto activate_locked; 413*1da177e4SLinus Torvalds } 414*1da177e4SLinus Torvalds #endif /* CONFIG_SWAP */ 415*1da177e4SLinus Torvalds 416*1da177e4SLinus Torvalds mapping = page_mapping(page); 417*1da177e4SLinus Torvalds may_enter_fs = (sc->gfp_mask & __GFP_FS) || 418*1da177e4SLinus Torvalds (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 419*1da177e4SLinus Torvalds 420*1da177e4SLinus Torvalds /* 421*1da177e4SLinus Torvalds * The page is mapped into the page tables of one or more 422*1da177e4SLinus Torvalds * processes. Try to unmap it here. 423*1da177e4SLinus Torvalds */ 424*1da177e4SLinus Torvalds if (page_mapped(page) && mapping) { 425*1da177e4SLinus Torvalds switch (try_to_unmap(page)) { 426*1da177e4SLinus Torvalds case SWAP_FAIL: 427*1da177e4SLinus Torvalds goto activate_locked; 428*1da177e4SLinus Torvalds case SWAP_AGAIN: 429*1da177e4SLinus Torvalds goto keep_locked; 430*1da177e4SLinus Torvalds case SWAP_SUCCESS: 431*1da177e4SLinus Torvalds ; /* try to free the page below */ 432*1da177e4SLinus Torvalds } 433*1da177e4SLinus Torvalds } 434*1da177e4SLinus Torvalds 435*1da177e4SLinus Torvalds if (PageDirty(page)) { 436*1da177e4SLinus Torvalds if (referenced) 437*1da177e4SLinus Torvalds goto keep_locked; 438*1da177e4SLinus Torvalds if (!may_enter_fs) 439*1da177e4SLinus Torvalds goto keep_locked; 440*1da177e4SLinus Torvalds if (laptop_mode && !sc->may_writepage) 441*1da177e4SLinus Torvalds goto keep_locked; 442*1da177e4SLinus Torvalds 443*1da177e4SLinus Torvalds /* Page is dirty, try to write it out here */ 444*1da177e4SLinus Torvalds switch(pageout(page, mapping)) { 445*1da177e4SLinus Torvalds case PAGE_KEEP: 446*1da177e4SLinus Torvalds goto keep_locked; 447*1da177e4SLinus Torvalds case PAGE_ACTIVATE: 448*1da177e4SLinus Torvalds goto activate_locked; 449*1da177e4SLinus Torvalds case PAGE_SUCCESS: 450*1da177e4SLinus Torvalds if (PageWriteback(page) || PageDirty(page)) 451*1da177e4SLinus Torvalds goto keep; 452*1da177e4SLinus Torvalds /* 453*1da177e4SLinus Torvalds * A synchronous write - probably a ramdisk. Go 454*1da177e4SLinus Torvalds * ahead and try to reclaim the page. 455*1da177e4SLinus Torvalds */ 456*1da177e4SLinus Torvalds if (TestSetPageLocked(page)) 457*1da177e4SLinus Torvalds goto keep; 458*1da177e4SLinus Torvalds if (PageDirty(page) || PageWriteback(page)) 459*1da177e4SLinus Torvalds goto keep_locked; 460*1da177e4SLinus Torvalds mapping = page_mapping(page); 461*1da177e4SLinus Torvalds case PAGE_CLEAN: 462*1da177e4SLinus Torvalds ; /* try to free the page below */ 463*1da177e4SLinus Torvalds } 464*1da177e4SLinus Torvalds } 465*1da177e4SLinus Torvalds 466*1da177e4SLinus Torvalds /* 467*1da177e4SLinus Torvalds * If the page has buffers, try to free the buffer mappings 468*1da177e4SLinus Torvalds * associated with this page. If we succeed we try to free 469*1da177e4SLinus Torvalds * the page as well. 470*1da177e4SLinus Torvalds * 471*1da177e4SLinus Torvalds * We do this even if the page is PageDirty(). 472*1da177e4SLinus Torvalds * try_to_release_page() does not perform I/O, but it is 473*1da177e4SLinus Torvalds * possible for a page to have PageDirty set, but it is actually 474*1da177e4SLinus Torvalds * clean (all its buffers are clean). This happens if the 475*1da177e4SLinus Torvalds * buffers were written out directly, with submit_bh(). ext3 476*1da177e4SLinus Torvalds * will do this, as well as the blockdev mapping. 477*1da177e4SLinus Torvalds * try_to_release_page() will discover that cleanness and will 478*1da177e4SLinus Torvalds * drop the buffers and mark the page clean - it can be freed. 479*1da177e4SLinus Torvalds * 480*1da177e4SLinus Torvalds * Rarely, pages can have buffers and no ->mapping. These are 481*1da177e4SLinus Torvalds * the pages which were not successfully invalidated in 482*1da177e4SLinus Torvalds * truncate_complete_page(). We try to drop those buffers here 483*1da177e4SLinus Torvalds * and if that worked, and the page is no longer mapped into 484*1da177e4SLinus Torvalds * process address space (page_count == 1) it can be freed. 485*1da177e4SLinus Torvalds * Otherwise, leave the page on the LRU so it is swappable. 486*1da177e4SLinus Torvalds */ 487*1da177e4SLinus Torvalds if (PagePrivate(page)) { 488*1da177e4SLinus Torvalds if (!try_to_release_page(page, sc->gfp_mask)) 489*1da177e4SLinus Torvalds goto activate_locked; 490*1da177e4SLinus Torvalds if (!mapping && page_count(page) == 1) 491*1da177e4SLinus Torvalds goto free_it; 492*1da177e4SLinus Torvalds } 493*1da177e4SLinus Torvalds 494*1da177e4SLinus Torvalds if (!mapping) 495*1da177e4SLinus Torvalds goto keep_locked; /* truncate got there first */ 496*1da177e4SLinus Torvalds 497*1da177e4SLinus Torvalds write_lock_irq(&mapping->tree_lock); 498*1da177e4SLinus Torvalds 499*1da177e4SLinus Torvalds /* 500*1da177e4SLinus Torvalds * The non-racy check for busy page. It is critical to check 501*1da177e4SLinus Torvalds * PageDirty _after_ making sure that the page is freeable and 502*1da177e4SLinus Torvalds * not in use by anybody. (pagecache + us == 2) 503*1da177e4SLinus Torvalds */ 504*1da177e4SLinus Torvalds if (page_count(page) != 2 || PageDirty(page)) { 505*1da177e4SLinus Torvalds write_unlock_irq(&mapping->tree_lock); 506*1da177e4SLinus Torvalds goto keep_locked; 507*1da177e4SLinus Torvalds } 508*1da177e4SLinus Torvalds 509*1da177e4SLinus Torvalds #ifdef CONFIG_SWAP 510*1da177e4SLinus Torvalds if (PageSwapCache(page)) { 511*1da177e4SLinus Torvalds swp_entry_t swap = { .val = page->private }; 512*1da177e4SLinus Torvalds __delete_from_swap_cache(page); 513*1da177e4SLinus Torvalds write_unlock_irq(&mapping->tree_lock); 514*1da177e4SLinus Torvalds swap_free(swap); 515*1da177e4SLinus Torvalds __put_page(page); /* The pagecache ref */ 516*1da177e4SLinus Torvalds goto free_it; 517*1da177e4SLinus Torvalds } 518*1da177e4SLinus Torvalds #endif /* CONFIG_SWAP */ 519*1da177e4SLinus Torvalds 520*1da177e4SLinus Torvalds __remove_from_page_cache(page); 521*1da177e4SLinus Torvalds write_unlock_irq(&mapping->tree_lock); 522*1da177e4SLinus Torvalds __put_page(page); 523*1da177e4SLinus Torvalds 524*1da177e4SLinus Torvalds free_it: 525*1da177e4SLinus Torvalds unlock_page(page); 526*1da177e4SLinus Torvalds reclaimed++; 527*1da177e4SLinus Torvalds if (!pagevec_add(&freed_pvec, page)) 528*1da177e4SLinus Torvalds __pagevec_release_nonlru(&freed_pvec); 529*1da177e4SLinus Torvalds continue; 530*1da177e4SLinus Torvalds 531*1da177e4SLinus Torvalds activate_locked: 532*1da177e4SLinus Torvalds SetPageActive(page); 533*1da177e4SLinus Torvalds pgactivate++; 534*1da177e4SLinus Torvalds keep_locked: 535*1da177e4SLinus Torvalds unlock_page(page); 536*1da177e4SLinus Torvalds keep: 537*1da177e4SLinus Torvalds list_add(&page->lru, &ret_pages); 538*1da177e4SLinus Torvalds BUG_ON(PageLRU(page)); 539*1da177e4SLinus Torvalds } 540*1da177e4SLinus Torvalds list_splice(&ret_pages, page_list); 541*1da177e4SLinus Torvalds if (pagevec_count(&freed_pvec)) 542*1da177e4SLinus Torvalds __pagevec_release_nonlru(&freed_pvec); 543*1da177e4SLinus Torvalds mod_page_state(pgactivate, pgactivate); 544*1da177e4SLinus Torvalds sc->nr_reclaimed += reclaimed; 545*1da177e4SLinus Torvalds return reclaimed; 546*1da177e4SLinus Torvalds } 547*1da177e4SLinus Torvalds 548*1da177e4SLinus Torvalds /* 549*1da177e4SLinus Torvalds * zone->lru_lock is heavily contended. Some of the functions that 550*1da177e4SLinus Torvalds * shrink the lists perform better by taking out a batch of pages 551*1da177e4SLinus Torvalds * and working on them outside the LRU lock. 552*1da177e4SLinus Torvalds * 553*1da177e4SLinus Torvalds * For pagecache intensive workloads, this function is the hottest 554*1da177e4SLinus Torvalds * spot in the kernel (apart from copy_*_user functions). 555*1da177e4SLinus Torvalds * 556*1da177e4SLinus Torvalds * Appropriate locks must be held before calling this function. 557*1da177e4SLinus Torvalds * 558*1da177e4SLinus Torvalds * @nr_to_scan: The number of pages to look through on the list. 559*1da177e4SLinus Torvalds * @src: The LRU list to pull pages off. 560*1da177e4SLinus Torvalds * @dst: The temp list to put pages on to. 561*1da177e4SLinus Torvalds * @scanned: The number of pages that were scanned. 562*1da177e4SLinus Torvalds * 563*1da177e4SLinus Torvalds * returns how many pages were moved onto *@dst. 564*1da177e4SLinus Torvalds */ 565*1da177e4SLinus Torvalds static int isolate_lru_pages(int nr_to_scan, struct list_head *src, 566*1da177e4SLinus Torvalds struct list_head *dst, int *scanned) 567*1da177e4SLinus Torvalds { 568*1da177e4SLinus Torvalds int nr_taken = 0; 569*1da177e4SLinus Torvalds struct page *page; 570*1da177e4SLinus Torvalds int scan = 0; 571*1da177e4SLinus Torvalds 572*1da177e4SLinus Torvalds while (scan++ < nr_to_scan && !list_empty(src)) { 573*1da177e4SLinus Torvalds page = lru_to_page(src); 574*1da177e4SLinus Torvalds prefetchw_prev_lru_page(page, src, flags); 575*1da177e4SLinus Torvalds 576*1da177e4SLinus Torvalds if (!TestClearPageLRU(page)) 577*1da177e4SLinus Torvalds BUG(); 578*1da177e4SLinus Torvalds list_del(&page->lru); 579*1da177e4SLinus Torvalds if (get_page_testone(page)) { 580*1da177e4SLinus Torvalds /* 581*1da177e4SLinus Torvalds * It is being freed elsewhere 582*1da177e4SLinus Torvalds */ 583*1da177e4SLinus Torvalds __put_page(page); 584*1da177e4SLinus Torvalds SetPageLRU(page); 585*1da177e4SLinus Torvalds list_add(&page->lru, src); 586*1da177e4SLinus Torvalds continue; 587*1da177e4SLinus Torvalds } else { 588*1da177e4SLinus Torvalds list_add(&page->lru, dst); 589*1da177e4SLinus Torvalds nr_taken++; 590*1da177e4SLinus Torvalds } 591*1da177e4SLinus Torvalds } 592*1da177e4SLinus Torvalds 593*1da177e4SLinus Torvalds *scanned = scan; 594*1da177e4SLinus Torvalds return nr_taken; 595*1da177e4SLinus Torvalds } 596*1da177e4SLinus Torvalds 597*1da177e4SLinus Torvalds /* 598*1da177e4SLinus Torvalds * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed 599*1da177e4SLinus Torvalds */ 600*1da177e4SLinus Torvalds static void shrink_cache(struct zone *zone, struct scan_control *sc) 601*1da177e4SLinus Torvalds { 602*1da177e4SLinus Torvalds LIST_HEAD(page_list); 603*1da177e4SLinus Torvalds struct pagevec pvec; 604*1da177e4SLinus Torvalds int max_scan = sc->nr_to_scan; 605*1da177e4SLinus Torvalds 606*1da177e4SLinus Torvalds pagevec_init(&pvec, 1); 607*1da177e4SLinus Torvalds 608*1da177e4SLinus Torvalds lru_add_drain(); 609*1da177e4SLinus Torvalds spin_lock_irq(&zone->lru_lock); 610*1da177e4SLinus Torvalds while (max_scan > 0) { 611*1da177e4SLinus Torvalds struct page *page; 612*1da177e4SLinus Torvalds int nr_taken; 613*1da177e4SLinus Torvalds int nr_scan; 614*1da177e4SLinus Torvalds int nr_freed; 615*1da177e4SLinus Torvalds 616*1da177e4SLinus Torvalds nr_taken = isolate_lru_pages(sc->swap_cluster_max, 617*1da177e4SLinus Torvalds &zone->inactive_list, 618*1da177e4SLinus Torvalds &page_list, &nr_scan); 619*1da177e4SLinus Torvalds zone->nr_inactive -= nr_taken; 620*1da177e4SLinus Torvalds zone->pages_scanned += nr_scan; 621*1da177e4SLinus Torvalds spin_unlock_irq(&zone->lru_lock); 622*1da177e4SLinus Torvalds 623*1da177e4SLinus Torvalds if (nr_taken == 0) 624*1da177e4SLinus Torvalds goto done; 625*1da177e4SLinus Torvalds 626*1da177e4SLinus Torvalds max_scan -= nr_scan; 627*1da177e4SLinus Torvalds if (current_is_kswapd()) 628*1da177e4SLinus Torvalds mod_page_state_zone(zone, pgscan_kswapd, nr_scan); 629*1da177e4SLinus Torvalds else 630*1da177e4SLinus Torvalds mod_page_state_zone(zone, pgscan_direct, nr_scan); 631*1da177e4SLinus Torvalds nr_freed = shrink_list(&page_list, sc); 632*1da177e4SLinus Torvalds if (current_is_kswapd()) 633*1da177e4SLinus Torvalds mod_page_state(kswapd_steal, nr_freed); 634*1da177e4SLinus Torvalds mod_page_state_zone(zone, pgsteal, nr_freed); 635*1da177e4SLinus Torvalds sc->nr_to_reclaim -= nr_freed; 636*1da177e4SLinus Torvalds 637*1da177e4SLinus Torvalds spin_lock_irq(&zone->lru_lock); 638*1da177e4SLinus Torvalds /* 639*1da177e4SLinus Torvalds * Put back any unfreeable pages. 640*1da177e4SLinus Torvalds */ 641*1da177e4SLinus Torvalds while (!list_empty(&page_list)) { 642*1da177e4SLinus Torvalds page = lru_to_page(&page_list); 643*1da177e4SLinus Torvalds if (TestSetPageLRU(page)) 644*1da177e4SLinus Torvalds BUG(); 645*1da177e4SLinus Torvalds list_del(&page->lru); 646*1da177e4SLinus Torvalds if (PageActive(page)) 647*1da177e4SLinus Torvalds add_page_to_active_list(zone, page); 648*1da177e4SLinus Torvalds else 649*1da177e4SLinus Torvalds add_page_to_inactive_list(zone, page); 650*1da177e4SLinus Torvalds if (!pagevec_add(&pvec, page)) { 651*1da177e4SLinus Torvalds spin_unlock_irq(&zone->lru_lock); 652*1da177e4SLinus Torvalds __pagevec_release(&pvec); 653*1da177e4SLinus Torvalds spin_lock_irq(&zone->lru_lock); 654*1da177e4SLinus Torvalds } 655*1da177e4SLinus Torvalds } 656*1da177e4SLinus Torvalds } 657*1da177e4SLinus Torvalds spin_unlock_irq(&zone->lru_lock); 658*1da177e4SLinus Torvalds done: 659*1da177e4SLinus Torvalds pagevec_release(&pvec); 660*1da177e4SLinus Torvalds } 661*1da177e4SLinus Torvalds 662*1da177e4SLinus Torvalds /* 663*1da177e4SLinus Torvalds * This moves pages from the active list to the inactive list. 664*1da177e4SLinus Torvalds * 665*1da177e4SLinus Torvalds * We move them the other way if the page is referenced by one or more 666*1da177e4SLinus Torvalds * processes, from rmap. 667*1da177e4SLinus Torvalds * 668*1da177e4SLinus Torvalds * If the pages are mostly unmapped, the processing is fast and it is 669*1da177e4SLinus Torvalds * appropriate to hold zone->lru_lock across the whole operation. But if 670*1da177e4SLinus Torvalds * the pages are mapped, the processing is slow (page_referenced()) so we 671*1da177e4SLinus Torvalds * should drop zone->lru_lock around each page. It's impossible to balance 672*1da177e4SLinus Torvalds * this, so instead we remove the pages from the LRU while processing them. 673*1da177e4SLinus Torvalds * It is safe to rely on PG_active against the non-LRU pages in here because 674*1da177e4SLinus Torvalds * nobody will play with that bit on a non-LRU page. 675*1da177e4SLinus Torvalds * 676*1da177e4SLinus Torvalds * The downside is that we have to touch page->_count against each page. 677*1da177e4SLinus Torvalds * But we had to alter page->flags anyway. 678*1da177e4SLinus Torvalds */ 679*1da177e4SLinus Torvalds static void 680*1da177e4SLinus Torvalds refill_inactive_zone(struct zone *zone, struct scan_control *sc) 681*1da177e4SLinus Torvalds { 682*1da177e4SLinus Torvalds int pgmoved; 683*1da177e4SLinus Torvalds int pgdeactivate = 0; 684*1da177e4SLinus Torvalds int pgscanned; 685*1da177e4SLinus Torvalds int nr_pages = sc->nr_to_scan; 686*1da177e4SLinus Torvalds LIST_HEAD(l_hold); /* The pages which were snipped off */ 687*1da177e4SLinus Torvalds LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 688*1da177e4SLinus Torvalds LIST_HEAD(l_active); /* Pages to go onto the active_list */ 689*1da177e4SLinus Torvalds struct page *page; 690*1da177e4SLinus Torvalds struct pagevec pvec; 691*1da177e4SLinus Torvalds int reclaim_mapped = 0; 692*1da177e4SLinus Torvalds long mapped_ratio; 693*1da177e4SLinus Torvalds long distress; 694*1da177e4SLinus Torvalds long swap_tendency; 695*1da177e4SLinus Torvalds 696*1da177e4SLinus Torvalds lru_add_drain(); 697*1da177e4SLinus Torvalds spin_lock_irq(&zone->lru_lock); 698*1da177e4SLinus Torvalds pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, 699*1da177e4SLinus Torvalds &l_hold, &pgscanned); 700*1da177e4SLinus Torvalds zone->pages_scanned += pgscanned; 701*1da177e4SLinus Torvalds zone->nr_active -= pgmoved; 702*1da177e4SLinus Torvalds spin_unlock_irq(&zone->lru_lock); 703*1da177e4SLinus Torvalds 704*1da177e4SLinus Torvalds /* 705*1da177e4SLinus Torvalds * `distress' is a measure of how much trouble we're having reclaiming 706*1da177e4SLinus Torvalds * pages. 0 -> no problems. 100 -> great trouble. 707*1da177e4SLinus Torvalds */ 708*1da177e4SLinus Torvalds distress = 100 >> zone->prev_priority; 709*1da177e4SLinus Torvalds 710*1da177e4SLinus Torvalds /* 711*1da177e4SLinus Torvalds * The point of this algorithm is to decide when to start reclaiming 712*1da177e4SLinus Torvalds * mapped memory instead of just pagecache. Work out how much memory 713*1da177e4SLinus Torvalds * is mapped. 714*1da177e4SLinus Torvalds */ 715*1da177e4SLinus Torvalds mapped_ratio = (sc->nr_mapped * 100) / total_memory; 716*1da177e4SLinus Torvalds 717*1da177e4SLinus Torvalds /* 718*1da177e4SLinus Torvalds * Now decide how much we really want to unmap some pages. The mapped 719*1da177e4SLinus Torvalds * ratio is downgraded - just because there's a lot of mapped memory 720*1da177e4SLinus Torvalds * doesn't necessarily mean that page reclaim isn't succeeding. 721*1da177e4SLinus Torvalds * 722*1da177e4SLinus Torvalds * The distress ratio is important - we don't want to start going oom. 723*1da177e4SLinus Torvalds * 724*1da177e4SLinus Torvalds * A 100% value of vm_swappiness overrides this algorithm altogether. 725*1da177e4SLinus Torvalds */ 726*1da177e4SLinus Torvalds swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; 727*1da177e4SLinus Torvalds 728*1da177e4SLinus Torvalds /* 729*1da177e4SLinus Torvalds * Now use this metric to decide whether to start moving mapped memory 730*1da177e4SLinus Torvalds * onto the inactive list. 731*1da177e4SLinus Torvalds */ 732*1da177e4SLinus Torvalds if (swap_tendency >= 100) 733*1da177e4SLinus Torvalds reclaim_mapped = 1; 734*1da177e4SLinus Torvalds 735*1da177e4SLinus Torvalds while (!list_empty(&l_hold)) { 736*1da177e4SLinus Torvalds cond_resched(); 737*1da177e4SLinus Torvalds page = lru_to_page(&l_hold); 738*1da177e4SLinus Torvalds list_del(&page->lru); 739*1da177e4SLinus Torvalds if (page_mapped(page)) { 740*1da177e4SLinus Torvalds if (!reclaim_mapped || 741*1da177e4SLinus Torvalds (total_swap_pages == 0 && PageAnon(page)) || 742*1da177e4SLinus Torvalds page_referenced(page, 0, sc->priority <= 0)) { 743*1da177e4SLinus Torvalds list_add(&page->lru, &l_active); 744*1da177e4SLinus Torvalds continue; 745*1da177e4SLinus Torvalds } 746*1da177e4SLinus Torvalds } 747*1da177e4SLinus Torvalds list_add(&page->lru, &l_inactive); 748*1da177e4SLinus Torvalds } 749*1da177e4SLinus Torvalds 750*1da177e4SLinus Torvalds pagevec_init(&pvec, 1); 751*1da177e4SLinus Torvalds pgmoved = 0; 752*1da177e4SLinus Torvalds spin_lock_irq(&zone->lru_lock); 753*1da177e4SLinus Torvalds while (!list_empty(&l_inactive)) { 754*1da177e4SLinus Torvalds page = lru_to_page(&l_inactive); 755*1da177e4SLinus Torvalds prefetchw_prev_lru_page(page, &l_inactive, flags); 756*1da177e4SLinus Torvalds if (TestSetPageLRU(page)) 757*1da177e4SLinus Torvalds BUG(); 758*1da177e4SLinus Torvalds if (!TestClearPageActive(page)) 759*1da177e4SLinus Torvalds BUG(); 760*1da177e4SLinus Torvalds list_move(&page->lru, &zone->inactive_list); 761*1da177e4SLinus Torvalds pgmoved++; 762*1da177e4SLinus Torvalds if (!pagevec_add(&pvec, page)) { 763*1da177e4SLinus Torvalds zone->nr_inactive += pgmoved; 764*1da177e4SLinus Torvalds spin_unlock_irq(&zone->lru_lock); 765*1da177e4SLinus Torvalds pgdeactivate += pgmoved; 766*1da177e4SLinus Torvalds pgmoved = 0; 767*1da177e4SLinus Torvalds if (buffer_heads_over_limit) 768*1da177e4SLinus Torvalds pagevec_strip(&pvec); 769*1da177e4SLinus Torvalds __pagevec_release(&pvec); 770*1da177e4SLinus Torvalds spin_lock_irq(&zone->lru_lock); 771*1da177e4SLinus Torvalds } 772*1da177e4SLinus Torvalds } 773*1da177e4SLinus Torvalds zone->nr_inactive += pgmoved; 774*1da177e4SLinus Torvalds pgdeactivate += pgmoved; 775*1da177e4SLinus Torvalds if (buffer_heads_over_limit) { 776*1da177e4SLinus Torvalds spin_unlock_irq(&zone->lru_lock); 777*1da177e4SLinus Torvalds pagevec_strip(&pvec); 778*1da177e4SLinus Torvalds spin_lock_irq(&zone->lru_lock); 779*1da177e4SLinus Torvalds } 780*1da177e4SLinus Torvalds 781*1da177e4SLinus Torvalds pgmoved = 0; 782*1da177e4SLinus Torvalds while (!list_empty(&l_active)) { 783*1da177e4SLinus Torvalds page = lru_to_page(&l_active); 784*1da177e4SLinus Torvalds prefetchw_prev_lru_page(page, &l_active, flags); 785*1da177e4SLinus Torvalds if (TestSetPageLRU(page)) 786*1da177e4SLinus Torvalds BUG(); 787*1da177e4SLinus Torvalds BUG_ON(!PageActive(page)); 788*1da177e4SLinus Torvalds list_move(&page->lru, &zone->active_list); 789*1da177e4SLinus Torvalds pgmoved++; 790*1da177e4SLinus Torvalds if (!pagevec_add(&pvec, page)) { 791*1da177e4SLinus Torvalds zone->nr_active += pgmoved; 792*1da177e4SLinus Torvalds pgmoved = 0; 793*1da177e4SLinus Torvalds spin_unlock_irq(&zone->lru_lock); 794*1da177e4SLinus Torvalds __pagevec_release(&pvec); 795*1da177e4SLinus Torvalds spin_lock_irq(&zone->lru_lock); 796*1da177e4SLinus Torvalds } 797*1da177e4SLinus Torvalds } 798*1da177e4SLinus Torvalds zone->nr_active += pgmoved; 799*1da177e4SLinus Torvalds spin_unlock_irq(&zone->lru_lock); 800*1da177e4SLinus Torvalds pagevec_release(&pvec); 801*1da177e4SLinus Torvalds 802*1da177e4SLinus Torvalds mod_page_state_zone(zone, pgrefill, pgscanned); 803*1da177e4SLinus Torvalds mod_page_state(pgdeactivate, pgdeactivate); 804*1da177e4SLinus Torvalds } 805*1da177e4SLinus Torvalds 806*1da177e4SLinus Torvalds /* 807*1da177e4SLinus Torvalds * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 808*1da177e4SLinus Torvalds */ 809*1da177e4SLinus Torvalds static void 810*1da177e4SLinus Torvalds shrink_zone(struct zone *zone, struct scan_control *sc) 811*1da177e4SLinus Torvalds { 812*1da177e4SLinus Torvalds unsigned long nr_active; 813*1da177e4SLinus Torvalds unsigned long nr_inactive; 814*1da177e4SLinus Torvalds 815*1da177e4SLinus Torvalds /* 816*1da177e4SLinus Torvalds * Add one to `nr_to_scan' just to make sure that the kernel will 817*1da177e4SLinus Torvalds * slowly sift through the active list. 818*1da177e4SLinus Torvalds */ 819*1da177e4SLinus Torvalds zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; 820*1da177e4SLinus Torvalds nr_active = zone->nr_scan_active; 821*1da177e4SLinus Torvalds if (nr_active >= sc->swap_cluster_max) 822*1da177e4SLinus Torvalds zone->nr_scan_active = 0; 823*1da177e4SLinus Torvalds else 824*1da177e4SLinus Torvalds nr_active = 0; 825*1da177e4SLinus Torvalds 826*1da177e4SLinus Torvalds zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; 827*1da177e4SLinus Torvalds nr_inactive = zone->nr_scan_inactive; 828*1da177e4SLinus Torvalds if (nr_inactive >= sc->swap_cluster_max) 829*1da177e4SLinus Torvalds zone->nr_scan_inactive = 0; 830*1da177e4SLinus Torvalds else 831*1da177e4SLinus Torvalds nr_inactive = 0; 832*1da177e4SLinus Torvalds 833*1da177e4SLinus Torvalds sc->nr_to_reclaim = sc->swap_cluster_max; 834*1da177e4SLinus Torvalds 835*1da177e4SLinus Torvalds while (nr_active || nr_inactive) { 836*1da177e4SLinus Torvalds if (nr_active) { 837*1da177e4SLinus Torvalds sc->nr_to_scan = min(nr_active, 838*1da177e4SLinus Torvalds (unsigned long)sc->swap_cluster_max); 839*1da177e4SLinus Torvalds nr_active -= sc->nr_to_scan; 840*1da177e4SLinus Torvalds refill_inactive_zone(zone, sc); 841*1da177e4SLinus Torvalds } 842*1da177e4SLinus Torvalds 843*1da177e4SLinus Torvalds if (nr_inactive) { 844*1da177e4SLinus Torvalds sc->nr_to_scan = min(nr_inactive, 845*1da177e4SLinus Torvalds (unsigned long)sc->swap_cluster_max); 846*1da177e4SLinus Torvalds nr_inactive -= sc->nr_to_scan; 847*1da177e4SLinus Torvalds shrink_cache(zone, sc); 848*1da177e4SLinus Torvalds if (sc->nr_to_reclaim <= 0) 849*1da177e4SLinus Torvalds break; 850*1da177e4SLinus Torvalds } 851*1da177e4SLinus Torvalds } 852*1da177e4SLinus Torvalds 853*1da177e4SLinus Torvalds throttle_vm_writeout(); 854*1da177e4SLinus Torvalds } 855*1da177e4SLinus Torvalds 856*1da177e4SLinus Torvalds /* 857*1da177e4SLinus Torvalds * This is the direct reclaim path, for page-allocating processes. We only 858*1da177e4SLinus Torvalds * try to reclaim pages from zones which will satisfy the caller's allocation 859*1da177e4SLinus Torvalds * request. 860*1da177e4SLinus Torvalds * 861*1da177e4SLinus Torvalds * We reclaim from a zone even if that zone is over pages_high. Because: 862*1da177e4SLinus Torvalds * a) The caller may be trying to free *extra* pages to satisfy a higher-order 863*1da177e4SLinus Torvalds * allocation or 864*1da177e4SLinus Torvalds * b) The zones may be over pages_high but they must go *over* pages_high to 865*1da177e4SLinus Torvalds * satisfy the `incremental min' zone defense algorithm. 866*1da177e4SLinus Torvalds * 867*1da177e4SLinus Torvalds * Returns the number of reclaimed pages. 868*1da177e4SLinus Torvalds * 869*1da177e4SLinus Torvalds * If a zone is deemed to be full of pinned pages then just give it a light 870*1da177e4SLinus Torvalds * scan then give up on it. 871*1da177e4SLinus Torvalds */ 872*1da177e4SLinus Torvalds static void 873*1da177e4SLinus Torvalds shrink_caches(struct zone **zones, struct scan_control *sc) 874*1da177e4SLinus Torvalds { 875*1da177e4SLinus Torvalds int i; 876*1da177e4SLinus Torvalds 877*1da177e4SLinus Torvalds for (i = 0; zones[i] != NULL; i++) { 878*1da177e4SLinus Torvalds struct zone *zone = zones[i]; 879*1da177e4SLinus Torvalds 880*1da177e4SLinus Torvalds if (zone->present_pages == 0) 881*1da177e4SLinus Torvalds continue; 882*1da177e4SLinus Torvalds 883*1da177e4SLinus Torvalds if (!cpuset_zone_allowed(zone)) 884*1da177e4SLinus Torvalds continue; 885*1da177e4SLinus Torvalds 886*1da177e4SLinus Torvalds zone->temp_priority = sc->priority; 887*1da177e4SLinus Torvalds if (zone->prev_priority > sc->priority) 888*1da177e4SLinus Torvalds zone->prev_priority = sc->priority; 889*1da177e4SLinus Torvalds 890*1da177e4SLinus Torvalds if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) 891*1da177e4SLinus Torvalds continue; /* Let kswapd poll it */ 892*1da177e4SLinus Torvalds 893*1da177e4SLinus Torvalds shrink_zone(zone, sc); 894*1da177e4SLinus Torvalds } 895*1da177e4SLinus Torvalds } 896*1da177e4SLinus Torvalds 897*1da177e4SLinus Torvalds /* 898*1da177e4SLinus Torvalds * This is the main entry point to direct page reclaim. 899*1da177e4SLinus Torvalds * 900*1da177e4SLinus Torvalds * If a full scan of the inactive list fails to free enough memory then we 901*1da177e4SLinus Torvalds * are "out of memory" and something needs to be killed. 902*1da177e4SLinus Torvalds * 903*1da177e4SLinus Torvalds * If the caller is !__GFP_FS then the probability of a failure is reasonably 904*1da177e4SLinus Torvalds * high - the zone may be full of dirty or under-writeback pages, which this 905*1da177e4SLinus Torvalds * caller can't do much about. We kick pdflush and take explicit naps in the 906*1da177e4SLinus Torvalds * hope that some of these pages can be written. But if the allocating task 907*1da177e4SLinus Torvalds * holds filesystem locks which prevent writeout this might not work, and the 908*1da177e4SLinus Torvalds * allocation attempt will fail. 909*1da177e4SLinus Torvalds */ 910*1da177e4SLinus Torvalds int try_to_free_pages(struct zone **zones, 911*1da177e4SLinus Torvalds unsigned int gfp_mask, unsigned int order) 912*1da177e4SLinus Torvalds { 913*1da177e4SLinus Torvalds int priority; 914*1da177e4SLinus Torvalds int ret = 0; 915*1da177e4SLinus Torvalds int total_scanned = 0, total_reclaimed = 0; 916*1da177e4SLinus Torvalds struct reclaim_state *reclaim_state = current->reclaim_state; 917*1da177e4SLinus Torvalds struct scan_control sc; 918*1da177e4SLinus Torvalds unsigned long lru_pages = 0; 919*1da177e4SLinus Torvalds int i; 920*1da177e4SLinus Torvalds 921*1da177e4SLinus Torvalds sc.gfp_mask = gfp_mask; 922*1da177e4SLinus Torvalds sc.may_writepage = 0; 923*1da177e4SLinus Torvalds 924*1da177e4SLinus Torvalds inc_page_state(allocstall); 925*1da177e4SLinus Torvalds 926*1da177e4SLinus Torvalds for (i = 0; zones[i] != NULL; i++) { 927*1da177e4SLinus Torvalds struct zone *zone = zones[i]; 928*1da177e4SLinus Torvalds 929*1da177e4SLinus Torvalds if (!cpuset_zone_allowed(zone)) 930*1da177e4SLinus Torvalds continue; 931*1da177e4SLinus Torvalds 932*1da177e4SLinus Torvalds zone->temp_priority = DEF_PRIORITY; 933*1da177e4SLinus Torvalds lru_pages += zone->nr_active + zone->nr_inactive; 934*1da177e4SLinus Torvalds } 935*1da177e4SLinus Torvalds 936*1da177e4SLinus Torvalds for (priority = DEF_PRIORITY; priority >= 0; priority--) { 937*1da177e4SLinus Torvalds sc.nr_mapped = read_page_state(nr_mapped); 938*1da177e4SLinus Torvalds sc.nr_scanned = 0; 939*1da177e4SLinus Torvalds sc.nr_reclaimed = 0; 940*1da177e4SLinus Torvalds sc.priority = priority; 941*1da177e4SLinus Torvalds sc.swap_cluster_max = SWAP_CLUSTER_MAX; 942*1da177e4SLinus Torvalds shrink_caches(zones, &sc); 943*1da177e4SLinus Torvalds shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 944*1da177e4SLinus Torvalds if (reclaim_state) { 945*1da177e4SLinus Torvalds sc.nr_reclaimed += reclaim_state->reclaimed_slab; 946*1da177e4SLinus Torvalds reclaim_state->reclaimed_slab = 0; 947*1da177e4SLinus Torvalds } 948*1da177e4SLinus Torvalds total_scanned += sc.nr_scanned; 949*1da177e4SLinus Torvalds total_reclaimed += sc.nr_reclaimed; 950*1da177e4SLinus Torvalds if (total_reclaimed >= sc.swap_cluster_max) { 951*1da177e4SLinus Torvalds ret = 1; 952*1da177e4SLinus Torvalds goto out; 953*1da177e4SLinus Torvalds } 954*1da177e4SLinus Torvalds 955*1da177e4SLinus Torvalds /* 956*1da177e4SLinus Torvalds * Try to write back as many pages as we just scanned. This 957*1da177e4SLinus Torvalds * tends to cause slow streaming writers to write data to the 958*1da177e4SLinus Torvalds * disk smoothly, at the dirtying rate, which is nice. But 959*1da177e4SLinus Torvalds * that's undesirable in laptop mode, where we *want* lumpy 960*1da177e4SLinus Torvalds * writeout. So in laptop mode, write out the whole world. 961*1da177e4SLinus Torvalds */ 962*1da177e4SLinus Torvalds if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { 963*1da177e4SLinus Torvalds wakeup_bdflush(laptop_mode ? 0 : total_scanned); 964*1da177e4SLinus Torvalds sc.may_writepage = 1; 965*1da177e4SLinus Torvalds } 966*1da177e4SLinus Torvalds 967*1da177e4SLinus Torvalds /* Take a nap, wait for some writeback to complete */ 968*1da177e4SLinus Torvalds if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 969*1da177e4SLinus Torvalds blk_congestion_wait(WRITE, HZ/10); 970*1da177e4SLinus Torvalds } 971*1da177e4SLinus Torvalds out: 972*1da177e4SLinus Torvalds for (i = 0; zones[i] != 0; i++) { 973*1da177e4SLinus Torvalds struct zone *zone = zones[i]; 974*1da177e4SLinus Torvalds 975*1da177e4SLinus Torvalds if (!cpuset_zone_allowed(zone)) 976*1da177e4SLinus Torvalds continue; 977*1da177e4SLinus Torvalds 978*1da177e4SLinus Torvalds zone->prev_priority = zone->temp_priority; 979*1da177e4SLinus Torvalds } 980*1da177e4SLinus Torvalds return ret; 981*1da177e4SLinus Torvalds } 982*1da177e4SLinus Torvalds 983*1da177e4SLinus Torvalds /* 984*1da177e4SLinus Torvalds * For kswapd, balance_pgdat() will work across all this node's zones until 985*1da177e4SLinus Torvalds * they are all at pages_high. 986*1da177e4SLinus Torvalds * 987*1da177e4SLinus Torvalds * If `nr_pages' is non-zero then it is the number of pages which are to be 988*1da177e4SLinus Torvalds * reclaimed, regardless of the zone occupancies. This is a software suspend 989*1da177e4SLinus Torvalds * special. 990*1da177e4SLinus Torvalds * 991*1da177e4SLinus Torvalds * Returns the number of pages which were actually freed. 992*1da177e4SLinus Torvalds * 993*1da177e4SLinus Torvalds * There is special handling here for zones which are full of pinned pages. 994*1da177e4SLinus Torvalds * This can happen if the pages are all mlocked, or if they are all used by 995*1da177e4SLinus Torvalds * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. 996*1da177e4SLinus Torvalds * What we do is to detect the case where all pages in the zone have been 997*1da177e4SLinus Torvalds * scanned twice and there has been zero successful reclaim. Mark the zone as 998*1da177e4SLinus Torvalds * dead and from now on, only perform a short scan. Basically we're polling 999*1da177e4SLinus Torvalds * the zone for when the problem goes away. 1000*1da177e4SLinus Torvalds * 1001*1da177e4SLinus Torvalds * kswapd scans the zones in the highmem->normal->dma direction. It skips 1002*1da177e4SLinus Torvalds * zones which have free_pages > pages_high, but once a zone is found to have 1003*1da177e4SLinus Torvalds * free_pages <= pages_high, we scan that zone and the lower zones regardless 1004*1da177e4SLinus Torvalds * of the number of free pages in the lower zones. This interoperates with 1005*1da177e4SLinus Torvalds * the page allocator fallback scheme to ensure that aging of pages is balanced 1006*1da177e4SLinus Torvalds * across the zones. 1007*1da177e4SLinus Torvalds */ 1008*1da177e4SLinus Torvalds static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) 1009*1da177e4SLinus Torvalds { 1010*1da177e4SLinus Torvalds int to_free = nr_pages; 1011*1da177e4SLinus Torvalds int all_zones_ok; 1012*1da177e4SLinus Torvalds int priority; 1013*1da177e4SLinus Torvalds int i; 1014*1da177e4SLinus Torvalds int total_scanned, total_reclaimed; 1015*1da177e4SLinus Torvalds struct reclaim_state *reclaim_state = current->reclaim_state; 1016*1da177e4SLinus Torvalds struct scan_control sc; 1017*1da177e4SLinus Torvalds 1018*1da177e4SLinus Torvalds loop_again: 1019*1da177e4SLinus Torvalds total_scanned = 0; 1020*1da177e4SLinus Torvalds total_reclaimed = 0; 1021*1da177e4SLinus Torvalds sc.gfp_mask = GFP_KERNEL; 1022*1da177e4SLinus Torvalds sc.may_writepage = 0; 1023*1da177e4SLinus Torvalds sc.nr_mapped = read_page_state(nr_mapped); 1024*1da177e4SLinus Torvalds 1025*1da177e4SLinus Torvalds inc_page_state(pageoutrun); 1026*1da177e4SLinus Torvalds 1027*1da177e4SLinus Torvalds for (i = 0; i < pgdat->nr_zones; i++) { 1028*1da177e4SLinus Torvalds struct zone *zone = pgdat->node_zones + i; 1029*1da177e4SLinus Torvalds 1030*1da177e4SLinus Torvalds zone->temp_priority = DEF_PRIORITY; 1031*1da177e4SLinus Torvalds } 1032*1da177e4SLinus Torvalds 1033*1da177e4SLinus Torvalds for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1034*1da177e4SLinus Torvalds int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1035*1da177e4SLinus Torvalds unsigned long lru_pages = 0; 1036*1da177e4SLinus Torvalds 1037*1da177e4SLinus Torvalds all_zones_ok = 1; 1038*1da177e4SLinus Torvalds 1039*1da177e4SLinus Torvalds if (nr_pages == 0) { 1040*1da177e4SLinus Torvalds /* 1041*1da177e4SLinus Torvalds * Scan in the highmem->dma direction for the highest 1042*1da177e4SLinus Torvalds * zone which needs scanning 1043*1da177e4SLinus Torvalds */ 1044*1da177e4SLinus Torvalds for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1045*1da177e4SLinus Torvalds struct zone *zone = pgdat->node_zones + i; 1046*1da177e4SLinus Torvalds 1047*1da177e4SLinus Torvalds if (zone->present_pages == 0) 1048*1da177e4SLinus Torvalds continue; 1049*1da177e4SLinus Torvalds 1050*1da177e4SLinus Torvalds if (zone->all_unreclaimable && 1051*1da177e4SLinus Torvalds priority != DEF_PRIORITY) 1052*1da177e4SLinus Torvalds continue; 1053*1da177e4SLinus Torvalds 1054*1da177e4SLinus Torvalds if (!zone_watermark_ok(zone, order, 1055*1da177e4SLinus Torvalds zone->pages_high, 0, 0, 0)) { 1056*1da177e4SLinus Torvalds end_zone = i; 1057*1da177e4SLinus Torvalds goto scan; 1058*1da177e4SLinus Torvalds } 1059*1da177e4SLinus Torvalds } 1060*1da177e4SLinus Torvalds goto out; 1061*1da177e4SLinus Torvalds } else { 1062*1da177e4SLinus Torvalds end_zone = pgdat->nr_zones - 1; 1063*1da177e4SLinus Torvalds } 1064*1da177e4SLinus Torvalds scan: 1065*1da177e4SLinus Torvalds for (i = 0; i <= end_zone; i++) { 1066*1da177e4SLinus Torvalds struct zone *zone = pgdat->node_zones + i; 1067*1da177e4SLinus Torvalds 1068*1da177e4SLinus Torvalds lru_pages += zone->nr_active + zone->nr_inactive; 1069*1da177e4SLinus Torvalds } 1070*1da177e4SLinus Torvalds 1071*1da177e4SLinus Torvalds /* 1072*1da177e4SLinus Torvalds * Now scan the zone in the dma->highmem direction, stopping 1073*1da177e4SLinus Torvalds * at the last zone which needs scanning. 1074*1da177e4SLinus Torvalds * 1075*1da177e4SLinus Torvalds * We do this because the page allocator works in the opposite 1076*1da177e4SLinus Torvalds * direction. This prevents the page allocator from allocating 1077*1da177e4SLinus Torvalds * pages behind kswapd's direction of progress, which would 1078*1da177e4SLinus Torvalds * cause too much scanning of the lower zones. 1079*1da177e4SLinus Torvalds */ 1080*1da177e4SLinus Torvalds for (i = 0; i <= end_zone; i++) { 1081*1da177e4SLinus Torvalds struct zone *zone = pgdat->node_zones + i; 1082*1da177e4SLinus Torvalds 1083*1da177e4SLinus Torvalds if (zone->present_pages == 0) 1084*1da177e4SLinus Torvalds continue; 1085*1da177e4SLinus Torvalds 1086*1da177e4SLinus Torvalds if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1087*1da177e4SLinus Torvalds continue; 1088*1da177e4SLinus Torvalds 1089*1da177e4SLinus Torvalds if (nr_pages == 0) { /* Not software suspend */ 1090*1da177e4SLinus Torvalds if (!zone_watermark_ok(zone, order, 1091*1da177e4SLinus Torvalds zone->pages_high, end_zone, 0, 0)) 1092*1da177e4SLinus Torvalds all_zones_ok = 0; 1093*1da177e4SLinus Torvalds } 1094*1da177e4SLinus Torvalds zone->temp_priority = priority; 1095*1da177e4SLinus Torvalds if (zone->prev_priority > priority) 1096*1da177e4SLinus Torvalds zone->prev_priority = priority; 1097*1da177e4SLinus Torvalds sc.nr_scanned = 0; 1098*1da177e4SLinus Torvalds sc.nr_reclaimed = 0; 1099*1da177e4SLinus Torvalds sc.priority = priority; 1100*1da177e4SLinus Torvalds sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; 1101*1da177e4SLinus Torvalds shrink_zone(zone, &sc); 1102*1da177e4SLinus Torvalds reclaim_state->reclaimed_slab = 0; 1103*1da177e4SLinus Torvalds shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); 1104*1da177e4SLinus Torvalds sc.nr_reclaimed += reclaim_state->reclaimed_slab; 1105*1da177e4SLinus Torvalds total_reclaimed += sc.nr_reclaimed; 1106*1da177e4SLinus Torvalds total_scanned += sc.nr_scanned; 1107*1da177e4SLinus Torvalds if (zone->all_unreclaimable) 1108*1da177e4SLinus Torvalds continue; 1109*1da177e4SLinus Torvalds if (zone->pages_scanned >= (zone->nr_active + 1110*1da177e4SLinus Torvalds zone->nr_inactive) * 4) 1111*1da177e4SLinus Torvalds zone->all_unreclaimable = 1; 1112*1da177e4SLinus Torvalds /* 1113*1da177e4SLinus Torvalds * If we've done a decent amount of scanning and 1114*1da177e4SLinus Torvalds * the reclaim ratio is low, start doing writepage 1115*1da177e4SLinus Torvalds * even in laptop mode 1116*1da177e4SLinus Torvalds */ 1117*1da177e4SLinus Torvalds if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1118*1da177e4SLinus Torvalds total_scanned > total_reclaimed+total_reclaimed/2) 1119*1da177e4SLinus Torvalds sc.may_writepage = 1; 1120*1da177e4SLinus Torvalds } 1121*1da177e4SLinus Torvalds if (nr_pages && to_free > total_reclaimed) 1122*1da177e4SLinus Torvalds continue; /* swsusp: need to do more work */ 1123*1da177e4SLinus Torvalds if (all_zones_ok) 1124*1da177e4SLinus Torvalds break; /* kswapd: all done */ 1125*1da177e4SLinus Torvalds /* 1126*1da177e4SLinus Torvalds * OK, kswapd is getting into trouble. Take a nap, then take 1127*1da177e4SLinus Torvalds * another pass across the zones. 1128*1da177e4SLinus Torvalds */ 1129*1da177e4SLinus Torvalds if (total_scanned && priority < DEF_PRIORITY - 2) 1130*1da177e4SLinus Torvalds blk_congestion_wait(WRITE, HZ/10); 1131*1da177e4SLinus Torvalds 1132*1da177e4SLinus Torvalds /* 1133*1da177e4SLinus Torvalds * We do this so kswapd doesn't build up large priorities for 1134*1da177e4SLinus Torvalds * example when it is freeing in parallel with allocators. It 1135*1da177e4SLinus Torvalds * matches the direct reclaim path behaviour in terms of impact 1136*1da177e4SLinus Torvalds * on zone->*_priority. 1137*1da177e4SLinus Torvalds */ 1138*1da177e4SLinus Torvalds if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) 1139*1da177e4SLinus Torvalds break; 1140*1da177e4SLinus Torvalds } 1141*1da177e4SLinus Torvalds out: 1142*1da177e4SLinus Torvalds for (i = 0; i < pgdat->nr_zones; i++) { 1143*1da177e4SLinus Torvalds struct zone *zone = pgdat->node_zones + i; 1144*1da177e4SLinus Torvalds 1145*1da177e4SLinus Torvalds zone->prev_priority = zone->temp_priority; 1146*1da177e4SLinus Torvalds } 1147*1da177e4SLinus Torvalds if (!all_zones_ok) { 1148*1da177e4SLinus Torvalds cond_resched(); 1149*1da177e4SLinus Torvalds goto loop_again; 1150*1da177e4SLinus Torvalds } 1151*1da177e4SLinus Torvalds 1152*1da177e4SLinus Torvalds return total_reclaimed; 1153*1da177e4SLinus Torvalds } 1154*1da177e4SLinus Torvalds 1155*1da177e4SLinus Torvalds /* 1156*1da177e4SLinus Torvalds * The background pageout daemon, started as a kernel thread 1157*1da177e4SLinus Torvalds * from the init process. 1158*1da177e4SLinus Torvalds * 1159*1da177e4SLinus Torvalds * This basically trickles out pages so that we have _some_ 1160*1da177e4SLinus Torvalds * free memory available even if there is no other activity 1161*1da177e4SLinus Torvalds * that frees anything up. This is needed for things like routing 1162*1da177e4SLinus Torvalds * etc, where we otherwise might have all activity going on in 1163*1da177e4SLinus Torvalds * asynchronous contexts that cannot page things out. 1164*1da177e4SLinus Torvalds * 1165*1da177e4SLinus Torvalds * If there are applications that are active memory-allocators 1166*1da177e4SLinus Torvalds * (most normal use), this basically shouldn't matter. 1167*1da177e4SLinus Torvalds */ 1168*1da177e4SLinus Torvalds static int kswapd(void *p) 1169*1da177e4SLinus Torvalds { 1170*1da177e4SLinus Torvalds unsigned long order; 1171*1da177e4SLinus Torvalds pg_data_t *pgdat = (pg_data_t*)p; 1172*1da177e4SLinus Torvalds struct task_struct *tsk = current; 1173*1da177e4SLinus Torvalds DEFINE_WAIT(wait); 1174*1da177e4SLinus Torvalds struct reclaim_state reclaim_state = { 1175*1da177e4SLinus Torvalds .reclaimed_slab = 0, 1176*1da177e4SLinus Torvalds }; 1177*1da177e4SLinus Torvalds cpumask_t cpumask; 1178*1da177e4SLinus Torvalds 1179*1da177e4SLinus Torvalds daemonize("kswapd%d", pgdat->node_id); 1180*1da177e4SLinus Torvalds cpumask = node_to_cpumask(pgdat->node_id); 1181*1da177e4SLinus Torvalds if (!cpus_empty(cpumask)) 1182*1da177e4SLinus Torvalds set_cpus_allowed(tsk, cpumask); 1183*1da177e4SLinus Torvalds current->reclaim_state = &reclaim_state; 1184*1da177e4SLinus Torvalds 1185*1da177e4SLinus Torvalds /* 1186*1da177e4SLinus Torvalds * Tell the memory management that we're a "memory allocator", 1187*1da177e4SLinus Torvalds * and that if we need more memory we should get access to it 1188*1da177e4SLinus Torvalds * regardless (see "__alloc_pages()"). "kswapd" should 1189*1da177e4SLinus Torvalds * never get caught in the normal page freeing logic. 1190*1da177e4SLinus Torvalds * 1191*1da177e4SLinus Torvalds * (Kswapd normally doesn't need memory anyway, but sometimes 1192*1da177e4SLinus Torvalds * you need a small amount of memory in order to be able to 1193*1da177e4SLinus Torvalds * page out something else, and this flag essentially protects 1194*1da177e4SLinus Torvalds * us from recursively trying to free more memory as we're 1195*1da177e4SLinus Torvalds * trying to free the first piece of memory in the first place). 1196*1da177e4SLinus Torvalds */ 1197*1da177e4SLinus Torvalds tsk->flags |= PF_MEMALLOC|PF_KSWAPD; 1198*1da177e4SLinus Torvalds 1199*1da177e4SLinus Torvalds order = 0; 1200*1da177e4SLinus Torvalds for ( ; ; ) { 1201*1da177e4SLinus Torvalds unsigned long new_order; 1202*1da177e4SLinus Torvalds if (current->flags & PF_FREEZE) 1203*1da177e4SLinus Torvalds refrigerator(PF_FREEZE); 1204*1da177e4SLinus Torvalds 1205*1da177e4SLinus Torvalds prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 1206*1da177e4SLinus Torvalds new_order = pgdat->kswapd_max_order; 1207*1da177e4SLinus Torvalds pgdat->kswapd_max_order = 0; 1208*1da177e4SLinus Torvalds if (order < new_order) { 1209*1da177e4SLinus Torvalds /* 1210*1da177e4SLinus Torvalds * Don't sleep if someone wants a larger 'order' 1211*1da177e4SLinus Torvalds * allocation 1212*1da177e4SLinus Torvalds */ 1213*1da177e4SLinus Torvalds order = new_order; 1214*1da177e4SLinus Torvalds } else { 1215*1da177e4SLinus Torvalds schedule(); 1216*1da177e4SLinus Torvalds order = pgdat->kswapd_max_order; 1217*1da177e4SLinus Torvalds } 1218*1da177e4SLinus Torvalds finish_wait(&pgdat->kswapd_wait, &wait); 1219*1da177e4SLinus Torvalds 1220*1da177e4SLinus Torvalds balance_pgdat(pgdat, 0, order); 1221*1da177e4SLinus Torvalds } 1222*1da177e4SLinus Torvalds return 0; 1223*1da177e4SLinus Torvalds } 1224*1da177e4SLinus Torvalds 1225*1da177e4SLinus Torvalds /* 1226*1da177e4SLinus Torvalds * A zone is low on free memory, so wake its kswapd task to service it. 1227*1da177e4SLinus Torvalds */ 1228*1da177e4SLinus Torvalds void wakeup_kswapd(struct zone *zone, int order) 1229*1da177e4SLinus Torvalds { 1230*1da177e4SLinus Torvalds pg_data_t *pgdat; 1231*1da177e4SLinus Torvalds 1232*1da177e4SLinus Torvalds if (zone->present_pages == 0) 1233*1da177e4SLinus Torvalds return; 1234*1da177e4SLinus Torvalds 1235*1da177e4SLinus Torvalds pgdat = zone->zone_pgdat; 1236*1da177e4SLinus Torvalds if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0)) 1237*1da177e4SLinus Torvalds return; 1238*1da177e4SLinus Torvalds if (pgdat->kswapd_max_order < order) 1239*1da177e4SLinus Torvalds pgdat->kswapd_max_order = order; 1240*1da177e4SLinus Torvalds if (!cpuset_zone_allowed(zone)) 1241*1da177e4SLinus Torvalds return; 1242*1da177e4SLinus Torvalds if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) 1243*1da177e4SLinus Torvalds return; 1244*1da177e4SLinus Torvalds wake_up_interruptible(&zone->zone_pgdat->kswapd_wait); 1245*1da177e4SLinus Torvalds } 1246*1da177e4SLinus Torvalds 1247*1da177e4SLinus Torvalds #ifdef CONFIG_PM 1248*1da177e4SLinus Torvalds /* 1249*1da177e4SLinus Torvalds * Try to free `nr_pages' of memory, system-wide. Returns the number of freed 1250*1da177e4SLinus Torvalds * pages. 1251*1da177e4SLinus Torvalds */ 1252*1da177e4SLinus Torvalds int shrink_all_memory(int nr_pages) 1253*1da177e4SLinus Torvalds { 1254*1da177e4SLinus Torvalds pg_data_t *pgdat; 1255*1da177e4SLinus Torvalds int nr_to_free = nr_pages; 1256*1da177e4SLinus Torvalds int ret = 0; 1257*1da177e4SLinus Torvalds struct reclaim_state reclaim_state = { 1258*1da177e4SLinus Torvalds .reclaimed_slab = 0, 1259*1da177e4SLinus Torvalds }; 1260*1da177e4SLinus Torvalds 1261*1da177e4SLinus Torvalds current->reclaim_state = &reclaim_state; 1262*1da177e4SLinus Torvalds for_each_pgdat(pgdat) { 1263*1da177e4SLinus Torvalds int freed; 1264*1da177e4SLinus Torvalds freed = balance_pgdat(pgdat, nr_to_free, 0); 1265*1da177e4SLinus Torvalds ret += freed; 1266*1da177e4SLinus Torvalds nr_to_free -= freed; 1267*1da177e4SLinus Torvalds if (nr_to_free <= 0) 1268*1da177e4SLinus Torvalds break; 1269*1da177e4SLinus Torvalds } 1270*1da177e4SLinus Torvalds current->reclaim_state = NULL; 1271*1da177e4SLinus Torvalds return ret; 1272*1da177e4SLinus Torvalds } 1273*1da177e4SLinus Torvalds #endif 1274*1da177e4SLinus Torvalds 1275*1da177e4SLinus Torvalds #ifdef CONFIG_HOTPLUG_CPU 1276*1da177e4SLinus Torvalds /* It's optimal to keep kswapds on the same CPUs as their memory, but 1277*1da177e4SLinus Torvalds not required for correctness. So if the last cpu in a node goes 1278*1da177e4SLinus Torvalds away, we get changed to run anywhere: as the first one comes back, 1279*1da177e4SLinus Torvalds restore their cpu bindings. */ 1280*1da177e4SLinus Torvalds static int __devinit cpu_callback(struct notifier_block *nfb, 1281*1da177e4SLinus Torvalds unsigned long action, 1282*1da177e4SLinus Torvalds void *hcpu) 1283*1da177e4SLinus Torvalds { 1284*1da177e4SLinus Torvalds pg_data_t *pgdat; 1285*1da177e4SLinus Torvalds cpumask_t mask; 1286*1da177e4SLinus Torvalds 1287*1da177e4SLinus Torvalds if (action == CPU_ONLINE) { 1288*1da177e4SLinus Torvalds for_each_pgdat(pgdat) { 1289*1da177e4SLinus Torvalds mask = node_to_cpumask(pgdat->node_id); 1290*1da177e4SLinus Torvalds if (any_online_cpu(mask) != NR_CPUS) 1291*1da177e4SLinus Torvalds /* One of our CPUs online: restore mask */ 1292*1da177e4SLinus Torvalds set_cpus_allowed(pgdat->kswapd, mask); 1293*1da177e4SLinus Torvalds } 1294*1da177e4SLinus Torvalds } 1295*1da177e4SLinus Torvalds return NOTIFY_OK; 1296*1da177e4SLinus Torvalds } 1297*1da177e4SLinus Torvalds #endif /* CONFIG_HOTPLUG_CPU */ 1298*1da177e4SLinus Torvalds 1299*1da177e4SLinus Torvalds static int __init kswapd_init(void) 1300*1da177e4SLinus Torvalds { 1301*1da177e4SLinus Torvalds pg_data_t *pgdat; 1302*1da177e4SLinus Torvalds swap_setup(); 1303*1da177e4SLinus Torvalds for_each_pgdat(pgdat) 1304*1da177e4SLinus Torvalds pgdat->kswapd 1305*1da177e4SLinus Torvalds = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); 1306*1da177e4SLinus Torvalds total_memory = nr_free_pagecache_pages(); 1307*1da177e4SLinus Torvalds hotcpu_notifier(cpu_callback, 0); 1308*1da177e4SLinus Torvalds return 0; 1309*1da177e4SLinus Torvalds } 1310*1da177e4SLinus Torvalds 1311*1da177e4SLinus Torvalds module_init(kswapd_init) 1312