1 /* 2 * linux/mm/swapfile.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * Swap reorganised 29.12.95, Stephen Tweedie 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/hugetlb.h> 10 #include <linux/mman.h> 11 #include <linux/slab.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/swap.h> 14 #include <linux/vmalloc.h> 15 #include <linux/pagemap.h> 16 #include <linux/namei.h> 17 #include <linux/shm.h> 18 #include <linux/blkdev.h> 19 #include <linux/random.h> 20 #include <linux/writeback.h> 21 #include <linux/proc_fs.h> 22 #include <linux/seq_file.h> 23 #include <linux/init.h> 24 #include <linux/module.h> 25 #include <linux/rmap.h> 26 #include <linux/security.h> 27 #include <linux/backing-dev.h> 28 #include <linux/mutex.h> 29 #include <linux/capability.h> 30 #include <linux/syscalls.h> 31 #include <linux/memcontrol.h> 32 33 #include <asm/pgtable.h> 34 #include <asm/tlbflush.h> 35 #include <linux/swapops.h> 36 #include <linux/page_cgroup.h> 37 38 static DEFINE_SPINLOCK(swap_lock); 39 static unsigned int nr_swapfiles; 40 long nr_swap_pages; 41 long total_swap_pages; 42 static int swap_overflow; 43 static int least_priority; 44 45 static const char Bad_file[] = "Bad swap file entry "; 46 static const char Unused_file[] = "Unused swap file entry "; 47 static const char Bad_offset[] = "Bad swap offset entry "; 48 static const char Unused_offset[] = "Unused swap offset entry "; 49 50 static struct swap_list_t swap_list = {-1, -1}; 51 52 static struct swap_info_struct swap_info[MAX_SWAPFILES]; 53 54 static DEFINE_MUTEX(swapon_mutex); 55 56 /* For reference count accounting in swap_map */ 57 /* enum for swap_map[] handling. internal use only */ 58 enum { 59 SWAP_MAP = 0, /* ops for reference from swap users */ 60 SWAP_CACHE, /* ops for reference from swap cache */ 61 }; 62 63 static inline int swap_count(unsigned short ent) 64 { 65 return ent & SWAP_COUNT_MASK; 66 } 67 68 static inline bool swap_has_cache(unsigned short ent) 69 { 70 return !!(ent & SWAP_HAS_CACHE); 71 } 72 73 static inline unsigned short encode_swapmap(int count, bool has_cache) 74 { 75 unsigned short ret = count; 76 77 if (has_cache) 78 return SWAP_HAS_CACHE | ret; 79 return ret; 80 } 81 82 /* returnes 1 if swap entry is freed */ 83 static int 84 __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) 85 { 86 int type = si - swap_info; 87 swp_entry_t entry = swp_entry(type, offset); 88 struct page *page; 89 int ret = 0; 90 91 page = find_get_page(&swapper_space, entry.val); 92 if (!page) 93 return 0; 94 /* 95 * This function is called from scan_swap_map() and it's called 96 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. 97 * We have to use trylock for avoiding deadlock. This is a special 98 * case and you should use try_to_free_swap() with explicit lock_page() 99 * in usual operations. 100 */ 101 if (trylock_page(page)) { 102 ret = try_to_free_swap(page); 103 unlock_page(page); 104 } 105 page_cache_release(page); 106 return ret; 107 } 108 109 /* 110 * We need this because the bdev->unplug_fn can sleep and we cannot 111 * hold swap_lock while calling the unplug_fn. And swap_lock 112 * cannot be turned into a mutex. 113 */ 114 static DECLARE_RWSEM(swap_unplug_sem); 115 116 void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) 117 { 118 swp_entry_t entry; 119 120 down_read(&swap_unplug_sem); 121 entry.val = page_private(page); 122 if (PageSwapCache(page)) { 123 struct block_device *bdev = swap_info[swp_type(entry)].bdev; 124 struct backing_dev_info *bdi; 125 126 /* 127 * If the page is removed from swapcache from under us (with a 128 * racy try_to_unuse/swapoff) we need an additional reference 129 * count to avoid reading garbage from page_private(page) above. 130 * If the WARN_ON triggers during a swapoff it maybe the race 131 * condition and it's harmless. However if it triggers without 132 * swapoff it signals a problem. 133 */ 134 WARN_ON(page_count(page) <= 1); 135 136 bdi = bdev->bd_inode->i_mapping->backing_dev_info; 137 blk_run_backing_dev(bdi, page); 138 } 139 up_read(&swap_unplug_sem); 140 } 141 142 /* 143 * swapon tell device that all the old swap contents can be discarded, 144 * to allow the swap device to optimize its wear-levelling. 145 */ 146 static int discard_swap(struct swap_info_struct *si) 147 { 148 struct swap_extent *se; 149 int err = 0; 150 151 list_for_each_entry(se, &si->extent_list, list) { 152 sector_t start_block = se->start_block << (PAGE_SHIFT - 9); 153 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 154 155 if (se->start_page == 0) { 156 /* Do not discard the swap header page! */ 157 start_block += 1 << (PAGE_SHIFT - 9); 158 nr_blocks -= 1 << (PAGE_SHIFT - 9); 159 if (!nr_blocks) 160 continue; 161 } 162 163 err = blkdev_issue_discard(si->bdev, start_block, 164 nr_blocks, GFP_KERNEL, 165 DISCARD_FL_BARRIER); 166 if (err) 167 break; 168 169 cond_resched(); 170 } 171 return err; /* That will often be -EOPNOTSUPP */ 172 } 173 174 /* 175 * swap allocation tell device that a cluster of swap can now be discarded, 176 * to allow the swap device to optimize its wear-levelling. 177 */ 178 static void discard_swap_cluster(struct swap_info_struct *si, 179 pgoff_t start_page, pgoff_t nr_pages) 180 { 181 struct swap_extent *se = si->curr_swap_extent; 182 int found_extent = 0; 183 184 while (nr_pages) { 185 struct list_head *lh; 186 187 if (se->start_page <= start_page && 188 start_page < se->start_page + se->nr_pages) { 189 pgoff_t offset = start_page - se->start_page; 190 sector_t start_block = se->start_block + offset; 191 sector_t nr_blocks = se->nr_pages - offset; 192 193 if (nr_blocks > nr_pages) 194 nr_blocks = nr_pages; 195 start_page += nr_blocks; 196 nr_pages -= nr_blocks; 197 198 if (!found_extent++) 199 si->curr_swap_extent = se; 200 201 start_block <<= PAGE_SHIFT - 9; 202 nr_blocks <<= PAGE_SHIFT - 9; 203 if (blkdev_issue_discard(si->bdev, start_block, 204 nr_blocks, GFP_NOIO, 205 DISCARD_FL_BARRIER)) 206 break; 207 } 208 209 lh = se->list.next; 210 if (lh == &si->extent_list) 211 lh = lh->next; 212 se = list_entry(lh, struct swap_extent, list); 213 } 214 } 215 216 static int wait_for_discard(void *word) 217 { 218 schedule(); 219 return 0; 220 } 221 222 #define SWAPFILE_CLUSTER 256 223 #define LATENCY_LIMIT 256 224 225 static inline unsigned long scan_swap_map(struct swap_info_struct *si, 226 int cache) 227 { 228 unsigned long offset; 229 unsigned long scan_base; 230 unsigned long last_in_cluster = 0; 231 int latency_ration = LATENCY_LIMIT; 232 int found_free_cluster = 0; 233 234 /* 235 * We try to cluster swap pages by allocating them sequentially 236 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 237 * way, however, we resort to first-free allocation, starting 238 * a new cluster. This prevents us from scattering swap pages 239 * all over the entire swap partition, so that we reduce 240 * overall disk seek times between swap pages. -- sct 241 * But we do now try to find an empty cluster. -Andrea 242 * And we let swap pages go all over an SSD partition. Hugh 243 */ 244 245 si->flags += SWP_SCANNING; 246 scan_base = offset = si->cluster_next; 247 248 if (unlikely(!si->cluster_nr--)) { 249 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 250 si->cluster_nr = SWAPFILE_CLUSTER - 1; 251 goto checks; 252 } 253 if (si->flags & SWP_DISCARDABLE) { 254 /* 255 * Start range check on racing allocations, in case 256 * they overlap the cluster we eventually decide on 257 * (we scan without swap_lock to allow preemption). 258 * It's hardly conceivable that cluster_nr could be 259 * wrapped during our scan, but don't depend on it. 260 */ 261 if (si->lowest_alloc) 262 goto checks; 263 si->lowest_alloc = si->max; 264 si->highest_alloc = 0; 265 } 266 spin_unlock(&swap_lock); 267 268 /* 269 * If seek is expensive, start searching for new cluster from 270 * start of partition, to minimize the span of allocated swap. 271 * But if seek is cheap, search from our current position, so 272 * that swap is allocated from all over the partition: if the 273 * Flash Translation Layer only remaps within limited zones, 274 * we don't want to wear out the first zone too quickly. 275 */ 276 if (!(si->flags & SWP_SOLIDSTATE)) 277 scan_base = offset = si->lowest_bit; 278 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 279 280 /* Locate the first empty (unaligned) cluster */ 281 for (; last_in_cluster <= si->highest_bit; offset++) { 282 if (si->swap_map[offset]) 283 last_in_cluster = offset + SWAPFILE_CLUSTER; 284 else if (offset == last_in_cluster) { 285 spin_lock(&swap_lock); 286 offset -= SWAPFILE_CLUSTER - 1; 287 si->cluster_next = offset; 288 si->cluster_nr = SWAPFILE_CLUSTER - 1; 289 found_free_cluster = 1; 290 goto checks; 291 } 292 if (unlikely(--latency_ration < 0)) { 293 cond_resched(); 294 latency_ration = LATENCY_LIMIT; 295 } 296 } 297 298 offset = si->lowest_bit; 299 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 300 301 /* Locate the first empty (unaligned) cluster */ 302 for (; last_in_cluster < scan_base; offset++) { 303 if (si->swap_map[offset]) 304 last_in_cluster = offset + SWAPFILE_CLUSTER; 305 else if (offset == last_in_cluster) { 306 spin_lock(&swap_lock); 307 offset -= SWAPFILE_CLUSTER - 1; 308 si->cluster_next = offset; 309 si->cluster_nr = SWAPFILE_CLUSTER - 1; 310 found_free_cluster = 1; 311 goto checks; 312 } 313 if (unlikely(--latency_ration < 0)) { 314 cond_resched(); 315 latency_ration = LATENCY_LIMIT; 316 } 317 } 318 319 offset = scan_base; 320 spin_lock(&swap_lock); 321 si->cluster_nr = SWAPFILE_CLUSTER - 1; 322 si->lowest_alloc = 0; 323 } 324 325 checks: 326 if (!(si->flags & SWP_WRITEOK)) 327 goto no_page; 328 if (!si->highest_bit) 329 goto no_page; 330 if (offset > si->highest_bit) 331 scan_base = offset = si->lowest_bit; 332 333 /* reuse swap entry of cache-only swap if not busy. */ 334 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 335 int swap_was_freed; 336 spin_unlock(&swap_lock); 337 swap_was_freed = __try_to_reclaim_swap(si, offset); 338 spin_lock(&swap_lock); 339 /* entry was freed successfully, try to use this again */ 340 if (swap_was_freed) 341 goto checks; 342 goto scan; /* check next one */ 343 } 344 345 if (si->swap_map[offset]) 346 goto scan; 347 348 if (offset == si->lowest_bit) 349 si->lowest_bit++; 350 if (offset == si->highest_bit) 351 si->highest_bit--; 352 si->inuse_pages++; 353 if (si->inuse_pages == si->pages) { 354 si->lowest_bit = si->max; 355 si->highest_bit = 0; 356 } 357 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ 358 si->swap_map[offset] = encode_swapmap(0, true); 359 else /* at suspend */ 360 si->swap_map[offset] = encode_swapmap(1, false); 361 si->cluster_next = offset + 1; 362 si->flags -= SWP_SCANNING; 363 364 if (si->lowest_alloc) { 365 /* 366 * Only set when SWP_DISCARDABLE, and there's a scan 367 * for a free cluster in progress or just completed. 368 */ 369 if (found_free_cluster) { 370 /* 371 * To optimize wear-levelling, discard the 372 * old data of the cluster, taking care not to 373 * discard any of its pages that have already 374 * been allocated by racing tasks (offset has 375 * already stepped over any at the beginning). 376 */ 377 if (offset < si->highest_alloc && 378 si->lowest_alloc <= last_in_cluster) 379 last_in_cluster = si->lowest_alloc - 1; 380 si->flags |= SWP_DISCARDING; 381 spin_unlock(&swap_lock); 382 383 if (offset < last_in_cluster) 384 discard_swap_cluster(si, offset, 385 last_in_cluster - offset + 1); 386 387 spin_lock(&swap_lock); 388 si->lowest_alloc = 0; 389 si->flags &= ~SWP_DISCARDING; 390 391 smp_mb(); /* wake_up_bit advises this */ 392 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); 393 394 } else if (si->flags & SWP_DISCARDING) { 395 /* 396 * Delay using pages allocated by racing tasks 397 * until the whole discard has been issued. We 398 * could defer that delay until swap_writepage, 399 * but it's easier to keep this self-contained. 400 */ 401 spin_unlock(&swap_lock); 402 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), 403 wait_for_discard, TASK_UNINTERRUPTIBLE); 404 spin_lock(&swap_lock); 405 } else { 406 /* 407 * Note pages allocated by racing tasks while 408 * scan for a free cluster is in progress, so 409 * that its final discard can exclude them. 410 */ 411 if (offset < si->lowest_alloc) 412 si->lowest_alloc = offset; 413 if (offset > si->highest_alloc) 414 si->highest_alloc = offset; 415 } 416 } 417 return offset; 418 419 scan: 420 spin_unlock(&swap_lock); 421 while (++offset <= si->highest_bit) { 422 if (!si->swap_map[offset]) { 423 spin_lock(&swap_lock); 424 goto checks; 425 } 426 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 427 spin_lock(&swap_lock); 428 goto checks; 429 } 430 if (unlikely(--latency_ration < 0)) { 431 cond_resched(); 432 latency_ration = LATENCY_LIMIT; 433 } 434 } 435 offset = si->lowest_bit; 436 while (++offset < scan_base) { 437 if (!si->swap_map[offset]) { 438 spin_lock(&swap_lock); 439 goto checks; 440 } 441 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 442 spin_lock(&swap_lock); 443 goto checks; 444 } 445 if (unlikely(--latency_ration < 0)) { 446 cond_resched(); 447 latency_ration = LATENCY_LIMIT; 448 } 449 } 450 spin_lock(&swap_lock); 451 452 no_page: 453 si->flags -= SWP_SCANNING; 454 return 0; 455 } 456 457 swp_entry_t get_swap_page(void) 458 { 459 struct swap_info_struct *si; 460 pgoff_t offset; 461 int type, next; 462 int wrapped = 0; 463 464 spin_lock(&swap_lock); 465 if (nr_swap_pages <= 0) 466 goto noswap; 467 nr_swap_pages--; 468 469 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 470 si = swap_info + type; 471 next = si->next; 472 if (next < 0 || 473 (!wrapped && si->prio != swap_info[next].prio)) { 474 next = swap_list.head; 475 wrapped++; 476 } 477 478 if (!si->highest_bit) 479 continue; 480 if (!(si->flags & SWP_WRITEOK)) 481 continue; 482 483 swap_list.next = next; 484 /* This is called for allocating swap entry for cache */ 485 offset = scan_swap_map(si, SWAP_CACHE); 486 if (offset) { 487 spin_unlock(&swap_lock); 488 return swp_entry(type, offset); 489 } 490 next = swap_list.next; 491 } 492 493 nr_swap_pages++; 494 noswap: 495 spin_unlock(&swap_lock); 496 return (swp_entry_t) {0}; 497 } 498 499 /* The only caller of this function is now susupend routine */ 500 swp_entry_t get_swap_page_of_type(int type) 501 { 502 struct swap_info_struct *si; 503 pgoff_t offset; 504 505 spin_lock(&swap_lock); 506 si = swap_info + type; 507 if (si->flags & SWP_WRITEOK) { 508 nr_swap_pages--; 509 /* This is called for allocating swap entry, not cache */ 510 offset = scan_swap_map(si, SWAP_MAP); 511 if (offset) { 512 spin_unlock(&swap_lock); 513 return swp_entry(type, offset); 514 } 515 nr_swap_pages++; 516 } 517 spin_unlock(&swap_lock); 518 return (swp_entry_t) {0}; 519 } 520 521 static struct swap_info_struct * swap_info_get(swp_entry_t entry) 522 { 523 struct swap_info_struct * p; 524 unsigned long offset, type; 525 526 if (!entry.val) 527 goto out; 528 type = swp_type(entry); 529 if (type >= nr_swapfiles) 530 goto bad_nofile; 531 p = & swap_info[type]; 532 if (!(p->flags & SWP_USED)) 533 goto bad_device; 534 offset = swp_offset(entry); 535 if (offset >= p->max) 536 goto bad_offset; 537 if (!p->swap_map[offset]) 538 goto bad_free; 539 spin_lock(&swap_lock); 540 return p; 541 542 bad_free: 543 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); 544 goto out; 545 bad_offset: 546 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); 547 goto out; 548 bad_device: 549 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); 550 goto out; 551 bad_nofile: 552 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 553 out: 554 return NULL; 555 } 556 557 static int swap_entry_free(struct swap_info_struct *p, 558 swp_entry_t ent, int cache) 559 { 560 unsigned long offset = swp_offset(ent); 561 int count = swap_count(p->swap_map[offset]); 562 bool has_cache; 563 564 has_cache = swap_has_cache(p->swap_map[offset]); 565 566 if (cache == SWAP_MAP) { /* dropping usage count of swap */ 567 if (count < SWAP_MAP_MAX) { 568 count--; 569 p->swap_map[offset] = encode_swapmap(count, has_cache); 570 } 571 } else { /* dropping swap cache flag */ 572 VM_BUG_ON(!has_cache); 573 p->swap_map[offset] = encode_swapmap(count, false); 574 575 } 576 /* return code. */ 577 count = p->swap_map[offset]; 578 /* free if no reference */ 579 if (!count) { 580 if (offset < p->lowest_bit) 581 p->lowest_bit = offset; 582 if (offset > p->highest_bit) 583 p->highest_bit = offset; 584 if (p->prio > swap_info[swap_list.next].prio) 585 swap_list.next = p - swap_info; 586 nr_swap_pages++; 587 p->inuse_pages--; 588 } 589 if (!swap_count(count)) 590 mem_cgroup_uncharge_swap(ent); 591 return count; 592 } 593 594 /* 595 * Caller has made sure that the swapdevice corresponding to entry 596 * is still around or has not been recycled. 597 */ 598 void swap_free(swp_entry_t entry) 599 { 600 struct swap_info_struct * p; 601 602 p = swap_info_get(entry); 603 if (p) { 604 swap_entry_free(p, entry, SWAP_MAP); 605 spin_unlock(&swap_lock); 606 } 607 } 608 609 /* 610 * Called after dropping swapcache to decrease refcnt to swap entries. 611 */ 612 void swapcache_free(swp_entry_t entry, struct page *page) 613 { 614 struct swap_info_struct *p; 615 int ret; 616 617 p = swap_info_get(entry); 618 if (p) { 619 ret = swap_entry_free(p, entry, SWAP_CACHE); 620 if (page) { 621 bool swapout; 622 if (ret) 623 swapout = true; /* the end of swap out */ 624 else 625 swapout = false; /* no more swap users! */ 626 mem_cgroup_uncharge_swapcache(page, entry, swapout); 627 } 628 spin_unlock(&swap_lock); 629 } 630 return; 631 } 632 633 /* 634 * How many references to page are currently swapped out? 635 */ 636 static inline int page_swapcount(struct page *page) 637 { 638 int count = 0; 639 struct swap_info_struct *p; 640 swp_entry_t entry; 641 642 entry.val = page_private(page); 643 p = swap_info_get(entry); 644 if (p) { 645 count = swap_count(p->swap_map[swp_offset(entry)]); 646 spin_unlock(&swap_lock); 647 } 648 return count; 649 } 650 651 /* 652 * We can write to an anon page without COW if there are no other references 653 * to it. And as a side-effect, free up its swap: because the old content 654 * on disk will never be read, and seeking back there to write new content 655 * later would only waste time away from clustering. 656 */ 657 int reuse_swap_page(struct page *page) 658 { 659 int count; 660 661 VM_BUG_ON(!PageLocked(page)); 662 count = page_mapcount(page); 663 if (count <= 1 && PageSwapCache(page)) { 664 count += page_swapcount(page); 665 if (count == 1 && !PageWriteback(page)) { 666 delete_from_swap_cache(page); 667 SetPageDirty(page); 668 } 669 } 670 return count == 1; 671 } 672 673 /* 674 * If swap is getting full, or if there are no more mappings of this page, 675 * then try_to_free_swap is called to free its swap space. 676 */ 677 int try_to_free_swap(struct page *page) 678 { 679 VM_BUG_ON(!PageLocked(page)); 680 681 if (!PageSwapCache(page)) 682 return 0; 683 if (PageWriteback(page)) 684 return 0; 685 if (page_swapcount(page)) 686 return 0; 687 688 delete_from_swap_cache(page); 689 SetPageDirty(page); 690 return 1; 691 } 692 693 /* 694 * Free the swap entry like above, but also try to 695 * free the page cache entry if it is the last user. 696 */ 697 int free_swap_and_cache(swp_entry_t entry) 698 { 699 struct swap_info_struct *p; 700 struct page *page = NULL; 701 702 if (non_swap_entry(entry)) 703 return 1; 704 705 p = swap_info_get(entry); 706 if (p) { 707 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { 708 page = find_get_page(&swapper_space, entry.val); 709 if (page && !trylock_page(page)) { 710 page_cache_release(page); 711 page = NULL; 712 } 713 } 714 spin_unlock(&swap_lock); 715 } 716 if (page) { 717 /* 718 * Not mapped elsewhere, or swap space full? Free it! 719 * Also recheck PageSwapCache now page is locked (above). 720 */ 721 if (PageSwapCache(page) && !PageWriteback(page) && 722 (!page_mapped(page) || vm_swap_full())) { 723 delete_from_swap_cache(page); 724 SetPageDirty(page); 725 } 726 unlock_page(page); 727 page_cache_release(page); 728 } 729 return p != NULL; 730 } 731 732 #ifdef CONFIG_HIBERNATION 733 /* 734 * Find the swap type that corresponds to given device (if any). 735 * 736 * @offset - number of the PAGE_SIZE-sized block of the device, starting 737 * from 0, in which the swap header is expected to be located. 738 * 739 * This is needed for the suspend to disk (aka swsusp). 740 */ 741 int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 742 { 743 struct block_device *bdev = NULL; 744 int i; 745 746 if (device) 747 bdev = bdget(device); 748 749 spin_lock(&swap_lock); 750 for (i = 0; i < nr_swapfiles; i++) { 751 struct swap_info_struct *sis = swap_info + i; 752 753 if (!(sis->flags & SWP_WRITEOK)) 754 continue; 755 756 if (!bdev) { 757 if (bdev_p) 758 *bdev_p = bdgrab(sis->bdev); 759 760 spin_unlock(&swap_lock); 761 return i; 762 } 763 if (bdev == sis->bdev) { 764 struct swap_extent *se; 765 766 se = list_entry(sis->extent_list.next, 767 struct swap_extent, list); 768 if (se->start_block == offset) { 769 if (bdev_p) 770 *bdev_p = bdgrab(sis->bdev); 771 772 spin_unlock(&swap_lock); 773 bdput(bdev); 774 return i; 775 } 776 } 777 } 778 spin_unlock(&swap_lock); 779 if (bdev) 780 bdput(bdev); 781 782 return -ENODEV; 783 } 784 785 /* 786 * Return either the total number of swap pages of given type, or the number 787 * of free pages of that type (depending on @free) 788 * 789 * This is needed for software suspend 790 */ 791 unsigned int count_swap_pages(int type, int free) 792 { 793 unsigned int n = 0; 794 795 if (type < nr_swapfiles) { 796 spin_lock(&swap_lock); 797 if (swap_info[type].flags & SWP_WRITEOK) { 798 n = swap_info[type].pages; 799 if (free) 800 n -= swap_info[type].inuse_pages; 801 } 802 spin_unlock(&swap_lock); 803 } 804 return n; 805 } 806 #endif 807 808 /* 809 * No need to decide whether this PTE shares the swap entry with others, 810 * just let do_wp_page work it out if a write is requested later - to 811 * force COW, vm_page_prot omits write permission from any private vma. 812 */ 813 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 814 unsigned long addr, swp_entry_t entry, struct page *page) 815 { 816 struct mem_cgroup *ptr = NULL; 817 spinlock_t *ptl; 818 pte_t *pte; 819 int ret = 1; 820 821 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { 822 ret = -ENOMEM; 823 goto out_nolock; 824 } 825 826 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 827 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 828 if (ret > 0) 829 mem_cgroup_cancel_charge_swapin(ptr); 830 ret = 0; 831 goto out; 832 } 833 834 inc_mm_counter(vma->vm_mm, anon_rss); 835 get_page(page); 836 set_pte_at(vma->vm_mm, addr, pte, 837 pte_mkold(mk_pte(page, vma->vm_page_prot))); 838 page_add_anon_rmap(page, vma, addr); 839 mem_cgroup_commit_charge_swapin(page, ptr); 840 swap_free(entry); 841 /* 842 * Move the page to the active list so it is not 843 * immediately swapped out again after swapon. 844 */ 845 activate_page(page); 846 out: 847 pte_unmap_unlock(pte, ptl); 848 out_nolock: 849 return ret; 850 } 851 852 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 853 unsigned long addr, unsigned long end, 854 swp_entry_t entry, struct page *page) 855 { 856 pte_t swp_pte = swp_entry_to_pte(entry); 857 pte_t *pte; 858 int ret = 0; 859 860 /* 861 * We don't actually need pte lock while scanning for swp_pte: since 862 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the 863 * page table while we're scanning; though it could get zapped, and on 864 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse 865 * of unmatched parts which look like swp_pte, so unuse_pte must 866 * recheck under pte lock. Scanning without pte lock lets it be 867 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. 868 */ 869 pte = pte_offset_map(pmd, addr); 870 do { 871 /* 872 * swapoff spends a _lot_ of time in this loop! 873 * Test inline before going to call unuse_pte. 874 */ 875 if (unlikely(pte_same(*pte, swp_pte))) { 876 pte_unmap(pte); 877 ret = unuse_pte(vma, pmd, addr, entry, page); 878 if (ret) 879 goto out; 880 pte = pte_offset_map(pmd, addr); 881 } 882 } while (pte++, addr += PAGE_SIZE, addr != end); 883 pte_unmap(pte - 1); 884 out: 885 return ret; 886 } 887 888 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 889 unsigned long addr, unsigned long end, 890 swp_entry_t entry, struct page *page) 891 { 892 pmd_t *pmd; 893 unsigned long next; 894 int ret; 895 896 pmd = pmd_offset(pud, addr); 897 do { 898 next = pmd_addr_end(addr, end); 899 if (pmd_none_or_clear_bad(pmd)) 900 continue; 901 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 902 if (ret) 903 return ret; 904 } while (pmd++, addr = next, addr != end); 905 return 0; 906 } 907 908 static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 909 unsigned long addr, unsigned long end, 910 swp_entry_t entry, struct page *page) 911 { 912 pud_t *pud; 913 unsigned long next; 914 int ret; 915 916 pud = pud_offset(pgd, addr); 917 do { 918 next = pud_addr_end(addr, end); 919 if (pud_none_or_clear_bad(pud)) 920 continue; 921 ret = unuse_pmd_range(vma, pud, addr, next, entry, page); 922 if (ret) 923 return ret; 924 } while (pud++, addr = next, addr != end); 925 return 0; 926 } 927 928 static int unuse_vma(struct vm_area_struct *vma, 929 swp_entry_t entry, struct page *page) 930 { 931 pgd_t *pgd; 932 unsigned long addr, end, next; 933 int ret; 934 935 if (page->mapping) { 936 addr = page_address_in_vma(page, vma); 937 if (addr == -EFAULT) 938 return 0; 939 else 940 end = addr + PAGE_SIZE; 941 } else { 942 addr = vma->vm_start; 943 end = vma->vm_end; 944 } 945 946 pgd = pgd_offset(vma->vm_mm, addr); 947 do { 948 next = pgd_addr_end(addr, end); 949 if (pgd_none_or_clear_bad(pgd)) 950 continue; 951 ret = unuse_pud_range(vma, pgd, addr, next, entry, page); 952 if (ret) 953 return ret; 954 } while (pgd++, addr = next, addr != end); 955 return 0; 956 } 957 958 static int unuse_mm(struct mm_struct *mm, 959 swp_entry_t entry, struct page *page) 960 { 961 struct vm_area_struct *vma; 962 int ret = 0; 963 964 if (!down_read_trylock(&mm->mmap_sem)) { 965 /* 966 * Activate page so shrink_inactive_list is unlikely to unmap 967 * its ptes while lock is dropped, so swapoff can make progress. 968 */ 969 activate_page(page); 970 unlock_page(page); 971 down_read(&mm->mmap_sem); 972 lock_page(page); 973 } 974 for (vma = mm->mmap; vma; vma = vma->vm_next) { 975 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) 976 break; 977 } 978 up_read(&mm->mmap_sem); 979 return (ret < 0)? ret: 0; 980 } 981 982 /* 983 * Scan swap_map from current position to next entry still in use. 984 * Recycle to start on reaching the end, returning 0 when empty. 985 */ 986 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 987 unsigned int prev) 988 { 989 unsigned int max = si->max; 990 unsigned int i = prev; 991 int count; 992 993 /* 994 * No need for swap_lock here: we're just looking 995 * for whether an entry is in use, not modifying it; false 996 * hits are okay, and sys_swapoff() has already prevented new 997 * allocations from this area (while holding swap_lock). 998 */ 999 for (;;) { 1000 if (++i >= max) { 1001 if (!prev) { 1002 i = 0; 1003 break; 1004 } 1005 /* 1006 * No entries in use at top of swap_map, 1007 * loop back to start and recheck there. 1008 */ 1009 max = prev + 1; 1010 prev = 0; 1011 i = 1; 1012 } 1013 count = si->swap_map[i]; 1014 if (count && swap_count(count) != SWAP_MAP_BAD) 1015 break; 1016 } 1017 return i; 1018 } 1019 1020 /* 1021 * We completely avoid races by reading each swap page in advance, 1022 * and then search for the process using it. All the necessary 1023 * page table adjustments can then be made atomically. 1024 */ 1025 static int try_to_unuse(unsigned int type) 1026 { 1027 struct swap_info_struct * si = &swap_info[type]; 1028 struct mm_struct *start_mm; 1029 unsigned short *swap_map; 1030 unsigned short swcount; 1031 struct page *page; 1032 swp_entry_t entry; 1033 unsigned int i = 0; 1034 int retval = 0; 1035 int reset_overflow = 0; 1036 int shmem; 1037 1038 /* 1039 * When searching mms for an entry, a good strategy is to 1040 * start at the first mm we freed the previous entry from 1041 * (though actually we don't notice whether we or coincidence 1042 * freed the entry). Initialize this start_mm with a hold. 1043 * 1044 * A simpler strategy would be to start at the last mm we 1045 * freed the previous entry from; but that would take less 1046 * advantage of mmlist ordering, which clusters forked mms 1047 * together, child after parent. If we race with dup_mmap(), we 1048 * prefer to resolve parent before child, lest we miss entries 1049 * duplicated after we scanned child: using last mm would invert 1050 * that. Though it's only a serious concern when an overflowed 1051 * swap count is reset from SWAP_MAP_MAX, preventing a rescan. 1052 */ 1053 start_mm = &init_mm; 1054 atomic_inc(&init_mm.mm_users); 1055 1056 /* 1057 * Keep on scanning until all entries have gone. Usually, 1058 * one pass through swap_map is enough, but not necessarily: 1059 * there are races when an instance of an entry might be missed. 1060 */ 1061 while ((i = find_next_to_unuse(si, i)) != 0) { 1062 if (signal_pending(current)) { 1063 retval = -EINTR; 1064 break; 1065 } 1066 1067 /* 1068 * Get a page for the entry, using the existing swap 1069 * cache page if there is one. Otherwise, get a clean 1070 * page and read the swap into it. 1071 */ 1072 swap_map = &si->swap_map[i]; 1073 entry = swp_entry(type, i); 1074 page = read_swap_cache_async(entry, 1075 GFP_HIGHUSER_MOVABLE, NULL, 0); 1076 if (!page) { 1077 /* 1078 * Either swap_duplicate() failed because entry 1079 * has been freed independently, and will not be 1080 * reused since sys_swapoff() already disabled 1081 * allocation from here, or alloc_page() failed. 1082 */ 1083 if (!*swap_map) 1084 continue; 1085 retval = -ENOMEM; 1086 break; 1087 } 1088 1089 /* 1090 * Don't hold on to start_mm if it looks like exiting. 1091 */ 1092 if (atomic_read(&start_mm->mm_users) == 1) { 1093 mmput(start_mm); 1094 start_mm = &init_mm; 1095 atomic_inc(&init_mm.mm_users); 1096 } 1097 1098 /* 1099 * Wait for and lock page. When do_swap_page races with 1100 * try_to_unuse, do_swap_page can handle the fault much 1101 * faster than try_to_unuse can locate the entry. This 1102 * apparently redundant "wait_on_page_locked" lets try_to_unuse 1103 * defer to do_swap_page in such a case - in some tests, 1104 * do_swap_page and try_to_unuse repeatedly compete. 1105 */ 1106 wait_on_page_locked(page); 1107 wait_on_page_writeback(page); 1108 lock_page(page); 1109 wait_on_page_writeback(page); 1110 1111 /* 1112 * Remove all references to entry. 1113 * Whenever we reach init_mm, there's no address space 1114 * to search, but use it as a reminder to search shmem. 1115 */ 1116 shmem = 0; 1117 swcount = *swap_map; 1118 if (swap_count(swcount)) { 1119 if (start_mm == &init_mm) 1120 shmem = shmem_unuse(entry, page); 1121 else 1122 retval = unuse_mm(start_mm, entry, page); 1123 } 1124 if (swap_count(*swap_map)) { 1125 int set_start_mm = (*swap_map >= swcount); 1126 struct list_head *p = &start_mm->mmlist; 1127 struct mm_struct *new_start_mm = start_mm; 1128 struct mm_struct *prev_mm = start_mm; 1129 struct mm_struct *mm; 1130 1131 atomic_inc(&new_start_mm->mm_users); 1132 atomic_inc(&prev_mm->mm_users); 1133 spin_lock(&mmlist_lock); 1134 while (swap_count(*swap_map) && !retval && !shmem && 1135 (p = p->next) != &start_mm->mmlist) { 1136 mm = list_entry(p, struct mm_struct, mmlist); 1137 if (!atomic_inc_not_zero(&mm->mm_users)) 1138 continue; 1139 spin_unlock(&mmlist_lock); 1140 mmput(prev_mm); 1141 prev_mm = mm; 1142 1143 cond_resched(); 1144 1145 swcount = *swap_map; 1146 if (!swap_count(swcount)) /* any usage ? */ 1147 ; 1148 else if (mm == &init_mm) { 1149 set_start_mm = 1; 1150 shmem = shmem_unuse(entry, page); 1151 } else 1152 retval = unuse_mm(mm, entry, page); 1153 1154 if (set_start_mm && 1155 swap_count(*swap_map) < swcount) { 1156 mmput(new_start_mm); 1157 atomic_inc(&mm->mm_users); 1158 new_start_mm = mm; 1159 set_start_mm = 0; 1160 } 1161 spin_lock(&mmlist_lock); 1162 } 1163 spin_unlock(&mmlist_lock); 1164 mmput(prev_mm); 1165 mmput(start_mm); 1166 start_mm = new_start_mm; 1167 } 1168 if (shmem) { 1169 /* page has already been unlocked and released */ 1170 if (shmem > 0) 1171 continue; 1172 retval = shmem; 1173 break; 1174 } 1175 if (retval) { 1176 unlock_page(page); 1177 page_cache_release(page); 1178 break; 1179 } 1180 1181 /* 1182 * How could swap count reach 0x7ffe ? 1183 * There's no way to repeat a swap page within an mm 1184 * (except in shmem, where it's the shared object which takes 1185 * the reference count)? 1186 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned 1187 * short is too small....) 1188 * If that's wrong, then we should worry more about 1189 * exit_mmap() and do_munmap() cases described above: 1190 * we might be resetting SWAP_MAP_MAX too early here. 1191 * We know "Undead"s can happen, they're okay, so don't 1192 * report them; but do report if we reset SWAP_MAP_MAX. 1193 */ 1194 /* We might release the lock_page() in unuse_mm(). */ 1195 if (!PageSwapCache(page) || page_private(page) != entry.val) 1196 goto retry; 1197 1198 if (swap_count(*swap_map) == SWAP_MAP_MAX) { 1199 spin_lock(&swap_lock); 1200 *swap_map = encode_swapmap(0, true); 1201 spin_unlock(&swap_lock); 1202 reset_overflow = 1; 1203 } 1204 1205 /* 1206 * If a reference remains (rare), we would like to leave 1207 * the page in the swap cache; but try_to_unmap could 1208 * then re-duplicate the entry once we drop page lock, 1209 * so we might loop indefinitely; also, that page could 1210 * not be swapped out to other storage meanwhile. So: 1211 * delete from cache even if there's another reference, 1212 * after ensuring that the data has been saved to disk - 1213 * since if the reference remains (rarer), it will be 1214 * read from disk into another page. Splitting into two 1215 * pages would be incorrect if swap supported "shared 1216 * private" pages, but they are handled by tmpfs files. 1217 */ 1218 if (swap_count(*swap_map) && 1219 PageDirty(page) && PageSwapCache(page)) { 1220 struct writeback_control wbc = { 1221 .sync_mode = WB_SYNC_NONE, 1222 }; 1223 1224 swap_writepage(page, &wbc); 1225 lock_page(page); 1226 wait_on_page_writeback(page); 1227 } 1228 1229 /* 1230 * It is conceivable that a racing task removed this page from 1231 * swap cache just before we acquired the page lock at the top, 1232 * or while we dropped it in unuse_mm(). The page might even 1233 * be back in swap cache on another swap area: that we must not 1234 * delete, since it may not have been written out to swap yet. 1235 */ 1236 if (PageSwapCache(page) && 1237 likely(page_private(page) == entry.val)) 1238 delete_from_swap_cache(page); 1239 1240 /* 1241 * So we could skip searching mms once swap count went 1242 * to 1, we did not mark any present ptes as dirty: must 1243 * mark page dirty so shrink_page_list will preserve it. 1244 */ 1245 SetPageDirty(page); 1246 retry: 1247 unlock_page(page); 1248 page_cache_release(page); 1249 1250 /* 1251 * Make sure that we aren't completely killing 1252 * interactive performance. 1253 */ 1254 cond_resched(); 1255 } 1256 1257 mmput(start_mm); 1258 if (reset_overflow) { 1259 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); 1260 swap_overflow = 0; 1261 } 1262 return retval; 1263 } 1264 1265 /* 1266 * After a successful try_to_unuse, if no swap is now in use, we know 1267 * we can empty the mmlist. swap_lock must be held on entry and exit. 1268 * Note that mmlist_lock nests inside swap_lock, and an mm must be 1269 * added to the mmlist just after page_duplicate - before would be racy. 1270 */ 1271 static void drain_mmlist(void) 1272 { 1273 struct list_head *p, *next; 1274 unsigned int i; 1275 1276 for (i = 0; i < nr_swapfiles; i++) 1277 if (swap_info[i].inuse_pages) 1278 return; 1279 spin_lock(&mmlist_lock); 1280 list_for_each_safe(p, next, &init_mm.mmlist) 1281 list_del_init(p); 1282 spin_unlock(&mmlist_lock); 1283 } 1284 1285 /* 1286 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 1287 * corresponds to page offset `offset'. 1288 */ 1289 sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) 1290 { 1291 struct swap_extent *se = sis->curr_swap_extent; 1292 struct swap_extent *start_se = se; 1293 1294 for ( ; ; ) { 1295 struct list_head *lh; 1296 1297 if (se->start_page <= offset && 1298 offset < (se->start_page + se->nr_pages)) { 1299 return se->start_block + (offset - se->start_page); 1300 } 1301 lh = se->list.next; 1302 if (lh == &sis->extent_list) 1303 lh = lh->next; 1304 se = list_entry(lh, struct swap_extent, list); 1305 sis->curr_swap_extent = se; 1306 BUG_ON(se == start_se); /* It *must* be present */ 1307 } 1308 } 1309 1310 #ifdef CONFIG_HIBERNATION 1311 /* 1312 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 1313 * corresponding to given index in swap_info (swap type). 1314 */ 1315 sector_t swapdev_block(int swap_type, pgoff_t offset) 1316 { 1317 struct swap_info_struct *sis; 1318 1319 if (swap_type >= nr_swapfiles) 1320 return 0; 1321 1322 sis = swap_info + swap_type; 1323 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; 1324 } 1325 #endif /* CONFIG_HIBERNATION */ 1326 1327 /* 1328 * Free all of a swapdev's extent information 1329 */ 1330 static void destroy_swap_extents(struct swap_info_struct *sis) 1331 { 1332 while (!list_empty(&sis->extent_list)) { 1333 struct swap_extent *se; 1334 1335 se = list_entry(sis->extent_list.next, 1336 struct swap_extent, list); 1337 list_del(&se->list); 1338 kfree(se); 1339 } 1340 } 1341 1342 /* 1343 * Add a block range (and the corresponding page range) into this swapdev's 1344 * extent list. The extent list is kept sorted in page order. 1345 * 1346 * This function rather assumes that it is called in ascending page order. 1347 */ 1348 static int 1349 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 1350 unsigned long nr_pages, sector_t start_block) 1351 { 1352 struct swap_extent *se; 1353 struct swap_extent *new_se; 1354 struct list_head *lh; 1355 1356 lh = sis->extent_list.prev; /* The highest page extent */ 1357 if (lh != &sis->extent_list) { 1358 se = list_entry(lh, struct swap_extent, list); 1359 BUG_ON(se->start_page + se->nr_pages != start_page); 1360 if (se->start_block + se->nr_pages == start_block) { 1361 /* Merge it */ 1362 se->nr_pages += nr_pages; 1363 return 0; 1364 } 1365 } 1366 1367 /* 1368 * No merge. Insert a new extent, preserving ordering. 1369 */ 1370 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 1371 if (new_se == NULL) 1372 return -ENOMEM; 1373 new_se->start_page = start_page; 1374 new_se->nr_pages = nr_pages; 1375 new_se->start_block = start_block; 1376 1377 list_add_tail(&new_se->list, &sis->extent_list); 1378 return 1; 1379 } 1380 1381 /* 1382 * A `swap extent' is a simple thing which maps a contiguous range of pages 1383 * onto a contiguous range of disk blocks. An ordered list of swap extents 1384 * is built at swapon time and is then used at swap_writepage/swap_readpage 1385 * time for locating where on disk a page belongs. 1386 * 1387 * If the swapfile is an S_ISBLK block device, a single extent is installed. 1388 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 1389 * swap files identically. 1390 * 1391 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 1392 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 1393 * swapfiles are handled *identically* after swapon time. 1394 * 1395 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 1396 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If 1397 * some stray blocks are found which do not fall within the PAGE_SIZE alignment 1398 * requirements, they are simply tossed out - we will never use those blocks 1399 * for swapping. 1400 * 1401 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This 1402 * prevents root from shooting her foot off by ftruncating an in-use swapfile, 1403 * which will scribble on the fs. 1404 * 1405 * The amount of disk space which a single swap extent represents varies. 1406 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 1407 * extents in the list. To avoid much list walking, we cache the previous 1408 * search location in `curr_swap_extent', and start new searches from there. 1409 * This is extremely effective. The average number of iterations in 1410 * map_swap_page() has been measured at about 0.3 per page. - akpm. 1411 */ 1412 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 1413 { 1414 struct inode *inode; 1415 unsigned blocks_per_page; 1416 unsigned long page_no; 1417 unsigned blkbits; 1418 sector_t probe_block; 1419 sector_t last_block; 1420 sector_t lowest_block = -1; 1421 sector_t highest_block = 0; 1422 int nr_extents = 0; 1423 int ret; 1424 1425 inode = sis->swap_file->f_mapping->host; 1426 if (S_ISBLK(inode->i_mode)) { 1427 ret = add_swap_extent(sis, 0, sis->max, 0); 1428 *span = sis->pages; 1429 goto done; 1430 } 1431 1432 blkbits = inode->i_blkbits; 1433 blocks_per_page = PAGE_SIZE >> blkbits; 1434 1435 /* 1436 * Map all the blocks into the extent list. This code doesn't try 1437 * to be very smart. 1438 */ 1439 probe_block = 0; 1440 page_no = 0; 1441 last_block = i_size_read(inode) >> blkbits; 1442 while ((probe_block + blocks_per_page) <= last_block && 1443 page_no < sis->max) { 1444 unsigned block_in_page; 1445 sector_t first_block; 1446 1447 first_block = bmap(inode, probe_block); 1448 if (first_block == 0) 1449 goto bad_bmap; 1450 1451 /* 1452 * It must be PAGE_SIZE aligned on-disk 1453 */ 1454 if (first_block & (blocks_per_page - 1)) { 1455 probe_block++; 1456 goto reprobe; 1457 } 1458 1459 for (block_in_page = 1; block_in_page < blocks_per_page; 1460 block_in_page++) { 1461 sector_t block; 1462 1463 block = bmap(inode, probe_block + block_in_page); 1464 if (block == 0) 1465 goto bad_bmap; 1466 if (block != first_block + block_in_page) { 1467 /* Discontiguity */ 1468 probe_block++; 1469 goto reprobe; 1470 } 1471 } 1472 1473 first_block >>= (PAGE_SHIFT - blkbits); 1474 if (page_no) { /* exclude the header page */ 1475 if (first_block < lowest_block) 1476 lowest_block = first_block; 1477 if (first_block > highest_block) 1478 highest_block = first_block; 1479 } 1480 1481 /* 1482 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks 1483 */ 1484 ret = add_swap_extent(sis, page_no, 1, first_block); 1485 if (ret < 0) 1486 goto out; 1487 nr_extents += ret; 1488 page_no++; 1489 probe_block += blocks_per_page; 1490 reprobe: 1491 continue; 1492 } 1493 ret = nr_extents; 1494 *span = 1 + highest_block - lowest_block; 1495 if (page_no == 0) 1496 page_no = 1; /* force Empty message */ 1497 sis->max = page_no; 1498 sis->pages = page_no - 1; 1499 sis->highest_bit = page_no - 1; 1500 done: 1501 sis->curr_swap_extent = list_entry(sis->extent_list.prev, 1502 struct swap_extent, list); 1503 goto out; 1504 bad_bmap: 1505 printk(KERN_ERR "swapon: swapfile has holes\n"); 1506 ret = -EINVAL; 1507 out: 1508 return ret; 1509 } 1510 1511 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1512 { 1513 struct swap_info_struct * p = NULL; 1514 unsigned short *swap_map; 1515 struct file *swap_file, *victim; 1516 struct address_space *mapping; 1517 struct inode *inode; 1518 char * pathname; 1519 int i, type, prev; 1520 int err; 1521 1522 if (!capable(CAP_SYS_ADMIN)) 1523 return -EPERM; 1524 1525 pathname = getname(specialfile); 1526 err = PTR_ERR(pathname); 1527 if (IS_ERR(pathname)) 1528 goto out; 1529 1530 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); 1531 putname(pathname); 1532 err = PTR_ERR(victim); 1533 if (IS_ERR(victim)) 1534 goto out; 1535 1536 mapping = victim->f_mapping; 1537 prev = -1; 1538 spin_lock(&swap_lock); 1539 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1540 p = swap_info + type; 1541 if (p->flags & SWP_WRITEOK) { 1542 if (p->swap_file->f_mapping == mapping) 1543 break; 1544 } 1545 prev = type; 1546 } 1547 if (type < 0) { 1548 err = -EINVAL; 1549 spin_unlock(&swap_lock); 1550 goto out_dput; 1551 } 1552 if (!security_vm_enough_memory(p->pages)) 1553 vm_unacct_memory(p->pages); 1554 else { 1555 err = -ENOMEM; 1556 spin_unlock(&swap_lock); 1557 goto out_dput; 1558 } 1559 if (prev < 0) { 1560 swap_list.head = p->next; 1561 } else { 1562 swap_info[prev].next = p->next; 1563 } 1564 if (type == swap_list.next) { 1565 /* just pick something that's safe... */ 1566 swap_list.next = swap_list.head; 1567 } 1568 if (p->prio < 0) { 1569 for (i = p->next; i >= 0; i = swap_info[i].next) 1570 swap_info[i].prio = p->prio--; 1571 least_priority++; 1572 } 1573 nr_swap_pages -= p->pages; 1574 total_swap_pages -= p->pages; 1575 p->flags &= ~SWP_WRITEOK; 1576 spin_unlock(&swap_lock); 1577 1578 current->flags |= PF_OOM_ORIGIN; 1579 err = try_to_unuse(type); 1580 current->flags &= ~PF_OOM_ORIGIN; 1581 1582 if (err) { 1583 /* re-insert swap space back into swap_list */ 1584 spin_lock(&swap_lock); 1585 if (p->prio < 0) 1586 p->prio = --least_priority; 1587 prev = -1; 1588 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 1589 if (p->prio >= swap_info[i].prio) 1590 break; 1591 prev = i; 1592 } 1593 p->next = i; 1594 if (prev < 0) 1595 swap_list.head = swap_list.next = p - swap_info; 1596 else 1597 swap_info[prev].next = p - swap_info; 1598 nr_swap_pages += p->pages; 1599 total_swap_pages += p->pages; 1600 p->flags |= SWP_WRITEOK; 1601 spin_unlock(&swap_lock); 1602 goto out_dput; 1603 } 1604 1605 /* wait for any unplug function to finish */ 1606 down_write(&swap_unplug_sem); 1607 up_write(&swap_unplug_sem); 1608 1609 destroy_swap_extents(p); 1610 mutex_lock(&swapon_mutex); 1611 spin_lock(&swap_lock); 1612 drain_mmlist(); 1613 1614 /* wait for anyone still in scan_swap_map */ 1615 p->highest_bit = 0; /* cuts scans short */ 1616 while (p->flags >= SWP_SCANNING) { 1617 spin_unlock(&swap_lock); 1618 schedule_timeout_uninterruptible(1); 1619 spin_lock(&swap_lock); 1620 } 1621 1622 swap_file = p->swap_file; 1623 p->swap_file = NULL; 1624 p->max = 0; 1625 swap_map = p->swap_map; 1626 p->swap_map = NULL; 1627 p->flags = 0; 1628 spin_unlock(&swap_lock); 1629 mutex_unlock(&swapon_mutex); 1630 vfree(swap_map); 1631 /* Destroy swap account informatin */ 1632 swap_cgroup_swapoff(type); 1633 1634 inode = mapping->host; 1635 if (S_ISBLK(inode->i_mode)) { 1636 struct block_device *bdev = I_BDEV(inode); 1637 set_blocksize(bdev, p->old_block_size); 1638 bd_release(bdev); 1639 } else { 1640 mutex_lock(&inode->i_mutex); 1641 inode->i_flags &= ~S_SWAPFILE; 1642 mutex_unlock(&inode->i_mutex); 1643 } 1644 filp_close(swap_file, NULL); 1645 err = 0; 1646 1647 out_dput: 1648 filp_close(victim, NULL); 1649 out: 1650 return err; 1651 } 1652 1653 #ifdef CONFIG_PROC_FS 1654 /* iterator */ 1655 static void *swap_start(struct seq_file *swap, loff_t *pos) 1656 { 1657 struct swap_info_struct *ptr = swap_info; 1658 int i; 1659 loff_t l = *pos; 1660 1661 mutex_lock(&swapon_mutex); 1662 1663 if (!l) 1664 return SEQ_START_TOKEN; 1665 1666 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1667 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1668 continue; 1669 if (!--l) 1670 return ptr; 1671 } 1672 1673 return NULL; 1674 } 1675 1676 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1677 { 1678 struct swap_info_struct *ptr; 1679 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1680 1681 if (v == SEQ_START_TOKEN) 1682 ptr = swap_info; 1683 else { 1684 ptr = v; 1685 ptr++; 1686 } 1687 1688 for (; ptr < endptr; ptr++) { 1689 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1690 continue; 1691 ++*pos; 1692 return ptr; 1693 } 1694 1695 return NULL; 1696 } 1697 1698 static void swap_stop(struct seq_file *swap, void *v) 1699 { 1700 mutex_unlock(&swapon_mutex); 1701 } 1702 1703 static int swap_show(struct seq_file *swap, void *v) 1704 { 1705 struct swap_info_struct *ptr = v; 1706 struct file *file; 1707 int len; 1708 1709 if (ptr == SEQ_START_TOKEN) { 1710 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1711 return 0; 1712 } 1713 1714 file = ptr->swap_file; 1715 len = seq_path(swap, &file->f_path, " \t\n\\"); 1716 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1717 len < 40 ? 40 - len : 1, " ", 1718 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1719 "partition" : "file\t", 1720 ptr->pages << (PAGE_SHIFT - 10), 1721 ptr->inuse_pages << (PAGE_SHIFT - 10), 1722 ptr->prio); 1723 return 0; 1724 } 1725 1726 static const struct seq_operations swaps_op = { 1727 .start = swap_start, 1728 .next = swap_next, 1729 .stop = swap_stop, 1730 .show = swap_show 1731 }; 1732 1733 static int swaps_open(struct inode *inode, struct file *file) 1734 { 1735 return seq_open(file, &swaps_op); 1736 } 1737 1738 static const struct file_operations proc_swaps_operations = { 1739 .open = swaps_open, 1740 .read = seq_read, 1741 .llseek = seq_lseek, 1742 .release = seq_release, 1743 }; 1744 1745 static int __init procswaps_init(void) 1746 { 1747 proc_create("swaps", 0, NULL, &proc_swaps_operations); 1748 return 0; 1749 } 1750 __initcall(procswaps_init); 1751 #endif /* CONFIG_PROC_FS */ 1752 1753 #ifdef MAX_SWAPFILES_CHECK 1754 static int __init max_swapfiles_check(void) 1755 { 1756 MAX_SWAPFILES_CHECK(); 1757 return 0; 1758 } 1759 late_initcall(max_swapfiles_check); 1760 #endif 1761 1762 /* 1763 * Written 01/25/92 by Simmule Turner, heavily changed by Linus. 1764 * 1765 * The swapon system call 1766 */ 1767 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 1768 { 1769 struct swap_info_struct * p; 1770 char *name = NULL; 1771 struct block_device *bdev = NULL; 1772 struct file *swap_file = NULL; 1773 struct address_space *mapping; 1774 unsigned int type; 1775 int i, prev; 1776 int error; 1777 union swap_header *swap_header = NULL; 1778 unsigned int nr_good_pages = 0; 1779 int nr_extents = 0; 1780 sector_t span; 1781 unsigned long maxpages = 1; 1782 unsigned long swapfilepages; 1783 unsigned short *swap_map = NULL; 1784 struct page *page = NULL; 1785 struct inode *inode = NULL; 1786 int did_down = 0; 1787 1788 if (!capable(CAP_SYS_ADMIN)) 1789 return -EPERM; 1790 spin_lock(&swap_lock); 1791 p = swap_info; 1792 for (type = 0 ; type < nr_swapfiles ; type++,p++) 1793 if (!(p->flags & SWP_USED)) 1794 break; 1795 error = -EPERM; 1796 if (type >= MAX_SWAPFILES) { 1797 spin_unlock(&swap_lock); 1798 goto out; 1799 } 1800 if (type >= nr_swapfiles) 1801 nr_swapfiles = type+1; 1802 memset(p, 0, sizeof(*p)); 1803 INIT_LIST_HEAD(&p->extent_list); 1804 p->flags = SWP_USED; 1805 p->next = -1; 1806 spin_unlock(&swap_lock); 1807 name = getname(specialfile); 1808 error = PTR_ERR(name); 1809 if (IS_ERR(name)) { 1810 name = NULL; 1811 goto bad_swap_2; 1812 } 1813 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); 1814 error = PTR_ERR(swap_file); 1815 if (IS_ERR(swap_file)) { 1816 swap_file = NULL; 1817 goto bad_swap_2; 1818 } 1819 1820 p->swap_file = swap_file; 1821 mapping = swap_file->f_mapping; 1822 inode = mapping->host; 1823 1824 error = -EBUSY; 1825 for (i = 0; i < nr_swapfiles; i++) { 1826 struct swap_info_struct *q = &swap_info[i]; 1827 1828 if (i == type || !q->swap_file) 1829 continue; 1830 if (mapping == q->swap_file->f_mapping) 1831 goto bad_swap; 1832 } 1833 1834 error = -EINVAL; 1835 if (S_ISBLK(inode->i_mode)) { 1836 bdev = I_BDEV(inode); 1837 error = bd_claim(bdev, sys_swapon); 1838 if (error < 0) { 1839 bdev = NULL; 1840 error = -EINVAL; 1841 goto bad_swap; 1842 } 1843 p->old_block_size = block_size(bdev); 1844 error = set_blocksize(bdev, PAGE_SIZE); 1845 if (error < 0) 1846 goto bad_swap; 1847 p->bdev = bdev; 1848 } else if (S_ISREG(inode->i_mode)) { 1849 p->bdev = inode->i_sb->s_bdev; 1850 mutex_lock(&inode->i_mutex); 1851 did_down = 1; 1852 if (IS_SWAPFILE(inode)) { 1853 error = -EBUSY; 1854 goto bad_swap; 1855 } 1856 } else { 1857 goto bad_swap; 1858 } 1859 1860 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 1861 1862 /* 1863 * Read the swap header. 1864 */ 1865 if (!mapping->a_ops->readpage) { 1866 error = -EINVAL; 1867 goto bad_swap; 1868 } 1869 page = read_mapping_page(mapping, 0, swap_file); 1870 if (IS_ERR(page)) { 1871 error = PTR_ERR(page); 1872 goto bad_swap; 1873 } 1874 swap_header = kmap(page); 1875 1876 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 1877 printk(KERN_ERR "Unable to find swap-space signature\n"); 1878 error = -EINVAL; 1879 goto bad_swap; 1880 } 1881 1882 /* swap partition endianess hack... */ 1883 if (swab32(swap_header->info.version) == 1) { 1884 swab32s(&swap_header->info.version); 1885 swab32s(&swap_header->info.last_page); 1886 swab32s(&swap_header->info.nr_badpages); 1887 for (i = 0; i < swap_header->info.nr_badpages; i++) 1888 swab32s(&swap_header->info.badpages[i]); 1889 } 1890 /* Check the swap header's sub-version */ 1891 if (swap_header->info.version != 1) { 1892 printk(KERN_WARNING 1893 "Unable to handle swap header version %d\n", 1894 swap_header->info.version); 1895 error = -EINVAL; 1896 goto bad_swap; 1897 } 1898 1899 p->lowest_bit = 1; 1900 p->cluster_next = 1; 1901 1902 /* 1903 * Find out how many pages are allowed for a single swap 1904 * device. There are two limiting factors: 1) the number of 1905 * bits for the swap offset in the swp_entry_t type and 1906 * 2) the number of bits in the a swap pte as defined by 1907 * the different architectures. In order to find the 1908 * largest possible bit mask a swap entry with swap type 0 1909 * and swap offset ~0UL is created, encoded to a swap pte, 1910 * decoded to a swp_entry_t again and finally the swap 1911 * offset is extracted. This will mask all the bits from 1912 * the initial ~0UL mask that can't be encoded in either 1913 * the swp_entry_t or the architecture definition of a 1914 * swap pte. 1915 */ 1916 maxpages = swp_offset(pte_to_swp_entry( 1917 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; 1918 if (maxpages > swap_header->info.last_page) 1919 maxpages = swap_header->info.last_page; 1920 p->highest_bit = maxpages - 1; 1921 1922 error = -EINVAL; 1923 if (!maxpages) 1924 goto bad_swap; 1925 if (swapfilepages && maxpages > swapfilepages) { 1926 printk(KERN_WARNING 1927 "Swap area shorter than signature indicates\n"); 1928 goto bad_swap; 1929 } 1930 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1931 goto bad_swap; 1932 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1933 goto bad_swap; 1934 1935 /* OK, set up the swap map and apply the bad block list */ 1936 swap_map = vmalloc(maxpages * sizeof(short)); 1937 if (!swap_map) { 1938 error = -ENOMEM; 1939 goto bad_swap; 1940 } 1941 1942 memset(swap_map, 0, maxpages * sizeof(short)); 1943 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1944 int page_nr = swap_header->info.badpages[i]; 1945 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { 1946 error = -EINVAL; 1947 goto bad_swap; 1948 } 1949 swap_map[page_nr] = SWAP_MAP_BAD; 1950 } 1951 1952 error = swap_cgroup_swapon(type, maxpages); 1953 if (error) 1954 goto bad_swap; 1955 1956 nr_good_pages = swap_header->info.last_page - 1957 swap_header->info.nr_badpages - 1958 1 /* header page */; 1959 1960 if (nr_good_pages) { 1961 swap_map[0] = SWAP_MAP_BAD; 1962 p->max = maxpages; 1963 p->pages = nr_good_pages; 1964 nr_extents = setup_swap_extents(p, &span); 1965 if (nr_extents < 0) { 1966 error = nr_extents; 1967 goto bad_swap; 1968 } 1969 nr_good_pages = p->pages; 1970 } 1971 if (!nr_good_pages) { 1972 printk(KERN_WARNING "Empty swap-file\n"); 1973 error = -EINVAL; 1974 goto bad_swap; 1975 } 1976 1977 if (p->bdev) { 1978 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 1979 p->flags |= SWP_SOLIDSTATE; 1980 p->cluster_next = 1 + (random32() % p->highest_bit); 1981 } 1982 if (discard_swap(p) == 0) 1983 p->flags |= SWP_DISCARDABLE; 1984 } 1985 1986 mutex_lock(&swapon_mutex); 1987 spin_lock(&swap_lock); 1988 if (swap_flags & SWAP_FLAG_PREFER) 1989 p->prio = 1990 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 1991 else 1992 p->prio = --least_priority; 1993 p->swap_map = swap_map; 1994 p->flags |= SWP_WRITEOK; 1995 nr_swap_pages += nr_good_pages; 1996 total_swap_pages += nr_good_pages; 1997 1998 printk(KERN_INFO "Adding %uk swap on %s. " 1999 "Priority:%d extents:%d across:%lluk %s%s\n", 2000 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, 2001 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2002 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2003 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2004 2005 /* insert swap space into swap_list: */ 2006 prev = -1; 2007 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 2008 if (p->prio >= swap_info[i].prio) { 2009 break; 2010 } 2011 prev = i; 2012 } 2013 p->next = i; 2014 if (prev < 0) { 2015 swap_list.head = swap_list.next = p - swap_info; 2016 } else { 2017 swap_info[prev].next = p - swap_info; 2018 } 2019 spin_unlock(&swap_lock); 2020 mutex_unlock(&swapon_mutex); 2021 error = 0; 2022 goto out; 2023 bad_swap: 2024 if (bdev) { 2025 set_blocksize(bdev, p->old_block_size); 2026 bd_release(bdev); 2027 } 2028 destroy_swap_extents(p); 2029 swap_cgroup_swapoff(type); 2030 bad_swap_2: 2031 spin_lock(&swap_lock); 2032 p->swap_file = NULL; 2033 p->flags = 0; 2034 spin_unlock(&swap_lock); 2035 vfree(swap_map); 2036 if (swap_file) 2037 filp_close(swap_file, NULL); 2038 out: 2039 if (page && !IS_ERR(page)) { 2040 kunmap(page); 2041 page_cache_release(page); 2042 } 2043 if (name) 2044 putname(name); 2045 if (did_down) { 2046 if (!error) 2047 inode->i_flags |= S_SWAPFILE; 2048 mutex_unlock(&inode->i_mutex); 2049 } 2050 return error; 2051 } 2052 2053 void si_swapinfo(struct sysinfo *val) 2054 { 2055 unsigned int i; 2056 unsigned long nr_to_be_unused = 0; 2057 2058 spin_lock(&swap_lock); 2059 for (i = 0; i < nr_swapfiles; i++) { 2060 if (!(swap_info[i].flags & SWP_USED) || 2061 (swap_info[i].flags & SWP_WRITEOK)) 2062 continue; 2063 nr_to_be_unused += swap_info[i].inuse_pages; 2064 } 2065 val->freeswap = nr_swap_pages + nr_to_be_unused; 2066 val->totalswap = total_swap_pages + nr_to_be_unused; 2067 spin_unlock(&swap_lock); 2068 } 2069 2070 /* 2071 * Verify that a swap entry is valid and increment its swap map count. 2072 * 2073 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 2074 * "permanent", but will be reclaimed by the next swapoff. 2075 * Returns error code in following case. 2076 * - success -> 0 2077 * - swp_entry is invalid -> EINVAL 2078 * - swp_entry is migration entry -> EINVAL 2079 * - swap-cache reference is requested but there is already one. -> EEXIST 2080 * - swap-cache reference is requested but the entry is not used. -> ENOENT 2081 */ 2082 static int __swap_duplicate(swp_entry_t entry, bool cache) 2083 { 2084 struct swap_info_struct * p; 2085 unsigned long offset, type; 2086 int result = -EINVAL; 2087 int count; 2088 bool has_cache; 2089 2090 if (non_swap_entry(entry)) 2091 return -EINVAL; 2092 2093 type = swp_type(entry); 2094 if (type >= nr_swapfiles) 2095 goto bad_file; 2096 p = type + swap_info; 2097 offset = swp_offset(entry); 2098 2099 spin_lock(&swap_lock); 2100 2101 if (unlikely(offset >= p->max)) 2102 goto unlock_out; 2103 2104 count = swap_count(p->swap_map[offset]); 2105 has_cache = swap_has_cache(p->swap_map[offset]); 2106 2107 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ 2108 2109 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 2110 if (!has_cache && count) { 2111 p->swap_map[offset] = encode_swapmap(count, true); 2112 result = 0; 2113 } else if (has_cache) /* someone added cache */ 2114 result = -EEXIST; 2115 else if (!count) /* no users */ 2116 result = -ENOENT; 2117 2118 } else if (count || has_cache) { 2119 if (count < SWAP_MAP_MAX - 1) { 2120 p->swap_map[offset] = encode_swapmap(count + 1, 2121 has_cache); 2122 result = 0; 2123 } else if (count <= SWAP_MAP_MAX) { 2124 if (swap_overflow++ < 5) 2125 printk(KERN_WARNING 2126 "swap_dup: swap entry overflow\n"); 2127 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, 2128 has_cache); 2129 result = 0; 2130 } 2131 } else 2132 result = -ENOENT; /* unused swap entry */ 2133 unlock_out: 2134 spin_unlock(&swap_lock); 2135 out: 2136 return result; 2137 2138 bad_file: 2139 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2140 goto out; 2141 } 2142 /* 2143 * increase reference count of swap entry by 1. 2144 */ 2145 void swap_duplicate(swp_entry_t entry) 2146 { 2147 __swap_duplicate(entry, SWAP_MAP); 2148 } 2149 2150 /* 2151 * @entry: swap entry for which we allocate swap cache. 2152 * 2153 * Called when allocating swap cache for exising swap entry, 2154 * This can return error codes. Returns 0 at success. 2155 * -EBUSY means there is a swap cache. 2156 * Note: return code is different from swap_duplicate(). 2157 */ 2158 int swapcache_prepare(swp_entry_t entry) 2159 { 2160 return __swap_duplicate(entry, SWAP_CACHE); 2161 } 2162 2163 2164 struct swap_info_struct * 2165 get_swap_info_struct(unsigned type) 2166 { 2167 return &swap_info[type]; 2168 } 2169 2170 /* 2171 * swap_lock prevents swap_map being freed. Don't grab an extra 2172 * reference on the swaphandle, it doesn't matter if it becomes unused. 2173 */ 2174 int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 2175 { 2176 struct swap_info_struct *si; 2177 int our_page_cluster = page_cluster; 2178 pgoff_t target, toff; 2179 pgoff_t base, end; 2180 int nr_pages = 0; 2181 2182 if (!our_page_cluster) /* no readahead */ 2183 return 0; 2184 2185 si = &swap_info[swp_type(entry)]; 2186 target = swp_offset(entry); 2187 base = (target >> our_page_cluster) << our_page_cluster; 2188 end = base + (1 << our_page_cluster); 2189 if (!base) /* first page is swap header */ 2190 base++; 2191 2192 spin_lock(&swap_lock); 2193 if (end > si->max) /* don't go beyond end of map */ 2194 end = si->max; 2195 2196 /* Count contiguous allocated slots above our target */ 2197 for (toff = target; ++toff < end; nr_pages++) { 2198 /* Don't read in free or bad pages */ 2199 if (!si->swap_map[toff]) 2200 break; 2201 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) 2202 break; 2203 } 2204 /* Count contiguous allocated slots below our target */ 2205 for (toff = target; --toff >= base; nr_pages++) { 2206 /* Don't read in free or bad pages */ 2207 if (!si->swap_map[toff]) 2208 break; 2209 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) 2210 break; 2211 } 2212 spin_unlock(&swap_lock); 2213 2214 /* 2215 * Indicate starting offset, and return number of pages to get: 2216 * if only 1, say 0, since there's then no readahead to be done. 2217 */ 2218 *offset = ++toff; 2219 return nr_pages? ++nr_pages: 0; 2220 } 2221