1 /* 2 * linux/mm/swapfile.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * Swap reorganised 29.12.95, Stephen Tweedie 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/sched/mm.h> 10 #include <linux/sched/task.h> 11 #include <linux/hugetlb.h> 12 #include <linux/mman.h> 13 #include <linux/slab.h> 14 #include <linux/kernel_stat.h> 15 #include <linux/swap.h> 16 #include <linux/vmalloc.h> 17 #include <linux/pagemap.h> 18 #include <linux/namei.h> 19 #include <linux/shmem_fs.h> 20 #include <linux/blkdev.h> 21 #include <linux/random.h> 22 #include <linux/writeback.h> 23 #include <linux/proc_fs.h> 24 #include <linux/seq_file.h> 25 #include <linux/init.h> 26 #include <linux/ksm.h> 27 #include <linux/rmap.h> 28 #include <linux/security.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mutex.h> 31 #include <linux/capability.h> 32 #include <linux/syscalls.h> 33 #include <linux/memcontrol.h> 34 #include <linux/poll.h> 35 #include <linux/oom.h> 36 #include <linux/frontswap.h> 37 #include <linux/swapfile.h> 38 #include <linux/export.h> 39 #include <linux/swap_slots.h> 40 41 #include <asm/pgtable.h> 42 #include <asm/tlbflush.h> 43 #include <linux/swapops.h> 44 #include <linux/swap_cgroup.h> 45 46 static bool swap_count_continued(struct swap_info_struct *, pgoff_t, 47 unsigned char); 48 static void free_swap_count_continuations(struct swap_info_struct *); 49 static sector_t map_swap_entry(swp_entry_t, struct block_device**); 50 51 DEFINE_SPINLOCK(swap_lock); 52 static unsigned int nr_swapfiles; 53 atomic_long_t nr_swap_pages; 54 /* 55 * Some modules use swappable objects and may try to swap them out under 56 * memory pressure (via the shrinker). Before doing so, they may wish to 57 * check to see if any swap space is available. 58 */ 59 EXPORT_SYMBOL_GPL(nr_swap_pages); 60 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 61 long total_swap_pages; 62 static int least_priority; 63 64 static const char Bad_file[] = "Bad swap file entry "; 65 static const char Unused_file[] = "Unused swap file entry "; 66 static const char Bad_offset[] = "Bad swap offset entry "; 67 static const char Unused_offset[] = "Unused swap offset entry "; 68 69 /* 70 * all active swap_info_structs 71 * protected with swap_lock, and ordered by priority. 72 */ 73 PLIST_HEAD(swap_active_head); 74 75 /* 76 * all available (active, not full) swap_info_structs 77 * protected with swap_avail_lock, ordered by priority. 78 * This is used by get_swap_page() instead of swap_active_head 79 * because swap_active_head includes all swap_info_structs, 80 * but get_swap_page() doesn't need to look at full ones. 81 * This uses its own lock instead of swap_lock because when a 82 * swap_info_struct changes between not-full/full, it needs to 83 * add/remove itself to/from this list, but the swap_info_struct->lock 84 * is held and the locking order requires swap_lock to be taken 85 * before any swap_info_struct->lock. 86 */ 87 static PLIST_HEAD(swap_avail_head); 88 static DEFINE_SPINLOCK(swap_avail_lock); 89 90 struct swap_info_struct *swap_info[MAX_SWAPFILES]; 91 92 static DEFINE_MUTEX(swapon_mutex); 93 94 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); 95 /* Activity counter to indicate that a swapon or swapoff has occurred */ 96 static atomic_t proc_poll_event = ATOMIC_INIT(0); 97 98 static inline unsigned char swap_count(unsigned char ent) 99 { 100 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ 101 } 102 103 /* returns 1 if swap entry is freed */ 104 static int 105 __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) 106 { 107 swp_entry_t entry = swp_entry(si->type, offset); 108 struct page *page; 109 int ret = 0; 110 111 page = find_get_page(swap_address_space(entry), swp_offset(entry)); 112 if (!page) 113 return 0; 114 /* 115 * This function is called from scan_swap_map() and it's called 116 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. 117 * We have to use trylock for avoiding deadlock. This is a special 118 * case and you should use try_to_free_swap() with explicit lock_page() 119 * in usual operations. 120 */ 121 if (trylock_page(page)) { 122 ret = try_to_free_swap(page); 123 unlock_page(page); 124 } 125 put_page(page); 126 return ret; 127 } 128 129 /* 130 * swapon tell device that all the old swap contents can be discarded, 131 * to allow the swap device to optimize its wear-levelling. 132 */ 133 static int discard_swap(struct swap_info_struct *si) 134 { 135 struct swap_extent *se; 136 sector_t start_block; 137 sector_t nr_blocks; 138 int err = 0; 139 140 /* Do not discard the swap header page! */ 141 se = &si->first_swap_extent; 142 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); 143 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 144 if (nr_blocks) { 145 err = blkdev_issue_discard(si->bdev, start_block, 146 nr_blocks, GFP_KERNEL, 0); 147 if (err) 148 return err; 149 cond_resched(); 150 } 151 152 list_for_each_entry(se, &si->first_swap_extent.list, list) { 153 start_block = se->start_block << (PAGE_SHIFT - 9); 154 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 155 156 err = blkdev_issue_discard(si->bdev, start_block, 157 nr_blocks, GFP_KERNEL, 0); 158 if (err) 159 break; 160 161 cond_resched(); 162 } 163 return err; /* That will often be -EOPNOTSUPP */ 164 } 165 166 /* 167 * swap allocation tell device that a cluster of swap can now be discarded, 168 * to allow the swap device to optimize its wear-levelling. 169 */ 170 static void discard_swap_cluster(struct swap_info_struct *si, 171 pgoff_t start_page, pgoff_t nr_pages) 172 { 173 struct swap_extent *se = si->curr_swap_extent; 174 int found_extent = 0; 175 176 while (nr_pages) { 177 if (se->start_page <= start_page && 178 start_page < se->start_page + se->nr_pages) { 179 pgoff_t offset = start_page - se->start_page; 180 sector_t start_block = se->start_block + offset; 181 sector_t nr_blocks = se->nr_pages - offset; 182 183 if (nr_blocks > nr_pages) 184 nr_blocks = nr_pages; 185 start_page += nr_blocks; 186 nr_pages -= nr_blocks; 187 188 if (!found_extent++) 189 si->curr_swap_extent = se; 190 191 start_block <<= PAGE_SHIFT - 9; 192 nr_blocks <<= PAGE_SHIFT - 9; 193 if (blkdev_issue_discard(si->bdev, start_block, 194 nr_blocks, GFP_NOIO, 0)) 195 break; 196 } 197 198 se = list_next_entry(se, list); 199 } 200 } 201 202 #define SWAPFILE_CLUSTER 256 203 #define LATENCY_LIMIT 256 204 205 static inline void cluster_set_flag(struct swap_cluster_info *info, 206 unsigned int flag) 207 { 208 info->flags = flag; 209 } 210 211 static inline unsigned int cluster_count(struct swap_cluster_info *info) 212 { 213 return info->data; 214 } 215 216 static inline void cluster_set_count(struct swap_cluster_info *info, 217 unsigned int c) 218 { 219 info->data = c; 220 } 221 222 static inline void cluster_set_count_flag(struct swap_cluster_info *info, 223 unsigned int c, unsigned int f) 224 { 225 info->flags = f; 226 info->data = c; 227 } 228 229 static inline unsigned int cluster_next(struct swap_cluster_info *info) 230 { 231 return info->data; 232 } 233 234 static inline void cluster_set_next(struct swap_cluster_info *info, 235 unsigned int n) 236 { 237 info->data = n; 238 } 239 240 static inline void cluster_set_next_flag(struct swap_cluster_info *info, 241 unsigned int n, unsigned int f) 242 { 243 info->flags = f; 244 info->data = n; 245 } 246 247 static inline bool cluster_is_free(struct swap_cluster_info *info) 248 { 249 return info->flags & CLUSTER_FLAG_FREE; 250 } 251 252 static inline bool cluster_is_null(struct swap_cluster_info *info) 253 { 254 return info->flags & CLUSTER_FLAG_NEXT_NULL; 255 } 256 257 static inline void cluster_set_null(struct swap_cluster_info *info) 258 { 259 info->flags = CLUSTER_FLAG_NEXT_NULL; 260 info->data = 0; 261 } 262 263 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, 264 unsigned long offset) 265 { 266 struct swap_cluster_info *ci; 267 268 ci = si->cluster_info; 269 if (ci) { 270 ci += offset / SWAPFILE_CLUSTER; 271 spin_lock(&ci->lock); 272 } 273 return ci; 274 } 275 276 static inline void unlock_cluster(struct swap_cluster_info *ci) 277 { 278 if (ci) 279 spin_unlock(&ci->lock); 280 } 281 282 static inline struct swap_cluster_info *lock_cluster_or_swap_info( 283 struct swap_info_struct *si, 284 unsigned long offset) 285 { 286 struct swap_cluster_info *ci; 287 288 ci = lock_cluster(si, offset); 289 if (!ci) 290 spin_lock(&si->lock); 291 292 return ci; 293 } 294 295 static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, 296 struct swap_cluster_info *ci) 297 { 298 if (ci) 299 unlock_cluster(ci); 300 else 301 spin_unlock(&si->lock); 302 } 303 304 static inline bool cluster_list_empty(struct swap_cluster_list *list) 305 { 306 return cluster_is_null(&list->head); 307 } 308 309 static inline unsigned int cluster_list_first(struct swap_cluster_list *list) 310 { 311 return cluster_next(&list->head); 312 } 313 314 static void cluster_list_init(struct swap_cluster_list *list) 315 { 316 cluster_set_null(&list->head); 317 cluster_set_null(&list->tail); 318 } 319 320 static void cluster_list_add_tail(struct swap_cluster_list *list, 321 struct swap_cluster_info *ci, 322 unsigned int idx) 323 { 324 if (cluster_list_empty(list)) { 325 cluster_set_next_flag(&list->head, idx, 0); 326 cluster_set_next_flag(&list->tail, idx, 0); 327 } else { 328 struct swap_cluster_info *ci_tail; 329 unsigned int tail = cluster_next(&list->tail); 330 331 /* 332 * Nested cluster lock, but both cluster locks are 333 * only acquired when we held swap_info_struct->lock 334 */ 335 ci_tail = ci + tail; 336 spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); 337 cluster_set_next(ci_tail, idx); 338 unlock_cluster(ci_tail); 339 cluster_set_next_flag(&list->tail, idx, 0); 340 } 341 } 342 343 static unsigned int cluster_list_del_first(struct swap_cluster_list *list, 344 struct swap_cluster_info *ci) 345 { 346 unsigned int idx; 347 348 idx = cluster_next(&list->head); 349 if (cluster_next(&list->tail) == idx) { 350 cluster_set_null(&list->head); 351 cluster_set_null(&list->tail); 352 } else 353 cluster_set_next_flag(&list->head, 354 cluster_next(&ci[idx]), 0); 355 356 return idx; 357 } 358 359 /* Add a cluster to discard list and schedule it to do discard */ 360 static void swap_cluster_schedule_discard(struct swap_info_struct *si, 361 unsigned int idx) 362 { 363 /* 364 * If scan_swap_map() can't find a free cluster, it will check 365 * si->swap_map directly. To make sure the discarding cluster isn't 366 * taken by scan_swap_map(), mark the swap entries bad (occupied). It 367 * will be cleared after discard 368 */ 369 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 370 SWAP_MAP_BAD, SWAPFILE_CLUSTER); 371 372 cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); 373 374 schedule_work(&si->discard_work); 375 } 376 377 /* 378 * Doing discard actually. After a cluster discard is finished, the cluster 379 * will be added to free cluster list. caller should hold si->lock. 380 */ 381 static void swap_do_scheduled_discard(struct swap_info_struct *si) 382 { 383 struct swap_cluster_info *info, *ci; 384 unsigned int idx; 385 386 info = si->cluster_info; 387 388 while (!cluster_list_empty(&si->discard_clusters)) { 389 idx = cluster_list_del_first(&si->discard_clusters, info); 390 spin_unlock(&si->lock); 391 392 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, 393 SWAPFILE_CLUSTER); 394 395 spin_lock(&si->lock); 396 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); 397 cluster_set_flag(ci, CLUSTER_FLAG_FREE); 398 unlock_cluster(ci); 399 cluster_list_add_tail(&si->free_clusters, info, idx); 400 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); 401 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 402 0, SWAPFILE_CLUSTER); 403 unlock_cluster(ci); 404 } 405 } 406 407 static void swap_discard_work(struct work_struct *work) 408 { 409 struct swap_info_struct *si; 410 411 si = container_of(work, struct swap_info_struct, discard_work); 412 413 spin_lock(&si->lock); 414 swap_do_scheduled_discard(si); 415 spin_unlock(&si->lock); 416 } 417 418 /* 419 * The cluster corresponding to page_nr will be used. The cluster will be 420 * removed from free cluster list and its usage counter will be increased. 421 */ 422 static void inc_cluster_info_page(struct swap_info_struct *p, 423 struct swap_cluster_info *cluster_info, unsigned long page_nr) 424 { 425 unsigned long idx = page_nr / SWAPFILE_CLUSTER; 426 427 if (!cluster_info) 428 return; 429 if (cluster_is_free(&cluster_info[idx])) { 430 VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx); 431 cluster_list_del_first(&p->free_clusters, cluster_info); 432 cluster_set_count_flag(&cluster_info[idx], 0, 0); 433 } 434 435 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); 436 cluster_set_count(&cluster_info[idx], 437 cluster_count(&cluster_info[idx]) + 1); 438 } 439 440 /* 441 * The cluster corresponding to page_nr decreases one usage. If the usage 442 * counter becomes 0, which means no page in the cluster is in using, we can 443 * optionally discard the cluster and add it to free cluster list. 444 */ 445 static void dec_cluster_info_page(struct swap_info_struct *p, 446 struct swap_cluster_info *cluster_info, unsigned long page_nr) 447 { 448 unsigned long idx = page_nr / SWAPFILE_CLUSTER; 449 450 if (!cluster_info) 451 return; 452 453 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); 454 cluster_set_count(&cluster_info[idx], 455 cluster_count(&cluster_info[idx]) - 1); 456 457 if (cluster_count(&cluster_info[idx]) == 0) { 458 /* 459 * If the swap is discardable, prepare discard the cluster 460 * instead of free it immediately. The cluster will be freed 461 * after discard. 462 */ 463 if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == 464 (SWP_WRITEOK | SWP_PAGE_DISCARD)) { 465 swap_cluster_schedule_discard(p, idx); 466 return; 467 } 468 469 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); 470 cluster_list_add_tail(&p->free_clusters, cluster_info, idx); 471 } 472 } 473 474 /* 475 * It's possible scan_swap_map() uses a free cluster in the middle of free 476 * cluster list. Avoiding such abuse to avoid list corruption. 477 */ 478 static bool 479 scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, 480 unsigned long offset) 481 { 482 struct percpu_cluster *percpu_cluster; 483 bool conflict; 484 485 offset /= SWAPFILE_CLUSTER; 486 conflict = !cluster_list_empty(&si->free_clusters) && 487 offset != cluster_list_first(&si->free_clusters) && 488 cluster_is_free(&si->cluster_info[offset]); 489 490 if (!conflict) 491 return false; 492 493 percpu_cluster = this_cpu_ptr(si->percpu_cluster); 494 cluster_set_null(&percpu_cluster->index); 495 return true; 496 } 497 498 /* 499 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This 500 * might involve allocating a new cluster for current CPU too. 501 */ 502 static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, 503 unsigned long *offset, unsigned long *scan_base) 504 { 505 struct percpu_cluster *cluster; 506 struct swap_cluster_info *ci; 507 bool found_free; 508 unsigned long tmp, max; 509 510 new_cluster: 511 cluster = this_cpu_ptr(si->percpu_cluster); 512 if (cluster_is_null(&cluster->index)) { 513 if (!cluster_list_empty(&si->free_clusters)) { 514 cluster->index = si->free_clusters.head; 515 cluster->next = cluster_next(&cluster->index) * 516 SWAPFILE_CLUSTER; 517 } else if (!cluster_list_empty(&si->discard_clusters)) { 518 /* 519 * we don't have free cluster but have some clusters in 520 * discarding, do discard now and reclaim them 521 */ 522 swap_do_scheduled_discard(si); 523 *scan_base = *offset = si->cluster_next; 524 goto new_cluster; 525 } else 526 return false; 527 } 528 529 found_free = false; 530 531 /* 532 * Other CPUs can use our cluster if they can't find a free cluster, 533 * check if there is still free entry in the cluster 534 */ 535 tmp = cluster->next; 536 max = min_t(unsigned long, si->max, 537 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); 538 if (tmp >= max) { 539 cluster_set_null(&cluster->index); 540 goto new_cluster; 541 } 542 ci = lock_cluster(si, tmp); 543 while (tmp < max) { 544 if (!si->swap_map[tmp]) { 545 found_free = true; 546 break; 547 } 548 tmp++; 549 } 550 unlock_cluster(ci); 551 if (!found_free) { 552 cluster_set_null(&cluster->index); 553 goto new_cluster; 554 } 555 cluster->next = tmp + 1; 556 *offset = tmp; 557 *scan_base = tmp; 558 return found_free; 559 } 560 561 static int scan_swap_map_slots(struct swap_info_struct *si, 562 unsigned char usage, int nr, 563 swp_entry_t slots[]) 564 { 565 struct swap_cluster_info *ci; 566 unsigned long offset; 567 unsigned long scan_base; 568 unsigned long last_in_cluster = 0; 569 int latency_ration = LATENCY_LIMIT; 570 int n_ret = 0; 571 572 if (nr > SWAP_BATCH) 573 nr = SWAP_BATCH; 574 575 /* 576 * We try to cluster swap pages by allocating them sequentially 577 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 578 * way, however, we resort to first-free allocation, starting 579 * a new cluster. This prevents us from scattering swap pages 580 * all over the entire swap partition, so that we reduce 581 * overall disk seek times between swap pages. -- sct 582 * But we do now try to find an empty cluster. -Andrea 583 * And we let swap pages go all over an SSD partition. Hugh 584 */ 585 586 si->flags += SWP_SCANNING; 587 scan_base = offset = si->cluster_next; 588 589 /* SSD algorithm */ 590 if (si->cluster_info) { 591 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) 592 goto checks; 593 else 594 goto scan; 595 } 596 597 if (unlikely(!si->cluster_nr--)) { 598 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 599 si->cluster_nr = SWAPFILE_CLUSTER - 1; 600 goto checks; 601 } 602 603 spin_unlock(&si->lock); 604 605 /* 606 * If seek is expensive, start searching for new cluster from 607 * start of partition, to minimize the span of allocated swap. 608 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info 609 * case, just handled by scan_swap_map_try_ssd_cluster() above. 610 */ 611 scan_base = offset = si->lowest_bit; 612 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 613 614 /* Locate the first empty (unaligned) cluster */ 615 for (; last_in_cluster <= si->highest_bit; offset++) { 616 if (si->swap_map[offset]) 617 last_in_cluster = offset + SWAPFILE_CLUSTER; 618 else if (offset == last_in_cluster) { 619 spin_lock(&si->lock); 620 offset -= SWAPFILE_CLUSTER - 1; 621 si->cluster_next = offset; 622 si->cluster_nr = SWAPFILE_CLUSTER - 1; 623 goto checks; 624 } 625 if (unlikely(--latency_ration < 0)) { 626 cond_resched(); 627 latency_ration = LATENCY_LIMIT; 628 } 629 } 630 631 offset = scan_base; 632 spin_lock(&si->lock); 633 si->cluster_nr = SWAPFILE_CLUSTER - 1; 634 } 635 636 checks: 637 if (si->cluster_info) { 638 while (scan_swap_map_ssd_cluster_conflict(si, offset)) { 639 /* take a break if we already got some slots */ 640 if (n_ret) 641 goto done; 642 if (!scan_swap_map_try_ssd_cluster(si, &offset, 643 &scan_base)) 644 goto scan; 645 } 646 } 647 if (!(si->flags & SWP_WRITEOK)) 648 goto no_page; 649 if (!si->highest_bit) 650 goto no_page; 651 if (offset > si->highest_bit) 652 scan_base = offset = si->lowest_bit; 653 654 ci = lock_cluster(si, offset); 655 /* reuse swap entry of cache-only swap if not busy. */ 656 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 657 int swap_was_freed; 658 unlock_cluster(ci); 659 spin_unlock(&si->lock); 660 swap_was_freed = __try_to_reclaim_swap(si, offset); 661 spin_lock(&si->lock); 662 /* entry was freed successfully, try to use this again */ 663 if (swap_was_freed) 664 goto checks; 665 goto scan; /* check next one */ 666 } 667 668 if (si->swap_map[offset]) { 669 unlock_cluster(ci); 670 if (!n_ret) 671 goto scan; 672 else 673 goto done; 674 } 675 676 if (offset == si->lowest_bit) 677 si->lowest_bit++; 678 if (offset == si->highest_bit) 679 si->highest_bit--; 680 si->inuse_pages++; 681 if (si->inuse_pages == si->pages) { 682 si->lowest_bit = si->max; 683 si->highest_bit = 0; 684 spin_lock(&swap_avail_lock); 685 plist_del(&si->avail_list, &swap_avail_head); 686 spin_unlock(&swap_avail_lock); 687 } 688 si->swap_map[offset] = usage; 689 inc_cluster_info_page(si, si->cluster_info, offset); 690 unlock_cluster(ci); 691 si->cluster_next = offset + 1; 692 slots[n_ret++] = swp_entry(si->type, offset); 693 694 /* got enough slots or reach max slots? */ 695 if ((n_ret == nr) || (offset >= si->highest_bit)) 696 goto done; 697 698 /* search for next available slot */ 699 700 /* time to take a break? */ 701 if (unlikely(--latency_ration < 0)) { 702 if (n_ret) 703 goto done; 704 spin_unlock(&si->lock); 705 cond_resched(); 706 spin_lock(&si->lock); 707 latency_ration = LATENCY_LIMIT; 708 } 709 710 /* try to get more slots in cluster */ 711 if (si->cluster_info) { 712 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) 713 goto checks; 714 else 715 goto done; 716 } 717 /* non-ssd case */ 718 ++offset; 719 720 /* non-ssd case, still more slots in cluster? */ 721 if (si->cluster_nr && !si->swap_map[offset]) { 722 --si->cluster_nr; 723 goto checks; 724 } 725 726 done: 727 si->flags -= SWP_SCANNING; 728 return n_ret; 729 730 scan: 731 spin_unlock(&si->lock); 732 while (++offset <= si->highest_bit) { 733 if (!si->swap_map[offset]) { 734 spin_lock(&si->lock); 735 goto checks; 736 } 737 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 738 spin_lock(&si->lock); 739 goto checks; 740 } 741 if (unlikely(--latency_ration < 0)) { 742 cond_resched(); 743 latency_ration = LATENCY_LIMIT; 744 } 745 } 746 offset = si->lowest_bit; 747 while (offset < scan_base) { 748 if (!si->swap_map[offset]) { 749 spin_lock(&si->lock); 750 goto checks; 751 } 752 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 753 spin_lock(&si->lock); 754 goto checks; 755 } 756 if (unlikely(--latency_ration < 0)) { 757 cond_resched(); 758 latency_ration = LATENCY_LIMIT; 759 } 760 offset++; 761 } 762 spin_lock(&si->lock); 763 764 no_page: 765 si->flags -= SWP_SCANNING; 766 return n_ret; 767 } 768 769 static unsigned long scan_swap_map(struct swap_info_struct *si, 770 unsigned char usage) 771 { 772 swp_entry_t entry; 773 int n_ret; 774 775 n_ret = scan_swap_map_slots(si, usage, 1, &entry); 776 777 if (n_ret) 778 return swp_offset(entry); 779 else 780 return 0; 781 782 } 783 784 int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) 785 { 786 struct swap_info_struct *si, *next; 787 long avail_pgs; 788 int n_ret = 0; 789 790 avail_pgs = atomic_long_read(&nr_swap_pages); 791 if (avail_pgs <= 0) 792 goto noswap; 793 794 if (n_goal > SWAP_BATCH) 795 n_goal = SWAP_BATCH; 796 797 if (n_goal > avail_pgs) 798 n_goal = avail_pgs; 799 800 atomic_long_sub(n_goal, &nr_swap_pages); 801 802 spin_lock(&swap_avail_lock); 803 804 start_over: 805 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { 806 /* requeue si to after same-priority siblings */ 807 plist_requeue(&si->avail_list, &swap_avail_head); 808 spin_unlock(&swap_avail_lock); 809 spin_lock(&si->lock); 810 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { 811 spin_lock(&swap_avail_lock); 812 if (plist_node_empty(&si->avail_list)) { 813 spin_unlock(&si->lock); 814 goto nextsi; 815 } 816 WARN(!si->highest_bit, 817 "swap_info %d in list but !highest_bit\n", 818 si->type); 819 WARN(!(si->flags & SWP_WRITEOK), 820 "swap_info %d in list but !SWP_WRITEOK\n", 821 si->type); 822 plist_del(&si->avail_list, &swap_avail_head); 823 spin_unlock(&si->lock); 824 goto nextsi; 825 } 826 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, 827 n_goal, swp_entries); 828 spin_unlock(&si->lock); 829 if (n_ret) 830 goto check_out; 831 pr_debug("scan_swap_map of si %d failed to find offset\n", 832 si->type); 833 834 spin_lock(&swap_avail_lock); 835 nextsi: 836 /* 837 * if we got here, it's likely that si was almost full before, 838 * and since scan_swap_map() can drop the si->lock, multiple 839 * callers probably all tried to get a page from the same si 840 * and it filled up before we could get one; or, the si filled 841 * up between us dropping swap_avail_lock and taking si->lock. 842 * Since we dropped the swap_avail_lock, the swap_avail_head 843 * list may have been modified; so if next is still in the 844 * swap_avail_head list then try it, otherwise start over 845 * if we have not gotten any slots. 846 */ 847 if (plist_node_empty(&next->avail_list)) 848 goto start_over; 849 } 850 851 spin_unlock(&swap_avail_lock); 852 853 check_out: 854 if (n_ret < n_goal) 855 atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages); 856 noswap: 857 return n_ret; 858 } 859 860 /* The only caller of this function is now suspend routine */ 861 swp_entry_t get_swap_page_of_type(int type) 862 { 863 struct swap_info_struct *si; 864 pgoff_t offset; 865 866 si = swap_info[type]; 867 spin_lock(&si->lock); 868 if (si && (si->flags & SWP_WRITEOK)) { 869 atomic_long_dec(&nr_swap_pages); 870 /* This is called for allocating swap entry, not cache */ 871 offset = scan_swap_map(si, 1); 872 if (offset) { 873 spin_unlock(&si->lock); 874 return swp_entry(type, offset); 875 } 876 atomic_long_inc(&nr_swap_pages); 877 } 878 spin_unlock(&si->lock); 879 return (swp_entry_t) {0}; 880 } 881 882 static struct swap_info_struct *__swap_info_get(swp_entry_t entry) 883 { 884 struct swap_info_struct *p; 885 unsigned long offset, type; 886 887 if (!entry.val) 888 goto out; 889 type = swp_type(entry); 890 if (type >= nr_swapfiles) 891 goto bad_nofile; 892 p = swap_info[type]; 893 if (!(p->flags & SWP_USED)) 894 goto bad_device; 895 offset = swp_offset(entry); 896 if (offset >= p->max) 897 goto bad_offset; 898 return p; 899 900 bad_offset: 901 pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val); 902 goto out; 903 bad_device: 904 pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val); 905 goto out; 906 bad_nofile: 907 pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val); 908 out: 909 return NULL; 910 } 911 912 static struct swap_info_struct *_swap_info_get(swp_entry_t entry) 913 { 914 struct swap_info_struct *p; 915 916 p = __swap_info_get(entry); 917 if (!p) 918 goto out; 919 if (!p->swap_map[swp_offset(entry)]) 920 goto bad_free; 921 return p; 922 923 bad_free: 924 pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); 925 goto out; 926 out: 927 return NULL; 928 } 929 930 static struct swap_info_struct *swap_info_get(swp_entry_t entry) 931 { 932 struct swap_info_struct *p; 933 934 p = _swap_info_get(entry); 935 if (p) 936 spin_lock(&p->lock); 937 return p; 938 } 939 940 static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, 941 struct swap_info_struct *q) 942 { 943 struct swap_info_struct *p; 944 945 p = _swap_info_get(entry); 946 947 if (p != q) { 948 if (q != NULL) 949 spin_unlock(&q->lock); 950 if (p != NULL) 951 spin_lock(&p->lock); 952 } 953 return p; 954 } 955 956 static unsigned char __swap_entry_free(struct swap_info_struct *p, 957 swp_entry_t entry, unsigned char usage) 958 { 959 struct swap_cluster_info *ci; 960 unsigned long offset = swp_offset(entry); 961 unsigned char count; 962 unsigned char has_cache; 963 964 ci = lock_cluster_or_swap_info(p, offset); 965 966 count = p->swap_map[offset]; 967 968 has_cache = count & SWAP_HAS_CACHE; 969 count &= ~SWAP_HAS_CACHE; 970 971 if (usage == SWAP_HAS_CACHE) { 972 VM_BUG_ON(!has_cache); 973 has_cache = 0; 974 } else if (count == SWAP_MAP_SHMEM) { 975 /* 976 * Or we could insist on shmem.c using a special 977 * swap_shmem_free() and free_shmem_swap_and_cache()... 978 */ 979 count = 0; 980 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { 981 if (count == COUNT_CONTINUED) { 982 if (swap_count_continued(p, offset, count)) 983 count = SWAP_MAP_MAX | COUNT_CONTINUED; 984 else 985 count = SWAP_MAP_MAX; 986 } else 987 count--; 988 } 989 990 usage = count | has_cache; 991 p->swap_map[offset] = usage ? : SWAP_HAS_CACHE; 992 993 unlock_cluster_or_swap_info(p, ci); 994 995 return usage; 996 } 997 998 static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) 999 { 1000 struct swap_cluster_info *ci; 1001 unsigned long offset = swp_offset(entry); 1002 unsigned char count; 1003 1004 ci = lock_cluster(p, offset); 1005 count = p->swap_map[offset]; 1006 VM_BUG_ON(count != SWAP_HAS_CACHE); 1007 p->swap_map[offset] = 0; 1008 dec_cluster_info_page(p, p->cluster_info, offset); 1009 unlock_cluster(ci); 1010 1011 mem_cgroup_uncharge_swap(entry); 1012 if (offset < p->lowest_bit) 1013 p->lowest_bit = offset; 1014 if (offset > p->highest_bit) { 1015 bool was_full = !p->highest_bit; 1016 1017 p->highest_bit = offset; 1018 if (was_full && (p->flags & SWP_WRITEOK)) { 1019 spin_lock(&swap_avail_lock); 1020 WARN_ON(!plist_node_empty(&p->avail_list)); 1021 if (plist_node_empty(&p->avail_list)) 1022 plist_add(&p->avail_list, 1023 &swap_avail_head); 1024 spin_unlock(&swap_avail_lock); 1025 } 1026 } 1027 atomic_long_inc(&nr_swap_pages); 1028 p->inuse_pages--; 1029 frontswap_invalidate_page(p->type, offset); 1030 if (p->flags & SWP_BLKDEV) { 1031 struct gendisk *disk = p->bdev->bd_disk; 1032 1033 if (disk->fops->swap_slot_free_notify) 1034 disk->fops->swap_slot_free_notify(p->bdev, 1035 offset); 1036 } 1037 } 1038 1039 /* 1040 * Caller has made sure that the swap device corresponding to entry 1041 * is still around or has not been recycled. 1042 */ 1043 void swap_free(swp_entry_t entry) 1044 { 1045 struct swap_info_struct *p; 1046 1047 p = _swap_info_get(entry); 1048 if (p) { 1049 if (!__swap_entry_free(p, entry, 1)) 1050 free_swap_slot(entry); 1051 } 1052 } 1053 1054 /* 1055 * Called after dropping swapcache to decrease refcnt to swap entries. 1056 */ 1057 void swapcache_free(swp_entry_t entry) 1058 { 1059 struct swap_info_struct *p; 1060 1061 p = _swap_info_get(entry); 1062 if (p) { 1063 if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE)) 1064 free_swap_slot(entry); 1065 } 1066 } 1067 1068 void swapcache_free_entries(swp_entry_t *entries, int n) 1069 { 1070 struct swap_info_struct *p, *prev; 1071 int i; 1072 1073 if (n <= 0) 1074 return; 1075 1076 prev = NULL; 1077 p = NULL; 1078 for (i = 0; i < n; ++i) { 1079 p = swap_info_get_cont(entries[i], prev); 1080 if (p) 1081 swap_entry_free(p, entries[i]); 1082 else 1083 break; 1084 prev = p; 1085 } 1086 if (p) 1087 spin_unlock(&p->lock); 1088 } 1089 1090 /* 1091 * How many references to page are currently swapped out? 1092 * This does not give an exact answer when swap count is continued, 1093 * but does include the high COUNT_CONTINUED flag to allow for that. 1094 */ 1095 int page_swapcount(struct page *page) 1096 { 1097 int count = 0; 1098 struct swap_info_struct *p; 1099 struct swap_cluster_info *ci; 1100 swp_entry_t entry; 1101 unsigned long offset; 1102 1103 entry.val = page_private(page); 1104 p = _swap_info_get(entry); 1105 if (p) { 1106 offset = swp_offset(entry); 1107 ci = lock_cluster_or_swap_info(p, offset); 1108 count = swap_count(p->swap_map[offset]); 1109 unlock_cluster_or_swap_info(p, ci); 1110 } 1111 return count; 1112 } 1113 1114 /* 1115 * How many references to @entry are currently swapped out? 1116 * This does not give an exact answer when swap count is continued, 1117 * but does include the high COUNT_CONTINUED flag to allow for that. 1118 */ 1119 int __swp_swapcount(swp_entry_t entry) 1120 { 1121 int count = 0; 1122 pgoff_t offset; 1123 struct swap_info_struct *si; 1124 struct swap_cluster_info *ci; 1125 1126 si = __swap_info_get(entry); 1127 if (si) { 1128 offset = swp_offset(entry); 1129 ci = lock_cluster_or_swap_info(si, offset); 1130 count = swap_count(si->swap_map[offset]); 1131 unlock_cluster_or_swap_info(si, ci); 1132 } 1133 return count; 1134 } 1135 1136 /* 1137 * How many references to @entry are currently swapped out? 1138 * This considers COUNT_CONTINUED so it returns exact answer. 1139 */ 1140 int swp_swapcount(swp_entry_t entry) 1141 { 1142 int count, tmp_count, n; 1143 struct swap_info_struct *p; 1144 struct swap_cluster_info *ci; 1145 struct page *page; 1146 pgoff_t offset; 1147 unsigned char *map; 1148 1149 p = _swap_info_get(entry); 1150 if (!p) 1151 return 0; 1152 1153 offset = swp_offset(entry); 1154 1155 ci = lock_cluster_or_swap_info(p, offset); 1156 1157 count = swap_count(p->swap_map[offset]); 1158 if (!(count & COUNT_CONTINUED)) 1159 goto out; 1160 1161 count &= ~COUNT_CONTINUED; 1162 n = SWAP_MAP_MAX + 1; 1163 1164 page = vmalloc_to_page(p->swap_map + offset); 1165 offset &= ~PAGE_MASK; 1166 VM_BUG_ON(page_private(page) != SWP_CONTINUED); 1167 1168 do { 1169 page = list_next_entry(page, lru); 1170 map = kmap_atomic(page); 1171 tmp_count = map[offset]; 1172 kunmap_atomic(map); 1173 1174 count += (tmp_count & ~COUNT_CONTINUED) * n; 1175 n *= (SWAP_CONT_MAX + 1); 1176 } while (tmp_count & COUNT_CONTINUED); 1177 out: 1178 unlock_cluster_or_swap_info(p, ci); 1179 return count; 1180 } 1181 1182 /* 1183 * We can write to an anon page without COW if there are no other references 1184 * to it. And as a side-effect, free up its swap: because the old content 1185 * on disk will never be read, and seeking back there to write new content 1186 * later would only waste time away from clustering. 1187 * 1188 * NOTE: total_mapcount should not be relied upon by the caller if 1189 * reuse_swap_page() returns false, but it may be always overwritten 1190 * (see the other implementation for CONFIG_SWAP=n). 1191 */ 1192 bool reuse_swap_page(struct page *page, int *total_mapcount) 1193 { 1194 int count; 1195 1196 VM_BUG_ON_PAGE(!PageLocked(page), page); 1197 if (unlikely(PageKsm(page))) 1198 return false; 1199 count = page_trans_huge_mapcount(page, total_mapcount); 1200 if (count <= 1 && PageSwapCache(page)) { 1201 count += page_swapcount(page); 1202 if (count != 1) 1203 goto out; 1204 if (!PageWriteback(page)) { 1205 delete_from_swap_cache(page); 1206 SetPageDirty(page); 1207 } else { 1208 swp_entry_t entry; 1209 struct swap_info_struct *p; 1210 1211 entry.val = page_private(page); 1212 p = swap_info_get(entry); 1213 if (p->flags & SWP_STABLE_WRITES) { 1214 spin_unlock(&p->lock); 1215 return false; 1216 } 1217 spin_unlock(&p->lock); 1218 } 1219 } 1220 out: 1221 return count <= 1; 1222 } 1223 1224 /* 1225 * If swap is getting full, or if there are no more mappings of this page, 1226 * then try_to_free_swap is called to free its swap space. 1227 */ 1228 int try_to_free_swap(struct page *page) 1229 { 1230 VM_BUG_ON_PAGE(!PageLocked(page), page); 1231 1232 if (!PageSwapCache(page)) 1233 return 0; 1234 if (PageWriteback(page)) 1235 return 0; 1236 if (page_swapcount(page)) 1237 return 0; 1238 1239 /* 1240 * Once hibernation has begun to create its image of memory, 1241 * there's a danger that one of the calls to try_to_free_swap() 1242 * - most probably a call from __try_to_reclaim_swap() while 1243 * hibernation is allocating its own swap pages for the image, 1244 * but conceivably even a call from memory reclaim - will free 1245 * the swap from a page which has already been recorded in the 1246 * image as a clean swapcache page, and then reuse its swap for 1247 * another page of the image. On waking from hibernation, the 1248 * original page might be freed under memory pressure, then 1249 * later read back in from swap, now with the wrong data. 1250 * 1251 * Hibernation suspends storage while it is writing the image 1252 * to disk so check that here. 1253 */ 1254 if (pm_suspended_storage()) 1255 return 0; 1256 1257 delete_from_swap_cache(page); 1258 SetPageDirty(page); 1259 return 1; 1260 } 1261 1262 /* 1263 * Free the swap entry like above, but also try to 1264 * free the page cache entry if it is the last user. 1265 */ 1266 int free_swap_and_cache(swp_entry_t entry) 1267 { 1268 struct swap_info_struct *p; 1269 struct page *page = NULL; 1270 unsigned char count; 1271 1272 if (non_swap_entry(entry)) 1273 return 1; 1274 1275 p = _swap_info_get(entry); 1276 if (p) { 1277 count = __swap_entry_free(p, entry, 1); 1278 if (count == SWAP_HAS_CACHE) { 1279 page = find_get_page(swap_address_space(entry), 1280 swp_offset(entry)); 1281 if (page && !trylock_page(page)) { 1282 put_page(page); 1283 page = NULL; 1284 } 1285 } else if (!count) 1286 free_swap_slot(entry); 1287 } 1288 if (page) { 1289 /* 1290 * Not mapped elsewhere, or swap space full? Free it! 1291 * Also recheck PageSwapCache now page is locked (above). 1292 */ 1293 if (PageSwapCache(page) && !PageWriteback(page) && 1294 (!page_mapped(page) || mem_cgroup_swap_full(page))) { 1295 delete_from_swap_cache(page); 1296 SetPageDirty(page); 1297 } 1298 unlock_page(page); 1299 put_page(page); 1300 } 1301 return p != NULL; 1302 } 1303 1304 #ifdef CONFIG_HIBERNATION 1305 /* 1306 * Find the swap type that corresponds to given device (if any). 1307 * 1308 * @offset - number of the PAGE_SIZE-sized block of the device, starting 1309 * from 0, in which the swap header is expected to be located. 1310 * 1311 * This is needed for the suspend to disk (aka swsusp). 1312 */ 1313 int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 1314 { 1315 struct block_device *bdev = NULL; 1316 int type; 1317 1318 if (device) 1319 bdev = bdget(device); 1320 1321 spin_lock(&swap_lock); 1322 for (type = 0; type < nr_swapfiles; type++) { 1323 struct swap_info_struct *sis = swap_info[type]; 1324 1325 if (!(sis->flags & SWP_WRITEOK)) 1326 continue; 1327 1328 if (!bdev) { 1329 if (bdev_p) 1330 *bdev_p = bdgrab(sis->bdev); 1331 1332 spin_unlock(&swap_lock); 1333 return type; 1334 } 1335 if (bdev == sis->bdev) { 1336 struct swap_extent *se = &sis->first_swap_extent; 1337 1338 if (se->start_block == offset) { 1339 if (bdev_p) 1340 *bdev_p = bdgrab(sis->bdev); 1341 1342 spin_unlock(&swap_lock); 1343 bdput(bdev); 1344 return type; 1345 } 1346 } 1347 } 1348 spin_unlock(&swap_lock); 1349 if (bdev) 1350 bdput(bdev); 1351 1352 return -ENODEV; 1353 } 1354 1355 /* 1356 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 1357 * corresponding to given index in swap_info (swap type). 1358 */ 1359 sector_t swapdev_block(int type, pgoff_t offset) 1360 { 1361 struct block_device *bdev; 1362 1363 if ((unsigned int)type >= nr_swapfiles) 1364 return 0; 1365 if (!(swap_info[type]->flags & SWP_WRITEOK)) 1366 return 0; 1367 return map_swap_entry(swp_entry(type, offset), &bdev); 1368 } 1369 1370 /* 1371 * Return either the total number of swap pages of given type, or the number 1372 * of free pages of that type (depending on @free) 1373 * 1374 * This is needed for software suspend 1375 */ 1376 unsigned int count_swap_pages(int type, int free) 1377 { 1378 unsigned int n = 0; 1379 1380 spin_lock(&swap_lock); 1381 if ((unsigned int)type < nr_swapfiles) { 1382 struct swap_info_struct *sis = swap_info[type]; 1383 1384 spin_lock(&sis->lock); 1385 if (sis->flags & SWP_WRITEOK) { 1386 n = sis->pages; 1387 if (free) 1388 n -= sis->inuse_pages; 1389 } 1390 spin_unlock(&sis->lock); 1391 } 1392 spin_unlock(&swap_lock); 1393 return n; 1394 } 1395 #endif /* CONFIG_HIBERNATION */ 1396 1397 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) 1398 { 1399 return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte); 1400 } 1401 1402 /* 1403 * No need to decide whether this PTE shares the swap entry with others, 1404 * just let do_wp_page work it out if a write is requested later - to 1405 * force COW, vm_page_prot omits write permission from any private vma. 1406 */ 1407 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 1408 unsigned long addr, swp_entry_t entry, struct page *page) 1409 { 1410 struct page *swapcache; 1411 struct mem_cgroup *memcg; 1412 spinlock_t *ptl; 1413 pte_t *pte; 1414 int ret = 1; 1415 1416 swapcache = page; 1417 page = ksm_might_need_to_copy(page, vma, addr); 1418 if (unlikely(!page)) 1419 return -ENOMEM; 1420 1421 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, 1422 &memcg, false)) { 1423 ret = -ENOMEM; 1424 goto out_nolock; 1425 } 1426 1427 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1428 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { 1429 mem_cgroup_cancel_charge(page, memcg, false); 1430 ret = 0; 1431 goto out; 1432 } 1433 1434 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 1435 inc_mm_counter(vma->vm_mm, MM_ANONPAGES); 1436 get_page(page); 1437 set_pte_at(vma->vm_mm, addr, pte, 1438 pte_mkold(mk_pte(page, vma->vm_page_prot))); 1439 if (page == swapcache) { 1440 page_add_anon_rmap(page, vma, addr, false); 1441 mem_cgroup_commit_charge(page, memcg, true, false); 1442 } else { /* ksm created a completely new copy */ 1443 page_add_new_anon_rmap(page, vma, addr, false); 1444 mem_cgroup_commit_charge(page, memcg, false, false); 1445 lru_cache_add_active_or_unevictable(page, vma); 1446 } 1447 swap_free(entry); 1448 /* 1449 * Move the page to the active list so it is not 1450 * immediately swapped out again after swapon. 1451 */ 1452 activate_page(page); 1453 out: 1454 pte_unmap_unlock(pte, ptl); 1455 out_nolock: 1456 if (page != swapcache) { 1457 unlock_page(page); 1458 put_page(page); 1459 } 1460 return ret; 1461 } 1462 1463 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 1464 unsigned long addr, unsigned long end, 1465 swp_entry_t entry, struct page *page) 1466 { 1467 pte_t swp_pte = swp_entry_to_pte(entry); 1468 pte_t *pte; 1469 int ret = 0; 1470 1471 /* 1472 * We don't actually need pte lock while scanning for swp_pte: since 1473 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the 1474 * page table while we're scanning; though it could get zapped, and on 1475 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse 1476 * of unmatched parts which look like swp_pte, so unuse_pte must 1477 * recheck under pte lock. Scanning without pte lock lets it be 1478 * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. 1479 */ 1480 pte = pte_offset_map(pmd, addr); 1481 do { 1482 /* 1483 * swapoff spends a _lot_ of time in this loop! 1484 * Test inline before going to call unuse_pte. 1485 */ 1486 if (unlikely(pte_same_as_swp(*pte, swp_pte))) { 1487 pte_unmap(pte); 1488 ret = unuse_pte(vma, pmd, addr, entry, page); 1489 if (ret) 1490 goto out; 1491 pte = pte_offset_map(pmd, addr); 1492 } 1493 } while (pte++, addr += PAGE_SIZE, addr != end); 1494 pte_unmap(pte - 1); 1495 out: 1496 return ret; 1497 } 1498 1499 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 1500 unsigned long addr, unsigned long end, 1501 swp_entry_t entry, struct page *page) 1502 { 1503 pmd_t *pmd; 1504 unsigned long next; 1505 int ret; 1506 1507 pmd = pmd_offset(pud, addr); 1508 do { 1509 cond_resched(); 1510 next = pmd_addr_end(addr, end); 1511 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 1512 continue; 1513 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 1514 if (ret) 1515 return ret; 1516 } while (pmd++, addr = next, addr != end); 1517 return 0; 1518 } 1519 1520 static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 1521 unsigned long addr, unsigned long end, 1522 swp_entry_t entry, struct page *page) 1523 { 1524 pud_t *pud; 1525 unsigned long next; 1526 int ret; 1527 1528 pud = pud_offset(pgd, addr); 1529 do { 1530 next = pud_addr_end(addr, end); 1531 if (pud_none_or_clear_bad(pud)) 1532 continue; 1533 ret = unuse_pmd_range(vma, pud, addr, next, entry, page); 1534 if (ret) 1535 return ret; 1536 } while (pud++, addr = next, addr != end); 1537 return 0; 1538 } 1539 1540 static int unuse_vma(struct vm_area_struct *vma, 1541 swp_entry_t entry, struct page *page) 1542 { 1543 pgd_t *pgd; 1544 unsigned long addr, end, next; 1545 int ret; 1546 1547 if (page_anon_vma(page)) { 1548 addr = page_address_in_vma(page, vma); 1549 if (addr == -EFAULT) 1550 return 0; 1551 else 1552 end = addr + PAGE_SIZE; 1553 } else { 1554 addr = vma->vm_start; 1555 end = vma->vm_end; 1556 } 1557 1558 pgd = pgd_offset(vma->vm_mm, addr); 1559 do { 1560 next = pgd_addr_end(addr, end); 1561 if (pgd_none_or_clear_bad(pgd)) 1562 continue; 1563 ret = unuse_pud_range(vma, pgd, addr, next, entry, page); 1564 if (ret) 1565 return ret; 1566 } while (pgd++, addr = next, addr != end); 1567 return 0; 1568 } 1569 1570 static int unuse_mm(struct mm_struct *mm, 1571 swp_entry_t entry, struct page *page) 1572 { 1573 struct vm_area_struct *vma; 1574 int ret = 0; 1575 1576 if (!down_read_trylock(&mm->mmap_sem)) { 1577 /* 1578 * Activate page so shrink_inactive_list is unlikely to unmap 1579 * its ptes while lock is dropped, so swapoff can make progress. 1580 */ 1581 activate_page(page); 1582 unlock_page(page); 1583 down_read(&mm->mmap_sem); 1584 lock_page(page); 1585 } 1586 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1587 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) 1588 break; 1589 cond_resched(); 1590 } 1591 up_read(&mm->mmap_sem); 1592 return (ret < 0)? ret: 0; 1593 } 1594 1595 /* 1596 * Scan swap_map (or frontswap_map if frontswap parameter is true) 1597 * from current position to next entry still in use. 1598 * Recycle to start on reaching the end, returning 0 when empty. 1599 */ 1600 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 1601 unsigned int prev, bool frontswap) 1602 { 1603 unsigned int max = si->max; 1604 unsigned int i = prev; 1605 unsigned char count; 1606 1607 /* 1608 * No need for swap_lock here: we're just looking 1609 * for whether an entry is in use, not modifying it; false 1610 * hits are okay, and sys_swapoff() has already prevented new 1611 * allocations from this area (while holding swap_lock). 1612 */ 1613 for (;;) { 1614 if (++i >= max) { 1615 if (!prev) { 1616 i = 0; 1617 break; 1618 } 1619 /* 1620 * No entries in use at top of swap_map, 1621 * loop back to start and recheck there. 1622 */ 1623 max = prev + 1; 1624 prev = 0; 1625 i = 1; 1626 } 1627 count = READ_ONCE(si->swap_map[i]); 1628 if (count && swap_count(count) != SWAP_MAP_BAD) 1629 if (!frontswap || frontswap_test(si, i)) 1630 break; 1631 if ((i % LATENCY_LIMIT) == 0) 1632 cond_resched(); 1633 } 1634 return i; 1635 } 1636 1637 /* 1638 * We completely avoid races by reading each swap page in advance, 1639 * and then search for the process using it. All the necessary 1640 * page table adjustments can then be made atomically. 1641 * 1642 * if the boolean frontswap is true, only unuse pages_to_unuse pages; 1643 * pages_to_unuse==0 means all pages; ignored if frontswap is false 1644 */ 1645 int try_to_unuse(unsigned int type, bool frontswap, 1646 unsigned long pages_to_unuse) 1647 { 1648 struct swap_info_struct *si = swap_info[type]; 1649 struct mm_struct *start_mm; 1650 volatile unsigned char *swap_map; /* swap_map is accessed without 1651 * locking. Mark it as volatile 1652 * to prevent compiler doing 1653 * something odd. 1654 */ 1655 unsigned char swcount; 1656 struct page *page; 1657 swp_entry_t entry; 1658 unsigned int i = 0; 1659 int retval = 0; 1660 1661 /* 1662 * When searching mms for an entry, a good strategy is to 1663 * start at the first mm we freed the previous entry from 1664 * (though actually we don't notice whether we or coincidence 1665 * freed the entry). Initialize this start_mm with a hold. 1666 * 1667 * A simpler strategy would be to start at the last mm we 1668 * freed the previous entry from; but that would take less 1669 * advantage of mmlist ordering, which clusters forked mms 1670 * together, child after parent. If we race with dup_mmap(), we 1671 * prefer to resolve parent before child, lest we miss entries 1672 * duplicated after we scanned child: using last mm would invert 1673 * that. 1674 */ 1675 start_mm = &init_mm; 1676 mmget(&init_mm); 1677 1678 /* 1679 * Keep on scanning until all entries have gone. Usually, 1680 * one pass through swap_map is enough, but not necessarily: 1681 * there are races when an instance of an entry might be missed. 1682 */ 1683 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { 1684 if (signal_pending(current)) { 1685 retval = -EINTR; 1686 break; 1687 } 1688 1689 /* 1690 * Get a page for the entry, using the existing swap 1691 * cache page if there is one. Otherwise, get a clean 1692 * page and read the swap into it. 1693 */ 1694 swap_map = &si->swap_map[i]; 1695 entry = swp_entry(type, i); 1696 page = read_swap_cache_async(entry, 1697 GFP_HIGHUSER_MOVABLE, NULL, 0); 1698 if (!page) { 1699 /* 1700 * Either swap_duplicate() failed because entry 1701 * has been freed independently, and will not be 1702 * reused since sys_swapoff() already disabled 1703 * allocation from here, or alloc_page() failed. 1704 */ 1705 swcount = *swap_map; 1706 /* 1707 * We don't hold lock here, so the swap entry could be 1708 * SWAP_MAP_BAD (when the cluster is discarding). 1709 * Instead of fail out, We can just skip the swap 1710 * entry because swapoff will wait for discarding 1711 * finish anyway. 1712 */ 1713 if (!swcount || swcount == SWAP_MAP_BAD) 1714 continue; 1715 retval = -ENOMEM; 1716 break; 1717 } 1718 1719 /* 1720 * Don't hold on to start_mm if it looks like exiting. 1721 */ 1722 if (atomic_read(&start_mm->mm_users) == 1) { 1723 mmput(start_mm); 1724 start_mm = &init_mm; 1725 mmget(&init_mm); 1726 } 1727 1728 /* 1729 * Wait for and lock page. When do_swap_page races with 1730 * try_to_unuse, do_swap_page can handle the fault much 1731 * faster than try_to_unuse can locate the entry. This 1732 * apparently redundant "wait_on_page_locked" lets try_to_unuse 1733 * defer to do_swap_page in such a case - in some tests, 1734 * do_swap_page and try_to_unuse repeatedly compete. 1735 */ 1736 wait_on_page_locked(page); 1737 wait_on_page_writeback(page); 1738 lock_page(page); 1739 wait_on_page_writeback(page); 1740 1741 /* 1742 * Remove all references to entry. 1743 */ 1744 swcount = *swap_map; 1745 if (swap_count(swcount) == SWAP_MAP_SHMEM) { 1746 retval = shmem_unuse(entry, page); 1747 /* page has already been unlocked and released */ 1748 if (retval < 0) 1749 break; 1750 continue; 1751 } 1752 if (swap_count(swcount) && start_mm != &init_mm) 1753 retval = unuse_mm(start_mm, entry, page); 1754 1755 if (swap_count(*swap_map)) { 1756 int set_start_mm = (*swap_map >= swcount); 1757 struct list_head *p = &start_mm->mmlist; 1758 struct mm_struct *new_start_mm = start_mm; 1759 struct mm_struct *prev_mm = start_mm; 1760 struct mm_struct *mm; 1761 1762 mmget(new_start_mm); 1763 mmget(prev_mm); 1764 spin_lock(&mmlist_lock); 1765 while (swap_count(*swap_map) && !retval && 1766 (p = p->next) != &start_mm->mmlist) { 1767 mm = list_entry(p, struct mm_struct, mmlist); 1768 if (!mmget_not_zero(mm)) 1769 continue; 1770 spin_unlock(&mmlist_lock); 1771 mmput(prev_mm); 1772 prev_mm = mm; 1773 1774 cond_resched(); 1775 1776 swcount = *swap_map; 1777 if (!swap_count(swcount)) /* any usage ? */ 1778 ; 1779 else if (mm == &init_mm) 1780 set_start_mm = 1; 1781 else 1782 retval = unuse_mm(mm, entry, page); 1783 1784 if (set_start_mm && *swap_map < swcount) { 1785 mmput(new_start_mm); 1786 mmget(mm); 1787 new_start_mm = mm; 1788 set_start_mm = 0; 1789 } 1790 spin_lock(&mmlist_lock); 1791 } 1792 spin_unlock(&mmlist_lock); 1793 mmput(prev_mm); 1794 mmput(start_mm); 1795 start_mm = new_start_mm; 1796 } 1797 if (retval) { 1798 unlock_page(page); 1799 put_page(page); 1800 break; 1801 } 1802 1803 /* 1804 * If a reference remains (rare), we would like to leave 1805 * the page in the swap cache; but try_to_unmap could 1806 * then re-duplicate the entry once we drop page lock, 1807 * so we might loop indefinitely; also, that page could 1808 * not be swapped out to other storage meanwhile. So: 1809 * delete from cache even if there's another reference, 1810 * after ensuring that the data has been saved to disk - 1811 * since if the reference remains (rarer), it will be 1812 * read from disk into another page. Splitting into two 1813 * pages would be incorrect if swap supported "shared 1814 * private" pages, but they are handled by tmpfs files. 1815 * 1816 * Given how unuse_vma() targets one particular offset 1817 * in an anon_vma, once the anon_vma has been determined, 1818 * this splitting happens to be just what is needed to 1819 * handle where KSM pages have been swapped out: re-reading 1820 * is unnecessarily slow, but we can fix that later on. 1821 */ 1822 if (swap_count(*swap_map) && 1823 PageDirty(page) && PageSwapCache(page)) { 1824 struct writeback_control wbc = { 1825 .sync_mode = WB_SYNC_NONE, 1826 }; 1827 1828 swap_writepage(page, &wbc); 1829 lock_page(page); 1830 wait_on_page_writeback(page); 1831 } 1832 1833 /* 1834 * It is conceivable that a racing task removed this page from 1835 * swap cache just before we acquired the page lock at the top, 1836 * or while we dropped it in unuse_mm(). The page might even 1837 * be back in swap cache on another swap area: that we must not 1838 * delete, since it may not have been written out to swap yet. 1839 */ 1840 if (PageSwapCache(page) && 1841 likely(page_private(page) == entry.val)) 1842 delete_from_swap_cache(page); 1843 1844 /* 1845 * So we could skip searching mms once swap count went 1846 * to 1, we did not mark any present ptes as dirty: must 1847 * mark page dirty so shrink_page_list will preserve it. 1848 */ 1849 SetPageDirty(page); 1850 unlock_page(page); 1851 put_page(page); 1852 1853 /* 1854 * Make sure that we aren't completely killing 1855 * interactive performance. 1856 */ 1857 cond_resched(); 1858 if (frontswap && pages_to_unuse > 0) { 1859 if (!--pages_to_unuse) 1860 break; 1861 } 1862 } 1863 1864 mmput(start_mm); 1865 return retval; 1866 } 1867 1868 /* 1869 * After a successful try_to_unuse, if no swap is now in use, we know 1870 * we can empty the mmlist. swap_lock must be held on entry and exit. 1871 * Note that mmlist_lock nests inside swap_lock, and an mm must be 1872 * added to the mmlist just after page_duplicate - before would be racy. 1873 */ 1874 static void drain_mmlist(void) 1875 { 1876 struct list_head *p, *next; 1877 unsigned int type; 1878 1879 for (type = 0; type < nr_swapfiles; type++) 1880 if (swap_info[type]->inuse_pages) 1881 return; 1882 spin_lock(&mmlist_lock); 1883 list_for_each_safe(p, next, &init_mm.mmlist) 1884 list_del_init(p); 1885 spin_unlock(&mmlist_lock); 1886 } 1887 1888 /* 1889 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 1890 * corresponds to page offset for the specified swap entry. 1891 * Note that the type of this function is sector_t, but it returns page offset 1892 * into the bdev, not sector offset. 1893 */ 1894 static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) 1895 { 1896 struct swap_info_struct *sis; 1897 struct swap_extent *start_se; 1898 struct swap_extent *se; 1899 pgoff_t offset; 1900 1901 sis = swap_info[swp_type(entry)]; 1902 *bdev = sis->bdev; 1903 1904 offset = swp_offset(entry); 1905 start_se = sis->curr_swap_extent; 1906 se = start_se; 1907 1908 for ( ; ; ) { 1909 if (se->start_page <= offset && 1910 offset < (se->start_page + se->nr_pages)) { 1911 return se->start_block + (offset - se->start_page); 1912 } 1913 se = list_next_entry(se, list); 1914 sis->curr_swap_extent = se; 1915 BUG_ON(se == start_se); /* It *must* be present */ 1916 } 1917 } 1918 1919 /* 1920 * Returns the page offset into bdev for the specified page's swap entry. 1921 */ 1922 sector_t map_swap_page(struct page *page, struct block_device **bdev) 1923 { 1924 swp_entry_t entry; 1925 entry.val = page_private(page); 1926 return map_swap_entry(entry, bdev); 1927 } 1928 1929 /* 1930 * Free all of a swapdev's extent information 1931 */ 1932 static void destroy_swap_extents(struct swap_info_struct *sis) 1933 { 1934 while (!list_empty(&sis->first_swap_extent.list)) { 1935 struct swap_extent *se; 1936 1937 se = list_first_entry(&sis->first_swap_extent.list, 1938 struct swap_extent, list); 1939 list_del(&se->list); 1940 kfree(se); 1941 } 1942 1943 if (sis->flags & SWP_FILE) { 1944 struct file *swap_file = sis->swap_file; 1945 struct address_space *mapping = swap_file->f_mapping; 1946 1947 sis->flags &= ~SWP_FILE; 1948 mapping->a_ops->swap_deactivate(swap_file); 1949 } 1950 } 1951 1952 /* 1953 * Add a block range (and the corresponding page range) into this swapdev's 1954 * extent list. The extent list is kept sorted in page order. 1955 * 1956 * This function rather assumes that it is called in ascending page order. 1957 */ 1958 int 1959 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 1960 unsigned long nr_pages, sector_t start_block) 1961 { 1962 struct swap_extent *se; 1963 struct swap_extent *new_se; 1964 struct list_head *lh; 1965 1966 if (start_page == 0) { 1967 se = &sis->first_swap_extent; 1968 sis->curr_swap_extent = se; 1969 se->start_page = 0; 1970 se->nr_pages = nr_pages; 1971 se->start_block = start_block; 1972 return 1; 1973 } else { 1974 lh = sis->first_swap_extent.list.prev; /* Highest extent */ 1975 se = list_entry(lh, struct swap_extent, list); 1976 BUG_ON(se->start_page + se->nr_pages != start_page); 1977 if (se->start_block + se->nr_pages == start_block) { 1978 /* Merge it */ 1979 se->nr_pages += nr_pages; 1980 return 0; 1981 } 1982 } 1983 1984 /* 1985 * No merge. Insert a new extent, preserving ordering. 1986 */ 1987 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 1988 if (new_se == NULL) 1989 return -ENOMEM; 1990 new_se->start_page = start_page; 1991 new_se->nr_pages = nr_pages; 1992 new_se->start_block = start_block; 1993 1994 list_add_tail(&new_se->list, &sis->first_swap_extent.list); 1995 return 1; 1996 } 1997 1998 /* 1999 * A `swap extent' is a simple thing which maps a contiguous range of pages 2000 * onto a contiguous range of disk blocks. An ordered list of swap extents 2001 * is built at swapon time and is then used at swap_writepage/swap_readpage 2002 * time for locating where on disk a page belongs. 2003 * 2004 * If the swapfile is an S_ISBLK block device, a single extent is installed. 2005 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 2006 * swap files identically. 2007 * 2008 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 2009 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 2010 * swapfiles are handled *identically* after swapon time. 2011 * 2012 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 2013 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If 2014 * some stray blocks are found which do not fall within the PAGE_SIZE alignment 2015 * requirements, they are simply tossed out - we will never use those blocks 2016 * for swapping. 2017 * 2018 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This 2019 * prevents root from shooting her foot off by ftruncating an in-use swapfile, 2020 * which will scribble on the fs. 2021 * 2022 * The amount of disk space which a single swap extent represents varies. 2023 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 2024 * extents in the list. To avoid much list walking, we cache the previous 2025 * search location in `curr_swap_extent', and start new searches from there. 2026 * This is extremely effective. The average number of iterations in 2027 * map_swap_page() has been measured at about 0.3 per page. - akpm. 2028 */ 2029 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 2030 { 2031 struct file *swap_file = sis->swap_file; 2032 struct address_space *mapping = swap_file->f_mapping; 2033 struct inode *inode = mapping->host; 2034 int ret; 2035 2036 if (S_ISBLK(inode->i_mode)) { 2037 ret = add_swap_extent(sis, 0, sis->max, 0); 2038 *span = sis->pages; 2039 return ret; 2040 } 2041 2042 if (mapping->a_ops->swap_activate) { 2043 ret = mapping->a_ops->swap_activate(sis, swap_file, span); 2044 if (!ret) { 2045 sis->flags |= SWP_FILE; 2046 ret = add_swap_extent(sis, 0, sis->max, 0); 2047 *span = sis->pages; 2048 } 2049 return ret; 2050 } 2051 2052 return generic_swapfile_activate(sis, swap_file, span); 2053 } 2054 2055 static void _enable_swap_info(struct swap_info_struct *p, int prio, 2056 unsigned char *swap_map, 2057 struct swap_cluster_info *cluster_info) 2058 { 2059 if (prio >= 0) 2060 p->prio = prio; 2061 else 2062 p->prio = --least_priority; 2063 /* 2064 * the plist prio is negated because plist ordering is 2065 * low-to-high, while swap ordering is high-to-low 2066 */ 2067 p->list.prio = -p->prio; 2068 p->avail_list.prio = -p->prio; 2069 p->swap_map = swap_map; 2070 p->cluster_info = cluster_info; 2071 p->flags |= SWP_WRITEOK; 2072 atomic_long_add(p->pages, &nr_swap_pages); 2073 total_swap_pages += p->pages; 2074 2075 assert_spin_locked(&swap_lock); 2076 /* 2077 * both lists are plists, and thus priority ordered. 2078 * swap_active_head needs to be priority ordered for swapoff(), 2079 * which on removal of any swap_info_struct with an auto-assigned 2080 * (i.e. negative) priority increments the auto-assigned priority 2081 * of any lower-priority swap_info_structs. 2082 * swap_avail_head needs to be priority ordered for get_swap_page(), 2083 * which allocates swap pages from the highest available priority 2084 * swap_info_struct. 2085 */ 2086 plist_add(&p->list, &swap_active_head); 2087 spin_lock(&swap_avail_lock); 2088 plist_add(&p->avail_list, &swap_avail_head); 2089 spin_unlock(&swap_avail_lock); 2090 } 2091 2092 static void enable_swap_info(struct swap_info_struct *p, int prio, 2093 unsigned char *swap_map, 2094 struct swap_cluster_info *cluster_info, 2095 unsigned long *frontswap_map) 2096 { 2097 frontswap_init(p->type, frontswap_map); 2098 spin_lock(&swap_lock); 2099 spin_lock(&p->lock); 2100 _enable_swap_info(p, prio, swap_map, cluster_info); 2101 spin_unlock(&p->lock); 2102 spin_unlock(&swap_lock); 2103 } 2104 2105 static void reinsert_swap_info(struct swap_info_struct *p) 2106 { 2107 spin_lock(&swap_lock); 2108 spin_lock(&p->lock); 2109 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); 2110 spin_unlock(&p->lock); 2111 spin_unlock(&swap_lock); 2112 } 2113 2114 bool has_usable_swap(void) 2115 { 2116 bool ret = true; 2117 2118 spin_lock(&swap_lock); 2119 if (plist_head_empty(&swap_active_head)) 2120 ret = false; 2121 spin_unlock(&swap_lock); 2122 return ret; 2123 } 2124 2125 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 2126 { 2127 struct swap_info_struct *p = NULL; 2128 unsigned char *swap_map; 2129 struct swap_cluster_info *cluster_info; 2130 unsigned long *frontswap_map; 2131 struct file *swap_file, *victim; 2132 struct address_space *mapping; 2133 struct inode *inode; 2134 struct filename *pathname; 2135 int err, found = 0; 2136 unsigned int old_block_size; 2137 2138 if (!capable(CAP_SYS_ADMIN)) 2139 return -EPERM; 2140 2141 BUG_ON(!current->mm); 2142 2143 pathname = getname(specialfile); 2144 if (IS_ERR(pathname)) 2145 return PTR_ERR(pathname); 2146 2147 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); 2148 err = PTR_ERR(victim); 2149 if (IS_ERR(victim)) 2150 goto out; 2151 2152 mapping = victim->f_mapping; 2153 spin_lock(&swap_lock); 2154 plist_for_each_entry(p, &swap_active_head, list) { 2155 if (p->flags & SWP_WRITEOK) { 2156 if (p->swap_file->f_mapping == mapping) { 2157 found = 1; 2158 break; 2159 } 2160 } 2161 } 2162 if (!found) { 2163 err = -EINVAL; 2164 spin_unlock(&swap_lock); 2165 goto out_dput; 2166 } 2167 if (!security_vm_enough_memory_mm(current->mm, p->pages)) 2168 vm_unacct_memory(p->pages); 2169 else { 2170 err = -ENOMEM; 2171 spin_unlock(&swap_lock); 2172 goto out_dput; 2173 } 2174 spin_lock(&swap_avail_lock); 2175 plist_del(&p->avail_list, &swap_avail_head); 2176 spin_unlock(&swap_avail_lock); 2177 spin_lock(&p->lock); 2178 if (p->prio < 0) { 2179 struct swap_info_struct *si = p; 2180 2181 plist_for_each_entry_continue(si, &swap_active_head, list) { 2182 si->prio++; 2183 si->list.prio--; 2184 si->avail_list.prio--; 2185 } 2186 least_priority++; 2187 } 2188 plist_del(&p->list, &swap_active_head); 2189 atomic_long_sub(p->pages, &nr_swap_pages); 2190 total_swap_pages -= p->pages; 2191 p->flags &= ~SWP_WRITEOK; 2192 spin_unlock(&p->lock); 2193 spin_unlock(&swap_lock); 2194 2195 disable_swap_slots_cache_lock(); 2196 2197 set_current_oom_origin(); 2198 err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ 2199 clear_current_oom_origin(); 2200 2201 if (err) { 2202 /* re-insert swap space back into swap_list */ 2203 reinsert_swap_info(p); 2204 reenable_swap_slots_cache_unlock(); 2205 goto out_dput; 2206 } 2207 2208 reenable_swap_slots_cache_unlock(); 2209 2210 flush_work(&p->discard_work); 2211 2212 destroy_swap_extents(p); 2213 if (p->flags & SWP_CONTINUED) 2214 free_swap_count_continuations(p); 2215 2216 mutex_lock(&swapon_mutex); 2217 spin_lock(&swap_lock); 2218 spin_lock(&p->lock); 2219 drain_mmlist(); 2220 2221 /* wait for anyone still in scan_swap_map */ 2222 p->highest_bit = 0; /* cuts scans short */ 2223 while (p->flags >= SWP_SCANNING) { 2224 spin_unlock(&p->lock); 2225 spin_unlock(&swap_lock); 2226 schedule_timeout_uninterruptible(1); 2227 spin_lock(&swap_lock); 2228 spin_lock(&p->lock); 2229 } 2230 2231 swap_file = p->swap_file; 2232 old_block_size = p->old_block_size; 2233 p->swap_file = NULL; 2234 p->max = 0; 2235 swap_map = p->swap_map; 2236 p->swap_map = NULL; 2237 cluster_info = p->cluster_info; 2238 p->cluster_info = NULL; 2239 frontswap_map = frontswap_map_get(p); 2240 spin_unlock(&p->lock); 2241 spin_unlock(&swap_lock); 2242 frontswap_invalidate_area(p->type); 2243 frontswap_map_set(p, NULL); 2244 mutex_unlock(&swapon_mutex); 2245 free_percpu(p->percpu_cluster); 2246 p->percpu_cluster = NULL; 2247 vfree(swap_map); 2248 vfree(cluster_info); 2249 vfree(frontswap_map); 2250 /* Destroy swap account information */ 2251 swap_cgroup_swapoff(p->type); 2252 exit_swap_address_space(p->type); 2253 2254 inode = mapping->host; 2255 if (S_ISBLK(inode->i_mode)) { 2256 struct block_device *bdev = I_BDEV(inode); 2257 set_blocksize(bdev, old_block_size); 2258 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2259 } else { 2260 inode_lock(inode); 2261 inode->i_flags &= ~S_SWAPFILE; 2262 inode_unlock(inode); 2263 } 2264 filp_close(swap_file, NULL); 2265 2266 /* 2267 * Clear the SWP_USED flag after all resources are freed so that swapon 2268 * can reuse this swap_info in alloc_swap_info() safely. It is ok to 2269 * not hold p->lock after we cleared its SWP_WRITEOK. 2270 */ 2271 spin_lock(&swap_lock); 2272 p->flags = 0; 2273 spin_unlock(&swap_lock); 2274 2275 err = 0; 2276 atomic_inc(&proc_poll_event); 2277 wake_up_interruptible(&proc_poll_wait); 2278 2279 out_dput: 2280 filp_close(victim, NULL); 2281 out: 2282 putname(pathname); 2283 return err; 2284 } 2285 2286 #ifdef CONFIG_PROC_FS 2287 static unsigned swaps_poll(struct file *file, poll_table *wait) 2288 { 2289 struct seq_file *seq = file->private_data; 2290 2291 poll_wait(file, &proc_poll_wait, wait); 2292 2293 if (seq->poll_event != atomic_read(&proc_poll_event)) { 2294 seq->poll_event = atomic_read(&proc_poll_event); 2295 return POLLIN | POLLRDNORM | POLLERR | POLLPRI; 2296 } 2297 2298 return POLLIN | POLLRDNORM; 2299 } 2300 2301 /* iterator */ 2302 static void *swap_start(struct seq_file *swap, loff_t *pos) 2303 { 2304 struct swap_info_struct *si; 2305 int type; 2306 loff_t l = *pos; 2307 2308 mutex_lock(&swapon_mutex); 2309 2310 if (!l) 2311 return SEQ_START_TOKEN; 2312 2313 for (type = 0; type < nr_swapfiles; type++) { 2314 smp_rmb(); /* read nr_swapfiles before swap_info[type] */ 2315 si = swap_info[type]; 2316 if (!(si->flags & SWP_USED) || !si->swap_map) 2317 continue; 2318 if (!--l) 2319 return si; 2320 } 2321 2322 return NULL; 2323 } 2324 2325 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 2326 { 2327 struct swap_info_struct *si = v; 2328 int type; 2329 2330 if (v == SEQ_START_TOKEN) 2331 type = 0; 2332 else 2333 type = si->type + 1; 2334 2335 for (; type < nr_swapfiles; type++) { 2336 smp_rmb(); /* read nr_swapfiles before swap_info[type] */ 2337 si = swap_info[type]; 2338 if (!(si->flags & SWP_USED) || !si->swap_map) 2339 continue; 2340 ++*pos; 2341 return si; 2342 } 2343 2344 return NULL; 2345 } 2346 2347 static void swap_stop(struct seq_file *swap, void *v) 2348 { 2349 mutex_unlock(&swapon_mutex); 2350 } 2351 2352 static int swap_show(struct seq_file *swap, void *v) 2353 { 2354 struct swap_info_struct *si = v; 2355 struct file *file; 2356 int len; 2357 2358 if (si == SEQ_START_TOKEN) { 2359 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 2360 return 0; 2361 } 2362 2363 file = si->swap_file; 2364 len = seq_file_path(swap, file, " \t\n\\"); 2365 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 2366 len < 40 ? 40 - len : 1, " ", 2367 S_ISBLK(file_inode(file)->i_mode) ? 2368 "partition" : "file\t", 2369 si->pages << (PAGE_SHIFT - 10), 2370 si->inuse_pages << (PAGE_SHIFT - 10), 2371 si->prio); 2372 return 0; 2373 } 2374 2375 static const struct seq_operations swaps_op = { 2376 .start = swap_start, 2377 .next = swap_next, 2378 .stop = swap_stop, 2379 .show = swap_show 2380 }; 2381 2382 static int swaps_open(struct inode *inode, struct file *file) 2383 { 2384 struct seq_file *seq; 2385 int ret; 2386 2387 ret = seq_open(file, &swaps_op); 2388 if (ret) 2389 return ret; 2390 2391 seq = file->private_data; 2392 seq->poll_event = atomic_read(&proc_poll_event); 2393 return 0; 2394 } 2395 2396 static const struct file_operations proc_swaps_operations = { 2397 .open = swaps_open, 2398 .read = seq_read, 2399 .llseek = seq_lseek, 2400 .release = seq_release, 2401 .poll = swaps_poll, 2402 }; 2403 2404 static int __init procswaps_init(void) 2405 { 2406 proc_create("swaps", 0, NULL, &proc_swaps_operations); 2407 return 0; 2408 } 2409 __initcall(procswaps_init); 2410 #endif /* CONFIG_PROC_FS */ 2411 2412 #ifdef MAX_SWAPFILES_CHECK 2413 static int __init max_swapfiles_check(void) 2414 { 2415 MAX_SWAPFILES_CHECK(); 2416 return 0; 2417 } 2418 late_initcall(max_swapfiles_check); 2419 #endif 2420 2421 static struct swap_info_struct *alloc_swap_info(void) 2422 { 2423 struct swap_info_struct *p; 2424 unsigned int type; 2425 2426 p = kzalloc(sizeof(*p), GFP_KERNEL); 2427 if (!p) 2428 return ERR_PTR(-ENOMEM); 2429 2430 spin_lock(&swap_lock); 2431 for (type = 0; type < nr_swapfiles; type++) { 2432 if (!(swap_info[type]->flags & SWP_USED)) 2433 break; 2434 } 2435 if (type >= MAX_SWAPFILES) { 2436 spin_unlock(&swap_lock); 2437 kfree(p); 2438 return ERR_PTR(-EPERM); 2439 } 2440 if (type >= nr_swapfiles) { 2441 p->type = type; 2442 swap_info[type] = p; 2443 /* 2444 * Write swap_info[type] before nr_swapfiles, in case a 2445 * racing procfs swap_start() or swap_next() is reading them. 2446 * (We never shrink nr_swapfiles, we never free this entry.) 2447 */ 2448 smp_wmb(); 2449 nr_swapfiles++; 2450 } else { 2451 kfree(p); 2452 p = swap_info[type]; 2453 /* 2454 * Do not memset this entry: a racing procfs swap_next() 2455 * would be relying on p->type to remain valid. 2456 */ 2457 } 2458 INIT_LIST_HEAD(&p->first_swap_extent.list); 2459 plist_node_init(&p->list, 0); 2460 plist_node_init(&p->avail_list, 0); 2461 p->flags = SWP_USED; 2462 spin_unlock(&swap_lock); 2463 spin_lock_init(&p->lock); 2464 2465 return p; 2466 } 2467 2468 static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) 2469 { 2470 int error; 2471 2472 if (S_ISBLK(inode->i_mode)) { 2473 p->bdev = bdgrab(I_BDEV(inode)); 2474 error = blkdev_get(p->bdev, 2475 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); 2476 if (error < 0) { 2477 p->bdev = NULL; 2478 return error; 2479 } 2480 p->old_block_size = block_size(p->bdev); 2481 error = set_blocksize(p->bdev, PAGE_SIZE); 2482 if (error < 0) 2483 return error; 2484 p->flags |= SWP_BLKDEV; 2485 } else if (S_ISREG(inode->i_mode)) { 2486 p->bdev = inode->i_sb->s_bdev; 2487 inode_lock(inode); 2488 if (IS_SWAPFILE(inode)) 2489 return -EBUSY; 2490 } else 2491 return -EINVAL; 2492 2493 return 0; 2494 } 2495 2496 static unsigned long read_swap_header(struct swap_info_struct *p, 2497 union swap_header *swap_header, 2498 struct inode *inode) 2499 { 2500 int i; 2501 unsigned long maxpages; 2502 unsigned long swapfilepages; 2503 unsigned long last_page; 2504 2505 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 2506 pr_err("Unable to find swap-space signature\n"); 2507 return 0; 2508 } 2509 2510 /* swap partition endianess hack... */ 2511 if (swab32(swap_header->info.version) == 1) { 2512 swab32s(&swap_header->info.version); 2513 swab32s(&swap_header->info.last_page); 2514 swab32s(&swap_header->info.nr_badpages); 2515 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 2516 return 0; 2517 for (i = 0; i < swap_header->info.nr_badpages; i++) 2518 swab32s(&swap_header->info.badpages[i]); 2519 } 2520 /* Check the swap header's sub-version */ 2521 if (swap_header->info.version != 1) { 2522 pr_warn("Unable to handle swap header version %d\n", 2523 swap_header->info.version); 2524 return 0; 2525 } 2526 2527 p->lowest_bit = 1; 2528 p->cluster_next = 1; 2529 p->cluster_nr = 0; 2530 2531 /* 2532 * Find out how many pages are allowed for a single swap 2533 * device. There are two limiting factors: 1) the number 2534 * of bits for the swap offset in the swp_entry_t type, and 2535 * 2) the number of bits in the swap pte as defined by the 2536 * different architectures. In order to find the 2537 * largest possible bit mask, a swap entry with swap type 0 2538 * and swap offset ~0UL is created, encoded to a swap pte, 2539 * decoded to a swp_entry_t again, and finally the swap 2540 * offset is extracted. This will mask all the bits from 2541 * the initial ~0UL mask that can't be encoded in either 2542 * the swp_entry_t or the architecture definition of a 2543 * swap pte. 2544 */ 2545 maxpages = swp_offset(pte_to_swp_entry( 2546 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 2547 last_page = swap_header->info.last_page; 2548 if (last_page > maxpages) { 2549 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", 2550 maxpages << (PAGE_SHIFT - 10), 2551 last_page << (PAGE_SHIFT - 10)); 2552 } 2553 if (maxpages > last_page) { 2554 maxpages = last_page + 1; 2555 /* p->max is an unsigned int: don't overflow it */ 2556 if ((unsigned int)maxpages == 0) 2557 maxpages = UINT_MAX; 2558 } 2559 p->highest_bit = maxpages - 1; 2560 2561 if (!maxpages) 2562 return 0; 2563 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 2564 if (swapfilepages && maxpages > swapfilepages) { 2565 pr_warn("Swap area shorter than signature indicates\n"); 2566 return 0; 2567 } 2568 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 2569 return 0; 2570 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 2571 return 0; 2572 2573 return maxpages; 2574 } 2575 2576 #define SWAP_CLUSTER_INFO_COLS \ 2577 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) 2578 #define SWAP_CLUSTER_SPACE_COLS \ 2579 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) 2580 #define SWAP_CLUSTER_COLS \ 2581 max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) 2582 2583 static int setup_swap_map_and_extents(struct swap_info_struct *p, 2584 union swap_header *swap_header, 2585 unsigned char *swap_map, 2586 struct swap_cluster_info *cluster_info, 2587 unsigned long maxpages, 2588 sector_t *span) 2589 { 2590 unsigned int j, k; 2591 unsigned int nr_good_pages; 2592 int nr_extents; 2593 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); 2594 unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; 2595 unsigned long i, idx; 2596 2597 nr_good_pages = maxpages - 1; /* omit header page */ 2598 2599 cluster_list_init(&p->free_clusters); 2600 cluster_list_init(&p->discard_clusters); 2601 2602 for (i = 0; i < swap_header->info.nr_badpages; i++) { 2603 unsigned int page_nr = swap_header->info.badpages[i]; 2604 if (page_nr == 0 || page_nr > swap_header->info.last_page) 2605 return -EINVAL; 2606 if (page_nr < maxpages) { 2607 swap_map[page_nr] = SWAP_MAP_BAD; 2608 nr_good_pages--; 2609 /* 2610 * Haven't marked the cluster free yet, no list 2611 * operation involved 2612 */ 2613 inc_cluster_info_page(p, cluster_info, page_nr); 2614 } 2615 } 2616 2617 /* Haven't marked the cluster free yet, no list operation involved */ 2618 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) 2619 inc_cluster_info_page(p, cluster_info, i); 2620 2621 if (nr_good_pages) { 2622 swap_map[0] = SWAP_MAP_BAD; 2623 /* 2624 * Not mark the cluster free yet, no list 2625 * operation involved 2626 */ 2627 inc_cluster_info_page(p, cluster_info, 0); 2628 p->max = maxpages; 2629 p->pages = nr_good_pages; 2630 nr_extents = setup_swap_extents(p, span); 2631 if (nr_extents < 0) 2632 return nr_extents; 2633 nr_good_pages = p->pages; 2634 } 2635 if (!nr_good_pages) { 2636 pr_warn("Empty swap-file\n"); 2637 return -EINVAL; 2638 } 2639 2640 if (!cluster_info) 2641 return nr_extents; 2642 2643 2644 /* 2645 * Reduce false cache line sharing between cluster_info and 2646 * sharing same address space. 2647 */ 2648 for (k = 0; k < SWAP_CLUSTER_COLS; k++) { 2649 j = (k + col) % SWAP_CLUSTER_COLS; 2650 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { 2651 idx = i * SWAP_CLUSTER_COLS + j; 2652 if (idx >= nr_clusters) 2653 continue; 2654 if (cluster_count(&cluster_info[idx])) 2655 continue; 2656 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); 2657 cluster_list_add_tail(&p->free_clusters, cluster_info, 2658 idx); 2659 } 2660 } 2661 return nr_extents; 2662 } 2663 2664 /* 2665 * Helper to sys_swapon determining if a given swap 2666 * backing device queue supports DISCARD operations. 2667 */ 2668 static bool swap_discardable(struct swap_info_struct *si) 2669 { 2670 struct request_queue *q = bdev_get_queue(si->bdev); 2671 2672 if (!q || !blk_queue_discard(q)) 2673 return false; 2674 2675 return true; 2676 } 2677 2678 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 2679 { 2680 struct swap_info_struct *p; 2681 struct filename *name; 2682 struct file *swap_file = NULL; 2683 struct address_space *mapping; 2684 int prio; 2685 int error; 2686 union swap_header *swap_header; 2687 int nr_extents; 2688 sector_t span; 2689 unsigned long maxpages; 2690 unsigned char *swap_map = NULL; 2691 struct swap_cluster_info *cluster_info = NULL; 2692 unsigned long *frontswap_map = NULL; 2693 struct page *page = NULL; 2694 struct inode *inode = NULL; 2695 2696 if (swap_flags & ~SWAP_FLAGS_VALID) 2697 return -EINVAL; 2698 2699 if (!capable(CAP_SYS_ADMIN)) 2700 return -EPERM; 2701 2702 p = alloc_swap_info(); 2703 if (IS_ERR(p)) 2704 return PTR_ERR(p); 2705 2706 INIT_WORK(&p->discard_work, swap_discard_work); 2707 2708 name = getname(specialfile); 2709 if (IS_ERR(name)) { 2710 error = PTR_ERR(name); 2711 name = NULL; 2712 goto bad_swap; 2713 } 2714 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); 2715 if (IS_ERR(swap_file)) { 2716 error = PTR_ERR(swap_file); 2717 swap_file = NULL; 2718 goto bad_swap; 2719 } 2720 2721 p->swap_file = swap_file; 2722 mapping = swap_file->f_mapping; 2723 inode = mapping->host; 2724 2725 /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */ 2726 error = claim_swapfile(p, inode); 2727 if (unlikely(error)) 2728 goto bad_swap; 2729 2730 /* 2731 * Read the swap header. 2732 */ 2733 if (!mapping->a_ops->readpage) { 2734 error = -EINVAL; 2735 goto bad_swap; 2736 } 2737 page = read_mapping_page(mapping, 0, swap_file); 2738 if (IS_ERR(page)) { 2739 error = PTR_ERR(page); 2740 goto bad_swap; 2741 } 2742 swap_header = kmap(page); 2743 2744 maxpages = read_swap_header(p, swap_header, inode); 2745 if (unlikely(!maxpages)) { 2746 error = -EINVAL; 2747 goto bad_swap; 2748 } 2749 2750 /* OK, set up the swap map and apply the bad block list */ 2751 swap_map = vzalloc(maxpages); 2752 if (!swap_map) { 2753 error = -ENOMEM; 2754 goto bad_swap; 2755 } 2756 2757 if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) 2758 p->flags |= SWP_STABLE_WRITES; 2759 2760 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2761 int cpu; 2762 unsigned long ci, nr_cluster; 2763 2764 p->flags |= SWP_SOLIDSTATE; 2765 /* 2766 * select a random position to start with to help wear leveling 2767 * SSD 2768 */ 2769 p->cluster_next = 1 + (prandom_u32() % p->highest_bit); 2770 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); 2771 2772 cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info)); 2773 if (!cluster_info) { 2774 error = -ENOMEM; 2775 goto bad_swap; 2776 } 2777 2778 for (ci = 0; ci < nr_cluster; ci++) 2779 spin_lock_init(&((cluster_info + ci)->lock)); 2780 2781 p->percpu_cluster = alloc_percpu(struct percpu_cluster); 2782 if (!p->percpu_cluster) { 2783 error = -ENOMEM; 2784 goto bad_swap; 2785 } 2786 for_each_possible_cpu(cpu) { 2787 struct percpu_cluster *cluster; 2788 cluster = per_cpu_ptr(p->percpu_cluster, cpu); 2789 cluster_set_null(&cluster->index); 2790 } 2791 } 2792 2793 error = swap_cgroup_swapon(p->type, maxpages); 2794 if (error) 2795 goto bad_swap; 2796 2797 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, 2798 cluster_info, maxpages, &span); 2799 if (unlikely(nr_extents < 0)) { 2800 error = nr_extents; 2801 goto bad_swap; 2802 } 2803 /* frontswap enabled? set up bit-per-page map for frontswap */ 2804 if (IS_ENABLED(CONFIG_FRONTSWAP)) 2805 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); 2806 2807 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { 2808 /* 2809 * When discard is enabled for swap with no particular 2810 * policy flagged, we set all swap discard flags here in 2811 * order to sustain backward compatibility with older 2812 * swapon(8) releases. 2813 */ 2814 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | 2815 SWP_PAGE_DISCARD); 2816 2817 /* 2818 * By flagging sys_swapon, a sysadmin can tell us to 2819 * either do single-time area discards only, or to just 2820 * perform discards for released swap page-clusters. 2821 * Now it's time to adjust the p->flags accordingly. 2822 */ 2823 if (swap_flags & SWAP_FLAG_DISCARD_ONCE) 2824 p->flags &= ~SWP_PAGE_DISCARD; 2825 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) 2826 p->flags &= ~SWP_AREA_DISCARD; 2827 2828 /* issue a swapon-time discard if it's still required */ 2829 if (p->flags & SWP_AREA_DISCARD) { 2830 int err = discard_swap(p); 2831 if (unlikely(err)) 2832 pr_err("swapon: discard_swap(%p): %d\n", 2833 p, err); 2834 } 2835 } 2836 2837 error = init_swap_address_space(p->type, maxpages); 2838 if (error) 2839 goto bad_swap; 2840 2841 mutex_lock(&swapon_mutex); 2842 prio = -1; 2843 if (swap_flags & SWAP_FLAG_PREFER) 2844 prio = 2845 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2846 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); 2847 2848 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", 2849 p->pages<<(PAGE_SHIFT-10), name->name, p->prio, 2850 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2851 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2852 (p->flags & SWP_DISCARDABLE) ? "D" : "", 2853 (p->flags & SWP_AREA_DISCARD) ? "s" : "", 2854 (p->flags & SWP_PAGE_DISCARD) ? "c" : "", 2855 (frontswap_map) ? "FS" : ""); 2856 2857 mutex_unlock(&swapon_mutex); 2858 atomic_inc(&proc_poll_event); 2859 wake_up_interruptible(&proc_poll_wait); 2860 2861 if (S_ISREG(inode->i_mode)) 2862 inode->i_flags |= S_SWAPFILE; 2863 error = 0; 2864 goto out; 2865 bad_swap: 2866 free_percpu(p->percpu_cluster); 2867 p->percpu_cluster = NULL; 2868 if (inode && S_ISBLK(inode->i_mode) && p->bdev) { 2869 set_blocksize(p->bdev, p->old_block_size); 2870 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2871 } 2872 destroy_swap_extents(p); 2873 swap_cgroup_swapoff(p->type); 2874 spin_lock(&swap_lock); 2875 p->swap_file = NULL; 2876 p->flags = 0; 2877 spin_unlock(&swap_lock); 2878 vfree(swap_map); 2879 vfree(cluster_info); 2880 if (swap_file) { 2881 if (inode && S_ISREG(inode->i_mode)) { 2882 inode_unlock(inode); 2883 inode = NULL; 2884 } 2885 filp_close(swap_file, NULL); 2886 } 2887 out: 2888 if (page && !IS_ERR(page)) { 2889 kunmap(page); 2890 put_page(page); 2891 } 2892 if (name) 2893 putname(name); 2894 if (inode && S_ISREG(inode->i_mode)) 2895 inode_unlock(inode); 2896 if (!error) 2897 enable_swap_slots_cache(); 2898 return error; 2899 } 2900 2901 void si_swapinfo(struct sysinfo *val) 2902 { 2903 unsigned int type; 2904 unsigned long nr_to_be_unused = 0; 2905 2906 spin_lock(&swap_lock); 2907 for (type = 0; type < nr_swapfiles; type++) { 2908 struct swap_info_struct *si = swap_info[type]; 2909 2910 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 2911 nr_to_be_unused += si->inuse_pages; 2912 } 2913 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; 2914 val->totalswap = total_swap_pages + nr_to_be_unused; 2915 spin_unlock(&swap_lock); 2916 } 2917 2918 /* 2919 * Verify that a swap entry is valid and increment its swap map count. 2920 * 2921 * Returns error code in following case. 2922 * - success -> 0 2923 * - swp_entry is invalid -> EINVAL 2924 * - swp_entry is migration entry -> EINVAL 2925 * - swap-cache reference is requested but there is already one. -> EEXIST 2926 * - swap-cache reference is requested but the entry is not used. -> ENOENT 2927 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM 2928 */ 2929 static int __swap_duplicate(swp_entry_t entry, unsigned char usage) 2930 { 2931 struct swap_info_struct *p; 2932 struct swap_cluster_info *ci; 2933 unsigned long offset, type; 2934 unsigned char count; 2935 unsigned char has_cache; 2936 int err = -EINVAL; 2937 2938 if (non_swap_entry(entry)) 2939 goto out; 2940 2941 type = swp_type(entry); 2942 if (type >= nr_swapfiles) 2943 goto bad_file; 2944 p = swap_info[type]; 2945 offset = swp_offset(entry); 2946 if (unlikely(offset >= p->max)) 2947 goto out; 2948 2949 ci = lock_cluster_or_swap_info(p, offset); 2950 2951 count = p->swap_map[offset]; 2952 2953 /* 2954 * swapin_readahead() doesn't check if a swap entry is valid, so the 2955 * swap entry could be SWAP_MAP_BAD. Check here with lock held. 2956 */ 2957 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { 2958 err = -ENOENT; 2959 goto unlock_out; 2960 } 2961 2962 has_cache = count & SWAP_HAS_CACHE; 2963 count &= ~SWAP_HAS_CACHE; 2964 err = 0; 2965 2966 if (usage == SWAP_HAS_CACHE) { 2967 2968 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 2969 if (!has_cache && count) 2970 has_cache = SWAP_HAS_CACHE; 2971 else if (has_cache) /* someone else added cache */ 2972 err = -EEXIST; 2973 else /* no users remaining */ 2974 err = -ENOENT; 2975 2976 } else if (count || has_cache) { 2977 2978 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) 2979 count += usage; 2980 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) 2981 err = -EINVAL; 2982 else if (swap_count_continued(p, offset, count)) 2983 count = COUNT_CONTINUED; 2984 else 2985 err = -ENOMEM; 2986 } else 2987 err = -ENOENT; /* unused swap entry */ 2988 2989 p->swap_map[offset] = count | has_cache; 2990 2991 unlock_out: 2992 unlock_cluster_or_swap_info(p, ci); 2993 out: 2994 return err; 2995 2996 bad_file: 2997 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); 2998 goto out; 2999 } 3000 3001 /* 3002 * Help swapoff by noting that swap entry belongs to shmem/tmpfs 3003 * (in which case its reference count is never incremented). 3004 */ 3005 void swap_shmem_alloc(swp_entry_t entry) 3006 { 3007 __swap_duplicate(entry, SWAP_MAP_SHMEM); 3008 } 3009 3010 /* 3011 * Increase reference count of swap entry by 1. 3012 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required 3013 * but could not be atomically allocated. Returns 0, just as if it succeeded, 3014 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which 3015 * might occur if a page table entry has got corrupted. 3016 */ 3017 int swap_duplicate(swp_entry_t entry) 3018 { 3019 int err = 0; 3020 3021 while (!err && __swap_duplicate(entry, 1) == -ENOMEM) 3022 err = add_swap_count_continuation(entry, GFP_ATOMIC); 3023 return err; 3024 } 3025 3026 /* 3027 * @entry: swap entry for which we allocate swap cache. 3028 * 3029 * Called when allocating swap cache for existing swap entry, 3030 * This can return error codes. Returns 0 at success. 3031 * -EBUSY means there is a swap cache. 3032 * Note: return code is different from swap_duplicate(). 3033 */ 3034 int swapcache_prepare(swp_entry_t entry) 3035 { 3036 return __swap_duplicate(entry, SWAP_HAS_CACHE); 3037 } 3038 3039 struct swap_info_struct *page_swap_info(struct page *page) 3040 { 3041 swp_entry_t swap = { .val = page_private(page) }; 3042 return swap_info[swp_type(swap)]; 3043 } 3044 3045 /* 3046 * out-of-line __page_file_ methods to avoid include hell. 3047 */ 3048 struct address_space *__page_file_mapping(struct page *page) 3049 { 3050 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 3051 return page_swap_info(page)->swap_file->f_mapping; 3052 } 3053 EXPORT_SYMBOL_GPL(__page_file_mapping); 3054 3055 pgoff_t __page_file_index(struct page *page) 3056 { 3057 swp_entry_t swap = { .val = page_private(page) }; 3058 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 3059 return swp_offset(swap); 3060 } 3061 EXPORT_SYMBOL_GPL(__page_file_index); 3062 3063 /* 3064 * add_swap_count_continuation - called when a swap count is duplicated 3065 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 3066 * page of the original vmalloc'ed swap_map, to hold the continuation count 3067 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called 3068 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. 3069 * 3070 * These continuation pages are seldom referenced: the common paths all work 3071 * on the original swap_map, only referring to a continuation page when the 3072 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. 3073 * 3074 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding 3075 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) 3076 * can be called after dropping locks. 3077 */ 3078 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) 3079 { 3080 struct swap_info_struct *si; 3081 struct swap_cluster_info *ci; 3082 struct page *head; 3083 struct page *page; 3084 struct page *list_page; 3085 pgoff_t offset; 3086 unsigned char count; 3087 3088 /* 3089 * When debugging, it's easier to use __GFP_ZERO here; but it's better 3090 * for latency not to zero a page while GFP_ATOMIC and holding locks. 3091 */ 3092 page = alloc_page(gfp_mask | __GFP_HIGHMEM); 3093 3094 si = swap_info_get(entry); 3095 if (!si) { 3096 /* 3097 * An acceptable race has occurred since the failing 3098 * __swap_duplicate(): the swap entry has been freed, 3099 * perhaps even the whole swap_map cleared for swapoff. 3100 */ 3101 goto outer; 3102 } 3103 3104 offset = swp_offset(entry); 3105 3106 ci = lock_cluster(si, offset); 3107 3108 count = si->swap_map[offset] & ~SWAP_HAS_CACHE; 3109 3110 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { 3111 /* 3112 * The higher the swap count, the more likely it is that tasks 3113 * will race to add swap count continuation: we need to avoid 3114 * over-provisioning. 3115 */ 3116 goto out; 3117 } 3118 3119 if (!page) { 3120 unlock_cluster(ci); 3121 spin_unlock(&si->lock); 3122 return -ENOMEM; 3123 } 3124 3125 /* 3126 * We are fortunate that although vmalloc_to_page uses pte_offset_map, 3127 * no architecture is using highmem pages for kernel page tables: so it 3128 * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. 3129 */ 3130 head = vmalloc_to_page(si->swap_map + offset); 3131 offset &= ~PAGE_MASK; 3132 3133 /* 3134 * Page allocation does not initialize the page's lru field, 3135 * but it does always reset its private field. 3136 */ 3137 if (!page_private(head)) { 3138 BUG_ON(count & COUNT_CONTINUED); 3139 INIT_LIST_HEAD(&head->lru); 3140 set_page_private(head, SWP_CONTINUED); 3141 si->flags |= SWP_CONTINUED; 3142 } 3143 3144 list_for_each_entry(list_page, &head->lru, lru) { 3145 unsigned char *map; 3146 3147 /* 3148 * If the previous map said no continuation, but we've found 3149 * a continuation page, free our allocation and use this one. 3150 */ 3151 if (!(count & COUNT_CONTINUED)) 3152 goto out; 3153 3154 map = kmap_atomic(list_page) + offset; 3155 count = *map; 3156 kunmap_atomic(map); 3157 3158 /* 3159 * If this continuation count now has some space in it, 3160 * free our allocation and use this one. 3161 */ 3162 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) 3163 goto out; 3164 } 3165 3166 list_add_tail(&page->lru, &head->lru); 3167 page = NULL; /* now it's attached, don't free it */ 3168 out: 3169 unlock_cluster(ci); 3170 spin_unlock(&si->lock); 3171 outer: 3172 if (page) 3173 __free_page(page); 3174 return 0; 3175 } 3176 3177 /* 3178 * swap_count_continued - when the original swap_map count is incremented 3179 * from SWAP_MAP_MAX, check if there is already a continuation page to carry 3180 * into, carry if so, or else fail until a new continuation page is allocated; 3181 * when the original swap_map count is decremented from 0 with continuation, 3182 * borrow from the continuation and report whether it still holds more. 3183 * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster 3184 * lock. 3185 */ 3186 static bool swap_count_continued(struct swap_info_struct *si, 3187 pgoff_t offset, unsigned char count) 3188 { 3189 struct page *head; 3190 struct page *page; 3191 unsigned char *map; 3192 3193 head = vmalloc_to_page(si->swap_map + offset); 3194 if (page_private(head) != SWP_CONTINUED) { 3195 BUG_ON(count & COUNT_CONTINUED); 3196 return false; /* need to add count continuation */ 3197 } 3198 3199 offset &= ~PAGE_MASK; 3200 page = list_entry(head->lru.next, struct page, lru); 3201 map = kmap_atomic(page) + offset; 3202 3203 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ 3204 goto init_map; /* jump over SWAP_CONT_MAX checks */ 3205 3206 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ 3207 /* 3208 * Think of how you add 1 to 999 3209 */ 3210 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { 3211 kunmap_atomic(map); 3212 page = list_entry(page->lru.next, struct page, lru); 3213 BUG_ON(page == head); 3214 map = kmap_atomic(page) + offset; 3215 } 3216 if (*map == SWAP_CONT_MAX) { 3217 kunmap_atomic(map); 3218 page = list_entry(page->lru.next, struct page, lru); 3219 if (page == head) 3220 return false; /* add count continuation */ 3221 map = kmap_atomic(page) + offset; 3222 init_map: *map = 0; /* we didn't zero the page */ 3223 } 3224 *map += 1; 3225 kunmap_atomic(map); 3226 page = list_entry(page->lru.prev, struct page, lru); 3227 while (page != head) { 3228 map = kmap_atomic(page) + offset; 3229 *map = COUNT_CONTINUED; 3230 kunmap_atomic(map); 3231 page = list_entry(page->lru.prev, struct page, lru); 3232 } 3233 return true; /* incremented */ 3234 3235 } else { /* decrementing */ 3236 /* 3237 * Think of how you subtract 1 from 1000 3238 */ 3239 BUG_ON(count != COUNT_CONTINUED); 3240 while (*map == COUNT_CONTINUED) { 3241 kunmap_atomic(map); 3242 page = list_entry(page->lru.next, struct page, lru); 3243 BUG_ON(page == head); 3244 map = kmap_atomic(page) + offset; 3245 } 3246 BUG_ON(*map == 0); 3247 *map -= 1; 3248 if (*map == 0) 3249 count = 0; 3250 kunmap_atomic(map); 3251 page = list_entry(page->lru.prev, struct page, lru); 3252 while (page != head) { 3253 map = kmap_atomic(page) + offset; 3254 *map = SWAP_CONT_MAX | count; 3255 count = COUNT_CONTINUED; 3256 kunmap_atomic(map); 3257 page = list_entry(page->lru.prev, struct page, lru); 3258 } 3259 return count == COUNT_CONTINUED; 3260 } 3261 } 3262 3263 /* 3264 * free_swap_count_continuations - swapoff free all the continuation pages 3265 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. 3266 */ 3267 static void free_swap_count_continuations(struct swap_info_struct *si) 3268 { 3269 pgoff_t offset; 3270 3271 for (offset = 0; offset < si->max; offset += PAGE_SIZE) { 3272 struct page *head; 3273 head = vmalloc_to_page(si->swap_map + offset); 3274 if (page_private(head)) { 3275 struct page *page, *next; 3276 3277 list_for_each_entry_safe(page, next, &head->lru, lru) { 3278 list_del(&page->lru); 3279 __free_page(page); 3280 } 3281 } 3282 } 3283 } 3284