1 /* 2 * linux/mm/swapfile.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * Swap reorganised 29.12.95, Stephen Tweedie 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/sched/mm.h> 10 #include <linux/sched/task.h> 11 #include <linux/hugetlb.h> 12 #include <linux/mman.h> 13 #include <linux/slab.h> 14 #include <linux/kernel_stat.h> 15 #include <linux/swap.h> 16 #include <linux/vmalloc.h> 17 #include <linux/pagemap.h> 18 #include <linux/namei.h> 19 #include <linux/shmem_fs.h> 20 #include <linux/blkdev.h> 21 #include <linux/random.h> 22 #include <linux/writeback.h> 23 #include <linux/proc_fs.h> 24 #include <linux/seq_file.h> 25 #include <linux/init.h> 26 #include <linux/ksm.h> 27 #include <linux/rmap.h> 28 #include <linux/security.h> 29 #include <linux/backing-dev.h> 30 #include <linux/mutex.h> 31 #include <linux/capability.h> 32 #include <linux/syscalls.h> 33 #include <linux/memcontrol.h> 34 #include <linux/poll.h> 35 #include <linux/oom.h> 36 #include <linux/frontswap.h> 37 #include <linux/swapfile.h> 38 #include <linux/export.h> 39 #include <linux/swap_slots.h> 40 41 #include <asm/pgtable.h> 42 #include <asm/tlbflush.h> 43 #include <linux/swapops.h> 44 #include <linux/swap_cgroup.h> 45 46 static bool swap_count_continued(struct swap_info_struct *, pgoff_t, 47 unsigned char); 48 static void free_swap_count_continuations(struct swap_info_struct *); 49 static sector_t map_swap_entry(swp_entry_t, struct block_device**); 50 51 DEFINE_SPINLOCK(swap_lock); 52 static unsigned int nr_swapfiles; 53 atomic_long_t nr_swap_pages; 54 /* 55 * Some modules use swappable objects and may try to swap them out under 56 * memory pressure (via the shrinker). Before doing so, they may wish to 57 * check to see if any swap space is available. 58 */ 59 EXPORT_SYMBOL_GPL(nr_swap_pages); 60 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 61 long total_swap_pages; 62 static int least_priority; 63 64 static const char Bad_file[] = "Bad swap file entry "; 65 static const char Unused_file[] = "Unused swap file entry "; 66 static const char Bad_offset[] = "Bad swap offset entry "; 67 static const char Unused_offset[] = "Unused swap offset entry "; 68 69 /* 70 * all active swap_info_structs 71 * protected with swap_lock, and ordered by priority. 72 */ 73 PLIST_HEAD(swap_active_head); 74 75 /* 76 * all available (active, not full) swap_info_structs 77 * protected with swap_avail_lock, ordered by priority. 78 * This is used by get_swap_page() instead of swap_active_head 79 * because swap_active_head includes all swap_info_structs, 80 * but get_swap_page() doesn't need to look at full ones. 81 * This uses its own lock instead of swap_lock because when a 82 * swap_info_struct changes between not-full/full, it needs to 83 * add/remove itself to/from this list, but the swap_info_struct->lock 84 * is held and the locking order requires swap_lock to be taken 85 * before any swap_info_struct->lock. 86 */ 87 static PLIST_HEAD(swap_avail_head); 88 static DEFINE_SPINLOCK(swap_avail_lock); 89 90 struct swap_info_struct *swap_info[MAX_SWAPFILES]; 91 92 static DEFINE_MUTEX(swapon_mutex); 93 94 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); 95 /* Activity counter to indicate that a swapon or swapoff has occurred */ 96 static atomic_t proc_poll_event = ATOMIC_INIT(0); 97 98 static inline unsigned char swap_count(unsigned char ent) 99 { 100 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ 101 } 102 103 /* returns 1 if swap entry is freed */ 104 static int 105 __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) 106 { 107 swp_entry_t entry = swp_entry(si->type, offset); 108 struct page *page; 109 int ret = 0; 110 111 page = find_get_page(swap_address_space(entry), swp_offset(entry)); 112 if (!page) 113 return 0; 114 /* 115 * This function is called from scan_swap_map() and it's called 116 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. 117 * We have to use trylock for avoiding deadlock. This is a special 118 * case and you should use try_to_free_swap() with explicit lock_page() 119 * in usual operations. 120 */ 121 if (trylock_page(page)) { 122 ret = try_to_free_swap(page); 123 unlock_page(page); 124 } 125 put_page(page); 126 return ret; 127 } 128 129 /* 130 * swapon tell device that all the old swap contents can be discarded, 131 * to allow the swap device to optimize its wear-levelling. 132 */ 133 static int discard_swap(struct swap_info_struct *si) 134 { 135 struct swap_extent *se; 136 sector_t start_block; 137 sector_t nr_blocks; 138 int err = 0; 139 140 /* Do not discard the swap header page! */ 141 se = &si->first_swap_extent; 142 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); 143 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 144 if (nr_blocks) { 145 err = blkdev_issue_discard(si->bdev, start_block, 146 nr_blocks, GFP_KERNEL, 0); 147 if (err) 148 return err; 149 cond_resched(); 150 } 151 152 list_for_each_entry(se, &si->first_swap_extent.list, list) { 153 start_block = se->start_block << (PAGE_SHIFT - 9); 154 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 155 156 err = blkdev_issue_discard(si->bdev, start_block, 157 nr_blocks, GFP_KERNEL, 0); 158 if (err) 159 break; 160 161 cond_resched(); 162 } 163 return err; /* That will often be -EOPNOTSUPP */ 164 } 165 166 /* 167 * swap allocation tell device that a cluster of swap can now be discarded, 168 * to allow the swap device to optimize its wear-levelling. 169 */ 170 static void discard_swap_cluster(struct swap_info_struct *si, 171 pgoff_t start_page, pgoff_t nr_pages) 172 { 173 struct swap_extent *se = si->curr_swap_extent; 174 int found_extent = 0; 175 176 while (nr_pages) { 177 if (se->start_page <= start_page && 178 start_page < se->start_page + se->nr_pages) { 179 pgoff_t offset = start_page - se->start_page; 180 sector_t start_block = se->start_block + offset; 181 sector_t nr_blocks = se->nr_pages - offset; 182 183 if (nr_blocks > nr_pages) 184 nr_blocks = nr_pages; 185 start_page += nr_blocks; 186 nr_pages -= nr_blocks; 187 188 if (!found_extent++) 189 si->curr_swap_extent = se; 190 191 start_block <<= PAGE_SHIFT - 9; 192 nr_blocks <<= PAGE_SHIFT - 9; 193 if (blkdev_issue_discard(si->bdev, start_block, 194 nr_blocks, GFP_NOIO, 0)) 195 break; 196 } 197 198 se = list_next_entry(se, list); 199 } 200 } 201 202 #define SWAPFILE_CLUSTER 256 203 #define LATENCY_LIMIT 256 204 205 static inline void cluster_set_flag(struct swap_cluster_info *info, 206 unsigned int flag) 207 { 208 info->flags = flag; 209 } 210 211 static inline unsigned int cluster_count(struct swap_cluster_info *info) 212 { 213 return info->data; 214 } 215 216 static inline void cluster_set_count(struct swap_cluster_info *info, 217 unsigned int c) 218 { 219 info->data = c; 220 } 221 222 static inline void cluster_set_count_flag(struct swap_cluster_info *info, 223 unsigned int c, unsigned int f) 224 { 225 info->flags = f; 226 info->data = c; 227 } 228 229 static inline unsigned int cluster_next(struct swap_cluster_info *info) 230 { 231 return info->data; 232 } 233 234 static inline void cluster_set_next(struct swap_cluster_info *info, 235 unsigned int n) 236 { 237 info->data = n; 238 } 239 240 static inline void cluster_set_next_flag(struct swap_cluster_info *info, 241 unsigned int n, unsigned int f) 242 { 243 info->flags = f; 244 info->data = n; 245 } 246 247 static inline bool cluster_is_free(struct swap_cluster_info *info) 248 { 249 return info->flags & CLUSTER_FLAG_FREE; 250 } 251 252 static inline bool cluster_is_null(struct swap_cluster_info *info) 253 { 254 return info->flags & CLUSTER_FLAG_NEXT_NULL; 255 } 256 257 static inline void cluster_set_null(struct swap_cluster_info *info) 258 { 259 info->flags = CLUSTER_FLAG_NEXT_NULL; 260 info->data = 0; 261 } 262 263 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, 264 unsigned long offset) 265 { 266 struct swap_cluster_info *ci; 267 268 ci = si->cluster_info; 269 if (ci) { 270 ci += offset / SWAPFILE_CLUSTER; 271 spin_lock(&ci->lock); 272 } 273 return ci; 274 } 275 276 static inline void unlock_cluster(struct swap_cluster_info *ci) 277 { 278 if (ci) 279 spin_unlock(&ci->lock); 280 } 281 282 static inline struct swap_cluster_info *lock_cluster_or_swap_info( 283 struct swap_info_struct *si, 284 unsigned long offset) 285 { 286 struct swap_cluster_info *ci; 287 288 ci = lock_cluster(si, offset); 289 if (!ci) 290 spin_lock(&si->lock); 291 292 return ci; 293 } 294 295 static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, 296 struct swap_cluster_info *ci) 297 { 298 if (ci) 299 unlock_cluster(ci); 300 else 301 spin_unlock(&si->lock); 302 } 303 304 static inline bool cluster_list_empty(struct swap_cluster_list *list) 305 { 306 return cluster_is_null(&list->head); 307 } 308 309 static inline unsigned int cluster_list_first(struct swap_cluster_list *list) 310 { 311 return cluster_next(&list->head); 312 } 313 314 static void cluster_list_init(struct swap_cluster_list *list) 315 { 316 cluster_set_null(&list->head); 317 cluster_set_null(&list->tail); 318 } 319 320 static void cluster_list_add_tail(struct swap_cluster_list *list, 321 struct swap_cluster_info *ci, 322 unsigned int idx) 323 { 324 if (cluster_list_empty(list)) { 325 cluster_set_next_flag(&list->head, idx, 0); 326 cluster_set_next_flag(&list->tail, idx, 0); 327 } else { 328 struct swap_cluster_info *ci_tail; 329 unsigned int tail = cluster_next(&list->tail); 330 331 /* 332 * Nested cluster lock, but both cluster locks are 333 * only acquired when we held swap_info_struct->lock 334 */ 335 ci_tail = ci + tail; 336 spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); 337 cluster_set_next(ci_tail, idx); 338 spin_unlock(&ci_tail->lock); 339 cluster_set_next_flag(&list->tail, idx, 0); 340 } 341 } 342 343 static unsigned int cluster_list_del_first(struct swap_cluster_list *list, 344 struct swap_cluster_info *ci) 345 { 346 unsigned int idx; 347 348 idx = cluster_next(&list->head); 349 if (cluster_next(&list->tail) == idx) { 350 cluster_set_null(&list->head); 351 cluster_set_null(&list->tail); 352 } else 353 cluster_set_next_flag(&list->head, 354 cluster_next(&ci[idx]), 0); 355 356 return idx; 357 } 358 359 /* Add a cluster to discard list and schedule it to do discard */ 360 static void swap_cluster_schedule_discard(struct swap_info_struct *si, 361 unsigned int idx) 362 { 363 /* 364 * If scan_swap_map() can't find a free cluster, it will check 365 * si->swap_map directly. To make sure the discarding cluster isn't 366 * taken by scan_swap_map(), mark the swap entries bad (occupied). It 367 * will be cleared after discard 368 */ 369 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 370 SWAP_MAP_BAD, SWAPFILE_CLUSTER); 371 372 cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); 373 374 schedule_work(&si->discard_work); 375 } 376 377 /* 378 * Doing discard actually. After a cluster discard is finished, the cluster 379 * will be added to free cluster list. caller should hold si->lock. 380 */ 381 static void swap_do_scheduled_discard(struct swap_info_struct *si) 382 { 383 struct swap_cluster_info *info, *ci; 384 unsigned int idx; 385 386 info = si->cluster_info; 387 388 while (!cluster_list_empty(&si->discard_clusters)) { 389 idx = cluster_list_del_first(&si->discard_clusters, info); 390 spin_unlock(&si->lock); 391 392 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, 393 SWAPFILE_CLUSTER); 394 395 spin_lock(&si->lock); 396 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); 397 cluster_set_flag(ci, CLUSTER_FLAG_FREE); 398 unlock_cluster(ci); 399 cluster_list_add_tail(&si->free_clusters, info, idx); 400 ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); 401 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 402 0, SWAPFILE_CLUSTER); 403 unlock_cluster(ci); 404 } 405 } 406 407 static void swap_discard_work(struct work_struct *work) 408 { 409 struct swap_info_struct *si; 410 411 si = container_of(work, struct swap_info_struct, discard_work); 412 413 spin_lock(&si->lock); 414 swap_do_scheduled_discard(si); 415 spin_unlock(&si->lock); 416 } 417 418 /* 419 * The cluster corresponding to page_nr will be used. The cluster will be 420 * removed from free cluster list and its usage counter will be increased. 421 */ 422 static void inc_cluster_info_page(struct swap_info_struct *p, 423 struct swap_cluster_info *cluster_info, unsigned long page_nr) 424 { 425 unsigned long idx = page_nr / SWAPFILE_CLUSTER; 426 427 if (!cluster_info) 428 return; 429 if (cluster_is_free(&cluster_info[idx])) { 430 VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx); 431 cluster_list_del_first(&p->free_clusters, cluster_info); 432 cluster_set_count_flag(&cluster_info[idx], 0, 0); 433 } 434 435 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); 436 cluster_set_count(&cluster_info[idx], 437 cluster_count(&cluster_info[idx]) + 1); 438 } 439 440 /* 441 * The cluster corresponding to page_nr decreases one usage. If the usage 442 * counter becomes 0, which means no page in the cluster is in using, we can 443 * optionally discard the cluster and add it to free cluster list. 444 */ 445 static void dec_cluster_info_page(struct swap_info_struct *p, 446 struct swap_cluster_info *cluster_info, unsigned long page_nr) 447 { 448 unsigned long idx = page_nr / SWAPFILE_CLUSTER; 449 450 if (!cluster_info) 451 return; 452 453 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); 454 cluster_set_count(&cluster_info[idx], 455 cluster_count(&cluster_info[idx]) - 1); 456 457 if (cluster_count(&cluster_info[idx]) == 0) { 458 /* 459 * If the swap is discardable, prepare discard the cluster 460 * instead of free it immediately. The cluster will be freed 461 * after discard. 462 */ 463 if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == 464 (SWP_WRITEOK | SWP_PAGE_DISCARD)) { 465 swap_cluster_schedule_discard(p, idx); 466 return; 467 } 468 469 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); 470 cluster_list_add_tail(&p->free_clusters, cluster_info, idx); 471 } 472 } 473 474 /* 475 * It's possible scan_swap_map() uses a free cluster in the middle of free 476 * cluster list. Avoiding such abuse to avoid list corruption. 477 */ 478 static bool 479 scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, 480 unsigned long offset) 481 { 482 struct percpu_cluster *percpu_cluster; 483 bool conflict; 484 485 offset /= SWAPFILE_CLUSTER; 486 conflict = !cluster_list_empty(&si->free_clusters) && 487 offset != cluster_list_first(&si->free_clusters) && 488 cluster_is_free(&si->cluster_info[offset]); 489 490 if (!conflict) 491 return false; 492 493 percpu_cluster = this_cpu_ptr(si->percpu_cluster); 494 cluster_set_null(&percpu_cluster->index); 495 return true; 496 } 497 498 /* 499 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This 500 * might involve allocating a new cluster for current CPU too. 501 */ 502 static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, 503 unsigned long *offset, unsigned long *scan_base) 504 { 505 struct percpu_cluster *cluster; 506 struct swap_cluster_info *ci; 507 bool found_free; 508 unsigned long tmp, max; 509 510 new_cluster: 511 cluster = this_cpu_ptr(si->percpu_cluster); 512 if (cluster_is_null(&cluster->index)) { 513 if (!cluster_list_empty(&si->free_clusters)) { 514 cluster->index = si->free_clusters.head; 515 cluster->next = cluster_next(&cluster->index) * 516 SWAPFILE_CLUSTER; 517 } else if (!cluster_list_empty(&si->discard_clusters)) { 518 /* 519 * we don't have free cluster but have some clusters in 520 * discarding, do discard now and reclaim them 521 */ 522 swap_do_scheduled_discard(si); 523 *scan_base = *offset = si->cluster_next; 524 goto new_cluster; 525 } else 526 return false; 527 } 528 529 found_free = false; 530 531 /* 532 * Other CPUs can use our cluster if they can't find a free cluster, 533 * check if there is still free entry in the cluster 534 */ 535 tmp = cluster->next; 536 max = min_t(unsigned long, si->max, 537 (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); 538 if (tmp >= max) { 539 cluster_set_null(&cluster->index); 540 goto new_cluster; 541 } 542 ci = lock_cluster(si, tmp); 543 while (tmp < max) { 544 if (!si->swap_map[tmp]) { 545 found_free = true; 546 break; 547 } 548 tmp++; 549 } 550 unlock_cluster(ci); 551 if (!found_free) { 552 cluster_set_null(&cluster->index); 553 goto new_cluster; 554 } 555 cluster->next = tmp + 1; 556 *offset = tmp; 557 *scan_base = tmp; 558 return found_free; 559 } 560 561 static int scan_swap_map_slots(struct swap_info_struct *si, 562 unsigned char usage, int nr, 563 swp_entry_t slots[]) 564 { 565 struct swap_cluster_info *ci; 566 unsigned long offset; 567 unsigned long scan_base; 568 unsigned long last_in_cluster = 0; 569 int latency_ration = LATENCY_LIMIT; 570 int n_ret = 0; 571 572 if (nr > SWAP_BATCH) 573 nr = SWAP_BATCH; 574 575 /* 576 * We try to cluster swap pages by allocating them sequentially 577 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 578 * way, however, we resort to first-free allocation, starting 579 * a new cluster. This prevents us from scattering swap pages 580 * all over the entire swap partition, so that we reduce 581 * overall disk seek times between swap pages. -- sct 582 * But we do now try to find an empty cluster. -Andrea 583 * And we let swap pages go all over an SSD partition. Hugh 584 */ 585 586 si->flags += SWP_SCANNING; 587 scan_base = offset = si->cluster_next; 588 589 /* SSD algorithm */ 590 if (si->cluster_info) { 591 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) 592 goto checks; 593 else 594 goto scan; 595 } 596 597 if (unlikely(!si->cluster_nr--)) { 598 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 599 si->cluster_nr = SWAPFILE_CLUSTER - 1; 600 goto checks; 601 } 602 603 spin_unlock(&si->lock); 604 605 /* 606 * If seek is expensive, start searching for new cluster from 607 * start of partition, to minimize the span of allocated swap. 608 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info 609 * case, just handled by scan_swap_map_try_ssd_cluster() above. 610 */ 611 scan_base = offset = si->lowest_bit; 612 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 613 614 /* Locate the first empty (unaligned) cluster */ 615 for (; last_in_cluster <= si->highest_bit; offset++) { 616 if (si->swap_map[offset]) 617 last_in_cluster = offset + SWAPFILE_CLUSTER; 618 else if (offset == last_in_cluster) { 619 spin_lock(&si->lock); 620 offset -= SWAPFILE_CLUSTER - 1; 621 si->cluster_next = offset; 622 si->cluster_nr = SWAPFILE_CLUSTER - 1; 623 goto checks; 624 } 625 if (unlikely(--latency_ration < 0)) { 626 cond_resched(); 627 latency_ration = LATENCY_LIMIT; 628 } 629 } 630 631 offset = scan_base; 632 spin_lock(&si->lock); 633 si->cluster_nr = SWAPFILE_CLUSTER - 1; 634 } 635 636 checks: 637 if (si->cluster_info) { 638 while (scan_swap_map_ssd_cluster_conflict(si, offset)) { 639 /* take a break if we already got some slots */ 640 if (n_ret) 641 goto done; 642 if (!scan_swap_map_try_ssd_cluster(si, &offset, 643 &scan_base)) 644 goto scan; 645 } 646 } 647 if (!(si->flags & SWP_WRITEOK)) 648 goto no_page; 649 if (!si->highest_bit) 650 goto no_page; 651 if (offset > si->highest_bit) 652 scan_base = offset = si->lowest_bit; 653 654 ci = lock_cluster(si, offset); 655 /* reuse swap entry of cache-only swap if not busy. */ 656 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 657 int swap_was_freed; 658 unlock_cluster(ci); 659 spin_unlock(&si->lock); 660 swap_was_freed = __try_to_reclaim_swap(si, offset); 661 spin_lock(&si->lock); 662 /* entry was freed successfully, try to use this again */ 663 if (swap_was_freed) 664 goto checks; 665 goto scan; /* check next one */ 666 } 667 668 if (si->swap_map[offset]) { 669 unlock_cluster(ci); 670 if (!n_ret) 671 goto scan; 672 else 673 goto done; 674 } 675 si->swap_map[offset] = usage; 676 inc_cluster_info_page(si, si->cluster_info, offset); 677 unlock_cluster(ci); 678 679 if (offset == si->lowest_bit) 680 si->lowest_bit++; 681 if (offset == si->highest_bit) 682 si->highest_bit--; 683 si->inuse_pages++; 684 if (si->inuse_pages == si->pages) { 685 si->lowest_bit = si->max; 686 si->highest_bit = 0; 687 spin_lock(&swap_avail_lock); 688 plist_del(&si->avail_list, &swap_avail_head); 689 spin_unlock(&swap_avail_lock); 690 } 691 si->cluster_next = offset + 1; 692 slots[n_ret++] = swp_entry(si->type, offset); 693 694 /* got enough slots or reach max slots? */ 695 if ((n_ret == nr) || (offset >= si->highest_bit)) 696 goto done; 697 698 /* search for next available slot */ 699 700 /* time to take a break? */ 701 if (unlikely(--latency_ration < 0)) { 702 if (n_ret) 703 goto done; 704 spin_unlock(&si->lock); 705 cond_resched(); 706 spin_lock(&si->lock); 707 latency_ration = LATENCY_LIMIT; 708 } 709 710 /* try to get more slots in cluster */ 711 if (si->cluster_info) { 712 if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) 713 goto checks; 714 else 715 goto done; 716 } 717 /* non-ssd case */ 718 ++offset; 719 720 /* non-ssd case, still more slots in cluster? */ 721 if (si->cluster_nr && !si->swap_map[offset]) { 722 --si->cluster_nr; 723 goto checks; 724 } 725 726 done: 727 si->flags -= SWP_SCANNING; 728 return n_ret; 729 730 scan: 731 spin_unlock(&si->lock); 732 while (++offset <= si->highest_bit) { 733 if (!si->swap_map[offset]) { 734 spin_lock(&si->lock); 735 goto checks; 736 } 737 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 738 spin_lock(&si->lock); 739 goto checks; 740 } 741 if (unlikely(--latency_ration < 0)) { 742 cond_resched(); 743 latency_ration = LATENCY_LIMIT; 744 } 745 } 746 offset = si->lowest_bit; 747 while (offset < scan_base) { 748 if (!si->swap_map[offset]) { 749 spin_lock(&si->lock); 750 goto checks; 751 } 752 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 753 spin_lock(&si->lock); 754 goto checks; 755 } 756 if (unlikely(--latency_ration < 0)) { 757 cond_resched(); 758 latency_ration = LATENCY_LIMIT; 759 } 760 offset++; 761 } 762 spin_lock(&si->lock); 763 764 no_page: 765 si->flags -= SWP_SCANNING; 766 return n_ret; 767 } 768 769 static unsigned long scan_swap_map(struct swap_info_struct *si, 770 unsigned char usage) 771 { 772 swp_entry_t entry; 773 int n_ret; 774 775 n_ret = scan_swap_map_slots(si, usage, 1, &entry); 776 777 if (n_ret) 778 return swp_offset(entry); 779 else 780 return 0; 781 782 } 783 784 int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) 785 { 786 struct swap_info_struct *si, *next; 787 long avail_pgs; 788 int n_ret = 0; 789 790 avail_pgs = atomic_long_read(&nr_swap_pages); 791 if (avail_pgs <= 0) 792 goto noswap; 793 794 if (n_goal > SWAP_BATCH) 795 n_goal = SWAP_BATCH; 796 797 if (n_goal > avail_pgs) 798 n_goal = avail_pgs; 799 800 atomic_long_sub(n_goal, &nr_swap_pages); 801 802 spin_lock(&swap_avail_lock); 803 804 start_over: 805 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { 806 /* requeue si to after same-priority siblings */ 807 plist_requeue(&si->avail_list, &swap_avail_head); 808 spin_unlock(&swap_avail_lock); 809 spin_lock(&si->lock); 810 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { 811 spin_lock(&swap_avail_lock); 812 if (plist_node_empty(&si->avail_list)) { 813 spin_unlock(&si->lock); 814 goto nextsi; 815 } 816 WARN(!si->highest_bit, 817 "swap_info %d in list but !highest_bit\n", 818 si->type); 819 WARN(!(si->flags & SWP_WRITEOK), 820 "swap_info %d in list but !SWP_WRITEOK\n", 821 si->type); 822 plist_del(&si->avail_list, &swap_avail_head); 823 spin_unlock(&si->lock); 824 goto nextsi; 825 } 826 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, 827 n_goal, swp_entries); 828 spin_unlock(&si->lock); 829 if (n_ret) 830 goto check_out; 831 pr_debug("scan_swap_map of si %d failed to find offset\n", 832 si->type); 833 834 spin_lock(&swap_avail_lock); 835 nextsi: 836 /* 837 * if we got here, it's likely that si was almost full before, 838 * and since scan_swap_map() can drop the si->lock, multiple 839 * callers probably all tried to get a page from the same si 840 * and it filled up before we could get one; or, the si filled 841 * up between us dropping swap_avail_lock and taking si->lock. 842 * Since we dropped the swap_avail_lock, the swap_avail_head 843 * list may have been modified; so if next is still in the 844 * swap_avail_head list then try it, otherwise start over 845 * if we have not gotten any slots. 846 */ 847 if (plist_node_empty(&next->avail_list)) 848 goto start_over; 849 } 850 851 spin_unlock(&swap_avail_lock); 852 853 check_out: 854 if (n_ret < n_goal) 855 atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages); 856 noswap: 857 return n_ret; 858 } 859 860 /* The only caller of this function is now suspend routine */ 861 swp_entry_t get_swap_page_of_type(int type) 862 { 863 struct swap_info_struct *si; 864 pgoff_t offset; 865 866 si = swap_info[type]; 867 spin_lock(&si->lock); 868 if (si && (si->flags & SWP_WRITEOK)) { 869 atomic_long_dec(&nr_swap_pages); 870 /* This is called for allocating swap entry, not cache */ 871 offset = scan_swap_map(si, 1); 872 if (offset) { 873 spin_unlock(&si->lock); 874 return swp_entry(type, offset); 875 } 876 atomic_long_inc(&nr_swap_pages); 877 } 878 spin_unlock(&si->lock); 879 return (swp_entry_t) {0}; 880 } 881 882 static struct swap_info_struct *__swap_info_get(swp_entry_t entry) 883 { 884 struct swap_info_struct *p; 885 unsigned long offset, type; 886 887 if (!entry.val) 888 goto out; 889 type = swp_type(entry); 890 if (type >= nr_swapfiles) 891 goto bad_nofile; 892 p = swap_info[type]; 893 if (!(p->flags & SWP_USED)) 894 goto bad_device; 895 offset = swp_offset(entry); 896 if (offset >= p->max) 897 goto bad_offset; 898 return p; 899 900 bad_offset: 901 pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val); 902 goto out; 903 bad_device: 904 pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val); 905 goto out; 906 bad_nofile: 907 pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val); 908 out: 909 return NULL; 910 } 911 912 static struct swap_info_struct *_swap_info_get(swp_entry_t entry) 913 { 914 struct swap_info_struct *p; 915 916 p = __swap_info_get(entry); 917 if (!p) 918 goto out; 919 if (!p->swap_map[swp_offset(entry)]) 920 goto bad_free; 921 return p; 922 923 bad_free: 924 pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); 925 goto out; 926 out: 927 return NULL; 928 } 929 930 static struct swap_info_struct *swap_info_get(swp_entry_t entry) 931 { 932 struct swap_info_struct *p; 933 934 p = _swap_info_get(entry); 935 if (p) 936 spin_lock(&p->lock); 937 return p; 938 } 939 940 static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, 941 struct swap_info_struct *q) 942 { 943 struct swap_info_struct *p; 944 945 p = _swap_info_get(entry); 946 947 if (p != q) { 948 if (q != NULL) 949 spin_unlock(&q->lock); 950 if (p != NULL) 951 spin_lock(&p->lock); 952 } 953 return p; 954 } 955 956 static unsigned char __swap_entry_free(struct swap_info_struct *p, 957 swp_entry_t entry, unsigned char usage) 958 { 959 struct swap_cluster_info *ci; 960 unsigned long offset = swp_offset(entry); 961 unsigned char count; 962 unsigned char has_cache; 963 964 ci = lock_cluster_or_swap_info(p, offset); 965 966 count = p->swap_map[offset]; 967 968 has_cache = count & SWAP_HAS_CACHE; 969 count &= ~SWAP_HAS_CACHE; 970 971 if (usage == SWAP_HAS_CACHE) { 972 VM_BUG_ON(!has_cache); 973 has_cache = 0; 974 } else if (count == SWAP_MAP_SHMEM) { 975 /* 976 * Or we could insist on shmem.c using a special 977 * swap_shmem_free() and free_shmem_swap_and_cache()... 978 */ 979 count = 0; 980 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { 981 if (count == COUNT_CONTINUED) { 982 if (swap_count_continued(p, offset, count)) 983 count = SWAP_MAP_MAX | COUNT_CONTINUED; 984 else 985 count = SWAP_MAP_MAX; 986 } else 987 count--; 988 } 989 990 usage = count | has_cache; 991 p->swap_map[offset] = usage ? : SWAP_HAS_CACHE; 992 993 unlock_cluster_or_swap_info(p, ci); 994 995 return usage; 996 } 997 998 static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) 999 { 1000 struct swap_cluster_info *ci; 1001 unsigned long offset = swp_offset(entry); 1002 unsigned char count; 1003 1004 ci = lock_cluster(p, offset); 1005 count = p->swap_map[offset]; 1006 VM_BUG_ON(count != SWAP_HAS_CACHE); 1007 p->swap_map[offset] = 0; 1008 dec_cluster_info_page(p, p->cluster_info, offset); 1009 unlock_cluster(ci); 1010 1011 mem_cgroup_uncharge_swap(entry); 1012 if (offset < p->lowest_bit) 1013 p->lowest_bit = offset; 1014 if (offset > p->highest_bit) { 1015 bool was_full = !p->highest_bit; 1016 1017 p->highest_bit = offset; 1018 if (was_full && (p->flags & SWP_WRITEOK)) { 1019 spin_lock(&swap_avail_lock); 1020 WARN_ON(!plist_node_empty(&p->avail_list)); 1021 if (plist_node_empty(&p->avail_list)) 1022 plist_add(&p->avail_list, 1023 &swap_avail_head); 1024 spin_unlock(&swap_avail_lock); 1025 } 1026 } 1027 atomic_long_inc(&nr_swap_pages); 1028 p->inuse_pages--; 1029 frontswap_invalidate_page(p->type, offset); 1030 if (p->flags & SWP_BLKDEV) { 1031 struct gendisk *disk = p->bdev->bd_disk; 1032 1033 if (disk->fops->swap_slot_free_notify) 1034 disk->fops->swap_slot_free_notify(p->bdev, 1035 offset); 1036 } 1037 } 1038 1039 /* 1040 * Caller has made sure that the swap device corresponding to entry 1041 * is still around or has not been recycled. 1042 */ 1043 void swap_free(swp_entry_t entry) 1044 { 1045 struct swap_info_struct *p; 1046 1047 p = _swap_info_get(entry); 1048 if (p) { 1049 if (!__swap_entry_free(p, entry, 1)) 1050 free_swap_slot(entry); 1051 } 1052 } 1053 1054 /* 1055 * Called after dropping swapcache to decrease refcnt to swap entries. 1056 */ 1057 void swapcache_free(swp_entry_t entry) 1058 { 1059 struct swap_info_struct *p; 1060 1061 p = _swap_info_get(entry); 1062 if (p) { 1063 if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE)) 1064 free_swap_slot(entry); 1065 } 1066 } 1067 1068 void swapcache_free_entries(swp_entry_t *entries, int n) 1069 { 1070 struct swap_info_struct *p, *prev; 1071 int i; 1072 1073 if (n <= 0) 1074 return; 1075 1076 prev = NULL; 1077 p = NULL; 1078 for (i = 0; i < n; ++i) { 1079 p = swap_info_get_cont(entries[i], prev); 1080 if (p) 1081 swap_entry_free(p, entries[i]); 1082 prev = p; 1083 } 1084 if (p) 1085 spin_unlock(&p->lock); 1086 } 1087 1088 /* 1089 * How many references to page are currently swapped out? 1090 * This does not give an exact answer when swap count is continued, 1091 * but does include the high COUNT_CONTINUED flag to allow for that. 1092 */ 1093 int page_swapcount(struct page *page) 1094 { 1095 int count = 0; 1096 struct swap_info_struct *p; 1097 struct swap_cluster_info *ci; 1098 swp_entry_t entry; 1099 unsigned long offset; 1100 1101 entry.val = page_private(page); 1102 p = _swap_info_get(entry); 1103 if (p) { 1104 offset = swp_offset(entry); 1105 ci = lock_cluster_or_swap_info(p, offset); 1106 count = swap_count(p->swap_map[offset]); 1107 unlock_cluster_or_swap_info(p, ci); 1108 } 1109 return count; 1110 } 1111 1112 static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) 1113 { 1114 int count = 0; 1115 pgoff_t offset = swp_offset(entry); 1116 struct swap_cluster_info *ci; 1117 1118 ci = lock_cluster_or_swap_info(si, offset); 1119 count = swap_count(si->swap_map[offset]); 1120 unlock_cluster_or_swap_info(si, ci); 1121 return count; 1122 } 1123 1124 /* 1125 * How many references to @entry are currently swapped out? 1126 * This does not give an exact answer when swap count is continued, 1127 * but does include the high COUNT_CONTINUED flag to allow for that. 1128 */ 1129 int __swp_swapcount(swp_entry_t entry) 1130 { 1131 int count = 0; 1132 struct swap_info_struct *si; 1133 1134 si = __swap_info_get(entry); 1135 if (si) 1136 count = swap_swapcount(si, entry); 1137 return count; 1138 } 1139 1140 /* 1141 * How many references to @entry are currently swapped out? 1142 * This considers COUNT_CONTINUED so it returns exact answer. 1143 */ 1144 int swp_swapcount(swp_entry_t entry) 1145 { 1146 int count, tmp_count, n; 1147 struct swap_info_struct *p; 1148 struct swap_cluster_info *ci; 1149 struct page *page; 1150 pgoff_t offset; 1151 unsigned char *map; 1152 1153 p = _swap_info_get(entry); 1154 if (!p) 1155 return 0; 1156 1157 offset = swp_offset(entry); 1158 1159 ci = lock_cluster_or_swap_info(p, offset); 1160 1161 count = swap_count(p->swap_map[offset]); 1162 if (!(count & COUNT_CONTINUED)) 1163 goto out; 1164 1165 count &= ~COUNT_CONTINUED; 1166 n = SWAP_MAP_MAX + 1; 1167 1168 page = vmalloc_to_page(p->swap_map + offset); 1169 offset &= ~PAGE_MASK; 1170 VM_BUG_ON(page_private(page) != SWP_CONTINUED); 1171 1172 do { 1173 page = list_next_entry(page, lru); 1174 map = kmap_atomic(page); 1175 tmp_count = map[offset]; 1176 kunmap_atomic(map); 1177 1178 count += (tmp_count & ~COUNT_CONTINUED) * n; 1179 n *= (SWAP_CONT_MAX + 1); 1180 } while (tmp_count & COUNT_CONTINUED); 1181 out: 1182 unlock_cluster_or_swap_info(p, ci); 1183 return count; 1184 } 1185 1186 /* 1187 * We can write to an anon page without COW if there are no other references 1188 * to it. And as a side-effect, free up its swap: because the old content 1189 * on disk will never be read, and seeking back there to write new content 1190 * later would only waste time away from clustering. 1191 * 1192 * NOTE: total_mapcount should not be relied upon by the caller if 1193 * reuse_swap_page() returns false, but it may be always overwritten 1194 * (see the other implementation for CONFIG_SWAP=n). 1195 */ 1196 bool reuse_swap_page(struct page *page, int *total_mapcount) 1197 { 1198 int count; 1199 1200 VM_BUG_ON_PAGE(!PageLocked(page), page); 1201 if (unlikely(PageKsm(page))) 1202 return false; 1203 count = page_trans_huge_mapcount(page, total_mapcount); 1204 if (count <= 1 && PageSwapCache(page)) { 1205 count += page_swapcount(page); 1206 if (count != 1) 1207 goto out; 1208 if (!PageWriteback(page)) { 1209 delete_from_swap_cache(page); 1210 SetPageDirty(page); 1211 } else { 1212 swp_entry_t entry; 1213 struct swap_info_struct *p; 1214 1215 entry.val = page_private(page); 1216 p = swap_info_get(entry); 1217 if (p->flags & SWP_STABLE_WRITES) { 1218 spin_unlock(&p->lock); 1219 return false; 1220 } 1221 spin_unlock(&p->lock); 1222 } 1223 } 1224 out: 1225 return count <= 1; 1226 } 1227 1228 /* 1229 * If swap is getting full, or if there are no more mappings of this page, 1230 * then try_to_free_swap is called to free its swap space. 1231 */ 1232 int try_to_free_swap(struct page *page) 1233 { 1234 VM_BUG_ON_PAGE(!PageLocked(page), page); 1235 1236 if (!PageSwapCache(page)) 1237 return 0; 1238 if (PageWriteback(page)) 1239 return 0; 1240 if (page_swapcount(page)) 1241 return 0; 1242 1243 /* 1244 * Once hibernation has begun to create its image of memory, 1245 * there's a danger that one of the calls to try_to_free_swap() 1246 * - most probably a call from __try_to_reclaim_swap() while 1247 * hibernation is allocating its own swap pages for the image, 1248 * but conceivably even a call from memory reclaim - will free 1249 * the swap from a page which has already been recorded in the 1250 * image as a clean swapcache page, and then reuse its swap for 1251 * another page of the image. On waking from hibernation, the 1252 * original page might be freed under memory pressure, then 1253 * later read back in from swap, now with the wrong data. 1254 * 1255 * Hibernation suspends storage while it is writing the image 1256 * to disk so check that here. 1257 */ 1258 if (pm_suspended_storage()) 1259 return 0; 1260 1261 delete_from_swap_cache(page); 1262 SetPageDirty(page); 1263 return 1; 1264 } 1265 1266 /* 1267 * Free the swap entry like above, but also try to 1268 * free the page cache entry if it is the last user. 1269 */ 1270 int free_swap_and_cache(swp_entry_t entry) 1271 { 1272 struct swap_info_struct *p; 1273 struct page *page = NULL; 1274 unsigned char count; 1275 1276 if (non_swap_entry(entry)) 1277 return 1; 1278 1279 p = _swap_info_get(entry); 1280 if (p) { 1281 count = __swap_entry_free(p, entry, 1); 1282 if (count == SWAP_HAS_CACHE) { 1283 page = find_get_page(swap_address_space(entry), 1284 swp_offset(entry)); 1285 if (page && !trylock_page(page)) { 1286 put_page(page); 1287 page = NULL; 1288 } 1289 } else if (!count) 1290 free_swap_slot(entry); 1291 } 1292 if (page) { 1293 /* 1294 * Not mapped elsewhere, or swap space full? Free it! 1295 * Also recheck PageSwapCache now page is locked (above). 1296 */ 1297 if (PageSwapCache(page) && !PageWriteback(page) && 1298 (!page_mapped(page) || mem_cgroup_swap_full(page)) && 1299 !swap_swapcount(p, entry)) { 1300 delete_from_swap_cache(page); 1301 SetPageDirty(page); 1302 } 1303 unlock_page(page); 1304 put_page(page); 1305 } 1306 return p != NULL; 1307 } 1308 1309 #ifdef CONFIG_HIBERNATION 1310 /* 1311 * Find the swap type that corresponds to given device (if any). 1312 * 1313 * @offset - number of the PAGE_SIZE-sized block of the device, starting 1314 * from 0, in which the swap header is expected to be located. 1315 * 1316 * This is needed for the suspend to disk (aka swsusp). 1317 */ 1318 int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 1319 { 1320 struct block_device *bdev = NULL; 1321 int type; 1322 1323 if (device) 1324 bdev = bdget(device); 1325 1326 spin_lock(&swap_lock); 1327 for (type = 0; type < nr_swapfiles; type++) { 1328 struct swap_info_struct *sis = swap_info[type]; 1329 1330 if (!(sis->flags & SWP_WRITEOK)) 1331 continue; 1332 1333 if (!bdev) { 1334 if (bdev_p) 1335 *bdev_p = bdgrab(sis->bdev); 1336 1337 spin_unlock(&swap_lock); 1338 return type; 1339 } 1340 if (bdev == sis->bdev) { 1341 struct swap_extent *se = &sis->first_swap_extent; 1342 1343 if (se->start_block == offset) { 1344 if (bdev_p) 1345 *bdev_p = bdgrab(sis->bdev); 1346 1347 spin_unlock(&swap_lock); 1348 bdput(bdev); 1349 return type; 1350 } 1351 } 1352 } 1353 spin_unlock(&swap_lock); 1354 if (bdev) 1355 bdput(bdev); 1356 1357 return -ENODEV; 1358 } 1359 1360 /* 1361 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 1362 * corresponding to given index in swap_info (swap type). 1363 */ 1364 sector_t swapdev_block(int type, pgoff_t offset) 1365 { 1366 struct block_device *bdev; 1367 1368 if ((unsigned int)type >= nr_swapfiles) 1369 return 0; 1370 if (!(swap_info[type]->flags & SWP_WRITEOK)) 1371 return 0; 1372 return map_swap_entry(swp_entry(type, offset), &bdev); 1373 } 1374 1375 /* 1376 * Return either the total number of swap pages of given type, or the number 1377 * of free pages of that type (depending on @free) 1378 * 1379 * This is needed for software suspend 1380 */ 1381 unsigned int count_swap_pages(int type, int free) 1382 { 1383 unsigned int n = 0; 1384 1385 spin_lock(&swap_lock); 1386 if ((unsigned int)type < nr_swapfiles) { 1387 struct swap_info_struct *sis = swap_info[type]; 1388 1389 spin_lock(&sis->lock); 1390 if (sis->flags & SWP_WRITEOK) { 1391 n = sis->pages; 1392 if (free) 1393 n -= sis->inuse_pages; 1394 } 1395 spin_unlock(&sis->lock); 1396 } 1397 spin_unlock(&swap_lock); 1398 return n; 1399 } 1400 #endif /* CONFIG_HIBERNATION */ 1401 1402 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) 1403 { 1404 return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte); 1405 } 1406 1407 /* 1408 * No need to decide whether this PTE shares the swap entry with others, 1409 * just let do_wp_page work it out if a write is requested later - to 1410 * force COW, vm_page_prot omits write permission from any private vma. 1411 */ 1412 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 1413 unsigned long addr, swp_entry_t entry, struct page *page) 1414 { 1415 struct page *swapcache; 1416 struct mem_cgroup *memcg; 1417 spinlock_t *ptl; 1418 pte_t *pte; 1419 int ret = 1; 1420 1421 swapcache = page; 1422 page = ksm_might_need_to_copy(page, vma, addr); 1423 if (unlikely(!page)) 1424 return -ENOMEM; 1425 1426 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, 1427 &memcg, false)) { 1428 ret = -ENOMEM; 1429 goto out_nolock; 1430 } 1431 1432 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1433 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { 1434 mem_cgroup_cancel_charge(page, memcg, false); 1435 ret = 0; 1436 goto out; 1437 } 1438 1439 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 1440 inc_mm_counter(vma->vm_mm, MM_ANONPAGES); 1441 get_page(page); 1442 set_pte_at(vma->vm_mm, addr, pte, 1443 pte_mkold(mk_pte(page, vma->vm_page_prot))); 1444 if (page == swapcache) { 1445 page_add_anon_rmap(page, vma, addr, false); 1446 mem_cgroup_commit_charge(page, memcg, true, false); 1447 } else { /* ksm created a completely new copy */ 1448 page_add_new_anon_rmap(page, vma, addr, false); 1449 mem_cgroup_commit_charge(page, memcg, false, false); 1450 lru_cache_add_active_or_unevictable(page, vma); 1451 } 1452 swap_free(entry); 1453 /* 1454 * Move the page to the active list so it is not 1455 * immediately swapped out again after swapon. 1456 */ 1457 activate_page(page); 1458 out: 1459 pte_unmap_unlock(pte, ptl); 1460 out_nolock: 1461 if (page != swapcache) { 1462 unlock_page(page); 1463 put_page(page); 1464 } 1465 return ret; 1466 } 1467 1468 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 1469 unsigned long addr, unsigned long end, 1470 swp_entry_t entry, struct page *page) 1471 { 1472 pte_t swp_pte = swp_entry_to_pte(entry); 1473 pte_t *pte; 1474 int ret = 0; 1475 1476 /* 1477 * We don't actually need pte lock while scanning for swp_pte: since 1478 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the 1479 * page table while we're scanning; though it could get zapped, and on 1480 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse 1481 * of unmatched parts which look like swp_pte, so unuse_pte must 1482 * recheck under pte lock. Scanning without pte lock lets it be 1483 * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. 1484 */ 1485 pte = pte_offset_map(pmd, addr); 1486 do { 1487 /* 1488 * swapoff spends a _lot_ of time in this loop! 1489 * Test inline before going to call unuse_pte. 1490 */ 1491 if (unlikely(pte_same_as_swp(*pte, swp_pte))) { 1492 pte_unmap(pte); 1493 ret = unuse_pte(vma, pmd, addr, entry, page); 1494 if (ret) 1495 goto out; 1496 pte = pte_offset_map(pmd, addr); 1497 } 1498 } while (pte++, addr += PAGE_SIZE, addr != end); 1499 pte_unmap(pte - 1); 1500 out: 1501 return ret; 1502 } 1503 1504 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 1505 unsigned long addr, unsigned long end, 1506 swp_entry_t entry, struct page *page) 1507 { 1508 pmd_t *pmd; 1509 unsigned long next; 1510 int ret; 1511 1512 pmd = pmd_offset(pud, addr); 1513 do { 1514 cond_resched(); 1515 next = pmd_addr_end(addr, end); 1516 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 1517 continue; 1518 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 1519 if (ret) 1520 return ret; 1521 } while (pmd++, addr = next, addr != end); 1522 return 0; 1523 } 1524 1525 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, 1526 unsigned long addr, unsigned long end, 1527 swp_entry_t entry, struct page *page) 1528 { 1529 pud_t *pud; 1530 unsigned long next; 1531 int ret; 1532 1533 pud = pud_offset(p4d, addr); 1534 do { 1535 next = pud_addr_end(addr, end); 1536 if (pud_none_or_clear_bad(pud)) 1537 continue; 1538 ret = unuse_pmd_range(vma, pud, addr, next, entry, page); 1539 if (ret) 1540 return ret; 1541 } while (pud++, addr = next, addr != end); 1542 return 0; 1543 } 1544 1545 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, 1546 unsigned long addr, unsigned long end, 1547 swp_entry_t entry, struct page *page) 1548 { 1549 p4d_t *p4d; 1550 unsigned long next; 1551 int ret; 1552 1553 p4d = p4d_offset(pgd, addr); 1554 do { 1555 next = p4d_addr_end(addr, end); 1556 if (p4d_none_or_clear_bad(p4d)) 1557 continue; 1558 ret = unuse_pud_range(vma, p4d, addr, next, entry, page); 1559 if (ret) 1560 return ret; 1561 } while (p4d++, addr = next, addr != end); 1562 return 0; 1563 } 1564 1565 static int unuse_vma(struct vm_area_struct *vma, 1566 swp_entry_t entry, struct page *page) 1567 { 1568 pgd_t *pgd; 1569 unsigned long addr, end, next; 1570 int ret; 1571 1572 if (page_anon_vma(page)) { 1573 addr = page_address_in_vma(page, vma); 1574 if (addr == -EFAULT) 1575 return 0; 1576 else 1577 end = addr + PAGE_SIZE; 1578 } else { 1579 addr = vma->vm_start; 1580 end = vma->vm_end; 1581 } 1582 1583 pgd = pgd_offset(vma->vm_mm, addr); 1584 do { 1585 next = pgd_addr_end(addr, end); 1586 if (pgd_none_or_clear_bad(pgd)) 1587 continue; 1588 ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); 1589 if (ret) 1590 return ret; 1591 } while (pgd++, addr = next, addr != end); 1592 return 0; 1593 } 1594 1595 static int unuse_mm(struct mm_struct *mm, 1596 swp_entry_t entry, struct page *page) 1597 { 1598 struct vm_area_struct *vma; 1599 int ret = 0; 1600 1601 if (!down_read_trylock(&mm->mmap_sem)) { 1602 /* 1603 * Activate page so shrink_inactive_list is unlikely to unmap 1604 * its ptes while lock is dropped, so swapoff can make progress. 1605 */ 1606 activate_page(page); 1607 unlock_page(page); 1608 down_read(&mm->mmap_sem); 1609 lock_page(page); 1610 } 1611 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1612 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) 1613 break; 1614 cond_resched(); 1615 } 1616 up_read(&mm->mmap_sem); 1617 return (ret < 0)? ret: 0; 1618 } 1619 1620 /* 1621 * Scan swap_map (or frontswap_map if frontswap parameter is true) 1622 * from current position to next entry still in use. 1623 * Recycle to start on reaching the end, returning 0 when empty. 1624 */ 1625 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 1626 unsigned int prev, bool frontswap) 1627 { 1628 unsigned int max = si->max; 1629 unsigned int i = prev; 1630 unsigned char count; 1631 1632 /* 1633 * No need for swap_lock here: we're just looking 1634 * for whether an entry is in use, not modifying it; false 1635 * hits are okay, and sys_swapoff() has already prevented new 1636 * allocations from this area (while holding swap_lock). 1637 */ 1638 for (;;) { 1639 if (++i >= max) { 1640 if (!prev) { 1641 i = 0; 1642 break; 1643 } 1644 /* 1645 * No entries in use at top of swap_map, 1646 * loop back to start and recheck there. 1647 */ 1648 max = prev + 1; 1649 prev = 0; 1650 i = 1; 1651 } 1652 count = READ_ONCE(si->swap_map[i]); 1653 if (count && swap_count(count) != SWAP_MAP_BAD) 1654 if (!frontswap || frontswap_test(si, i)) 1655 break; 1656 if ((i % LATENCY_LIMIT) == 0) 1657 cond_resched(); 1658 } 1659 return i; 1660 } 1661 1662 /* 1663 * We completely avoid races by reading each swap page in advance, 1664 * and then search for the process using it. All the necessary 1665 * page table adjustments can then be made atomically. 1666 * 1667 * if the boolean frontswap is true, only unuse pages_to_unuse pages; 1668 * pages_to_unuse==0 means all pages; ignored if frontswap is false 1669 */ 1670 int try_to_unuse(unsigned int type, bool frontswap, 1671 unsigned long pages_to_unuse) 1672 { 1673 struct swap_info_struct *si = swap_info[type]; 1674 struct mm_struct *start_mm; 1675 volatile unsigned char *swap_map; /* swap_map is accessed without 1676 * locking. Mark it as volatile 1677 * to prevent compiler doing 1678 * something odd. 1679 */ 1680 unsigned char swcount; 1681 struct page *page; 1682 swp_entry_t entry; 1683 unsigned int i = 0; 1684 int retval = 0; 1685 1686 /* 1687 * When searching mms for an entry, a good strategy is to 1688 * start at the first mm we freed the previous entry from 1689 * (though actually we don't notice whether we or coincidence 1690 * freed the entry). Initialize this start_mm with a hold. 1691 * 1692 * A simpler strategy would be to start at the last mm we 1693 * freed the previous entry from; but that would take less 1694 * advantage of mmlist ordering, which clusters forked mms 1695 * together, child after parent. If we race with dup_mmap(), we 1696 * prefer to resolve parent before child, lest we miss entries 1697 * duplicated after we scanned child: using last mm would invert 1698 * that. 1699 */ 1700 start_mm = &init_mm; 1701 mmget(&init_mm); 1702 1703 /* 1704 * Keep on scanning until all entries have gone. Usually, 1705 * one pass through swap_map is enough, but not necessarily: 1706 * there are races when an instance of an entry might be missed. 1707 */ 1708 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { 1709 if (signal_pending(current)) { 1710 retval = -EINTR; 1711 break; 1712 } 1713 1714 /* 1715 * Get a page for the entry, using the existing swap 1716 * cache page if there is one. Otherwise, get a clean 1717 * page and read the swap into it. 1718 */ 1719 swap_map = &si->swap_map[i]; 1720 entry = swp_entry(type, i); 1721 page = read_swap_cache_async(entry, 1722 GFP_HIGHUSER_MOVABLE, NULL, 0); 1723 if (!page) { 1724 /* 1725 * Either swap_duplicate() failed because entry 1726 * has been freed independently, and will not be 1727 * reused since sys_swapoff() already disabled 1728 * allocation from here, or alloc_page() failed. 1729 */ 1730 swcount = *swap_map; 1731 /* 1732 * We don't hold lock here, so the swap entry could be 1733 * SWAP_MAP_BAD (when the cluster is discarding). 1734 * Instead of fail out, We can just skip the swap 1735 * entry because swapoff will wait for discarding 1736 * finish anyway. 1737 */ 1738 if (!swcount || swcount == SWAP_MAP_BAD) 1739 continue; 1740 retval = -ENOMEM; 1741 break; 1742 } 1743 1744 /* 1745 * Don't hold on to start_mm if it looks like exiting. 1746 */ 1747 if (atomic_read(&start_mm->mm_users) == 1) { 1748 mmput(start_mm); 1749 start_mm = &init_mm; 1750 mmget(&init_mm); 1751 } 1752 1753 /* 1754 * Wait for and lock page. When do_swap_page races with 1755 * try_to_unuse, do_swap_page can handle the fault much 1756 * faster than try_to_unuse can locate the entry. This 1757 * apparently redundant "wait_on_page_locked" lets try_to_unuse 1758 * defer to do_swap_page in such a case - in some tests, 1759 * do_swap_page and try_to_unuse repeatedly compete. 1760 */ 1761 wait_on_page_locked(page); 1762 wait_on_page_writeback(page); 1763 lock_page(page); 1764 wait_on_page_writeback(page); 1765 1766 /* 1767 * Remove all references to entry. 1768 */ 1769 swcount = *swap_map; 1770 if (swap_count(swcount) == SWAP_MAP_SHMEM) { 1771 retval = shmem_unuse(entry, page); 1772 /* page has already been unlocked and released */ 1773 if (retval < 0) 1774 break; 1775 continue; 1776 } 1777 if (swap_count(swcount) && start_mm != &init_mm) 1778 retval = unuse_mm(start_mm, entry, page); 1779 1780 if (swap_count(*swap_map)) { 1781 int set_start_mm = (*swap_map >= swcount); 1782 struct list_head *p = &start_mm->mmlist; 1783 struct mm_struct *new_start_mm = start_mm; 1784 struct mm_struct *prev_mm = start_mm; 1785 struct mm_struct *mm; 1786 1787 mmget(new_start_mm); 1788 mmget(prev_mm); 1789 spin_lock(&mmlist_lock); 1790 while (swap_count(*swap_map) && !retval && 1791 (p = p->next) != &start_mm->mmlist) { 1792 mm = list_entry(p, struct mm_struct, mmlist); 1793 if (!mmget_not_zero(mm)) 1794 continue; 1795 spin_unlock(&mmlist_lock); 1796 mmput(prev_mm); 1797 prev_mm = mm; 1798 1799 cond_resched(); 1800 1801 swcount = *swap_map; 1802 if (!swap_count(swcount)) /* any usage ? */ 1803 ; 1804 else if (mm == &init_mm) 1805 set_start_mm = 1; 1806 else 1807 retval = unuse_mm(mm, entry, page); 1808 1809 if (set_start_mm && *swap_map < swcount) { 1810 mmput(new_start_mm); 1811 mmget(mm); 1812 new_start_mm = mm; 1813 set_start_mm = 0; 1814 } 1815 spin_lock(&mmlist_lock); 1816 } 1817 spin_unlock(&mmlist_lock); 1818 mmput(prev_mm); 1819 mmput(start_mm); 1820 start_mm = new_start_mm; 1821 } 1822 if (retval) { 1823 unlock_page(page); 1824 put_page(page); 1825 break; 1826 } 1827 1828 /* 1829 * If a reference remains (rare), we would like to leave 1830 * the page in the swap cache; but try_to_unmap could 1831 * then re-duplicate the entry once we drop page lock, 1832 * so we might loop indefinitely; also, that page could 1833 * not be swapped out to other storage meanwhile. So: 1834 * delete from cache even if there's another reference, 1835 * after ensuring that the data has been saved to disk - 1836 * since if the reference remains (rarer), it will be 1837 * read from disk into another page. Splitting into two 1838 * pages would be incorrect if swap supported "shared 1839 * private" pages, but they are handled by tmpfs files. 1840 * 1841 * Given how unuse_vma() targets one particular offset 1842 * in an anon_vma, once the anon_vma has been determined, 1843 * this splitting happens to be just what is needed to 1844 * handle where KSM pages have been swapped out: re-reading 1845 * is unnecessarily slow, but we can fix that later on. 1846 */ 1847 if (swap_count(*swap_map) && 1848 PageDirty(page) && PageSwapCache(page)) { 1849 struct writeback_control wbc = { 1850 .sync_mode = WB_SYNC_NONE, 1851 }; 1852 1853 swap_writepage(page, &wbc); 1854 lock_page(page); 1855 wait_on_page_writeback(page); 1856 } 1857 1858 /* 1859 * It is conceivable that a racing task removed this page from 1860 * swap cache just before we acquired the page lock at the top, 1861 * or while we dropped it in unuse_mm(). The page might even 1862 * be back in swap cache on another swap area: that we must not 1863 * delete, since it may not have been written out to swap yet. 1864 */ 1865 if (PageSwapCache(page) && 1866 likely(page_private(page) == entry.val)) 1867 delete_from_swap_cache(page); 1868 1869 /* 1870 * So we could skip searching mms once swap count went 1871 * to 1, we did not mark any present ptes as dirty: must 1872 * mark page dirty so shrink_page_list will preserve it. 1873 */ 1874 SetPageDirty(page); 1875 unlock_page(page); 1876 put_page(page); 1877 1878 /* 1879 * Make sure that we aren't completely killing 1880 * interactive performance. 1881 */ 1882 cond_resched(); 1883 if (frontswap && pages_to_unuse > 0) { 1884 if (!--pages_to_unuse) 1885 break; 1886 } 1887 } 1888 1889 mmput(start_mm); 1890 return retval; 1891 } 1892 1893 /* 1894 * After a successful try_to_unuse, if no swap is now in use, we know 1895 * we can empty the mmlist. swap_lock must be held on entry and exit. 1896 * Note that mmlist_lock nests inside swap_lock, and an mm must be 1897 * added to the mmlist just after page_duplicate - before would be racy. 1898 */ 1899 static void drain_mmlist(void) 1900 { 1901 struct list_head *p, *next; 1902 unsigned int type; 1903 1904 for (type = 0; type < nr_swapfiles; type++) 1905 if (swap_info[type]->inuse_pages) 1906 return; 1907 spin_lock(&mmlist_lock); 1908 list_for_each_safe(p, next, &init_mm.mmlist) 1909 list_del_init(p); 1910 spin_unlock(&mmlist_lock); 1911 } 1912 1913 /* 1914 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 1915 * corresponds to page offset for the specified swap entry. 1916 * Note that the type of this function is sector_t, but it returns page offset 1917 * into the bdev, not sector offset. 1918 */ 1919 static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) 1920 { 1921 struct swap_info_struct *sis; 1922 struct swap_extent *start_se; 1923 struct swap_extent *se; 1924 pgoff_t offset; 1925 1926 sis = swap_info[swp_type(entry)]; 1927 *bdev = sis->bdev; 1928 1929 offset = swp_offset(entry); 1930 start_se = sis->curr_swap_extent; 1931 se = start_se; 1932 1933 for ( ; ; ) { 1934 if (se->start_page <= offset && 1935 offset < (se->start_page + se->nr_pages)) { 1936 return se->start_block + (offset - se->start_page); 1937 } 1938 se = list_next_entry(se, list); 1939 sis->curr_swap_extent = se; 1940 BUG_ON(se == start_se); /* It *must* be present */ 1941 } 1942 } 1943 1944 /* 1945 * Returns the page offset into bdev for the specified page's swap entry. 1946 */ 1947 sector_t map_swap_page(struct page *page, struct block_device **bdev) 1948 { 1949 swp_entry_t entry; 1950 entry.val = page_private(page); 1951 return map_swap_entry(entry, bdev); 1952 } 1953 1954 /* 1955 * Free all of a swapdev's extent information 1956 */ 1957 static void destroy_swap_extents(struct swap_info_struct *sis) 1958 { 1959 while (!list_empty(&sis->first_swap_extent.list)) { 1960 struct swap_extent *se; 1961 1962 se = list_first_entry(&sis->first_swap_extent.list, 1963 struct swap_extent, list); 1964 list_del(&se->list); 1965 kfree(se); 1966 } 1967 1968 if (sis->flags & SWP_FILE) { 1969 struct file *swap_file = sis->swap_file; 1970 struct address_space *mapping = swap_file->f_mapping; 1971 1972 sis->flags &= ~SWP_FILE; 1973 mapping->a_ops->swap_deactivate(swap_file); 1974 } 1975 } 1976 1977 /* 1978 * Add a block range (and the corresponding page range) into this swapdev's 1979 * extent list. The extent list is kept sorted in page order. 1980 * 1981 * This function rather assumes that it is called in ascending page order. 1982 */ 1983 int 1984 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 1985 unsigned long nr_pages, sector_t start_block) 1986 { 1987 struct swap_extent *se; 1988 struct swap_extent *new_se; 1989 struct list_head *lh; 1990 1991 if (start_page == 0) { 1992 se = &sis->first_swap_extent; 1993 sis->curr_swap_extent = se; 1994 se->start_page = 0; 1995 se->nr_pages = nr_pages; 1996 se->start_block = start_block; 1997 return 1; 1998 } else { 1999 lh = sis->first_swap_extent.list.prev; /* Highest extent */ 2000 se = list_entry(lh, struct swap_extent, list); 2001 BUG_ON(se->start_page + se->nr_pages != start_page); 2002 if (se->start_block + se->nr_pages == start_block) { 2003 /* Merge it */ 2004 se->nr_pages += nr_pages; 2005 return 0; 2006 } 2007 } 2008 2009 /* 2010 * No merge. Insert a new extent, preserving ordering. 2011 */ 2012 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 2013 if (new_se == NULL) 2014 return -ENOMEM; 2015 new_se->start_page = start_page; 2016 new_se->nr_pages = nr_pages; 2017 new_se->start_block = start_block; 2018 2019 list_add_tail(&new_se->list, &sis->first_swap_extent.list); 2020 return 1; 2021 } 2022 2023 /* 2024 * A `swap extent' is a simple thing which maps a contiguous range of pages 2025 * onto a contiguous range of disk blocks. An ordered list of swap extents 2026 * is built at swapon time and is then used at swap_writepage/swap_readpage 2027 * time for locating where on disk a page belongs. 2028 * 2029 * If the swapfile is an S_ISBLK block device, a single extent is installed. 2030 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 2031 * swap files identically. 2032 * 2033 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 2034 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 2035 * swapfiles are handled *identically* after swapon time. 2036 * 2037 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 2038 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If 2039 * some stray blocks are found which do not fall within the PAGE_SIZE alignment 2040 * requirements, they are simply tossed out - we will never use those blocks 2041 * for swapping. 2042 * 2043 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This 2044 * prevents root from shooting her foot off by ftruncating an in-use swapfile, 2045 * which will scribble on the fs. 2046 * 2047 * The amount of disk space which a single swap extent represents varies. 2048 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 2049 * extents in the list. To avoid much list walking, we cache the previous 2050 * search location in `curr_swap_extent', and start new searches from there. 2051 * This is extremely effective. The average number of iterations in 2052 * map_swap_page() has been measured at about 0.3 per page. - akpm. 2053 */ 2054 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 2055 { 2056 struct file *swap_file = sis->swap_file; 2057 struct address_space *mapping = swap_file->f_mapping; 2058 struct inode *inode = mapping->host; 2059 int ret; 2060 2061 if (S_ISBLK(inode->i_mode)) { 2062 ret = add_swap_extent(sis, 0, sis->max, 0); 2063 *span = sis->pages; 2064 return ret; 2065 } 2066 2067 if (mapping->a_ops->swap_activate) { 2068 ret = mapping->a_ops->swap_activate(sis, swap_file, span); 2069 if (!ret) { 2070 sis->flags |= SWP_FILE; 2071 ret = add_swap_extent(sis, 0, sis->max, 0); 2072 *span = sis->pages; 2073 } 2074 return ret; 2075 } 2076 2077 return generic_swapfile_activate(sis, swap_file, span); 2078 } 2079 2080 static void _enable_swap_info(struct swap_info_struct *p, int prio, 2081 unsigned char *swap_map, 2082 struct swap_cluster_info *cluster_info) 2083 { 2084 if (prio >= 0) 2085 p->prio = prio; 2086 else 2087 p->prio = --least_priority; 2088 /* 2089 * the plist prio is negated because plist ordering is 2090 * low-to-high, while swap ordering is high-to-low 2091 */ 2092 p->list.prio = -p->prio; 2093 p->avail_list.prio = -p->prio; 2094 p->swap_map = swap_map; 2095 p->cluster_info = cluster_info; 2096 p->flags |= SWP_WRITEOK; 2097 atomic_long_add(p->pages, &nr_swap_pages); 2098 total_swap_pages += p->pages; 2099 2100 assert_spin_locked(&swap_lock); 2101 /* 2102 * both lists are plists, and thus priority ordered. 2103 * swap_active_head needs to be priority ordered for swapoff(), 2104 * which on removal of any swap_info_struct with an auto-assigned 2105 * (i.e. negative) priority increments the auto-assigned priority 2106 * of any lower-priority swap_info_structs. 2107 * swap_avail_head needs to be priority ordered for get_swap_page(), 2108 * which allocates swap pages from the highest available priority 2109 * swap_info_struct. 2110 */ 2111 plist_add(&p->list, &swap_active_head); 2112 spin_lock(&swap_avail_lock); 2113 plist_add(&p->avail_list, &swap_avail_head); 2114 spin_unlock(&swap_avail_lock); 2115 } 2116 2117 static void enable_swap_info(struct swap_info_struct *p, int prio, 2118 unsigned char *swap_map, 2119 struct swap_cluster_info *cluster_info, 2120 unsigned long *frontswap_map) 2121 { 2122 frontswap_init(p->type, frontswap_map); 2123 spin_lock(&swap_lock); 2124 spin_lock(&p->lock); 2125 _enable_swap_info(p, prio, swap_map, cluster_info); 2126 spin_unlock(&p->lock); 2127 spin_unlock(&swap_lock); 2128 } 2129 2130 static void reinsert_swap_info(struct swap_info_struct *p) 2131 { 2132 spin_lock(&swap_lock); 2133 spin_lock(&p->lock); 2134 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); 2135 spin_unlock(&p->lock); 2136 spin_unlock(&swap_lock); 2137 } 2138 2139 bool has_usable_swap(void) 2140 { 2141 bool ret = true; 2142 2143 spin_lock(&swap_lock); 2144 if (plist_head_empty(&swap_active_head)) 2145 ret = false; 2146 spin_unlock(&swap_lock); 2147 return ret; 2148 } 2149 2150 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 2151 { 2152 struct swap_info_struct *p = NULL; 2153 unsigned char *swap_map; 2154 struct swap_cluster_info *cluster_info; 2155 unsigned long *frontswap_map; 2156 struct file *swap_file, *victim; 2157 struct address_space *mapping; 2158 struct inode *inode; 2159 struct filename *pathname; 2160 int err, found = 0; 2161 unsigned int old_block_size; 2162 2163 if (!capable(CAP_SYS_ADMIN)) 2164 return -EPERM; 2165 2166 BUG_ON(!current->mm); 2167 2168 pathname = getname(specialfile); 2169 if (IS_ERR(pathname)) 2170 return PTR_ERR(pathname); 2171 2172 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); 2173 err = PTR_ERR(victim); 2174 if (IS_ERR(victim)) 2175 goto out; 2176 2177 mapping = victim->f_mapping; 2178 spin_lock(&swap_lock); 2179 plist_for_each_entry(p, &swap_active_head, list) { 2180 if (p->flags & SWP_WRITEOK) { 2181 if (p->swap_file->f_mapping == mapping) { 2182 found = 1; 2183 break; 2184 } 2185 } 2186 } 2187 if (!found) { 2188 err = -EINVAL; 2189 spin_unlock(&swap_lock); 2190 goto out_dput; 2191 } 2192 if (!security_vm_enough_memory_mm(current->mm, p->pages)) 2193 vm_unacct_memory(p->pages); 2194 else { 2195 err = -ENOMEM; 2196 spin_unlock(&swap_lock); 2197 goto out_dput; 2198 } 2199 spin_lock(&swap_avail_lock); 2200 plist_del(&p->avail_list, &swap_avail_head); 2201 spin_unlock(&swap_avail_lock); 2202 spin_lock(&p->lock); 2203 if (p->prio < 0) { 2204 struct swap_info_struct *si = p; 2205 2206 plist_for_each_entry_continue(si, &swap_active_head, list) { 2207 si->prio++; 2208 si->list.prio--; 2209 si->avail_list.prio--; 2210 } 2211 least_priority++; 2212 } 2213 plist_del(&p->list, &swap_active_head); 2214 atomic_long_sub(p->pages, &nr_swap_pages); 2215 total_swap_pages -= p->pages; 2216 p->flags &= ~SWP_WRITEOK; 2217 spin_unlock(&p->lock); 2218 spin_unlock(&swap_lock); 2219 2220 disable_swap_slots_cache_lock(); 2221 2222 set_current_oom_origin(); 2223 err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ 2224 clear_current_oom_origin(); 2225 2226 if (err) { 2227 /* re-insert swap space back into swap_list */ 2228 reinsert_swap_info(p); 2229 reenable_swap_slots_cache_unlock(); 2230 goto out_dput; 2231 } 2232 2233 reenable_swap_slots_cache_unlock(); 2234 2235 flush_work(&p->discard_work); 2236 2237 destroy_swap_extents(p); 2238 if (p->flags & SWP_CONTINUED) 2239 free_swap_count_continuations(p); 2240 2241 mutex_lock(&swapon_mutex); 2242 spin_lock(&swap_lock); 2243 spin_lock(&p->lock); 2244 drain_mmlist(); 2245 2246 /* wait for anyone still in scan_swap_map */ 2247 p->highest_bit = 0; /* cuts scans short */ 2248 while (p->flags >= SWP_SCANNING) { 2249 spin_unlock(&p->lock); 2250 spin_unlock(&swap_lock); 2251 schedule_timeout_uninterruptible(1); 2252 spin_lock(&swap_lock); 2253 spin_lock(&p->lock); 2254 } 2255 2256 swap_file = p->swap_file; 2257 old_block_size = p->old_block_size; 2258 p->swap_file = NULL; 2259 p->max = 0; 2260 swap_map = p->swap_map; 2261 p->swap_map = NULL; 2262 cluster_info = p->cluster_info; 2263 p->cluster_info = NULL; 2264 frontswap_map = frontswap_map_get(p); 2265 spin_unlock(&p->lock); 2266 spin_unlock(&swap_lock); 2267 frontswap_invalidate_area(p->type); 2268 frontswap_map_set(p, NULL); 2269 mutex_unlock(&swapon_mutex); 2270 free_percpu(p->percpu_cluster); 2271 p->percpu_cluster = NULL; 2272 vfree(swap_map); 2273 kvfree(cluster_info); 2274 kvfree(frontswap_map); 2275 /* Destroy swap account information */ 2276 swap_cgroup_swapoff(p->type); 2277 exit_swap_address_space(p->type); 2278 2279 inode = mapping->host; 2280 if (S_ISBLK(inode->i_mode)) { 2281 struct block_device *bdev = I_BDEV(inode); 2282 set_blocksize(bdev, old_block_size); 2283 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2284 } else { 2285 inode_lock(inode); 2286 inode->i_flags &= ~S_SWAPFILE; 2287 inode_unlock(inode); 2288 } 2289 filp_close(swap_file, NULL); 2290 2291 /* 2292 * Clear the SWP_USED flag after all resources are freed so that swapon 2293 * can reuse this swap_info in alloc_swap_info() safely. It is ok to 2294 * not hold p->lock after we cleared its SWP_WRITEOK. 2295 */ 2296 spin_lock(&swap_lock); 2297 p->flags = 0; 2298 spin_unlock(&swap_lock); 2299 2300 err = 0; 2301 atomic_inc(&proc_poll_event); 2302 wake_up_interruptible(&proc_poll_wait); 2303 2304 out_dput: 2305 filp_close(victim, NULL); 2306 out: 2307 putname(pathname); 2308 return err; 2309 } 2310 2311 #ifdef CONFIG_PROC_FS 2312 static unsigned swaps_poll(struct file *file, poll_table *wait) 2313 { 2314 struct seq_file *seq = file->private_data; 2315 2316 poll_wait(file, &proc_poll_wait, wait); 2317 2318 if (seq->poll_event != atomic_read(&proc_poll_event)) { 2319 seq->poll_event = atomic_read(&proc_poll_event); 2320 return POLLIN | POLLRDNORM | POLLERR | POLLPRI; 2321 } 2322 2323 return POLLIN | POLLRDNORM; 2324 } 2325 2326 /* iterator */ 2327 static void *swap_start(struct seq_file *swap, loff_t *pos) 2328 { 2329 struct swap_info_struct *si; 2330 int type; 2331 loff_t l = *pos; 2332 2333 mutex_lock(&swapon_mutex); 2334 2335 if (!l) 2336 return SEQ_START_TOKEN; 2337 2338 for (type = 0; type < nr_swapfiles; type++) { 2339 smp_rmb(); /* read nr_swapfiles before swap_info[type] */ 2340 si = swap_info[type]; 2341 if (!(si->flags & SWP_USED) || !si->swap_map) 2342 continue; 2343 if (!--l) 2344 return si; 2345 } 2346 2347 return NULL; 2348 } 2349 2350 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 2351 { 2352 struct swap_info_struct *si = v; 2353 int type; 2354 2355 if (v == SEQ_START_TOKEN) 2356 type = 0; 2357 else 2358 type = si->type + 1; 2359 2360 for (; type < nr_swapfiles; type++) { 2361 smp_rmb(); /* read nr_swapfiles before swap_info[type] */ 2362 si = swap_info[type]; 2363 if (!(si->flags & SWP_USED) || !si->swap_map) 2364 continue; 2365 ++*pos; 2366 return si; 2367 } 2368 2369 return NULL; 2370 } 2371 2372 static void swap_stop(struct seq_file *swap, void *v) 2373 { 2374 mutex_unlock(&swapon_mutex); 2375 } 2376 2377 static int swap_show(struct seq_file *swap, void *v) 2378 { 2379 struct swap_info_struct *si = v; 2380 struct file *file; 2381 int len; 2382 2383 if (si == SEQ_START_TOKEN) { 2384 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 2385 return 0; 2386 } 2387 2388 file = si->swap_file; 2389 len = seq_file_path(swap, file, " \t\n\\"); 2390 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 2391 len < 40 ? 40 - len : 1, " ", 2392 S_ISBLK(file_inode(file)->i_mode) ? 2393 "partition" : "file\t", 2394 si->pages << (PAGE_SHIFT - 10), 2395 si->inuse_pages << (PAGE_SHIFT - 10), 2396 si->prio); 2397 return 0; 2398 } 2399 2400 static const struct seq_operations swaps_op = { 2401 .start = swap_start, 2402 .next = swap_next, 2403 .stop = swap_stop, 2404 .show = swap_show 2405 }; 2406 2407 static int swaps_open(struct inode *inode, struct file *file) 2408 { 2409 struct seq_file *seq; 2410 int ret; 2411 2412 ret = seq_open(file, &swaps_op); 2413 if (ret) 2414 return ret; 2415 2416 seq = file->private_data; 2417 seq->poll_event = atomic_read(&proc_poll_event); 2418 return 0; 2419 } 2420 2421 static const struct file_operations proc_swaps_operations = { 2422 .open = swaps_open, 2423 .read = seq_read, 2424 .llseek = seq_lseek, 2425 .release = seq_release, 2426 .poll = swaps_poll, 2427 }; 2428 2429 static int __init procswaps_init(void) 2430 { 2431 proc_create("swaps", 0, NULL, &proc_swaps_operations); 2432 return 0; 2433 } 2434 __initcall(procswaps_init); 2435 #endif /* CONFIG_PROC_FS */ 2436 2437 #ifdef MAX_SWAPFILES_CHECK 2438 static int __init max_swapfiles_check(void) 2439 { 2440 MAX_SWAPFILES_CHECK(); 2441 return 0; 2442 } 2443 late_initcall(max_swapfiles_check); 2444 #endif 2445 2446 static struct swap_info_struct *alloc_swap_info(void) 2447 { 2448 struct swap_info_struct *p; 2449 unsigned int type; 2450 2451 p = kzalloc(sizeof(*p), GFP_KERNEL); 2452 if (!p) 2453 return ERR_PTR(-ENOMEM); 2454 2455 spin_lock(&swap_lock); 2456 for (type = 0; type < nr_swapfiles; type++) { 2457 if (!(swap_info[type]->flags & SWP_USED)) 2458 break; 2459 } 2460 if (type >= MAX_SWAPFILES) { 2461 spin_unlock(&swap_lock); 2462 kfree(p); 2463 return ERR_PTR(-EPERM); 2464 } 2465 if (type >= nr_swapfiles) { 2466 p->type = type; 2467 swap_info[type] = p; 2468 /* 2469 * Write swap_info[type] before nr_swapfiles, in case a 2470 * racing procfs swap_start() or swap_next() is reading them. 2471 * (We never shrink nr_swapfiles, we never free this entry.) 2472 */ 2473 smp_wmb(); 2474 nr_swapfiles++; 2475 } else { 2476 kfree(p); 2477 p = swap_info[type]; 2478 /* 2479 * Do not memset this entry: a racing procfs swap_next() 2480 * would be relying on p->type to remain valid. 2481 */ 2482 } 2483 INIT_LIST_HEAD(&p->first_swap_extent.list); 2484 plist_node_init(&p->list, 0); 2485 plist_node_init(&p->avail_list, 0); 2486 p->flags = SWP_USED; 2487 spin_unlock(&swap_lock); 2488 spin_lock_init(&p->lock); 2489 2490 return p; 2491 } 2492 2493 static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) 2494 { 2495 int error; 2496 2497 if (S_ISBLK(inode->i_mode)) { 2498 p->bdev = bdgrab(I_BDEV(inode)); 2499 error = blkdev_get(p->bdev, 2500 FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); 2501 if (error < 0) { 2502 p->bdev = NULL; 2503 return error; 2504 } 2505 p->old_block_size = block_size(p->bdev); 2506 error = set_blocksize(p->bdev, PAGE_SIZE); 2507 if (error < 0) 2508 return error; 2509 p->flags |= SWP_BLKDEV; 2510 } else if (S_ISREG(inode->i_mode)) { 2511 p->bdev = inode->i_sb->s_bdev; 2512 inode_lock(inode); 2513 if (IS_SWAPFILE(inode)) 2514 return -EBUSY; 2515 } else 2516 return -EINVAL; 2517 2518 return 0; 2519 } 2520 2521 static unsigned long read_swap_header(struct swap_info_struct *p, 2522 union swap_header *swap_header, 2523 struct inode *inode) 2524 { 2525 int i; 2526 unsigned long maxpages; 2527 unsigned long swapfilepages; 2528 unsigned long last_page; 2529 2530 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 2531 pr_err("Unable to find swap-space signature\n"); 2532 return 0; 2533 } 2534 2535 /* swap partition endianess hack... */ 2536 if (swab32(swap_header->info.version) == 1) { 2537 swab32s(&swap_header->info.version); 2538 swab32s(&swap_header->info.last_page); 2539 swab32s(&swap_header->info.nr_badpages); 2540 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 2541 return 0; 2542 for (i = 0; i < swap_header->info.nr_badpages; i++) 2543 swab32s(&swap_header->info.badpages[i]); 2544 } 2545 /* Check the swap header's sub-version */ 2546 if (swap_header->info.version != 1) { 2547 pr_warn("Unable to handle swap header version %d\n", 2548 swap_header->info.version); 2549 return 0; 2550 } 2551 2552 p->lowest_bit = 1; 2553 p->cluster_next = 1; 2554 p->cluster_nr = 0; 2555 2556 /* 2557 * Find out how many pages are allowed for a single swap 2558 * device. There are two limiting factors: 1) the number 2559 * of bits for the swap offset in the swp_entry_t type, and 2560 * 2) the number of bits in the swap pte as defined by the 2561 * different architectures. In order to find the 2562 * largest possible bit mask, a swap entry with swap type 0 2563 * and swap offset ~0UL is created, encoded to a swap pte, 2564 * decoded to a swp_entry_t again, and finally the swap 2565 * offset is extracted. This will mask all the bits from 2566 * the initial ~0UL mask that can't be encoded in either 2567 * the swp_entry_t or the architecture definition of a 2568 * swap pte. 2569 */ 2570 maxpages = swp_offset(pte_to_swp_entry( 2571 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 2572 last_page = swap_header->info.last_page; 2573 if (last_page > maxpages) { 2574 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", 2575 maxpages << (PAGE_SHIFT - 10), 2576 last_page << (PAGE_SHIFT - 10)); 2577 } 2578 if (maxpages > last_page) { 2579 maxpages = last_page + 1; 2580 /* p->max is an unsigned int: don't overflow it */ 2581 if ((unsigned int)maxpages == 0) 2582 maxpages = UINT_MAX; 2583 } 2584 p->highest_bit = maxpages - 1; 2585 2586 if (!maxpages) 2587 return 0; 2588 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 2589 if (swapfilepages && maxpages > swapfilepages) { 2590 pr_warn("Swap area shorter than signature indicates\n"); 2591 return 0; 2592 } 2593 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 2594 return 0; 2595 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 2596 return 0; 2597 2598 return maxpages; 2599 } 2600 2601 #define SWAP_CLUSTER_INFO_COLS \ 2602 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) 2603 #define SWAP_CLUSTER_SPACE_COLS \ 2604 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) 2605 #define SWAP_CLUSTER_COLS \ 2606 max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) 2607 2608 static int setup_swap_map_and_extents(struct swap_info_struct *p, 2609 union swap_header *swap_header, 2610 unsigned char *swap_map, 2611 struct swap_cluster_info *cluster_info, 2612 unsigned long maxpages, 2613 sector_t *span) 2614 { 2615 unsigned int j, k; 2616 unsigned int nr_good_pages; 2617 int nr_extents; 2618 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); 2619 unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; 2620 unsigned long i, idx; 2621 2622 nr_good_pages = maxpages - 1; /* omit header page */ 2623 2624 cluster_list_init(&p->free_clusters); 2625 cluster_list_init(&p->discard_clusters); 2626 2627 for (i = 0; i < swap_header->info.nr_badpages; i++) { 2628 unsigned int page_nr = swap_header->info.badpages[i]; 2629 if (page_nr == 0 || page_nr > swap_header->info.last_page) 2630 return -EINVAL; 2631 if (page_nr < maxpages) { 2632 swap_map[page_nr] = SWAP_MAP_BAD; 2633 nr_good_pages--; 2634 /* 2635 * Haven't marked the cluster free yet, no list 2636 * operation involved 2637 */ 2638 inc_cluster_info_page(p, cluster_info, page_nr); 2639 } 2640 } 2641 2642 /* Haven't marked the cluster free yet, no list operation involved */ 2643 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) 2644 inc_cluster_info_page(p, cluster_info, i); 2645 2646 if (nr_good_pages) { 2647 swap_map[0] = SWAP_MAP_BAD; 2648 /* 2649 * Not mark the cluster free yet, no list 2650 * operation involved 2651 */ 2652 inc_cluster_info_page(p, cluster_info, 0); 2653 p->max = maxpages; 2654 p->pages = nr_good_pages; 2655 nr_extents = setup_swap_extents(p, span); 2656 if (nr_extents < 0) 2657 return nr_extents; 2658 nr_good_pages = p->pages; 2659 } 2660 if (!nr_good_pages) { 2661 pr_warn("Empty swap-file\n"); 2662 return -EINVAL; 2663 } 2664 2665 if (!cluster_info) 2666 return nr_extents; 2667 2668 2669 /* 2670 * Reduce false cache line sharing between cluster_info and 2671 * sharing same address space. 2672 */ 2673 for (k = 0; k < SWAP_CLUSTER_COLS; k++) { 2674 j = (k + col) % SWAP_CLUSTER_COLS; 2675 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { 2676 idx = i * SWAP_CLUSTER_COLS + j; 2677 if (idx >= nr_clusters) 2678 continue; 2679 if (cluster_count(&cluster_info[idx])) 2680 continue; 2681 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); 2682 cluster_list_add_tail(&p->free_clusters, cluster_info, 2683 idx); 2684 } 2685 } 2686 return nr_extents; 2687 } 2688 2689 /* 2690 * Helper to sys_swapon determining if a given swap 2691 * backing device queue supports DISCARD operations. 2692 */ 2693 static bool swap_discardable(struct swap_info_struct *si) 2694 { 2695 struct request_queue *q = bdev_get_queue(si->bdev); 2696 2697 if (!q || !blk_queue_discard(q)) 2698 return false; 2699 2700 return true; 2701 } 2702 2703 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 2704 { 2705 struct swap_info_struct *p; 2706 struct filename *name; 2707 struct file *swap_file = NULL; 2708 struct address_space *mapping; 2709 int prio; 2710 int error; 2711 union swap_header *swap_header; 2712 int nr_extents; 2713 sector_t span; 2714 unsigned long maxpages; 2715 unsigned char *swap_map = NULL; 2716 struct swap_cluster_info *cluster_info = NULL; 2717 unsigned long *frontswap_map = NULL; 2718 struct page *page = NULL; 2719 struct inode *inode = NULL; 2720 2721 if (swap_flags & ~SWAP_FLAGS_VALID) 2722 return -EINVAL; 2723 2724 if (!capable(CAP_SYS_ADMIN)) 2725 return -EPERM; 2726 2727 p = alloc_swap_info(); 2728 if (IS_ERR(p)) 2729 return PTR_ERR(p); 2730 2731 INIT_WORK(&p->discard_work, swap_discard_work); 2732 2733 name = getname(specialfile); 2734 if (IS_ERR(name)) { 2735 error = PTR_ERR(name); 2736 name = NULL; 2737 goto bad_swap; 2738 } 2739 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); 2740 if (IS_ERR(swap_file)) { 2741 error = PTR_ERR(swap_file); 2742 swap_file = NULL; 2743 goto bad_swap; 2744 } 2745 2746 p->swap_file = swap_file; 2747 mapping = swap_file->f_mapping; 2748 inode = mapping->host; 2749 2750 /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */ 2751 error = claim_swapfile(p, inode); 2752 if (unlikely(error)) 2753 goto bad_swap; 2754 2755 /* 2756 * Read the swap header. 2757 */ 2758 if (!mapping->a_ops->readpage) { 2759 error = -EINVAL; 2760 goto bad_swap; 2761 } 2762 page = read_mapping_page(mapping, 0, swap_file); 2763 if (IS_ERR(page)) { 2764 error = PTR_ERR(page); 2765 goto bad_swap; 2766 } 2767 swap_header = kmap(page); 2768 2769 maxpages = read_swap_header(p, swap_header, inode); 2770 if (unlikely(!maxpages)) { 2771 error = -EINVAL; 2772 goto bad_swap; 2773 } 2774 2775 /* OK, set up the swap map and apply the bad block list */ 2776 swap_map = vzalloc(maxpages); 2777 if (!swap_map) { 2778 error = -ENOMEM; 2779 goto bad_swap; 2780 } 2781 2782 if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) 2783 p->flags |= SWP_STABLE_WRITES; 2784 2785 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2786 int cpu; 2787 unsigned long ci, nr_cluster; 2788 2789 p->flags |= SWP_SOLIDSTATE; 2790 /* 2791 * select a random position to start with to help wear leveling 2792 * SSD 2793 */ 2794 p->cluster_next = 1 + (prandom_u32() % p->highest_bit); 2795 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); 2796 2797 cluster_info = kvzalloc(nr_cluster * sizeof(*cluster_info), 2798 GFP_KERNEL); 2799 if (!cluster_info) { 2800 error = -ENOMEM; 2801 goto bad_swap; 2802 } 2803 2804 for (ci = 0; ci < nr_cluster; ci++) 2805 spin_lock_init(&((cluster_info + ci)->lock)); 2806 2807 p->percpu_cluster = alloc_percpu(struct percpu_cluster); 2808 if (!p->percpu_cluster) { 2809 error = -ENOMEM; 2810 goto bad_swap; 2811 } 2812 for_each_possible_cpu(cpu) { 2813 struct percpu_cluster *cluster; 2814 cluster = per_cpu_ptr(p->percpu_cluster, cpu); 2815 cluster_set_null(&cluster->index); 2816 } 2817 } 2818 2819 error = swap_cgroup_swapon(p->type, maxpages); 2820 if (error) 2821 goto bad_swap; 2822 2823 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, 2824 cluster_info, maxpages, &span); 2825 if (unlikely(nr_extents < 0)) { 2826 error = nr_extents; 2827 goto bad_swap; 2828 } 2829 /* frontswap enabled? set up bit-per-page map for frontswap */ 2830 if (IS_ENABLED(CONFIG_FRONTSWAP)) 2831 frontswap_map = kvzalloc(BITS_TO_LONGS(maxpages) * sizeof(long), 2832 GFP_KERNEL); 2833 2834 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { 2835 /* 2836 * When discard is enabled for swap with no particular 2837 * policy flagged, we set all swap discard flags here in 2838 * order to sustain backward compatibility with older 2839 * swapon(8) releases. 2840 */ 2841 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | 2842 SWP_PAGE_DISCARD); 2843 2844 /* 2845 * By flagging sys_swapon, a sysadmin can tell us to 2846 * either do single-time area discards only, or to just 2847 * perform discards for released swap page-clusters. 2848 * Now it's time to adjust the p->flags accordingly. 2849 */ 2850 if (swap_flags & SWAP_FLAG_DISCARD_ONCE) 2851 p->flags &= ~SWP_PAGE_DISCARD; 2852 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) 2853 p->flags &= ~SWP_AREA_DISCARD; 2854 2855 /* issue a swapon-time discard if it's still required */ 2856 if (p->flags & SWP_AREA_DISCARD) { 2857 int err = discard_swap(p); 2858 if (unlikely(err)) 2859 pr_err("swapon: discard_swap(%p): %d\n", 2860 p, err); 2861 } 2862 } 2863 2864 error = init_swap_address_space(p->type, maxpages); 2865 if (error) 2866 goto bad_swap; 2867 2868 mutex_lock(&swapon_mutex); 2869 prio = -1; 2870 if (swap_flags & SWAP_FLAG_PREFER) 2871 prio = 2872 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2873 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); 2874 2875 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", 2876 p->pages<<(PAGE_SHIFT-10), name->name, p->prio, 2877 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2878 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2879 (p->flags & SWP_DISCARDABLE) ? "D" : "", 2880 (p->flags & SWP_AREA_DISCARD) ? "s" : "", 2881 (p->flags & SWP_PAGE_DISCARD) ? "c" : "", 2882 (frontswap_map) ? "FS" : ""); 2883 2884 mutex_unlock(&swapon_mutex); 2885 atomic_inc(&proc_poll_event); 2886 wake_up_interruptible(&proc_poll_wait); 2887 2888 if (S_ISREG(inode->i_mode)) 2889 inode->i_flags |= S_SWAPFILE; 2890 error = 0; 2891 goto out; 2892 bad_swap: 2893 free_percpu(p->percpu_cluster); 2894 p->percpu_cluster = NULL; 2895 if (inode && S_ISBLK(inode->i_mode) && p->bdev) { 2896 set_blocksize(p->bdev, p->old_block_size); 2897 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2898 } 2899 destroy_swap_extents(p); 2900 swap_cgroup_swapoff(p->type); 2901 spin_lock(&swap_lock); 2902 p->swap_file = NULL; 2903 p->flags = 0; 2904 spin_unlock(&swap_lock); 2905 vfree(swap_map); 2906 vfree(cluster_info); 2907 if (swap_file) { 2908 if (inode && S_ISREG(inode->i_mode)) { 2909 inode_unlock(inode); 2910 inode = NULL; 2911 } 2912 filp_close(swap_file, NULL); 2913 } 2914 out: 2915 if (page && !IS_ERR(page)) { 2916 kunmap(page); 2917 put_page(page); 2918 } 2919 if (name) 2920 putname(name); 2921 if (inode && S_ISREG(inode->i_mode)) 2922 inode_unlock(inode); 2923 if (!error) 2924 enable_swap_slots_cache(); 2925 return error; 2926 } 2927 2928 void si_swapinfo(struct sysinfo *val) 2929 { 2930 unsigned int type; 2931 unsigned long nr_to_be_unused = 0; 2932 2933 spin_lock(&swap_lock); 2934 for (type = 0; type < nr_swapfiles; type++) { 2935 struct swap_info_struct *si = swap_info[type]; 2936 2937 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 2938 nr_to_be_unused += si->inuse_pages; 2939 } 2940 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; 2941 val->totalswap = total_swap_pages + nr_to_be_unused; 2942 spin_unlock(&swap_lock); 2943 } 2944 2945 /* 2946 * Verify that a swap entry is valid and increment its swap map count. 2947 * 2948 * Returns error code in following case. 2949 * - success -> 0 2950 * - swp_entry is invalid -> EINVAL 2951 * - swp_entry is migration entry -> EINVAL 2952 * - swap-cache reference is requested but there is already one. -> EEXIST 2953 * - swap-cache reference is requested but the entry is not used. -> ENOENT 2954 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM 2955 */ 2956 static int __swap_duplicate(swp_entry_t entry, unsigned char usage) 2957 { 2958 struct swap_info_struct *p; 2959 struct swap_cluster_info *ci; 2960 unsigned long offset, type; 2961 unsigned char count; 2962 unsigned char has_cache; 2963 int err = -EINVAL; 2964 2965 if (non_swap_entry(entry)) 2966 goto out; 2967 2968 type = swp_type(entry); 2969 if (type >= nr_swapfiles) 2970 goto bad_file; 2971 p = swap_info[type]; 2972 offset = swp_offset(entry); 2973 if (unlikely(offset >= p->max)) 2974 goto out; 2975 2976 ci = lock_cluster_or_swap_info(p, offset); 2977 2978 count = p->swap_map[offset]; 2979 2980 /* 2981 * swapin_readahead() doesn't check if a swap entry is valid, so the 2982 * swap entry could be SWAP_MAP_BAD. Check here with lock held. 2983 */ 2984 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { 2985 err = -ENOENT; 2986 goto unlock_out; 2987 } 2988 2989 has_cache = count & SWAP_HAS_CACHE; 2990 count &= ~SWAP_HAS_CACHE; 2991 err = 0; 2992 2993 if (usage == SWAP_HAS_CACHE) { 2994 2995 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 2996 if (!has_cache && count) 2997 has_cache = SWAP_HAS_CACHE; 2998 else if (has_cache) /* someone else added cache */ 2999 err = -EEXIST; 3000 else /* no users remaining */ 3001 err = -ENOENT; 3002 3003 } else if (count || has_cache) { 3004 3005 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) 3006 count += usage; 3007 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) 3008 err = -EINVAL; 3009 else if (swap_count_continued(p, offset, count)) 3010 count = COUNT_CONTINUED; 3011 else 3012 err = -ENOMEM; 3013 } else 3014 err = -ENOENT; /* unused swap entry */ 3015 3016 p->swap_map[offset] = count | has_cache; 3017 3018 unlock_out: 3019 unlock_cluster_or_swap_info(p, ci); 3020 out: 3021 return err; 3022 3023 bad_file: 3024 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); 3025 goto out; 3026 } 3027 3028 /* 3029 * Help swapoff by noting that swap entry belongs to shmem/tmpfs 3030 * (in which case its reference count is never incremented). 3031 */ 3032 void swap_shmem_alloc(swp_entry_t entry) 3033 { 3034 __swap_duplicate(entry, SWAP_MAP_SHMEM); 3035 } 3036 3037 /* 3038 * Increase reference count of swap entry by 1. 3039 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required 3040 * but could not be atomically allocated. Returns 0, just as if it succeeded, 3041 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which 3042 * might occur if a page table entry has got corrupted. 3043 */ 3044 int swap_duplicate(swp_entry_t entry) 3045 { 3046 int err = 0; 3047 3048 while (!err && __swap_duplicate(entry, 1) == -ENOMEM) 3049 err = add_swap_count_continuation(entry, GFP_ATOMIC); 3050 return err; 3051 } 3052 3053 /* 3054 * @entry: swap entry for which we allocate swap cache. 3055 * 3056 * Called when allocating swap cache for existing swap entry, 3057 * This can return error codes. Returns 0 at success. 3058 * -EBUSY means there is a swap cache. 3059 * Note: return code is different from swap_duplicate(). 3060 */ 3061 int swapcache_prepare(swp_entry_t entry) 3062 { 3063 return __swap_duplicate(entry, SWAP_HAS_CACHE); 3064 } 3065 3066 struct swap_info_struct *page_swap_info(struct page *page) 3067 { 3068 swp_entry_t swap = { .val = page_private(page) }; 3069 return swap_info[swp_type(swap)]; 3070 } 3071 3072 /* 3073 * out-of-line __page_file_ methods to avoid include hell. 3074 */ 3075 struct address_space *__page_file_mapping(struct page *page) 3076 { 3077 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 3078 return page_swap_info(page)->swap_file->f_mapping; 3079 } 3080 EXPORT_SYMBOL_GPL(__page_file_mapping); 3081 3082 pgoff_t __page_file_index(struct page *page) 3083 { 3084 swp_entry_t swap = { .val = page_private(page) }; 3085 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 3086 return swp_offset(swap); 3087 } 3088 EXPORT_SYMBOL_GPL(__page_file_index); 3089 3090 /* 3091 * add_swap_count_continuation - called when a swap count is duplicated 3092 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 3093 * page of the original vmalloc'ed swap_map, to hold the continuation count 3094 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called 3095 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. 3096 * 3097 * These continuation pages are seldom referenced: the common paths all work 3098 * on the original swap_map, only referring to a continuation page when the 3099 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. 3100 * 3101 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding 3102 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) 3103 * can be called after dropping locks. 3104 */ 3105 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) 3106 { 3107 struct swap_info_struct *si; 3108 struct swap_cluster_info *ci; 3109 struct page *head; 3110 struct page *page; 3111 struct page *list_page; 3112 pgoff_t offset; 3113 unsigned char count; 3114 3115 /* 3116 * When debugging, it's easier to use __GFP_ZERO here; but it's better 3117 * for latency not to zero a page while GFP_ATOMIC and holding locks. 3118 */ 3119 page = alloc_page(gfp_mask | __GFP_HIGHMEM); 3120 3121 si = swap_info_get(entry); 3122 if (!si) { 3123 /* 3124 * An acceptable race has occurred since the failing 3125 * __swap_duplicate(): the swap entry has been freed, 3126 * perhaps even the whole swap_map cleared for swapoff. 3127 */ 3128 goto outer; 3129 } 3130 3131 offset = swp_offset(entry); 3132 3133 ci = lock_cluster(si, offset); 3134 3135 count = si->swap_map[offset] & ~SWAP_HAS_CACHE; 3136 3137 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { 3138 /* 3139 * The higher the swap count, the more likely it is that tasks 3140 * will race to add swap count continuation: we need to avoid 3141 * over-provisioning. 3142 */ 3143 goto out; 3144 } 3145 3146 if (!page) { 3147 unlock_cluster(ci); 3148 spin_unlock(&si->lock); 3149 return -ENOMEM; 3150 } 3151 3152 /* 3153 * We are fortunate that although vmalloc_to_page uses pte_offset_map, 3154 * no architecture is using highmem pages for kernel page tables: so it 3155 * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. 3156 */ 3157 head = vmalloc_to_page(si->swap_map + offset); 3158 offset &= ~PAGE_MASK; 3159 3160 /* 3161 * Page allocation does not initialize the page's lru field, 3162 * but it does always reset its private field. 3163 */ 3164 if (!page_private(head)) { 3165 BUG_ON(count & COUNT_CONTINUED); 3166 INIT_LIST_HEAD(&head->lru); 3167 set_page_private(head, SWP_CONTINUED); 3168 si->flags |= SWP_CONTINUED; 3169 } 3170 3171 list_for_each_entry(list_page, &head->lru, lru) { 3172 unsigned char *map; 3173 3174 /* 3175 * If the previous map said no continuation, but we've found 3176 * a continuation page, free our allocation and use this one. 3177 */ 3178 if (!(count & COUNT_CONTINUED)) 3179 goto out; 3180 3181 map = kmap_atomic(list_page) + offset; 3182 count = *map; 3183 kunmap_atomic(map); 3184 3185 /* 3186 * If this continuation count now has some space in it, 3187 * free our allocation and use this one. 3188 */ 3189 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) 3190 goto out; 3191 } 3192 3193 list_add_tail(&page->lru, &head->lru); 3194 page = NULL; /* now it's attached, don't free it */ 3195 out: 3196 unlock_cluster(ci); 3197 spin_unlock(&si->lock); 3198 outer: 3199 if (page) 3200 __free_page(page); 3201 return 0; 3202 } 3203 3204 /* 3205 * swap_count_continued - when the original swap_map count is incremented 3206 * from SWAP_MAP_MAX, check if there is already a continuation page to carry 3207 * into, carry if so, or else fail until a new continuation page is allocated; 3208 * when the original swap_map count is decremented from 0 with continuation, 3209 * borrow from the continuation and report whether it still holds more. 3210 * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster 3211 * lock. 3212 */ 3213 static bool swap_count_continued(struct swap_info_struct *si, 3214 pgoff_t offset, unsigned char count) 3215 { 3216 struct page *head; 3217 struct page *page; 3218 unsigned char *map; 3219 3220 head = vmalloc_to_page(si->swap_map + offset); 3221 if (page_private(head) != SWP_CONTINUED) { 3222 BUG_ON(count & COUNT_CONTINUED); 3223 return false; /* need to add count continuation */ 3224 } 3225 3226 offset &= ~PAGE_MASK; 3227 page = list_entry(head->lru.next, struct page, lru); 3228 map = kmap_atomic(page) + offset; 3229 3230 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ 3231 goto init_map; /* jump over SWAP_CONT_MAX checks */ 3232 3233 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ 3234 /* 3235 * Think of how you add 1 to 999 3236 */ 3237 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { 3238 kunmap_atomic(map); 3239 page = list_entry(page->lru.next, struct page, lru); 3240 BUG_ON(page == head); 3241 map = kmap_atomic(page) + offset; 3242 } 3243 if (*map == SWAP_CONT_MAX) { 3244 kunmap_atomic(map); 3245 page = list_entry(page->lru.next, struct page, lru); 3246 if (page == head) 3247 return false; /* add count continuation */ 3248 map = kmap_atomic(page) + offset; 3249 init_map: *map = 0; /* we didn't zero the page */ 3250 } 3251 *map += 1; 3252 kunmap_atomic(map); 3253 page = list_entry(page->lru.prev, struct page, lru); 3254 while (page != head) { 3255 map = kmap_atomic(page) + offset; 3256 *map = COUNT_CONTINUED; 3257 kunmap_atomic(map); 3258 page = list_entry(page->lru.prev, struct page, lru); 3259 } 3260 return true; /* incremented */ 3261 3262 } else { /* decrementing */ 3263 /* 3264 * Think of how you subtract 1 from 1000 3265 */ 3266 BUG_ON(count != COUNT_CONTINUED); 3267 while (*map == COUNT_CONTINUED) { 3268 kunmap_atomic(map); 3269 page = list_entry(page->lru.next, struct page, lru); 3270 BUG_ON(page == head); 3271 map = kmap_atomic(page) + offset; 3272 } 3273 BUG_ON(*map == 0); 3274 *map -= 1; 3275 if (*map == 0) 3276 count = 0; 3277 kunmap_atomic(map); 3278 page = list_entry(page->lru.prev, struct page, lru); 3279 while (page != head) { 3280 map = kmap_atomic(page) + offset; 3281 *map = SWAP_CONT_MAX | count; 3282 count = COUNT_CONTINUED; 3283 kunmap_atomic(map); 3284 page = list_entry(page->lru.prev, struct page, lru); 3285 } 3286 return count == COUNT_CONTINUED; 3287 } 3288 } 3289 3290 /* 3291 * free_swap_count_continuations - swapoff free all the continuation pages 3292 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. 3293 */ 3294 static void free_swap_count_continuations(struct swap_info_struct *si) 3295 { 3296 pgoff_t offset; 3297 3298 for (offset = 0; offset < si->max; offset += PAGE_SIZE) { 3299 struct page *head; 3300 head = vmalloc_to_page(si->swap_map + offset); 3301 if (page_private(head)) { 3302 struct page *page, *next; 3303 3304 list_for_each_entry_safe(page, next, &head->lru, lru) { 3305 list_del(&page->lru); 3306 __free_page(page); 3307 } 3308 } 3309 } 3310 } 3311