1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/mmu_notifier.c 4 * 5 * Copyright (C) 2008 Qumranet, Inc. 6 * Copyright (C) 2008 SGI 7 * Christoph Lameter <cl@linux.com> 8 */ 9 10 #include <linux/rculist.h> 11 #include <linux/mmu_notifier.h> 12 #include <linux/export.h> 13 #include <linux/mm.h> 14 #include <linux/err.h> 15 #include <linux/srcu.h> 16 #include <linux/rcupdate.h> 17 #include <linux/sched.h> 18 #include <linux/sched/mm.h> 19 #include <linux/slab.h> 20 21 /* global SRCU for all MMs */ 22 DEFINE_STATIC_SRCU(srcu); 23 24 #ifdef CONFIG_LOCKDEP 25 struct lockdep_map __mmu_notifier_invalidate_range_start_map = { 26 .name = "mmu_notifier_invalidate_range_start" 27 }; 28 #endif 29 30 /* 31 * This function can't run concurrently against mmu_notifier_register 32 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap 33 * runs with mm_users == 0. Other tasks may still invoke mmu notifiers 34 * in parallel despite there being no task using this mm any more, 35 * through the vmas outside of the exit_mmap context, such as with 36 * vmtruncate. This serializes against mmu_notifier_unregister with 37 * the mmu_notifier_mm->lock in addition to SRCU and it serializes 38 * against the other mmu notifiers with SRCU. struct mmu_notifier_mm 39 * can't go away from under us as exit_mmap holds an mm_count pin 40 * itself. 41 */ 42 void __mmu_notifier_release(struct mm_struct *mm) 43 { 44 struct mmu_notifier *mn; 45 int id; 46 47 /* 48 * SRCU here will block mmu_notifier_unregister until 49 * ->release returns. 50 */ 51 id = srcu_read_lock(&srcu); 52 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) 53 /* 54 * If ->release runs before mmu_notifier_unregister it must be 55 * handled, as it's the only way for the driver to flush all 56 * existing sptes and stop the driver from establishing any more 57 * sptes before all the pages in the mm are freed. 58 */ 59 if (mn->ops->release) 60 mn->ops->release(mn, mm); 61 62 spin_lock(&mm->mmu_notifier_mm->lock); 63 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 64 mn = hlist_entry(mm->mmu_notifier_mm->list.first, 65 struct mmu_notifier, 66 hlist); 67 /* 68 * We arrived before mmu_notifier_unregister so 69 * mmu_notifier_unregister will do nothing other than to wait 70 * for ->release to finish and for mmu_notifier_unregister to 71 * return. 72 */ 73 hlist_del_init_rcu(&mn->hlist); 74 } 75 spin_unlock(&mm->mmu_notifier_mm->lock); 76 srcu_read_unlock(&srcu, id); 77 78 /* 79 * synchronize_srcu here prevents mmu_notifier_release from returning to 80 * exit_mmap (which would proceed with freeing all pages in the mm) 81 * until the ->release method returns, if it was invoked by 82 * mmu_notifier_unregister. 83 * 84 * The mmu_notifier_mm can't go away from under us because one mm_count 85 * is held by exit_mmap. 86 */ 87 synchronize_srcu(&srcu); 88 } 89 90 /* 91 * If no young bitflag is supported by the hardware, ->clear_flush_young can 92 * unmap the address and return 1 or 0 depending if the mapping previously 93 * existed or not. 94 */ 95 int __mmu_notifier_clear_flush_young(struct mm_struct *mm, 96 unsigned long start, 97 unsigned long end) 98 { 99 struct mmu_notifier *mn; 100 int young = 0, id; 101 102 id = srcu_read_lock(&srcu); 103 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { 104 if (mn->ops->clear_flush_young) 105 young |= mn->ops->clear_flush_young(mn, mm, start, end); 106 } 107 srcu_read_unlock(&srcu, id); 108 109 return young; 110 } 111 112 int __mmu_notifier_clear_young(struct mm_struct *mm, 113 unsigned long start, 114 unsigned long end) 115 { 116 struct mmu_notifier *mn; 117 int young = 0, id; 118 119 id = srcu_read_lock(&srcu); 120 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { 121 if (mn->ops->clear_young) 122 young |= mn->ops->clear_young(mn, mm, start, end); 123 } 124 srcu_read_unlock(&srcu, id); 125 126 return young; 127 } 128 129 int __mmu_notifier_test_young(struct mm_struct *mm, 130 unsigned long address) 131 { 132 struct mmu_notifier *mn; 133 int young = 0, id; 134 135 id = srcu_read_lock(&srcu); 136 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { 137 if (mn->ops->test_young) { 138 young = mn->ops->test_young(mn, mm, address); 139 if (young) 140 break; 141 } 142 } 143 srcu_read_unlock(&srcu, id); 144 145 return young; 146 } 147 148 void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, 149 pte_t pte) 150 { 151 struct mmu_notifier *mn; 152 int id; 153 154 id = srcu_read_lock(&srcu); 155 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { 156 if (mn->ops->change_pte) 157 mn->ops->change_pte(mn, mm, address, pte); 158 } 159 srcu_read_unlock(&srcu, id); 160 } 161 162 int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) 163 { 164 struct mmu_notifier *mn; 165 int ret = 0; 166 int id; 167 168 id = srcu_read_lock(&srcu); 169 hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { 170 if (mn->ops->invalidate_range_start) { 171 int _ret; 172 173 if (!mmu_notifier_range_blockable(range)) 174 non_block_start(); 175 _ret = mn->ops->invalidate_range_start(mn, range); 176 if (!mmu_notifier_range_blockable(range)) 177 non_block_end(); 178 if (_ret) { 179 pr_info("%pS callback failed with %d in %sblockable context.\n", 180 mn->ops->invalidate_range_start, _ret, 181 !mmu_notifier_range_blockable(range) ? "non-" : ""); 182 WARN_ON(mmu_notifier_range_blockable(range) || 183 ret != -EAGAIN); 184 ret = _ret; 185 } 186 } 187 } 188 srcu_read_unlock(&srcu, id); 189 190 return ret; 191 } 192 193 void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, 194 bool only_end) 195 { 196 struct mmu_notifier *mn; 197 int id; 198 199 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 200 id = srcu_read_lock(&srcu); 201 hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { 202 /* 203 * Call invalidate_range here too to avoid the need for the 204 * subsystem of having to register an invalidate_range_end 205 * call-back when there is invalidate_range already. Usually a 206 * subsystem registers either invalidate_range_start()/end() or 207 * invalidate_range(), so this will be no additional overhead 208 * (besides the pointer check). 209 * 210 * We skip call to invalidate_range() if we know it is safe ie 211 * call site use mmu_notifier_invalidate_range_only_end() which 212 * is safe to do when we know that a call to invalidate_range() 213 * already happen under page table lock. 214 */ 215 if (!only_end && mn->ops->invalidate_range) 216 mn->ops->invalidate_range(mn, range->mm, 217 range->start, 218 range->end); 219 if (mn->ops->invalidate_range_end) { 220 if (!mmu_notifier_range_blockable(range)) 221 non_block_start(); 222 mn->ops->invalidate_range_end(mn, range); 223 if (!mmu_notifier_range_blockable(range)) 224 non_block_end(); 225 } 226 } 227 srcu_read_unlock(&srcu, id); 228 lock_map_release(&__mmu_notifier_invalidate_range_start_map); 229 } 230 231 void __mmu_notifier_invalidate_range(struct mm_struct *mm, 232 unsigned long start, unsigned long end) 233 { 234 struct mmu_notifier *mn; 235 int id; 236 237 id = srcu_read_lock(&srcu); 238 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { 239 if (mn->ops->invalidate_range) 240 mn->ops->invalidate_range(mn, mm, start, end); 241 } 242 srcu_read_unlock(&srcu, id); 243 } 244 245 /* 246 * Same as mmu_notifier_register but here the caller must hold the 247 * mmap_sem in write mode. 248 */ 249 int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) 250 { 251 struct mmu_notifier_mm *mmu_notifier_mm = NULL; 252 int ret; 253 254 lockdep_assert_held_write(&mm->mmap_sem); 255 BUG_ON(atomic_read(&mm->mm_users) <= 0); 256 257 if (IS_ENABLED(CONFIG_LOCKDEP)) { 258 fs_reclaim_acquire(GFP_KERNEL); 259 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 260 lock_map_release(&__mmu_notifier_invalidate_range_start_map); 261 fs_reclaim_release(GFP_KERNEL); 262 } 263 264 mn->mm = mm; 265 mn->users = 1; 266 267 if (!mm->mmu_notifier_mm) { 268 /* 269 * kmalloc cannot be called under mm_take_all_locks(), but we 270 * know that mm->mmu_notifier_mm can't change while we hold 271 * the write side of the mmap_sem. 272 */ 273 mmu_notifier_mm = 274 kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); 275 if (!mmu_notifier_mm) 276 return -ENOMEM; 277 278 INIT_HLIST_HEAD(&mmu_notifier_mm->list); 279 spin_lock_init(&mmu_notifier_mm->lock); 280 } 281 282 ret = mm_take_all_locks(mm); 283 if (unlikely(ret)) 284 goto out_clean; 285 286 /* Pairs with the mmdrop in mmu_notifier_unregister_* */ 287 mmgrab(mm); 288 289 /* 290 * Serialize the update against mmu_notifier_unregister. A 291 * side note: mmu_notifier_release can't run concurrently with 292 * us because we hold the mm_users pin (either implicitly as 293 * current->mm or explicitly with get_task_mm() or similar). 294 * We can't race against any other mmu notifier method either 295 * thanks to mm_take_all_locks(). 296 */ 297 if (mmu_notifier_mm) 298 mm->mmu_notifier_mm = mmu_notifier_mm; 299 300 spin_lock(&mm->mmu_notifier_mm->lock); 301 hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); 302 spin_unlock(&mm->mmu_notifier_mm->lock); 303 304 mm_drop_all_locks(mm); 305 BUG_ON(atomic_read(&mm->mm_users) <= 0); 306 return 0; 307 308 out_clean: 309 kfree(mmu_notifier_mm); 310 return ret; 311 } 312 EXPORT_SYMBOL_GPL(__mmu_notifier_register); 313 314 /** 315 * mmu_notifier_register - Register a notifier on a mm 316 * @mn: The notifier to attach 317 * @mm: The mm to attach the notifier to 318 * 319 * Must not hold mmap_sem nor any other VM related lock when calling 320 * this registration function. Must also ensure mm_users can't go down 321 * to zero while this runs to avoid races with mmu_notifier_release, 322 * so mm has to be current->mm or the mm should be pinned safely such 323 * as with get_task_mm(). If the mm is not current->mm, the mm_users 324 * pin should be released by calling mmput after mmu_notifier_register 325 * returns. 326 * 327 * mmu_notifier_unregister() or mmu_notifier_put() must be always called to 328 * unregister the notifier. 329 * 330 * While the caller has a mmu_notifier get the mn->mm pointer will remain 331 * valid, and can be converted to an active mm pointer via mmget_not_zero(). 332 */ 333 int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) 334 { 335 int ret; 336 337 down_write(&mm->mmap_sem); 338 ret = __mmu_notifier_register(mn, mm); 339 up_write(&mm->mmap_sem); 340 return ret; 341 } 342 EXPORT_SYMBOL_GPL(mmu_notifier_register); 343 344 static struct mmu_notifier * 345 find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) 346 { 347 struct mmu_notifier *mn; 348 349 spin_lock(&mm->mmu_notifier_mm->lock); 350 hlist_for_each_entry_rcu (mn, &mm->mmu_notifier_mm->list, hlist) { 351 if (mn->ops != ops) 352 continue; 353 354 if (likely(mn->users != UINT_MAX)) 355 mn->users++; 356 else 357 mn = ERR_PTR(-EOVERFLOW); 358 spin_unlock(&mm->mmu_notifier_mm->lock); 359 return mn; 360 } 361 spin_unlock(&mm->mmu_notifier_mm->lock); 362 return NULL; 363 } 364 365 /** 366 * mmu_notifier_get_locked - Return the single struct mmu_notifier for 367 * the mm & ops 368 * @ops: The operations struct being subscribe with 369 * @mm : The mm to attach notifiers too 370 * 371 * This function either allocates a new mmu_notifier via 372 * ops->alloc_notifier(), or returns an already existing notifier on the 373 * list. The value of the ops pointer is used to determine when two notifiers 374 * are the same. 375 * 376 * Each call to mmu_notifier_get() must be paired with a call to 377 * mmu_notifier_put(). The caller must hold the write side of mm->mmap_sem. 378 * 379 * While the caller has a mmu_notifier get the mm pointer will remain valid, 380 * and can be converted to an active mm pointer via mmget_not_zero(). 381 */ 382 struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, 383 struct mm_struct *mm) 384 { 385 struct mmu_notifier *mn; 386 int ret; 387 388 lockdep_assert_held_write(&mm->mmap_sem); 389 390 if (mm->mmu_notifier_mm) { 391 mn = find_get_mmu_notifier(mm, ops); 392 if (mn) 393 return mn; 394 } 395 396 mn = ops->alloc_notifier(mm); 397 if (IS_ERR(mn)) 398 return mn; 399 mn->ops = ops; 400 ret = __mmu_notifier_register(mn, mm); 401 if (ret) 402 goto out_free; 403 return mn; 404 out_free: 405 mn->ops->free_notifier(mn); 406 return ERR_PTR(ret); 407 } 408 EXPORT_SYMBOL_GPL(mmu_notifier_get_locked); 409 410 /* this is called after the last mmu_notifier_unregister() returned */ 411 void __mmu_notifier_mm_destroy(struct mm_struct *mm) 412 { 413 BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); 414 kfree(mm->mmu_notifier_mm); 415 mm->mmu_notifier_mm = LIST_POISON1; /* debug */ 416 } 417 418 /* 419 * This releases the mm_count pin automatically and frees the mm 420 * structure if it was the last user of it. It serializes against 421 * running mmu notifiers with SRCU and against mmu_notifier_unregister 422 * with the unregister lock + SRCU. All sptes must be dropped before 423 * calling mmu_notifier_unregister. ->release or any other notifier 424 * method may be invoked concurrently with mmu_notifier_unregister, 425 * and only after mmu_notifier_unregister returned we're guaranteed 426 * that ->release or any other method can't run anymore. 427 */ 428 void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) 429 { 430 BUG_ON(atomic_read(&mm->mm_count) <= 0); 431 432 if (!hlist_unhashed(&mn->hlist)) { 433 /* 434 * SRCU here will force exit_mmap to wait for ->release to 435 * finish before freeing the pages. 436 */ 437 int id; 438 439 id = srcu_read_lock(&srcu); 440 /* 441 * exit_mmap will block in mmu_notifier_release to guarantee 442 * that ->release is called before freeing the pages. 443 */ 444 if (mn->ops->release) 445 mn->ops->release(mn, mm); 446 srcu_read_unlock(&srcu, id); 447 448 spin_lock(&mm->mmu_notifier_mm->lock); 449 /* 450 * Can not use list_del_rcu() since __mmu_notifier_release 451 * can delete it before we hold the lock. 452 */ 453 hlist_del_init_rcu(&mn->hlist); 454 spin_unlock(&mm->mmu_notifier_mm->lock); 455 } 456 457 /* 458 * Wait for any running method to finish, of course including 459 * ->release if it was run by mmu_notifier_release instead of us. 460 */ 461 synchronize_srcu(&srcu); 462 463 BUG_ON(atomic_read(&mm->mm_count) <= 0); 464 465 mmdrop(mm); 466 } 467 EXPORT_SYMBOL_GPL(mmu_notifier_unregister); 468 469 static void mmu_notifier_free_rcu(struct rcu_head *rcu) 470 { 471 struct mmu_notifier *mn = container_of(rcu, struct mmu_notifier, rcu); 472 struct mm_struct *mm = mn->mm; 473 474 mn->ops->free_notifier(mn); 475 /* Pairs with the get in __mmu_notifier_register() */ 476 mmdrop(mm); 477 } 478 479 /** 480 * mmu_notifier_put - Release the reference on the notifier 481 * @mn: The notifier to act on 482 * 483 * This function must be paired with each mmu_notifier_get(), it releases the 484 * reference obtained by the get. If this is the last reference then process 485 * to free the notifier will be run asynchronously. 486 * 487 * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release 488 * when the mm_struct is destroyed. Instead free_notifier is always called to 489 * release any resources held by the user. 490 * 491 * As ops->release is not guaranteed to be called, the user must ensure that 492 * all sptes are dropped, and no new sptes can be established before 493 * mmu_notifier_put() is called. 494 * 495 * This function can be called from the ops->release callback, however the 496 * caller must still ensure it is called pairwise with mmu_notifier_get(). 497 * 498 * Modules calling this function must call mmu_notifier_synchronize() in 499 * their __exit functions to ensure the async work is completed. 500 */ 501 void mmu_notifier_put(struct mmu_notifier *mn) 502 { 503 struct mm_struct *mm = mn->mm; 504 505 spin_lock(&mm->mmu_notifier_mm->lock); 506 if (WARN_ON(!mn->users) || --mn->users) 507 goto out_unlock; 508 hlist_del_init_rcu(&mn->hlist); 509 spin_unlock(&mm->mmu_notifier_mm->lock); 510 511 call_srcu(&srcu, &mn->rcu, mmu_notifier_free_rcu); 512 return; 513 514 out_unlock: 515 spin_unlock(&mm->mmu_notifier_mm->lock); 516 } 517 EXPORT_SYMBOL_GPL(mmu_notifier_put); 518 519 /** 520 * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed 521 * 522 * This function ensures that all outstanding async SRU work from 523 * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops 524 * associated with an unused mmu_notifier will no longer be called. 525 * 526 * Before using the caller must ensure that all of its mmu_notifiers have been 527 * fully released via mmu_notifier_put(). 528 * 529 * Modules using the mmu_notifier_put() API should call this in their __exit 530 * function to avoid module unloading races. 531 */ 532 void mmu_notifier_synchronize(void) 533 { 534 synchronize_srcu(&srcu); 535 } 536 EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); 537 538 bool 539 mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) 540 { 541 if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) 542 return false; 543 /* Return true if the vma still have the read flag set. */ 544 return range->vma->vm_flags & VM_READ; 545 } 546 EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); 547