1 /* 2 * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2, or (at your option) 7 * any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; see the file COPYING. If not, write to 16 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 17 */ 18 19 /* 20 * fsnotify inode mark locking/lifetime/and refcnting 21 * 22 * REFCNT: 23 * The group->recnt and mark->refcnt tell how many "things" in the kernel 24 * currently are referencing the objects. Both kind of objects typically will 25 * live inside the kernel with a refcnt of 2, one for its creation and one for 26 * the reference a group and a mark hold to each other. 27 * If you are holding the appropriate locks, you can take a reference and the 28 * object itself is guaranteed to survive until the reference is dropped. 29 * 30 * LOCKING: 31 * There are 3 locks involved with fsnotify inode marks and they MUST be taken 32 * in order as follows: 33 * 34 * group->mark_mutex 35 * mark->lock 36 * inode->i_lock 37 * 38 * group->mark_mutex protects the marks_list anchored inside a given group and 39 * each mark is hooked via the g_list. It also protects the groups private 40 * data (i.e group limits). 41 42 * mark->lock protects the marks attributes like its masks and flags. 43 * Furthermore it protects the access to a reference of the group that the mark 44 * is assigned to as well as the access to a reference of the inode/vfsmount 45 * that is being watched by the mark. 46 * 47 * inode->i_lock protects the i_fsnotify_marks list anchored inside a 48 * given inode and each mark is hooked via the i_list. (and sorta the 49 * free_i_list) 50 * 51 * 52 * LIFETIME: 53 * Inode marks survive between when they are added to an inode and when their 54 * refcnt==0. 55 * 56 * The inode mark can be cleared for a number of different reasons including: 57 * - The inode is unlinked for the last time. (fsnotify_inode_remove) 58 * - The inode is being evicted from cache. (fsnotify_inode_delete) 59 * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) 60 * - Something explicitly requests that it be removed. (fsnotify_destroy_mark) 61 * - The fsnotify_group associated with the mark is going away and all such marks 62 * need to be cleaned up. (fsnotify_clear_marks_by_group) 63 * 64 * Worst case we are given an inode and need to clean up all the marks on that 65 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each 66 * mark on the list we take a reference (so the mark can't disappear under us). 67 * We remove that mark form the inode's list of marks and we add this mark to a 68 * private list anchored on the stack using i_free_list; we walk i_free_list 69 * and before we destroy the mark we make sure that we dont race with a 70 * concurrent destroy_group by getting a ref to the marks group and taking the 71 * groups mutex. 72 73 * Very similarly for freeing by group, except we use free_g_list. 74 * 75 * This has the very interesting property of being able to run concurrently with 76 * any (or all) other directions. 77 */ 78 79 #include <linux/fs.h> 80 #include <linux/init.h> 81 #include <linux/kernel.h> 82 #include <linux/kthread.h> 83 #include <linux/module.h> 84 #include <linux/mutex.h> 85 #include <linux/slab.h> 86 #include <linux/spinlock.h> 87 #include <linux/srcu.h> 88 89 #include <linux/atomic.h> 90 91 #include <linux/fsnotify_backend.h> 92 #include "fsnotify.h" 93 94 #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ 95 96 struct srcu_struct fsnotify_mark_srcu; 97 static DEFINE_SPINLOCK(destroy_lock); 98 static LIST_HEAD(destroy_list); 99 100 static void fsnotify_mark_destroy_workfn(struct work_struct *work); 101 static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); 102 103 void fsnotify_get_mark(struct fsnotify_mark *mark) 104 { 105 atomic_inc(&mark->refcnt); 106 } 107 108 void fsnotify_put_mark(struct fsnotify_mark *mark) 109 { 110 if (atomic_dec_and_test(&mark->refcnt)) { 111 if (mark->group) 112 fsnotify_put_group(mark->group); 113 mark->free_mark(mark); 114 } 115 } 116 117 /* Calculate mask of events for a list of marks */ 118 u32 fsnotify_recalc_mask(struct hlist_head *head) 119 { 120 u32 new_mask = 0; 121 struct fsnotify_mark *mark; 122 123 hlist_for_each_entry(mark, head, obj_list) 124 new_mask |= mark->mask; 125 return new_mask; 126 } 127 128 /* 129 * Remove mark from inode / vfsmount list, group list, drop inode reference 130 * if we got one. 131 * 132 * Must be called with group->mark_mutex held. 133 */ 134 void fsnotify_detach_mark(struct fsnotify_mark *mark) 135 { 136 struct inode *inode = NULL; 137 struct fsnotify_group *group = mark->group; 138 139 BUG_ON(!mutex_is_locked(&group->mark_mutex)); 140 141 spin_lock(&mark->lock); 142 143 /* something else already called this function on this mark */ 144 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { 145 spin_unlock(&mark->lock); 146 return; 147 } 148 149 mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; 150 151 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { 152 inode = mark->inode; 153 fsnotify_destroy_inode_mark(mark); 154 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) 155 fsnotify_destroy_vfsmount_mark(mark); 156 else 157 BUG(); 158 /* 159 * Note that we didn't update flags telling whether inode cares about 160 * what's happening with children. We update these flags from 161 * __fsnotify_parent() lazily when next event happens on one of our 162 * children. 163 */ 164 165 list_del_init(&mark->g_list); 166 167 spin_unlock(&mark->lock); 168 169 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) 170 iput(inode); 171 172 atomic_dec(&group->num_marks); 173 } 174 175 /* 176 * Prepare mark for freeing and add it to the list of marks prepared for 177 * freeing. The actual freeing must happen after SRCU period ends and the 178 * caller is responsible for this. 179 * 180 * The function returns true if the mark was added to the list of marks for 181 * freeing. The function returns false if someone else has already called 182 * __fsnotify_free_mark() for the mark. 183 */ 184 static bool __fsnotify_free_mark(struct fsnotify_mark *mark) 185 { 186 struct fsnotify_group *group = mark->group; 187 188 spin_lock(&mark->lock); 189 /* something else already called this function on this mark */ 190 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { 191 spin_unlock(&mark->lock); 192 return false; 193 } 194 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 195 spin_unlock(&mark->lock); 196 197 /* 198 * Some groups like to know that marks are being freed. This is a 199 * callback to the group function to let it know that this mark 200 * is being freed. 201 */ 202 if (group->ops->freeing_mark) 203 group->ops->freeing_mark(mark, group); 204 205 spin_lock(&destroy_lock); 206 list_add(&mark->g_list, &destroy_list); 207 spin_unlock(&destroy_lock); 208 209 return true; 210 } 211 212 /* 213 * Free fsnotify mark. The freeing is actually happening from a workqueue which 214 * first waits for srcu period end. Caller must have a reference to the mark 215 * or be protected by fsnotify_mark_srcu. 216 */ 217 void fsnotify_free_mark(struct fsnotify_mark *mark) 218 { 219 if (__fsnotify_free_mark(mark)) { 220 queue_delayed_work(system_unbound_wq, &reaper_work, 221 FSNOTIFY_REAPER_DELAY); 222 } 223 } 224 225 void fsnotify_destroy_mark(struct fsnotify_mark *mark, 226 struct fsnotify_group *group) 227 { 228 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 229 fsnotify_detach_mark(mark); 230 mutex_unlock(&group->mark_mutex); 231 fsnotify_free_mark(mark); 232 } 233 234 void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) 235 { 236 struct fsnotify_mark *mark; 237 238 while (1) { 239 /* 240 * We have to be careful since we can race with e.g. 241 * fsnotify_clear_marks_by_group() and once we drop 'lock', 242 * mark can get removed from the obj_list and destroyed. But 243 * we are holding mark reference so mark cannot be freed and 244 * calling fsnotify_destroy_mark() more than once is fine. 245 */ 246 spin_lock(lock); 247 if (hlist_empty(head)) { 248 spin_unlock(lock); 249 break; 250 } 251 mark = hlist_entry(head->first, struct fsnotify_mark, obj_list); 252 /* 253 * We don't update i_fsnotify_mask / mnt_fsnotify_mask here 254 * since inode / mount is going away anyway. So just remove 255 * mark from the list. 256 */ 257 hlist_del_init_rcu(&mark->obj_list); 258 fsnotify_get_mark(mark); 259 spin_unlock(lock); 260 fsnotify_destroy_mark(mark, mark->group); 261 fsnotify_put_mark(mark); 262 } 263 } 264 265 void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) 266 { 267 assert_spin_locked(&mark->lock); 268 269 mark->mask = mask; 270 271 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) 272 fsnotify_set_inode_mark_mask_locked(mark, mask); 273 } 274 275 void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask) 276 { 277 assert_spin_locked(&mark->lock); 278 279 mark->ignored_mask = mask; 280 } 281 282 /* 283 * Sorting function for lists of fsnotify marks. 284 * 285 * Fanotify supports different notification classes (reflected as priority of 286 * notification group). Events shall be passed to notification groups in 287 * decreasing priority order. To achieve this marks in notification lists for 288 * inodes and vfsmounts are sorted so that priorities of corresponding groups 289 * are descending. 290 * 291 * Furthermore correct handling of the ignore mask requires processing inode 292 * and vfsmount marks of each group together. Using the group address as 293 * further sort criterion provides a unique sorting order and thus we can 294 * merge inode and vfsmount lists of marks in linear time and find groups 295 * present in both lists. 296 * 297 * A return value of 1 signifies that b has priority over a. 298 * A return value of 0 signifies that the two marks have to be handled together. 299 * A return value of -1 signifies that a has priority over b. 300 */ 301 int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) 302 { 303 if (a == b) 304 return 0; 305 if (!a) 306 return 1; 307 if (!b) 308 return -1; 309 if (a->priority < b->priority) 310 return 1; 311 if (a->priority > b->priority) 312 return -1; 313 if (a < b) 314 return 1; 315 return -1; 316 } 317 318 /* Add mark into proper place in given list of marks */ 319 int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, 320 int allow_dups) 321 { 322 struct fsnotify_mark *lmark, *last = NULL; 323 int cmp; 324 325 /* is mark the first mark? */ 326 if (hlist_empty(head)) { 327 hlist_add_head_rcu(&mark->obj_list, head); 328 return 0; 329 } 330 331 /* should mark be in the middle of the current list? */ 332 hlist_for_each_entry(lmark, head, obj_list) { 333 last = lmark; 334 335 if ((lmark->group == mark->group) && !allow_dups) 336 return -EEXIST; 337 338 cmp = fsnotify_compare_groups(lmark->group, mark->group); 339 if (cmp >= 0) { 340 hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); 341 return 0; 342 } 343 } 344 345 BUG_ON(last == NULL); 346 /* mark should be the last entry. last is the current last entry */ 347 hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); 348 return 0; 349 } 350 351 /* 352 * Attach an initialized mark to a given group and fs object. 353 * These marks may be used for the fsnotify backend to determine which 354 * event types should be delivered to which group. 355 */ 356 int fsnotify_add_mark_locked(struct fsnotify_mark *mark, 357 struct fsnotify_group *group, struct inode *inode, 358 struct vfsmount *mnt, int allow_dups) 359 { 360 int ret = 0; 361 362 BUG_ON(inode && mnt); 363 BUG_ON(!inode && !mnt); 364 BUG_ON(!mutex_is_locked(&group->mark_mutex)); 365 366 /* 367 * LOCKING ORDER!!!! 368 * group->mark_mutex 369 * mark->lock 370 * inode->i_lock 371 */ 372 spin_lock(&mark->lock); 373 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; 374 375 fsnotify_get_group(group); 376 mark->group = group; 377 list_add(&mark->g_list, &group->marks_list); 378 atomic_inc(&group->num_marks); 379 fsnotify_get_mark(mark); /* for i_list and g_list */ 380 381 if (inode) { 382 ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups); 383 if (ret) 384 goto err; 385 } else if (mnt) { 386 ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups); 387 if (ret) 388 goto err; 389 } else { 390 BUG(); 391 } 392 393 /* this will pin the object if appropriate */ 394 fsnotify_set_mark_mask_locked(mark, mark->mask); 395 spin_unlock(&mark->lock); 396 397 if (inode) 398 __fsnotify_update_child_dentry_flags(inode); 399 400 return ret; 401 err: 402 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 403 list_del_init(&mark->g_list); 404 fsnotify_put_group(group); 405 mark->group = NULL; 406 atomic_dec(&group->num_marks); 407 408 spin_unlock(&mark->lock); 409 410 spin_lock(&destroy_lock); 411 list_add(&mark->g_list, &destroy_list); 412 spin_unlock(&destroy_lock); 413 queue_delayed_work(system_unbound_wq, &reaper_work, 414 FSNOTIFY_REAPER_DELAY); 415 416 return ret; 417 } 418 419 int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, 420 struct inode *inode, struct vfsmount *mnt, int allow_dups) 421 { 422 int ret; 423 mutex_lock(&group->mark_mutex); 424 ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups); 425 mutex_unlock(&group->mark_mutex); 426 return ret; 427 } 428 429 /* 430 * Given a list of marks, find the mark associated with given group. If found 431 * take a reference to that mark and return it, else return NULL. 432 */ 433 struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, 434 struct fsnotify_group *group) 435 { 436 struct fsnotify_mark *mark; 437 438 hlist_for_each_entry(mark, head, obj_list) { 439 if (mark->group == group) { 440 fsnotify_get_mark(mark); 441 return mark; 442 } 443 } 444 return NULL; 445 } 446 447 /* 448 * clear any marks in a group in which mark->flags & flags is true 449 */ 450 void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, 451 unsigned int flags) 452 { 453 struct fsnotify_mark *lmark, *mark; 454 LIST_HEAD(to_free); 455 456 /* 457 * We have to be really careful here. Anytime we drop mark_mutex, e.g. 458 * fsnotify_clear_marks_by_inode() can come and free marks. Even in our 459 * to_free list so we have to use mark_mutex even when accessing that 460 * list. And freeing mark requires us to drop mark_mutex. So we can 461 * reliably free only the first mark in the list. That's why we first 462 * move marks to free to to_free list in one go and then free marks in 463 * to_free list one by one. 464 */ 465 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 466 list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { 467 if (mark->flags & flags) 468 list_move(&mark->g_list, &to_free); 469 } 470 mutex_unlock(&group->mark_mutex); 471 472 while (1) { 473 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 474 if (list_empty(&to_free)) { 475 mutex_unlock(&group->mark_mutex); 476 break; 477 } 478 mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); 479 fsnotify_get_mark(mark); 480 fsnotify_detach_mark(mark); 481 mutex_unlock(&group->mark_mutex); 482 fsnotify_free_mark(mark); 483 fsnotify_put_mark(mark); 484 } 485 } 486 487 /* 488 * Given a group, prepare for freeing all the marks associated with that group. 489 * The marks are attached to the list of marks prepared for destruction, the 490 * caller is responsible for freeing marks in that list after SRCU period has 491 * ended. 492 */ 493 void fsnotify_detach_group_marks(struct fsnotify_group *group) 494 { 495 struct fsnotify_mark *mark; 496 497 while (1) { 498 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); 499 if (list_empty(&group->marks_list)) { 500 mutex_unlock(&group->mark_mutex); 501 break; 502 } 503 mark = list_first_entry(&group->marks_list, 504 struct fsnotify_mark, g_list); 505 fsnotify_get_mark(mark); 506 fsnotify_detach_mark(mark); 507 mutex_unlock(&group->mark_mutex); 508 __fsnotify_free_mark(mark); 509 fsnotify_put_mark(mark); 510 } 511 } 512 513 /* 514 * Nothing fancy, just initialize lists and locks and counters. 515 */ 516 void fsnotify_init_mark(struct fsnotify_mark *mark, 517 void (*free_mark)(struct fsnotify_mark *mark)) 518 { 519 memset(mark, 0, sizeof(*mark)); 520 spin_lock_init(&mark->lock); 521 atomic_set(&mark->refcnt, 1); 522 mark->free_mark = free_mark; 523 } 524 525 /* 526 * Destroy all marks in destroy_list, waits for SRCU period to finish before 527 * actually freeing marks. 528 */ 529 void fsnotify_mark_destroy_list(void) 530 { 531 struct fsnotify_mark *mark, *next; 532 struct list_head private_destroy_list; 533 534 spin_lock(&destroy_lock); 535 /* exchange the list head */ 536 list_replace_init(&destroy_list, &private_destroy_list); 537 spin_unlock(&destroy_lock); 538 539 synchronize_srcu(&fsnotify_mark_srcu); 540 541 list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { 542 list_del_init(&mark->g_list); 543 fsnotify_put_mark(mark); 544 } 545 } 546 547 static void fsnotify_mark_destroy_workfn(struct work_struct *work) 548 { 549 fsnotify_mark_destroy_list(); 550 } 551