1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/ceph/ceph_debug.h> 3 4 #include <linux/fs.h> 5 #include <linux/kernel.h> 6 #include <linux/sched/signal.h> 7 #include <linux/slab.h> 8 #include <linux/vmalloc.h> 9 #include <linux/wait.h> 10 #include <linux/writeback.h> 11 #include <linux/iversion.h> 12 #include <linux/filelock.h> 13 14 #include "super.h" 15 #include "mds_client.h" 16 #include "cache.h" 17 #include "crypto.h" 18 #include <linux/ceph/decode.h> 19 #include <linux/ceph/messenger.h> 20 21 /* 22 * Capability management 23 * 24 * The Ceph metadata servers control client access to inode metadata 25 * and file data by issuing capabilities, granting clients permission 26 * to read and/or write both inode field and file data to OSDs 27 * (storage nodes). Each capability consists of a set of bits 28 * indicating which operations are allowed. 29 * 30 * If the client holds a *_SHARED cap, the client has a coherent value 31 * that can be safely read from the cached inode. 32 * 33 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the 34 * client is allowed to change inode attributes (e.g., file size, 35 * mtime), note its dirty state in the ceph_cap, and asynchronously 36 * flush that metadata change to the MDS. 37 * 38 * In the event of a conflicting operation (perhaps by another 39 * client), the MDS will revoke the conflicting client capabilities. 40 * 41 * In order for a client to cache an inode, it must hold a capability 42 * with at least one MDS server. When inodes are released, release 43 * notifications are batched and periodically sent en masse to the MDS 44 * cluster to release server state. 45 */ 46 47 static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc); 48 static void __kick_flushing_caps(struct ceph_mds_client *mdsc, 49 struct ceph_mds_session *session, 50 struct ceph_inode_info *ci, 51 u64 oldest_flush_tid); 52 53 /* 54 * Generate readable cap strings for debugging output. 55 */ 56 #define MAX_CAP_STR 20 57 static char cap_str[MAX_CAP_STR][40]; 58 static DEFINE_SPINLOCK(cap_str_lock); 59 static int last_cap_str; 60 61 static char *gcap_string(char *s, int c) 62 { 63 if (c & CEPH_CAP_GSHARED) 64 *s++ = 's'; 65 if (c & CEPH_CAP_GEXCL) 66 *s++ = 'x'; 67 if (c & CEPH_CAP_GCACHE) 68 *s++ = 'c'; 69 if (c & CEPH_CAP_GRD) 70 *s++ = 'r'; 71 if (c & CEPH_CAP_GWR) 72 *s++ = 'w'; 73 if (c & CEPH_CAP_GBUFFER) 74 *s++ = 'b'; 75 if (c & CEPH_CAP_GWREXTEND) 76 *s++ = 'a'; 77 if (c & CEPH_CAP_GLAZYIO) 78 *s++ = 'l'; 79 return s; 80 } 81 82 const char *ceph_cap_string(int caps) 83 { 84 int i; 85 char *s; 86 int c; 87 88 spin_lock(&cap_str_lock); 89 i = last_cap_str++; 90 if (last_cap_str == MAX_CAP_STR) 91 last_cap_str = 0; 92 spin_unlock(&cap_str_lock); 93 94 s = cap_str[i]; 95 96 if (caps & CEPH_CAP_PIN) 97 *s++ = 'p'; 98 99 c = (caps >> CEPH_CAP_SAUTH) & 3; 100 if (c) { 101 *s++ = 'A'; 102 s = gcap_string(s, c); 103 } 104 105 c = (caps >> CEPH_CAP_SLINK) & 3; 106 if (c) { 107 *s++ = 'L'; 108 s = gcap_string(s, c); 109 } 110 111 c = (caps >> CEPH_CAP_SXATTR) & 3; 112 if (c) { 113 *s++ = 'X'; 114 s = gcap_string(s, c); 115 } 116 117 c = caps >> CEPH_CAP_SFILE; 118 if (c) { 119 *s++ = 'F'; 120 s = gcap_string(s, c); 121 } 122 123 if (s == cap_str[i]) 124 *s++ = '-'; 125 *s = 0; 126 return cap_str[i]; 127 } 128 129 void ceph_caps_init(struct ceph_mds_client *mdsc) 130 { 131 INIT_LIST_HEAD(&mdsc->caps_list); 132 spin_lock_init(&mdsc->caps_list_lock); 133 } 134 135 void ceph_caps_finalize(struct ceph_mds_client *mdsc) 136 { 137 struct ceph_cap *cap; 138 139 spin_lock(&mdsc->caps_list_lock); 140 while (!list_empty(&mdsc->caps_list)) { 141 cap = list_first_entry(&mdsc->caps_list, 142 struct ceph_cap, caps_item); 143 list_del(&cap->caps_item); 144 kmem_cache_free(ceph_cap_cachep, cap); 145 } 146 mdsc->caps_total_count = 0; 147 mdsc->caps_avail_count = 0; 148 mdsc->caps_use_count = 0; 149 mdsc->caps_reserve_count = 0; 150 mdsc->caps_min_count = 0; 151 spin_unlock(&mdsc->caps_list_lock); 152 } 153 154 void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, 155 struct ceph_mount_options *fsopt) 156 { 157 spin_lock(&mdsc->caps_list_lock); 158 mdsc->caps_min_count = fsopt->max_readdir; 159 if (mdsc->caps_min_count < 1024) 160 mdsc->caps_min_count = 1024; 161 mdsc->caps_use_max = fsopt->caps_max; 162 if (mdsc->caps_use_max > 0 && 163 mdsc->caps_use_max < mdsc->caps_min_count) 164 mdsc->caps_use_max = mdsc->caps_min_count; 165 spin_unlock(&mdsc->caps_list_lock); 166 } 167 168 static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps) 169 { 170 struct ceph_cap *cap; 171 int i; 172 173 if (nr_caps) { 174 BUG_ON(mdsc->caps_reserve_count < nr_caps); 175 mdsc->caps_reserve_count -= nr_caps; 176 if (mdsc->caps_avail_count >= 177 mdsc->caps_reserve_count + mdsc->caps_min_count) { 178 mdsc->caps_total_count -= nr_caps; 179 for (i = 0; i < nr_caps; i++) { 180 cap = list_first_entry(&mdsc->caps_list, 181 struct ceph_cap, caps_item); 182 list_del(&cap->caps_item); 183 kmem_cache_free(ceph_cap_cachep, cap); 184 } 185 } else { 186 mdsc->caps_avail_count += nr_caps; 187 } 188 189 dout("%s: caps %d = %d used + %d resv + %d avail\n", 190 __func__, 191 mdsc->caps_total_count, mdsc->caps_use_count, 192 mdsc->caps_reserve_count, mdsc->caps_avail_count); 193 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 194 mdsc->caps_reserve_count + 195 mdsc->caps_avail_count); 196 } 197 } 198 199 /* 200 * Called under mdsc->mutex. 201 */ 202 int ceph_reserve_caps(struct ceph_mds_client *mdsc, 203 struct ceph_cap_reservation *ctx, int need) 204 { 205 int i, j; 206 struct ceph_cap *cap; 207 int have; 208 int alloc = 0; 209 int max_caps; 210 int err = 0; 211 bool trimmed = false; 212 struct ceph_mds_session *s; 213 LIST_HEAD(newcaps); 214 215 dout("reserve caps ctx=%p need=%d\n", ctx, need); 216 217 /* first reserve any caps that are already allocated */ 218 spin_lock(&mdsc->caps_list_lock); 219 if (mdsc->caps_avail_count >= need) 220 have = need; 221 else 222 have = mdsc->caps_avail_count; 223 mdsc->caps_avail_count -= have; 224 mdsc->caps_reserve_count += have; 225 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 226 mdsc->caps_reserve_count + 227 mdsc->caps_avail_count); 228 spin_unlock(&mdsc->caps_list_lock); 229 230 for (i = have; i < need; ) { 231 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 232 if (cap) { 233 list_add(&cap->caps_item, &newcaps); 234 alloc++; 235 i++; 236 continue; 237 } 238 239 if (!trimmed) { 240 for (j = 0; j < mdsc->max_sessions; j++) { 241 s = __ceph_lookup_mds_session(mdsc, j); 242 if (!s) 243 continue; 244 mutex_unlock(&mdsc->mutex); 245 246 mutex_lock(&s->s_mutex); 247 max_caps = s->s_nr_caps - (need - i); 248 ceph_trim_caps(mdsc, s, max_caps); 249 mutex_unlock(&s->s_mutex); 250 251 ceph_put_mds_session(s); 252 mutex_lock(&mdsc->mutex); 253 } 254 trimmed = true; 255 256 spin_lock(&mdsc->caps_list_lock); 257 if (mdsc->caps_avail_count) { 258 int more_have; 259 if (mdsc->caps_avail_count >= need - i) 260 more_have = need - i; 261 else 262 more_have = mdsc->caps_avail_count; 263 264 i += more_have; 265 have += more_have; 266 mdsc->caps_avail_count -= more_have; 267 mdsc->caps_reserve_count += more_have; 268 269 } 270 spin_unlock(&mdsc->caps_list_lock); 271 272 continue; 273 } 274 275 pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", 276 ctx, need, have + alloc); 277 err = -ENOMEM; 278 break; 279 } 280 281 if (!err) { 282 BUG_ON(have + alloc != need); 283 ctx->count = need; 284 ctx->used = 0; 285 } 286 287 spin_lock(&mdsc->caps_list_lock); 288 mdsc->caps_total_count += alloc; 289 mdsc->caps_reserve_count += alloc; 290 list_splice(&newcaps, &mdsc->caps_list); 291 292 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 293 mdsc->caps_reserve_count + 294 mdsc->caps_avail_count); 295 296 if (err) 297 __ceph_unreserve_caps(mdsc, have + alloc); 298 299 spin_unlock(&mdsc->caps_list_lock); 300 301 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", 302 ctx, mdsc->caps_total_count, mdsc->caps_use_count, 303 mdsc->caps_reserve_count, mdsc->caps_avail_count); 304 return err; 305 } 306 307 void ceph_unreserve_caps(struct ceph_mds_client *mdsc, 308 struct ceph_cap_reservation *ctx) 309 { 310 bool reclaim = false; 311 if (!ctx->count) 312 return; 313 314 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); 315 spin_lock(&mdsc->caps_list_lock); 316 __ceph_unreserve_caps(mdsc, ctx->count); 317 ctx->count = 0; 318 319 if (mdsc->caps_use_max > 0 && 320 mdsc->caps_use_count > mdsc->caps_use_max) 321 reclaim = true; 322 spin_unlock(&mdsc->caps_list_lock); 323 324 if (reclaim) 325 ceph_reclaim_caps_nr(mdsc, ctx->used); 326 } 327 328 struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, 329 struct ceph_cap_reservation *ctx) 330 { 331 struct ceph_cap *cap = NULL; 332 333 /* temporary, until we do something about cap import/export */ 334 if (!ctx) { 335 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 336 if (cap) { 337 spin_lock(&mdsc->caps_list_lock); 338 mdsc->caps_use_count++; 339 mdsc->caps_total_count++; 340 spin_unlock(&mdsc->caps_list_lock); 341 } else { 342 spin_lock(&mdsc->caps_list_lock); 343 if (mdsc->caps_avail_count) { 344 BUG_ON(list_empty(&mdsc->caps_list)); 345 346 mdsc->caps_avail_count--; 347 mdsc->caps_use_count++; 348 cap = list_first_entry(&mdsc->caps_list, 349 struct ceph_cap, caps_item); 350 list_del(&cap->caps_item); 351 352 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 353 mdsc->caps_reserve_count + mdsc->caps_avail_count); 354 } 355 spin_unlock(&mdsc->caps_list_lock); 356 } 357 358 return cap; 359 } 360 361 spin_lock(&mdsc->caps_list_lock); 362 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", 363 ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, 364 mdsc->caps_reserve_count, mdsc->caps_avail_count); 365 BUG_ON(!ctx->count); 366 BUG_ON(ctx->count > mdsc->caps_reserve_count); 367 BUG_ON(list_empty(&mdsc->caps_list)); 368 369 ctx->count--; 370 ctx->used++; 371 mdsc->caps_reserve_count--; 372 mdsc->caps_use_count++; 373 374 cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); 375 list_del(&cap->caps_item); 376 377 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 378 mdsc->caps_reserve_count + mdsc->caps_avail_count); 379 spin_unlock(&mdsc->caps_list_lock); 380 return cap; 381 } 382 383 void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) 384 { 385 spin_lock(&mdsc->caps_list_lock); 386 dout("put_cap %p %d = %d used + %d resv + %d avail\n", 387 cap, mdsc->caps_total_count, mdsc->caps_use_count, 388 mdsc->caps_reserve_count, mdsc->caps_avail_count); 389 mdsc->caps_use_count--; 390 /* 391 * Keep some preallocated caps around (ceph_min_count), to 392 * avoid lots of free/alloc churn. 393 */ 394 if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + 395 mdsc->caps_min_count) { 396 mdsc->caps_total_count--; 397 kmem_cache_free(ceph_cap_cachep, cap); 398 } else { 399 mdsc->caps_avail_count++; 400 list_add(&cap->caps_item, &mdsc->caps_list); 401 } 402 403 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 404 mdsc->caps_reserve_count + mdsc->caps_avail_count); 405 spin_unlock(&mdsc->caps_list_lock); 406 } 407 408 void ceph_reservation_status(struct ceph_fs_client *fsc, 409 int *total, int *avail, int *used, int *reserved, 410 int *min) 411 { 412 struct ceph_mds_client *mdsc = fsc->mdsc; 413 414 spin_lock(&mdsc->caps_list_lock); 415 416 if (total) 417 *total = mdsc->caps_total_count; 418 if (avail) 419 *avail = mdsc->caps_avail_count; 420 if (used) 421 *used = mdsc->caps_use_count; 422 if (reserved) 423 *reserved = mdsc->caps_reserve_count; 424 if (min) 425 *min = mdsc->caps_min_count; 426 427 spin_unlock(&mdsc->caps_list_lock); 428 } 429 430 /* 431 * Find ceph_cap for given mds, if any. 432 * 433 * Called with i_ceph_lock held. 434 */ 435 struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) 436 { 437 struct ceph_cap *cap; 438 struct rb_node *n = ci->i_caps.rb_node; 439 440 while (n) { 441 cap = rb_entry(n, struct ceph_cap, ci_node); 442 if (mds < cap->mds) 443 n = n->rb_left; 444 else if (mds > cap->mds) 445 n = n->rb_right; 446 else 447 return cap; 448 } 449 return NULL; 450 } 451 452 struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) 453 { 454 struct ceph_cap *cap; 455 456 spin_lock(&ci->i_ceph_lock); 457 cap = __get_cap_for_mds(ci, mds); 458 spin_unlock(&ci->i_ceph_lock); 459 return cap; 460 } 461 462 /* 463 * Called under i_ceph_lock. 464 */ 465 static void __insert_cap_node(struct ceph_inode_info *ci, 466 struct ceph_cap *new) 467 { 468 struct rb_node **p = &ci->i_caps.rb_node; 469 struct rb_node *parent = NULL; 470 struct ceph_cap *cap = NULL; 471 472 while (*p) { 473 parent = *p; 474 cap = rb_entry(parent, struct ceph_cap, ci_node); 475 if (new->mds < cap->mds) 476 p = &(*p)->rb_left; 477 else if (new->mds > cap->mds) 478 p = &(*p)->rb_right; 479 else 480 BUG(); 481 } 482 483 rb_link_node(&new->ci_node, parent, p); 484 rb_insert_color(&new->ci_node, &ci->i_caps); 485 } 486 487 /* 488 * (re)set cap hold timeouts, which control the delayed release 489 * of unused caps back to the MDS. Should be called on cap use. 490 */ 491 static void __cap_set_timeouts(struct ceph_mds_client *mdsc, 492 struct ceph_inode_info *ci) 493 { 494 struct ceph_mount_options *opt = mdsc->fsc->mount_options; 495 ci->i_hold_caps_max = round_jiffies(jiffies + 496 opt->caps_wanted_delay_max * HZ); 497 dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode, 498 ci->i_hold_caps_max - jiffies); 499 } 500 501 /* 502 * (Re)queue cap at the end of the delayed cap release list. 503 * 504 * If I_FLUSH is set, leave the inode at the front of the list. 505 * 506 * Caller holds i_ceph_lock 507 * -> we take mdsc->cap_delay_lock 508 */ 509 static void __cap_delay_requeue(struct ceph_mds_client *mdsc, 510 struct ceph_inode_info *ci) 511 { 512 dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode, 513 ci->i_ceph_flags, ci->i_hold_caps_max); 514 if (!mdsc->stopping) { 515 spin_lock(&mdsc->cap_delay_lock); 516 if (!list_empty(&ci->i_cap_delay_list)) { 517 if (ci->i_ceph_flags & CEPH_I_FLUSH) 518 goto no_change; 519 list_del_init(&ci->i_cap_delay_list); 520 } 521 __cap_set_timeouts(mdsc, ci); 522 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); 523 no_change: 524 spin_unlock(&mdsc->cap_delay_lock); 525 } 526 } 527 528 /* 529 * Queue an inode for immediate writeback. Mark inode with I_FLUSH, 530 * indicating we should send a cap message to flush dirty metadata 531 * asap, and move to the front of the delayed cap list. 532 */ 533 static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, 534 struct ceph_inode_info *ci) 535 { 536 dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode); 537 spin_lock(&mdsc->cap_delay_lock); 538 ci->i_ceph_flags |= CEPH_I_FLUSH; 539 if (!list_empty(&ci->i_cap_delay_list)) 540 list_del_init(&ci->i_cap_delay_list); 541 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); 542 spin_unlock(&mdsc->cap_delay_lock); 543 } 544 545 /* 546 * Cancel delayed work on cap. 547 * 548 * Caller must hold i_ceph_lock. 549 */ 550 static void __cap_delay_cancel(struct ceph_mds_client *mdsc, 551 struct ceph_inode_info *ci) 552 { 553 dout("__cap_delay_cancel %p\n", &ci->netfs.inode); 554 if (list_empty(&ci->i_cap_delay_list)) 555 return; 556 spin_lock(&mdsc->cap_delay_lock); 557 list_del_init(&ci->i_cap_delay_list); 558 spin_unlock(&mdsc->cap_delay_lock); 559 } 560 561 /* Common issue checks for add_cap, handle_cap_grant. */ 562 static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, 563 unsigned issued) 564 { 565 unsigned had = __ceph_caps_issued(ci, NULL); 566 567 lockdep_assert_held(&ci->i_ceph_lock); 568 569 /* 570 * Each time we receive FILE_CACHE anew, we increment 571 * i_rdcache_gen. 572 */ 573 if (S_ISREG(ci->netfs.inode.i_mode) && 574 (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 575 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { 576 ci->i_rdcache_gen++; 577 } 578 579 /* 580 * If FILE_SHARED is newly issued, mark dir not complete. We don't 581 * know what happened to this directory while we didn't have the cap. 582 * If FILE_SHARED is being revoked, also mark dir not complete. It 583 * stops on-going cached readdir. 584 */ 585 if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { 586 if (issued & CEPH_CAP_FILE_SHARED) 587 atomic_inc(&ci->i_shared_gen); 588 if (S_ISDIR(ci->netfs.inode.i_mode)) { 589 dout(" marking %p NOT complete\n", &ci->netfs.inode); 590 __ceph_dir_clear_complete(ci); 591 } 592 } 593 594 /* Wipe saved layout if we're losing DIR_CREATE caps */ 595 if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && 596 !(issued & CEPH_CAP_DIR_CREATE)) { 597 ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); 598 memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); 599 } 600 } 601 602 /** 603 * change_auth_cap_ses - move inode to appropriate lists when auth caps change 604 * @ci: inode to be moved 605 * @session: new auth caps session 606 */ 607 void change_auth_cap_ses(struct ceph_inode_info *ci, 608 struct ceph_mds_session *session) 609 { 610 lockdep_assert_held(&ci->i_ceph_lock); 611 612 if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item)) 613 return; 614 615 spin_lock(&session->s_mdsc->cap_dirty_lock); 616 if (!list_empty(&ci->i_dirty_item)) 617 list_move(&ci->i_dirty_item, &session->s_cap_dirty); 618 if (!list_empty(&ci->i_flushing_item)) 619 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); 620 spin_unlock(&session->s_mdsc->cap_dirty_lock); 621 } 622 623 /* 624 * Add a capability under the given MDS session. 625 * 626 * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock 627 * 628 * @fmode is the open file mode, if we are opening a file, otherwise 629 * it is < 0. (This is so we can atomically add the cap and add an 630 * open file reference to it.) 631 */ 632 void ceph_add_cap(struct inode *inode, 633 struct ceph_mds_session *session, u64 cap_id, 634 unsigned issued, unsigned wanted, 635 unsigned seq, unsigned mseq, u64 realmino, int flags, 636 struct ceph_cap **new_cap) 637 { 638 struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; 639 struct ceph_inode_info *ci = ceph_inode(inode); 640 struct ceph_cap *cap; 641 int mds = session->s_mds; 642 int actual_wanted; 643 u32 gen; 644 645 lockdep_assert_held(&ci->i_ceph_lock); 646 647 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, 648 session->s_mds, cap_id, ceph_cap_string(issued), seq); 649 650 gen = atomic_read(&session->s_cap_gen); 651 652 cap = __get_cap_for_mds(ci, mds); 653 if (!cap) { 654 cap = *new_cap; 655 *new_cap = NULL; 656 657 cap->issued = 0; 658 cap->implemented = 0; 659 cap->mds = mds; 660 cap->mds_wanted = 0; 661 cap->mseq = 0; 662 663 cap->ci = ci; 664 __insert_cap_node(ci, cap); 665 666 /* add to session cap list */ 667 cap->session = session; 668 spin_lock(&session->s_cap_lock); 669 list_add_tail(&cap->session_caps, &session->s_caps); 670 session->s_nr_caps++; 671 atomic64_inc(&mdsc->metric.total_caps); 672 spin_unlock(&session->s_cap_lock); 673 } else { 674 spin_lock(&session->s_cap_lock); 675 list_move_tail(&cap->session_caps, &session->s_caps); 676 spin_unlock(&session->s_cap_lock); 677 678 if (cap->cap_gen < gen) 679 cap->issued = cap->implemented = CEPH_CAP_PIN; 680 681 /* 682 * auth mds of the inode changed. we received the cap export 683 * message, but still haven't received the cap import message. 684 * handle_cap_export() updated the new auth MDS' cap. 685 * 686 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing 687 * a message that was send before the cap import message. So 688 * don't remove caps. 689 */ 690 if (ceph_seq_cmp(seq, cap->seq) <= 0) { 691 WARN_ON(cap != ci->i_auth_cap); 692 WARN_ON(cap->cap_id != cap_id); 693 seq = cap->seq; 694 mseq = cap->mseq; 695 issued |= cap->issued; 696 flags |= CEPH_CAP_FLAG_AUTH; 697 } 698 } 699 700 if (!ci->i_snap_realm || 701 ((flags & CEPH_CAP_FLAG_AUTH) && 702 realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) { 703 /* 704 * add this inode to the appropriate snap realm 705 */ 706 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, 707 realmino); 708 if (realm) 709 ceph_change_snap_realm(inode, realm); 710 else 711 WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n", 712 __func__, realmino, ci->i_vino.ino, 713 ci->i_snap_realm ? ci->i_snap_realm->ino : 0); 714 } 715 716 __check_cap_issue(ci, cap, issued); 717 718 /* 719 * If we are issued caps we don't want, or the mds' wanted 720 * value appears to be off, queue a check so we'll release 721 * later and/or update the mds wanted value. 722 */ 723 actual_wanted = __ceph_caps_wanted(ci); 724 if ((wanted & ~actual_wanted) || 725 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { 726 dout(" issued %s, mds wanted %s, actual %s, queueing\n", 727 ceph_cap_string(issued), ceph_cap_string(wanted), 728 ceph_cap_string(actual_wanted)); 729 __cap_delay_requeue(mdsc, ci); 730 } 731 732 if (flags & CEPH_CAP_FLAG_AUTH) { 733 if (!ci->i_auth_cap || 734 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { 735 if (ci->i_auth_cap && 736 ci->i_auth_cap->session != cap->session) 737 change_auth_cap_ses(ci, cap->session); 738 ci->i_auth_cap = cap; 739 cap->mds_wanted = wanted; 740 } 741 } else { 742 WARN_ON(ci->i_auth_cap == cap); 743 } 744 745 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", 746 inode, ceph_vinop(inode), cap, ceph_cap_string(issued), 747 ceph_cap_string(issued|cap->issued), seq, mds); 748 cap->cap_id = cap_id; 749 cap->issued = issued; 750 cap->implemented |= issued; 751 if (ceph_seq_cmp(mseq, cap->mseq) > 0) 752 cap->mds_wanted = wanted; 753 else 754 cap->mds_wanted |= wanted; 755 cap->seq = seq; 756 cap->issue_seq = seq; 757 cap->mseq = mseq; 758 cap->cap_gen = gen; 759 wake_up_all(&ci->i_cap_wq); 760 } 761 762 /* 763 * Return true if cap has not timed out and belongs to the current 764 * generation of the MDS session (i.e. has not gone 'stale' due to 765 * us losing touch with the mds). 766 */ 767 static int __cap_is_valid(struct ceph_cap *cap) 768 { 769 unsigned long ttl; 770 u32 gen; 771 772 gen = atomic_read(&cap->session->s_cap_gen); 773 ttl = cap->session->s_cap_ttl; 774 775 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { 776 dout("__cap_is_valid %p cap %p issued %s " 777 "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode, 778 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); 779 return 0; 780 } 781 782 return 1; 783 } 784 785 /* 786 * Return set of valid cap bits issued to us. Note that caps time 787 * out, and may be invalidated in bulk if the client session times out 788 * and session->s_cap_gen is bumped. 789 */ 790 int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) 791 { 792 int have = ci->i_snap_caps; 793 struct ceph_cap *cap; 794 struct rb_node *p; 795 796 if (implemented) 797 *implemented = 0; 798 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 799 cap = rb_entry(p, struct ceph_cap, ci_node); 800 if (!__cap_is_valid(cap)) 801 continue; 802 dout("__ceph_caps_issued %p cap %p issued %s\n", 803 &ci->netfs.inode, cap, ceph_cap_string(cap->issued)); 804 have |= cap->issued; 805 if (implemented) 806 *implemented |= cap->implemented; 807 } 808 /* 809 * exclude caps issued by non-auth MDS, but are been revoking 810 * by the auth MDS. The non-auth MDS should be revoking/exporting 811 * these caps, but the message is delayed. 812 */ 813 if (ci->i_auth_cap) { 814 cap = ci->i_auth_cap; 815 have &= ~cap->implemented | cap->issued; 816 } 817 return have; 818 } 819 820 /* 821 * Get cap bits issued by caps other than @ocap 822 */ 823 int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) 824 { 825 int have = ci->i_snap_caps; 826 struct ceph_cap *cap; 827 struct rb_node *p; 828 829 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 830 cap = rb_entry(p, struct ceph_cap, ci_node); 831 if (cap == ocap) 832 continue; 833 if (!__cap_is_valid(cap)) 834 continue; 835 have |= cap->issued; 836 } 837 return have; 838 } 839 840 /* 841 * Move a cap to the end of the LRU (oldest caps at list head, newest 842 * at list tail). 843 */ 844 static void __touch_cap(struct ceph_cap *cap) 845 { 846 struct ceph_mds_session *s = cap->session; 847 848 spin_lock(&s->s_cap_lock); 849 if (!s->s_cap_iterator) { 850 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap, 851 s->s_mds); 852 list_move_tail(&cap->session_caps, &s->s_caps); 853 } else { 854 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", 855 &cap->ci->netfs.inode, cap, s->s_mds); 856 } 857 spin_unlock(&s->s_cap_lock); 858 } 859 860 /* 861 * Check if we hold the given mask. If so, move the cap(s) to the 862 * front of their respective LRUs. (This is the preferred way for 863 * callers to check for caps they want.) 864 */ 865 int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) 866 { 867 struct ceph_cap *cap; 868 struct rb_node *p; 869 int have = ci->i_snap_caps; 870 871 if ((have & mask) == mask) { 872 dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" 873 " (mask %s)\n", ceph_ino(&ci->netfs.inode), 874 ceph_cap_string(have), 875 ceph_cap_string(mask)); 876 return 1; 877 } 878 879 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 880 cap = rb_entry(p, struct ceph_cap, ci_node); 881 if (!__cap_is_valid(cap)) 882 continue; 883 if ((cap->issued & mask) == mask) { 884 dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" 885 " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap, 886 ceph_cap_string(cap->issued), 887 ceph_cap_string(mask)); 888 if (touch) 889 __touch_cap(cap); 890 return 1; 891 } 892 893 /* does a combination of caps satisfy mask? */ 894 have |= cap->issued; 895 if ((have & mask) == mask) { 896 dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" 897 " (mask %s)\n", ceph_ino(&ci->netfs.inode), 898 ceph_cap_string(cap->issued), 899 ceph_cap_string(mask)); 900 if (touch) { 901 struct rb_node *q; 902 903 /* touch this + preceding caps */ 904 __touch_cap(cap); 905 for (q = rb_first(&ci->i_caps); q != p; 906 q = rb_next(q)) { 907 cap = rb_entry(q, struct ceph_cap, 908 ci_node); 909 if (!__cap_is_valid(cap)) 910 continue; 911 if (cap->issued & mask) 912 __touch_cap(cap); 913 } 914 } 915 return 1; 916 } 917 } 918 919 return 0; 920 } 921 922 int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, 923 int touch) 924 { 925 struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); 926 int r; 927 928 r = __ceph_caps_issued_mask(ci, mask, touch); 929 if (r) 930 ceph_update_cap_hit(&fsc->mdsc->metric); 931 else 932 ceph_update_cap_mis(&fsc->mdsc->metric); 933 return r; 934 } 935 936 /* 937 * Return true if mask caps are currently being revoked by an MDS. 938 */ 939 int __ceph_caps_revoking_other(struct ceph_inode_info *ci, 940 struct ceph_cap *ocap, int mask) 941 { 942 struct ceph_cap *cap; 943 struct rb_node *p; 944 945 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 946 cap = rb_entry(p, struct ceph_cap, ci_node); 947 if (cap != ocap && 948 (cap->implemented & ~cap->issued & mask)) 949 return 1; 950 } 951 return 0; 952 } 953 954 int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) 955 { 956 struct inode *inode = &ci->netfs.inode; 957 int ret; 958 959 spin_lock(&ci->i_ceph_lock); 960 ret = __ceph_caps_revoking_other(ci, NULL, mask); 961 spin_unlock(&ci->i_ceph_lock); 962 dout("ceph_caps_revoking %p %s = %d\n", inode, 963 ceph_cap_string(mask), ret); 964 return ret; 965 } 966 967 int __ceph_caps_used(struct ceph_inode_info *ci) 968 { 969 int used = 0; 970 if (ci->i_pin_ref) 971 used |= CEPH_CAP_PIN; 972 if (ci->i_rd_ref) 973 used |= CEPH_CAP_FILE_RD; 974 if (ci->i_rdcache_ref || 975 (S_ISREG(ci->netfs.inode.i_mode) && 976 ci->netfs.inode.i_data.nrpages)) 977 used |= CEPH_CAP_FILE_CACHE; 978 if (ci->i_wr_ref) 979 used |= CEPH_CAP_FILE_WR; 980 if (ci->i_wb_ref || ci->i_wrbuffer_ref) 981 used |= CEPH_CAP_FILE_BUFFER; 982 if (ci->i_fx_ref) 983 used |= CEPH_CAP_FILE_EXCL; 984 return used; 985 } 986 987 #define FMODE_WAIT_BIAS 1000 988 989 /* 990 * wanted, by virtue of open file modes 991 */ 992 int __ceph_caps_file_wanted(struct ceph_inode_info *ci) 993 { 994 const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN); 995 const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD); 996 const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); 997 const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); 998 struct ceph_mount_options *opt = 999 ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options; 1000 unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; 1001 unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; 1002 1003 if (S_ISDIR(ci->netfs.inode.i_mode)) { 1004 int want = 0; 1005 1006 /* use used_cutoff here, to keep dir's wanted caps longer */ 1007 if (ci->i_nr_by_mode[RD_SHIFT] > 0 || 1008 time_after(ci->i_last_rd, used_cutoff)) 1009 want |= CEPH_CAP_ANY_SHARED; 1010 1011 if (ci->i_nr_by_mode[WR_SHIFT] > 0 || 1012 time_after(ci->i_last_wr, used_cutoff)) { 1013 want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; 1014 if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) 1015 want |= CEPH_CAP_ANY_DIR_OPS; 1016 } 1017 1018 if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0) 1019 want |= CEPH_CAP_PIN; 1020 1021 return want; 1022 } else { 1023 int bits = 0; 1024 1025 if (ci->i_nr_by_mode[RD_SHIFT] > 0) { 1026 if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS || 1027 time_after(ci->i_last_rd, used_cutoff)) 1028 bits |= 1 << RD_SHIFT; 1029 } else if (time_after(ci->i_last_rd, idle_cutoff)) { 1030 bits |= 1 << RD_SHIFT; 1031 } 1032 1033 if (ci->i_nr_by_mode[WR_SHIFT] > 0) { 1034 if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS || 1035 time_after(ci->i_last_wr, used_cutoff)) 1036 bits |= 1 << WR_SHIFT; 1037 } else if (time_after(ci->i_last_wr, idle_cutoff)) { 1038 bits |= 1 << WR_SHIFT; 1039 } 1040 1041 /* check lazyio only when read/write is wanted */ 1042 if ((bits & (CEPH_FILE_MODE_RDWR << 1)) && 1043 ci->i_nr_by_mode[LAZY_SHIFT] > 0) 1044 bits |= 1 << LAZY_SHIFT; 1045 1046 return bits ? ceph_caps_for_mode(bits >> 1) : 0; 1047 } 1048 } 1049 1050 /* 1051 * wanted, by virtue of open file modes AND cap refs (buffered/cached data) 1052 */ 1053 int __ceph_caps_wanted(struct ceph_inode_info *ci) 1054 { 1055 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); 1056 if (S_ISDIR(ci->netfs.inode.i_mode)) { 1057 /* we want EXCL if holding caps of dir ops */ 1058 if (w & CEPH_CAP_ANY_DIR_OPS) 1059 w |= CEPH_CAP_FILE_EXCL; 1060 } else { 1061 /* we want EXCL if dirty data */ 1062 if (w & CEPH_CAP_FILE_BUFFER) 1063 w |= CEPH_CAP_FILE_EXCL; 1064 } 1065 return w; 1066 } 1067 1068 /* 1069 * Return caps we have registered with the MDS(s) as 'wanted'. 1070 */ 1071 int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) 1072 { 1073 struct ceph_cap *cap; 1074 struct rb_node *p; 1075 int mds_wanted = 0; 1076 1077 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 1078 cap = rb_entry(p, struct ceph_cap, ci_node); 1079 if (check && !__cap_is_valid(cap)) 1080 continue; 1081 if (cap == ci->i_auth_cap) 1082 mds_wanted |= cap->mds_wanted; 1083 else 1084 mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); 1085 } 1086 return mds_wanted; 1087 } 1088 1089 int ceph_is_any_caps(struct inode *inode) 1090 { 1091 struct ceph_inode_info *ci = ceph_inode(inode); 1092 int ret; 1093 1094 spin_lock(&ci->i_ceph_lock); 1095 ret = __ceph_is_any_real_caps(ci); 1096 spin_unlock(&ci->i_ceph_lock); 1097 1098 return ret; 1099 } 1100 1101 /* 1102 * Remove a cap. Take steps to deal with a racing iterate_session_caps. 1103 * 1104 * caller should hold i_ceph_lock. 1105 * caller will not hold session s_mutex if called from destroy_inode. 1106 */ 1107 void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) 1108 { 1109 struct ceph_mds_session *session = cap->session; 1110 struct ceph_inode_info *ci = cap->ci; 1111 struct ceph_mds_client *mdsc; 1112 int removed = 0; 1113 1114 /* 'ci' being NULL means the remove have already occurred */ 1115 if (!ci) { 1116 dout("%s: cap inode is NULL\n", __func__); 1117 return; 1118 } 1119 1120 lockdep_assert_held(&ci->i_ceph_lock); 1121 1122 dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode); 1123 1124 mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc; 1125 1126 /* remove from inode's cap rbtree, and clear auth cap */ 1127 rb_erase(&cap->ci_node, &ci->i_caps); 1128 if (ci->i_auth_cap == cap) 1129 ci->i_auth_cap = NULL; 1130 1131 /* remove from session list */ 1132 spin_lock(&session->s_cap_lock); 1133 if (session->s_cap_iterator == cap) { 1134 /* not yet, we are iterating over this very cap */ 1135 dout("__ceph_remove_cap delaying %p removal from session %p\n", 1136 cap, cap->session); 1137 } else { 1138 list_del_init(&cap->session_caps); 1139 session->s_nr_caps--; 1140 atomic64_dec(&mdsc->metric.total_caps); 1141 cap->session = NULL; 1142 removed = 1; 1143 } 1144 /* protect backpointer with s_cap_lock: see iterate_session_caps */ 1145 cap->ci = NULL; 1146 1147 /* 1148 * s_cap_reconnect is protected by s_cap_lock. no one changes 1149 * s_cap_gen while session is in the reconnect state. 1150 */ 1151 if (queue_release && 1152 (!session->s_cap_reconnect || 1153 cap->cap_gen == atomic_read(&session->s_cap_gen))) { 1154 cap->queue_release = 1; 1155 if (removed) { 1156 __ceph_queue_cap_release(session, cap); 1157 removed = 0; 1158 } 1159 } else { 1160 cap->queue_release = 0; 1161 } 1162 cap->cap_ino = ci->i_vino.ino; 1163 1164 spin_unlock(&session->s_cap_lock); 1165 1166 if (removed) 1167 ceph_put_cap(mdsc, cap); 1168 1169 if (!__ceph_is_any_real_caps(ci)) { 1170 /* when reconnect denied, we remove session caps forcibly, 1171 * i_wr_ref can be non-zero. If there are ongoing write, 1172 * keep i_snap_realm. 1173 */ 1174 if (ci->i_wr_ref == 0 && ci->i_snap_realm) 1175 ceph_change_snap_realm(&ci->netfs.inode, NULL); 1176 1177 __cap_delay_cancel(mdsc, ci); 1178 } 1179 } 1180 1181 void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, 1182 bool queue_release) 1183 { 1184 struct ceph_inode_info *ci = cap->ci; 1185 struct ceph_fs_client *fsc; 1186 1187 /* 'ci' being NULL means the remove have already occurred */ 1188 if (!ci) { 1189 dout("%s: cap inode is NULL\n", __func__); 1190 return; 1191 } 1192 1193 lockdep_assert_held(&ci->i_ceph_lock); 1194 1195 fsc = ceph_inode_to_fs_client(&ci->netfs.inode); 1196 WARN_ON_ONCE(ci->i_auth_cap == cap && 1197 !list_empty(&ci->i_dirty_item) && 1198 !fsc->blocklisted && 1199 !ceph_inode_is_shutdown(&ci->netfs.inode)); 1200 1201 __ceph_remove_cap(cap, queue_release); 1202 } 1203 1204 struct cap_msg_args { 1205 struct ceph_mds_session *session; 1206 u64 ino, cid, follows; 1207 u64 flush_tid, oldest_flush_tid, size, max_size; 1208 u64 xattr_version; 1209 u64 change_attr; 1210 struct ceph_buffer *xattr_buf; 1211 struct ceph_buffer *old_xattr_buf; 1212 struct timespec64 atime, mtime, ctime, btime; 1213 int op, caps, wanted, dirty; 1214 u32 seq, issue_seq, mseq, time_warp_seq; 1215 u32 flags; 1216 kuid_t uid; 1217 kgid_t gid; 1218 umode_t mode; 1219 bool inline_data; 1220 bool wake; 1221 bool encrypted; 1222 u32 fscrypt_auth_len; 1223 u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context 1224 }; 1225 1226 /* Marshal up the cap msg to the MDS */ 1227 static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) 1228 { 1229 struct ceph_mds_caps *fc; 1230 void *p; 1231 struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; 1232 1233 dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n", 1234 __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino, 1235 ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), 1236 ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, 1237 arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, 1238 arg->size, arg->max_size, arg->xattr_version, 1239 arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); 1240 1241 msg->hdr.version = cpu_to_le16(12); 1242 msg->hdr.tid = cpu_to_le64(arg->flush_tid); 1243 1244 fc = msg->front.iov_base; 1245 memset(fc, 0, sizeof(*fc)); 1246 1247 fc->cap_id = cpu_to_le64(arg->cid); 1248 fc->op = cpu_to_le32(arg->op); 1249 fc->seq = cpu_to_le32(arg->seq); 1250 fc->issue_seq = cpu_to_le32(arg->issue_seq); 1251 fc->migrate_seq = cpu_to_le32(arg->mseq); 1252 fc->caps = cpu_to_le32(arg->caps); 1253 fc->wanted = cpu_to_le32(arg->wanted); 1254 fc->dirty = cpu_to_le32(arg->dirty); 1255 fc->ino = cpu_to_le64(arg->ino); 1256 fc->snap_follows = cpu_to_le64(arg->follows); 1257 1258 #if IS_ENABLED(CONFIG_FS_ENCRYPTION) 1259 if (arg->encrypted) 1260 fc->size = cpu_to_le64(round_up(arg->size, 1261 CEPH_FSCRYPT_BLOCK_SIZE)); 1262 else 1263 #endif 1264 fc->size = cpu_to_le64(arg->size); 1265 fc->max_size = cpu_to_le64(arg->max_size); 1266 ceph_encode_timespec64(&fc->mtime, &arg->mtime); 1267 ceph_encode_timespec64(&fc->atime, &arg->atime); 1268 ceph_encode_timespec64(&fc->ctime, &arg->ctime); 1269 fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq); 1270 1271 fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid)); 1272 fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid)); 1273 fc->mode = cpu_to_le32(arg->mode); 1274 1275 fc->xattr_version = cpu_to_le64(arg->xattr_version); 1276 if (arg->xattr_buf) { 1277 msg->middle = ceph_buffer_get(arg->xattr_buf); 1278 fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); 1279 msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); 1280 } 1281 1282 p = fc + 1; 1283 /* flock buffer size (version 2) */ 1284 ceph_encode_32(&p, 0); 1285 /* inline version (version 4) */ 1286 ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); 1287 /* inline data size */ 1288 ceph_encode_32(&p, 0); 1289 /* 1290 * osd_epoch_barrier (version 5) 1291 * The epoch_barrier is protected osdc->lock, so READ_ONCE here in 1292 * case it was recently changed 1293 */ 1294 ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier)); 1295 /* oldest_flush_tid (version 6) */ 1296 ceph_encode_64(&p, arg->oldest_flush_tid); 1297 1298 /* 1299 * caller_uid/caller_gid (version 7) 1300 * 1301 * Currently, we don't properly track which caller dirtied the caps 1302 * last, and force a flush of them when there is a conflict. For now, 1303 * just set this to 0:0, to emulate how the MDS has worked up to now. 1304 */ 1305 ceph_encode_32(&p, 0); 1306 ceph_encode_32(&p, 0); 1307 1308 /* pool namespace (version 8) (mds always ignores this) */ 1309 ceph_encode_32(&p, 0); 1310 1311 /* btime and change_attr (version 9) */ 1312 ceph_encode_timespec64(p, &arg->btime); 1313 p += sizeof(struct ceph_timespec); 1314 ceph_encode_64(&p, arg->change_attr); 1315 1316 /* Advisory flags (version 10) */ 1317 ceph_encode_32(&p, arg->flags); 1318 1319 /* dirstats (version 11) - these are r/o on the client */ 1320 ceph_encode_64(&p, 0); 1321 ceph_encode_64(&p, 0); 1322 1323 #if IS_ENABLED(CONFIG_FS_ENCRYPTION) 1324 /* 1325 * fscrypt_auth and fscrypt_file (version 12) 1326 * 1327 * fscrypt_auth holds the crypto context (if any). fscrypt_file 1328 * tracks the real i_size as an __le64 field (and we use a rounded-up 1329 * i_size in the traditional size field). 1330 */ 1331 ceph_encode_32(&p, arg->fscrypt_auth_len); 1332 ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len); 1333 ceph_encode_32(&p, sizeof(__le64)); 1334 ceph_encode_64(&p, arg->size); 1335 #else /* CONFIG_FS_ENCRYPTION */ 1336 ceph_encode_32(&p, 0); 1337 ceph_encode_32(&p, 0); 1338 #endif /* CONFIG_FS_ENCRYPTION */ 1339 } 1340 1341 /* 1342 * Queue cap releases when an inode is dropped from our cache. 1343 */ 1344 void __ceph_remove_caps(struct ceph_inode_info *ci) 1345 { 1346 struct inode *inode = &ci->netfs.inode; 1347 struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; 1348 struct rb_node *p; 1349 1350 /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) 1351 * may call __ceph_caps_issued_mask() on a freeing inode. */ 1352 spin_lock(&ci->i_ceph_lock); 1353 p = rb_first(&ci->i_caps); 1354 while (p) { 1355 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); 1356 p = rb_next(p); 1357 ceph_remove_cap(mdsc, cap, true); 1358 } 1359 spin_unlock(&ci->i_ceph_lock); 1360 } 1361 1362 /* 1363 * Prepare to send a cap message to an MDS. Update the cap state, and populate 1364 * the arg struct with the parameters that will need to be sent. This should 1365 * be done under the i_ceph_lock to guard against changes to cap state. 1366 * 1367 * Make note of max_size reported/requested from mds, revoked caps 1368 * that have now been implemented. 1369 */ 1370 static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, 1371 int op, int flags, int used, int want, int retain, 1372 int flushing, u64 flush_tid, u64 oldest_flush_tid) 1373 { 1374 struct ceph_inode_info *ci = cap->ci; 1375 struct inode *inode = &ci->netfs.inode; 1376 int held, revoking; 1377 1378 lockdep_assert_held(&ci->i_ceph_lock); 1379 1380 held = cap->issued | cap->implemented; 1381 revoking = cap->implemented & ~cap->issued; 1382 retain &= ~revoking; 1383 1384 dout("%s %p cap %p session %p %s -> %s (revoking %s)\n", 1385 __func__, inode, cap, cap->session, 1386 ceph_cap_string(held), ceph_cap_string(held & retain), 1387 ceph_cap_string(revoking)); 1388 BUG_ON((retain & CEPH_CAP_PIN) == 0); 1389 1390 ci->i_ceph_flags &= ~CEPH_I_FLUSH; 1391 1392 cap->issued &= retain; /* drop bits we don't want */ 1393 /* 1394 * Wake up any waiters on wanted -> needed transition. This is due to 1395 * the weird transition from buffered to sync IO... we need to flush 1396 * dirty pages _before_ allowing sync writes to avoid reordering. 1397 */ 1398 arg->wake = cap->implemented & ~cap->issued; 1399 cap->implemented &= cap->issued | used; 1400 cap->mds_wanted = want; 1401 1402 arg->session = cap->session; 1403 arg->ino = ceph_vino(inode).ino; 1404 arg->cid = cap->cap_id; 1405 arg->follows = flushing ? ci->i_head_snapc->seq : 0; 1406 arg->flush_tid = flush_tid; 1407 arg->oldest_flush_tid = oldest_flush_tid; 1408 arg->size = i_size_read(inode); 1409 ci->i_reported_size = arg->size; 1410 arg->max_size = ci->i_wanted_max_size; 1411 if (cap == ci->i_auth_cap) { 1412 if (want & CEPH_CAP_ANY_FILE_WR) 1413 ci->i_requested_max_size = arg->max_size; 1414 else 1415 ci->i_requested_max_size = 0; 1416 } 1417 1418 if (flushing & CEPH_CAP_XATTR_EXCL) { 1419 arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); 1420 arg->xattr_version = ci->i_xattrs.version; 1421 arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob); 1422 } else { 1423 arg->xattr_buf = NULL; 1424 arg->old_xattr_buf = NULL; 1425 } 1426 1427 arg->mtime = inode->i_mtime; 1428 arg->atime = inode->i_atime; 1429 arg->ctime = inode_get_ctime(inode); 1430 arg->btime = ci->i_btime; 1431 arg->change_attr = inode_peek_iversion_raw(inode); 1432 1433 arg->op = op; 1434 arg->caps = cap->implemented; 1435 arg->wanted = want; 1436 arg->dirty = flushing; 1437 1438 arg->seq = cap->seq; 1439 arg->issue_seq = cap->issue_seq; 1440 arg->mseq = cap->mseq; 1441 arg->time_warp_seq = ci->i_time_warp_seq; 1442 1443 arg->uid = inode->i_uid; 1444 arg->gid = inode->i_gid; 1445 arg->mode = inode->i_mode; 1446 1447 arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; 1448 if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && 1449 !list_empty(&ci->i_cap_snaps)) { 1450 struct ceph_cap_snap *capsnap; 1451 list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) { 1452 if (capsnap->cap_flush.tid) 1453 break; 1454 if (capsnap->need_flush) { 1455 flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP; 1456 break; 1457 } 1458 } 1459 } 1460 arg->flags = flags; 1461 arg->encrypted = IS_ENCRYPTED(inode); 1462 #if IS_ENABLED(CONFIG_FS_ENCRYPTION) 1463 if (ci->fscrypt_auth_len && 1464 WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) { 1465 /* Don't set this if it's too big */ 1466 arg->fscrypt_auth_len = 0; 1467 } else { 1468 arg->fscrypt_auth_len = ci->fscrypt_auth_len; 1469 memcpy(arg->fscrypt_auth, ci->fscrypt_auth, 1470 min_t(size_t, ci->fscrypt_auth_len, 1471 sizeof(arg->fscrypt_auth))); 1472 } 1473 #endif /* CONFIG_FS_ENCRYPTION */ 1474 } 1475 1476 #if IS_ENABLED(CONFIG_FS_ENCRYPTION) 1477 #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ 1478 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8) 1479 1480 static inline int cap_msg_size(struct cap_msg_args *arg) 1481 { 1482 return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len; 1483 } 1484 #else 1485 #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ 1486 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) 1487 1488 static inline int cap_msg_size(struct cap_msg_args *arg) 1489 { 1490 return CAP_MSG_FIXED_FIELDS; 1491 } 1492 #endif /* CONFIG_FS_ENCRYPTION */ 1493 1494 /* 1495 * Send a cap msg on the given inode. 1496 * 1497 * Caller should hold snap_rwsem (read), s_mutex. 1498 */ 1499 static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) 1500 { 1501 struct ceph_msg *msg; 1502 struct inode *inode = &ci->netfs.inode; 1503 1504 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS, 1505 false); 1506 if (!msg) { 1507 pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n", 1508 ceph_vinop(inode), ceph_cap_string(arg->dirty), 1509 arg->flush_tid); 1510 spin_lock(&ci->i_ceph_lock); 1511 __cap_delay_requeue(arg->session->s_mdsc, ci); 1512 spin_unlock(&ci->i_ceph_lock); 1513 return; 1514 } 1515 1516 encode_cap_msg(msg, arg); 1517 ceph_con_send(&arg->session->s_con, msg); 1518 ceph_buffer_put(arg->old_xattr_buf); 1519 ceph_buffer_put(arg->xattr_buf); 1520 if (arg->wake) 1521 wake_up_all(&ci->i_cap_wq); 1522 } 1523 1524 static inline int __send_flush_snap(struct inode *inode, 1525 struct ceph_mds_session *session, 1526 struct ceph_cap_snap *capsnap, 1527 u32 mseq, u64 oldest_flush_tid) 1528 { 1529 struct cap_msg_args arg; 1530 struct ceph_msg *msg; 1531 1532 arg.session = session; 1533 arg.ino = ceph_vino(inode).ino; 1534 arg.cid = 0; 1535 arg.follows = capsnap->follows; 1536 arg.flush_tid = capsnap->cap_flush.tid; 1537 arg.oldest_flush_tid = oldest_flush_tid; 1538 1539 arg.size = capsnap->size; 1540 arg.max_size = 0; 1541 arg.xattr_version = capsnap->xattr_version; 1542 arg.xattr_buf = capsnap->xattr_blob; 1543 arg.old_xattr_buf = NULL; 1544 1545 arg.atime = capsnap->atime; 1546 arg.mtime = capsnap->mtime; 1547 arg.ctime = capsnap->ctime; 1548 arg.btime = capsnap->btime; 1549 arg.change_attr = capsnap->change_attr; 1550 1551 arg.op = CEPH_CAP_OP_FLUSHSNAP; 1552 arg.caps = capsnap->issued; 1553 arg.wanted = 0; 1554 arg.dirty = capsnap->dirty; 1555 1556 arg.seq = 0; 1557 arg.issue_seq = 0; 1558 arg.mseq = mseq; 1559 arg.time_warp_seq = capsnap->time_warp_seq; 1560 1561 arg.uid = capsnap->uid; 1562 arg.gid = capsnap->gid; 1563 arg.mode = capsnap->mode; 1564 1565 arg.inline_data = capsnap->inline_data; 1566 arg.flags = 0; 1567 arg.wake = false; 1568 arg.encrypted = IS_ENCRYPTED(inode); 1569 1570 /* No fscrypt_auth changes from a capsnap.*/ 1571 arg.fscrypt_auth_len = 0; 1572 1573 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg), 1574 GFP_NOFS, false); 1575 if (!msg) 1576 return -ENOMEM; 1577 1578 encode_cap_msg(msg, &arg); 1579 ceph_con_send(&arg.session->s_con, msg); 1580 return 0; 1581 } 1582 1583 /* 1584 * When a snapshot is taken, clients accumulate dirty metadata on 1585 * inodes with capabilities in ceph_cap_snaps to describe the file 1586 * state at the time the snapshot was taken. This must be flushed 1587 * asynchronously back to the MDS once sync writes complete and dirty 1588 * data is written out. 1589 * 1590 * Called under i_ceph_lock. 1591 */ 1592 static void __ceph_flush_snaps(struct ceph_inode_info *ci, 1593 struct ceph_mds_session *session) 1594 __releases(ci->i_ceph_lock) 1595 __acquires(ci->i_ceph_lock) 1596 { 1597 struct inode *inode = &ci->netfs.inode; 1598 struct ceph_mds_client *mdsc = session->s_mdsc; 1599 struct ceph_cap_snap *capsnap; 1600 u64 oldest_flush_tid = 0; 1601 u64 first_tid = 1, last_tid = 0; 1602 1603 dout("__flush_snaps %p session %p\n", inode, session); 1604 1605 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 1606 /* 1607 * we need to wait for sync writes to complete and for dirty 1608 * pages to be written out. 1609 */ 1610 if (capsnap->dirty_pages || capsnap->writing) 1611 break; 1612 1613 /* should be removed by ceph_try_drop_cap_snap() */ 1614 BUG_ON(!capsnap->need_flush); 1615 1616 /* only flush each capsnap once */ 1617 if (capsnap->cap_flush.tid > 0) { 1618 dout(" already flushed %p, skipping\n", capsnap); 1619 continue; 1620 } 1621 1622 spin_lock(&mdsc->cap_dirty_lock); 1623 capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid; 1624 list_add_tail(&capsnap->cap_flush.g_list, 1625 &mdsc->cap_flush_list); 1626 if (oldest_flush_tid == 0) 1627 oldest_flush_tid = __get_oldest_flush_tid(mdsc); 1628 if (list_empty(&ci->i_flushing_item)) { 1629 list_add_tail(&ci->i_flushing_item, 1630 &session->s_cap_flushing); 1631 } 1632 spin_unlock(&mdsc->cap_dirty_lock); 1633 1634 list_add_tail(&capsnap->cap_flush.i_list, 1635 &ci->i_cap_flush_list); 1636 1637 if (first_tid == 1) 1638 first_tid = capsnap->cap_flush.tid; 1639 last_tid = capsnap->cap_flush.tid; 1640 } 1641 1642 ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS; 1643 1644 while (first_tid <= last_tid) { 1645 struct ceph_cap *cap = ci->i_auth_cap; 1646 struct ceph_cap_flush *cf = NULL, *iter; 1647 int ret; 1648 1649 if (!(cap && cap->session == session)) { 1650 dout("__flush_snaps %p auth cap %p not mds%d, " 1651 "stop\n", inode, cap, session->s_mds); 1652 break; 1653 } 1654 1655 ret = -ENOENT; 1656 list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) { 1657 if (iter->tid >= first_tid) { 1658 cf = iter; 1659 ret = 0; 1660 break; 1661 } 1662 } 1663 if (ret < 0) 1664 break; 1665 1666 first_tid = cf->tid + 1; 1667 1668 capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); 1669 refcount_inc(&capsnap->nref); 1670 spin_unlock(&ci->i_ceph_lock); 1671 1672 dout("__flush_snaps %p capsnap %p tid %llu %s\n", 1673 inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); 1674 1675 ret = __send_flush_snap(inode, session, capsnap, cap->mseq, 1676 oldest_flush_tid); 1677 if (ret < 0) { 1678 pr_err("__flush_snaps: error sending cap flushsnap, " 1679 "ino (%llx.%llx) tid %llu follows %llu\n", 1680 ceph_vinop(inode), cf->tid, capsnap->follows); 1681 } 1682 1683 ceph_put_cap_snap(capsnap); 1684 spin_lock(&ci->i_ceph_lock); 1685 } 1686 } 1687 1688 void ceph_flush_snaps(struct ceph_inode_info *ci, 1689 struct ceph_mds_session **psession) 1690 { 1691 struct inode *inode = &ci->netfs.inode; 1692 struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; 1693 struct ceph_mds_session *session = NULL; 1694 bool need_put = false; 1695 int mds; 1696 1697 dout("ceph_flush_snaps %p\n", inode); 1698 if (psession) 1699 session = *psession; 1700 retry: 1701 spin_lock(&ci->i_ceph_lock); 1702 if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { 1703 dout(" no capsnap needs flush, doing nothing\n"); 1704 goto out; 1705 } 1706 if (!ci->i_auth_cap) { 1707 dout(" no auth cap (migrating?), doing nothing\n"); 1708 goto out; 1709 } 1710 1711 mds = ci->i_auth_cap->session->s_mds; 1712 if (session && session->s_mds != mds) { 1713 dout(" oops, wrong session %p mutex\n", session); 1714 ceph_put_mds_session(session); 1715 session = NULL; 1716 } 1717 if (!session) { 1718 spin_unlock(&ci->i_ceph_lock); 1719 mutex_lock(&mdsc->mutex); 1720 session = __ceph_lookup_mds_session(mdsc, mds); 1721 mutex_unlock(&mdsc->mutex); 1722 goto retry; 1723 } 1724 1725 // make sure flushsnap messages are sent in proper order. 1726 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) 1727 __kick_flushing_caps(mdsc, session, ci, 0); 1728 1729 __ceph_flush_snaps(ci, session); 1730 out: 1731 spin_unlock(&ci->i_ceph_lock); 1732 1733 if (psession) 1734 *psession = session; 1735 else 1736 ceph_put_mds_session(session); 1737 /* we flushed them all; remove this inode from the queue */ 1738 spin_lock(&mdsc->snap_flush_lock); 1739 if (!list_empty(&ci->i_snap_flush_item)) 1740 need_put = true; 1741 list_del_init(&ci->i_snap_flush_item); 1742 spin_unlock(&mdsc->snap_flush_lock); 1743 1744 if (need_put) 1745 iput(inode); 1746 } 1747 1748 /* 1749 * Mark caps dirty. If inode is newly dirty, return the dirty flags. 1750 * Caller is then responsible for calling __mark_inode_dirty with the 1751 * returned flags value. 1752 */ 1753 int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, 1754 struct ceph_cap_flush **pcf) 1755 { 1756 struct ceph_mds_client *mdsc = 1757 ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc; 1758 struct inode *inode = &ci->netfs.inode; 1759 int was = ci->i_dirty_caps; 1760 int dirty = 0; 1761 1762 lockdep_assert_held(&ci->i_ceph_lock); 1763 1764 if (!ci->i_auth_cap) { 1765 pr_warn("__mark_dirty_caps %p %llx mask %s, " 1766 "but no auth cap (session was closed?)\n", 1767 inode, ceph_ino(inode), ceph_cap_string(mask)); 1768 return 0; 1769 } 1770 1771 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode, 1772 ceph_cap_string(mask), ceph_cap_string(was), 1773 ceph_cap_string(was | mask)); 1774 ci->i_dirty_caps |= mask; 1775 if (was == 0) { 1776 struct ceph_mds_session *session = ci->i_auth_cap->session; 1777 1778 WARN_ON_ONCE(ci->i_prealloc_cap_flush); 1779 swap(ci->i_prealloc_cap_flush, *pcf); 1780 1781 if (!ci->i_head_snapc) { 1782 WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); 1783 ci->i_head_snapc = ceph_get_snap_context( 1784 ci->i_snap_realm->cached_context); 1785 } 1786 dout(" inode %p now dirty snapc %p auth cap %p\n", 1787 &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap); 1788 BUG_ON(!list_empty(&ci->i_dirty_item)); 1789 spin_lock(&mdsc->cap_dirty_lock); 1790 list_add(&ci->i_dirty_item, &session->s_cap_dirty); 1791 spin_unlock(&mdsc->cap_dirty_lock); 1792 if (ci->i_flushing_caps == 0) { 1793 ihold(inode); 1794 dirty |= I_DIRTY_SYNC; 1795 } 1796 } else { 1797 WARN_ON_ONCE(!ci->i_prealloc_cap_flush); 1798 } 1799 BUG_ON(list_empty(&ci->i_dirty_item)); 1800 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && 1801 (mask & CEPH_CAP_FILE_BUFFER)) 1802 dirty |= I_DIRTY_DATASYNC; 1803 __cap_delay_requeue(mdsc, ci); 1804 return dirty; 1805 } 1806 1807 struct ceph_cap_flush *ceph_alloc_cap_flush(void) 1808 { 1809 struct ceph_cap_flush *cf; 1810 1811 cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); 1812 if (!cf) 1813 return NULL; 1814 1815 cf->is_capsnap = false; 1816 return cf; 1817 } 1818 1819 void ceph_free_cap_flush(struct ceph_cap_flush *cf) 1820 { 1821 if (cf) 1822 kmem_cache_free(ceph_cap_flush_cachep, cf); 1823 } 1824 1825 static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) 1826 { 1827 if (!list_empty(&mdsc->cap_flush_list)) { 1828 struct ceph_cap_flush *cf = 1829 list_first_entry(&mdsc->cap_flush_list, 1830 struct ceph_cap_flush, g_list); 1831 return cf->tid; 1832 } 1833 return 0; 1834 } 1835 1836 /* 1837 * Remove cap_flush from the mdsc's or inode's flushing cap list. 1838 * Return true if caller needs to wake up flush waiters. 1839 */ 1840 static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, 1841 struct ceph_cap_flush *cf) 1842 { 1843 struct ceph_cap_flush *prev; 1844 bool wake = cf->wake; 1845 1846 if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { 1847 prev = list_prev_entry(cf, g_list); 1848 prev->wake = true; 1849 wake = false; 1850 } 1851 list_del_init(&cf->g_list); 1852 return wake; 1853 } 1854 1855 static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, 1856 struct ceph_cap_flush *cf) 1857 { 1858 struct ceph_cap_flush *prev; 1859 bool wake = cf->wake; 1860 1861 if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { 1862 prev = list_prev_entry(cf, i_list); 1863 prev->wake = true; 1864 wake = false; 1865 } 1866 list_del_init(&cf->i_list); 1867 return wake; 1868 } 1869 1870 /* 1871 * Add dirty inode to the flushing list. Assigned a seq number so we 1872 * can wait for caps to flush without starving. 1873 * 1874 * Called under i_ceph_lock. Returns the flush tid. 1875 */ 1876 static u64 __mark_caps_flushing(struct inode *inode, 1877 struct ceph_mds_session *session, bool wake, 1878 u64 *oldest_flush_tid) 1879 { 1880 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; 1881 struct ceph_inode_info *ci = ceph_inode(inode); 1882 struct ceph_cap_flush *cf = NULL; 1883 int flushing; 1884 1885 lockdep_assert_held(&ci->i_ceph_lock); 1886 BUG_ON(ci->i_dirty_caps == 0); 1887 BUG_ON(list_empty(&ci->i_dirty_item)); 1888 BUG_ON(!ci->i_prealloc_cap_flush); 1889 1890 flushing = ci->i_dirty_caps; 1891 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", 1892 ceph_cap_string(flushing), 1893 ceph_cap_string(ci->i_flushing_caps), 1894 ceph_cap_string(ci->i_flushing_caps | flushing)); 1895 ci->i_flushing_caps |= flushing; 1896 ci->i_dirty_caps = 0; 1897 dout(" inode %p now !dirty\n", inode); 1898 1899 swap(cf, ci->i_prealloc_cap_flush); 1900 cf->caps = flushing; 1901 cf->wake = wake; 1902 1903 spin_lock(&mdsc->cap_dirty_lock); 1904 list_del_init(&ci->i_dirty_item); 1905 1906 cf->tid = ++mdsc->last_cap_flush_tid; 1907 list_add_tail(&cf->g_list, &mdsc->cap_flush_list); 1908 *oldest_flush_tid = __get_oldest_flush_tid(mdsc); 1909 1910 if (list_empty(&ci->i_flushing_item)) { 1911 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); 1912 mdsc->num_cap_flushing++; 1913 } 1914 spin_unlock(&mdsc->cap_dirty_lock); 1915 1916 list_add_tail(&cf->i_list, &ci->i_cap_flush_list); 1917 1918 return cf->tid; 1919 } 1920 1921 /* 1922 * try to invalidate mapping pages without blocking. 1923 */ 1924 static int try_nonblocking_invalidate(struct inode *inode) 1925 __releases(ci->i_ceph_lock) 1926 __acquires(ci->i_ceph_lock) 1927 { 1928 struct ceph_inode_info *ci = ceph_inode(inode); 1929 u32 invalidating_gen = ci->i_rdcache_gen; 1930 1931 spin_unlock(&ci->i_ceph_lock); 1932 ceph_fscache_invalidate(inode, false); 1933 invalidate_mapping_pages(&inode->i_data, 0, -1); 1934 spin_lock(&ci->i_ceph_lock); 1935 1936 if (inode->i_data.nrpages == 0 && 1937 invalidating_gen == ci->i_rdcache_gen) { 1938 /* success. */ 1939 dout("try_nonblocking_invalidate %p success\n", inode); 1940 /* save any racing async invalidate some trouble */ 1941 ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; 1942 return 0; 1943 } 1944 dout("try_nonblocking_invalidate %p failed\n", inode); 1945 return -1; 1946 } 1947 1948 bool __ceph_should_report_size(struct ceph_inode_info *ci) 1949 { 1950 loff_t size = i_size_read(&ci->netfs.inode); 1951 /* mds will adjust max size according to the reported size */ 1952 if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) 1953 return false; 1954 if (size >= ci->i_max_size) 1955 return true; 1956 /* half of previous max_size increment has been used */ 1957 if (ci->i_max_size > ci->i_reported_size && 1958 (size << 1) >= ci->i_max_size + ci->i_reported_size) 1959 return true; 1960 return false; 1961 } 1962 1963 /* 1964 * Swiss army knife function to examine currently used and wanted 1965 * versus held caps. Release, flush, ack revoked caps to mds as 1966 * appropriate. 1967 * 1968 * CHECK_CAPS_AUTHONLY - we should only check the auth cap 1969 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without 1970 * further delay. 1971 */ 1972 void ceph_check_caps(struct ceph_inode_info *ci, int flags) 1973 { 1974 struct inode *inode = &ci->netfs.inode; 1975 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 1976 struct ceph_cap *cap; 1977 u64 flush_tid, oldest_flush_tid; 1978 int file_wanted, used, cap_used; 1979 int issued, implemented, want, retain, revoking, flushing = 0; 1980 int mds = -1; /* keep track of how far we've gone through i_caps list 1981 to avoid an infinite loop on retry */ 1982 struct rb_node *p; 1983 bool queue_invalidate = false; 1984 bool tried_invalidate = false; 1985 bool queue_writeback = false; 1986 struct ceph_mds_session *session = NULL; 1987 1988 spin_lock(&ci->i_ceph_lock); 1989 if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { 1990 ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS; 1991 1992 /* Don't send messages until we get async create reply */ 1993 spin_unlock(&ci->i_ceph_lock); 1994 return; 1995 } 1996 1997 if (ci->i_ceph_flags & CEPH_I_FLUSH) 1998 flags |= CHECK_CAPS_FLUSH; 1999 retry: 2000 /* Caps wanted by virtue of active open files. */ 2001 file_wanted = __ceph_caps_file_wanted(ci); 2002 2003 /* Caps which have active references against them */ 2004 used = __ceph_caps_used(ci); 2005 2006 /* 2007 * "issued" represents the current caps that the MDS wants us to have. 2008 * "implemented" is the set that we have been granted, and includes the 2009 * ones that have not yet been returned to the MDS (the "revoking" set, 2010 * usually because they have outstanding references). 2011 */ 2012 issued = __ceph_caps_issued(ci, &implemented); 2013 revoking = implemented & ~issued; 2014 2015 want = file_wanted; 2016 2017 /* The ones we currently want to retain (may be adjusted below) */ 2018 retain = file_wanted | used | CEPH_CAP_PIN; 2019 if (!mdsc->stopping && inode->i_nlink > 0) { 2020 if (file_wanted) { 2021 retain |= CEPH_CAP_ANY; /* be greedy */ 2022 } else if (S_ISDIR(inode->i_mode) && 2023 (issued & CEPH_CAP_FILE_SHARED) && 2024 __ceph_dir_is_complete(ci)) { 2025 /* 2026 * If a directory is complete, we want to keep 2027 * the exclusive cap. So that MDS does not end up 2028 * revoking the shared cap on every create/unlink 2029 * operation. 2030 */ 2031 if (IS_RDONLY(inode)) { 2032 want = CEPH_CAP_ANY_SHARED; 2033 } else { 2034 want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; 2035 } 2036 retain |= want; 2037 } else { 2038 2039 retain |= CEPH_CAP_ANY_SHARED; 2040 /* 2041 * keep RD only if we didn't have the file open RW, 2042 * because then the mds would revoke it anyway to 2043 * journal max_size=0. 2044 */ 2045 if (ci->i_max_size == 0) 2046 retain |= CEPH_CAP_ANY_RD; 2047 } 2048 } 2049 2050 dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s" 2051 " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode), 2052 ceph_cap_string(file_wanted), 2053 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), 2054 ceph_cap_string(ci->i_flushing_caps), 2055 ceph_cap_string(issued), ceph_cap_string(revoking), 2056 ceph_cap_string(retain), 2057 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", 2058 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", 2059 (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : ""); 2060 2061 /* 2062 * If we no longer need to hold onto old our caps, and we may 2063 * have cached pages, but don't want them, then try to invalidate. 2064 * If we fail, it's because pages are locked.... try again later. 2065 */ 2066 if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) && 2067 S_ISREG(inode->i_mode) && 2068 !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ 2069 inode->i_data.nrpages && /* have cached pages */ 2070 (revoking & (CEPH_CAP_FILE_CACHE| 2071 CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ 2072 !tried_invalidate) { 2073 dout("check_caps trying to invalidate on %llx.%llx\n", 2074 ceph_vinop(inode)); 2075 if (try_nonblocking_invalidate(inode) < 0) { 2076 dout("check_caps queuing invalidate\n"); 2077 queue_invalidate = true; 2078 ci->i_rdcache_revoking = ci->i_rdcache_gen; 2079 } 2080 tried_invalidate = true; 2081 goto retry; 2082 } 2083 2084 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 2085 int mflags = 0; 2086 struct cap_msg_args arg; 2087 2088 cap = rb_entry(p, struct ceph_cap, ci_node); 2089 2090 /* avoid looping forever */ 2091 if (mds >= cap->mds || 2092 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) 2093 continue; 2094 2095 /* 2096 * If we have an auth cap, we don't need to consider any 2097 * overlapping caps as used. 2098 */ 2099 cap_used = used; 2100 if (ci->i_auth_cap && cap != ci->i_auth_cap) 2101 cap_used &= ~ci->i_auth_cap->issued; 2102 2103 revoking = cap->implemented & ~cap->issued; 2104 dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", 2105 cap->mds, cap, ceph_cap_string(cap_used), 2106 ceph_cap_string(cap->issued), 2107 ceph_cap_string(cap->implemented), 2108 ceph_cap_string(revoking)); 2109 2110 if (cap == ci->i_auth_cap && 2111 (cap->issued & CEPH_CAP_FILE_WR)) { 2112 /* request larger max_size from MDS? */ 2113 if (ci->i_wanted_max_size > ci->i_max_size && 2114 ci->i_wanted_max_size > ci->i_requested_max_size) { 2115 dout("requesting new max_size\n"); 2116 goto ack; 2117 } 2118 2119 /* approaching file_max? */ 2120 if (__ceph_should_report_size(ci)) { 2121 dout("i_size approaching max_size\n"); 2122 goto ack; 2123 } 2124 } 2125 /* flush anything dirty? */ 2126 if (cap == ci->i_auth_cap) { 2127 if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) { 2128 dout("flushing dirty caps\n"); 2129 goto ack; 2130 } 2131 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) { 2132 dout("flushing snap caps\n"); 2133 goto ack; 2134 } 2135 } 2136 2137 /* completed revocation? going down and there are no caps? */ 2138 if (revoking) { 2139 if ((revoking & cap_used) == 0) { 2140 dout("completed revocation of %s\n", 2141 ceph_cap_string(cap->implemented & ~cap->issued)); 2142 goto ack; 2143 } 2144 2145 /* 2146 * If the "i_wrbuffer_ref" was increased by mmap or generic 2147 * cache write just before the ceph_check_caps() is called, 2148 * the Fb capability revoking will fail this time. Then we 2149 * must wait for the BDI's delayed work to flush the dirty 2150 * pages and to release the "i_wrbuffer_ref", which will cost 2151 * at most 5 seconds. That means the MDS needs to wait at 2152 * most 5 seconds to finished the Fb capability's revocation. 2153 * 2154 * Let's queue a writeback for it. 2155 */ 2156 if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && 2157 (revoking & CEPH_CAP_FILE_BUFFER)) 2158 queue_writeback = true; 2159 } 2160 2161 /* want more caps from mds? */ 2162 if (want & ~cap->mds_wanted) { 2163 if (want & ~(cap->mds_wanted | cap->issued)) 2164 goto ack; 2165 if (!__cap_is_valid(cap)) 2166 goto ack; 2167 } 2168 2169 /* things we might delay */ 2170 if ((cap->issued & ~retain) == 0) 2171 continue; /* nope, all good */ 2172 2173 ack: 2174 ceph_put_mds_session(session); 2175 session = ceph_get_mds_session(cap->session); 2176 2177 /* kick flushing and flush snaps before sending normal 2178 * cap message */ 2179 if (cap == ci->i_auth_cap && 2180 (ci->i_ceph_flags & 2181 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { 2182 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) 2183 __kick_flushing_caps(mdsc, session, ci, 0); 2184 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) 2185 __ceph_flush_snaps(ci, session); 2186 2187 goto retry; 2188 } 2189 2190 if (cap == ci->i_auth_cap && ci->i_dirty_caps) { 2191 flushing = ci->i_dirty_caps; 2192 flush_tid = __mark_caps_flushing(inode, session, false, 2193 &oldest_flush_tid); 2194 if (flags & CHECK_CAPS_FLUSH && 2195 list_empty(&session->s_cap_dirty)) 2196 mflags |= CEPH_CLIENT_CAPS_SYNC; 2197 } else { 2198 flushing = 0; 2199 flush_tid = 0; 2200 spin_lock(&mdsc->cap_dirty_lock); 2201 oldest_flush_tid = __get_oldest_flush_tid(mdsc); 2202 spin_unlock(&mdsc->cap_dirty_lock); 2203 } 2204 2205 mds = cap->mds; /* remember mds, so we don't repeat */ 2206 2207 __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, 2208 want, retain, flushing, flush_tid, oldest_flush_tid); 2209 2210 spin_unlock(&ci->i_ceph_lock); 2211 __send_cap(&arg, ci); 2212 spin_lock(&ci->i_ceph_lock); 2213 2214 goto retry; /* retake i_ceph_lock and restart our cap scan. */ 2215 } 2216 2217 /* periodically re-calculate caps wanted by open files */ 2218 if (__ceph_is_any_real_caps(ci) && 2219 list_empty(&ci->i_cap_delay_list) && 2220 (file_wanted & ~CEPH_CAP_PIN) && 2221 !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { 2222 __cap_delay_requeue(mdsc, ci); 2223 } 2224 2225 spin_unlock(&ci->i_ceph_lock); 2226 2227 ceph_put_mds_session(session); 2228 if (queue_writeback) 2229 ceph_queue_writeback(inode); 2230 if (queue_invalidate) 2231 ceph_queue_invalidate(inode); 2232 } 2233 2234 /* 2235 * Try to flush dirty caps back to the auth mds. 2236 */ 2237 static int try_flush_caps(struct inode *inode, u64 *ptid) 2238 { 2239 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; 2240 struct ceph_inode_info *ci = ceph_inode(inode); 2241 int flushing = 0; 2242 u64 flush_tid = 0, oldest_flush_tid = 0; 2243 2244 spin_lock(&ci->i_ceph_lock); 2245 retry_locked: 2246 if (ci->i_dirty_caps && ci->i_auth_cap) { 2247 struct ceph_cap *cap = ci->i_auth_cap; 2248 struct cap_msg_args arg; 2249 struct ceph_mds_session *session = cap->session; 2250 2251 if (session->s_state < CEPH_MDS_SESSION_OPEN) { 2252 spin_unlock(&ci->i_ceph_lock); 2253 goto out; 2254 } 2255 2256 if (ci->i_ceph_flags & 2257 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) { 2258 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) 2259 __kick_flushing_caps(mdsc, session, ci, 0); 2260 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) 2261 __ceph_flush_snaps(ci, session); 2262 goto retry_locked; 2263 } 2264 2265 flushing = ci->i_dirty_caps; 2266 flush_tid = __mark_caps_flushing(inode, session, true, 2267 &oldest_flush_tid); 2268 2269 __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, 2270 __ceph_caps_used(ci), __ceph_caps_wanted(ci), 2271 (cap->issued | cap->implemented), 2272 flushing, flush_tid, oldest_flush_tid); 2273 spin_unlock(&ci->i_ceph_lock); 2274 2275 __send_cap(&arg, ci); 2276 } else { 2277 if (!list_empty(&ci->i_cap_flush_list)) { 2278 struct ceph_cap_flush *cf = 2279 list_last_entry(&ci->i_cap_flush_list, 2280 struct ceph_cap_flush, i_list); 2281 cf->wake = true; 2282 flush_tid = cf->tid; 2283 } 2284 flushing = ci->i_flushing_caps; 2285 spin_unlock(&ci->i_ceph_lock); 2286 } 2287 out: 2288 *ptid = flush_tid; 2289 return flushing; 2290 } 2291 2292 /* 2293 * Return true if we've flushed caps through the given flush_tid. 2294 */ 2295 static int caps_are_flushed(struct inode *inode, u64 flush_tid) 2296 { 2297 struct ceph_inode_info *ci = ceph_inode(inode); 2298 int ret = 1; 2299 2300 spin_lock(&ci->i_ceph_lock); 2301 if (!list_empty(&ci->i_cap_flush_list)) { 2302 struct ceph_cap_flush * cf = 2303 list_first_entry(&ci->i_cap_flush_list, 2304 struct ceph_cap_flush, i_list); 2305 if (cf->tid <= flush_tid) 2306 ret = 0; 2307 } 2308 spin_unlock(&ci->i_ceph_lock); 2309 return ret; 2310 } 2311 2312 /* 2313 * flush the mdlog and wait for any unsafe requests to complete. 2314 */ 2315 static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) 2316 { 2317 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; 2318 struct ceph_inode_info *ci = ceph_inode(inode); 2319 struct ceph_mds_request *req1 = NULL, *req2 = NULL; 2320 int ret, err = 0; 2321 2322 spin_lock(&ci->i_unsafe_lock); 2323 if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) { 2324 req1 = list_last_entry(&ci->i_unsafe_dirops, 2325 struct ceph_mds_request, 2326 r_unsafe_dir_item); 2327 ceph_mdsc_get_request(req1); 2328 } 2329 if (!list_empty(&ci->i_unsafe_iops)) { 2330 req2 = list_last_entry(&ci->i_unsafe_iops, 2331 struct ceph_mds_request, 2332 r_unsafe_target_item); 2333 ceph_mdsc_get_request(req2); 2334 } 2335 spin_unlock(&ci->i_unsafe_lock); 2336 2337 /* 2338 * Trigger to flush the journal logs in all the relevant MDSes 2339 * manually, or in the worst case we must wait at most 5 seconds 2340 * to wait the journal logs to be flushed by the MDSes periodically. 2341 */ 2342 if (req1 || req2) { 2343 struct ceph_mds_request *req; 2344 struct ceph_mds_session **sessions; 2345 struct ceph_mds_session *s; 2346 unsigned int max_sessions; 2347 int i; 2348 2349 mutex_lock(&mdsc->mutex); 2350 max_sessions = mdsc->max_sessions; 2351 2352 sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); 2353 if (!sessions) { 2354 mutex_unlock(&mdsc->mutex); 2355 err = -ENOMEM; 2356 goto out; 2357 } 2358 2359 spin_lock(&ci->i_unsafe_lock); 2360 if (req1) { 2361 list_for_each_entry(req, &ci->i_unsafe_dirops, 2362 r_unsafe_dir_item) { 2363 s = req->r_session; 2364 if (!s) 2365 continue; 2366 if (!sessions[s->s_mds]) { 2367 s = ceph_get_mds_session(s); 2368 sessions[s->s_mds] = s; 2369 } 2370 } 2371 } 2372 if (req2) { 2373 list_for_each_entry(req, &ci->i_unsafe_iops, 2374 r_unsafe_target_item) { 2375 s = req->r_session; 2376 if (!s) 2377 continue; 2378 if (!sessions[s->s_mds]) { 2379 s = ceph_get_mds_session(s); 2380 sessions[s->s_mds] = s; 2381 } 2382 } 2383 } 2384 spin_unlock(&ci->i_unsafe_lock); 2385 2386 /* the auth MDS */ 2387 spin_lock(&ci->i_ceph_lock); 2388 if (ci->i_auth_cap) { 2389 s = ci->i_auth_cap->session; 2390 if (!sessions[s->s_mds]) 2391 sessions[s->s_mds] = ceph_get_mds_session(s); 2392 } 2393 spin_unlock(&ci->i_ceph_lock); 2394 mutex_unlock(&mdsc->mutex); 2395 2396 /* send flush mdlog request to MDSes */ 2397 for (i = 0; i < max_sessions; i++) { 2398 s = sessions[i]; 2399 if (s) { 2400 send_flush_mdlog(s); 2401 ceph_put_mds_session(s); 2402 } 2403 } 2404 kfree(sessions); 2405 } 2406 2407 dout("%s %p wait on tid %llu %llu\n", __func__, 2408 inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); 2409 if (req1) { 2410 ret = !wait_for_completion_timeout(&req1->r_safe_completion, 2411 ceph_timeout_jiffies(req1->r_timeout)); 2412 if (ret) 2413 err = -EIO; 2414 } 2415 if (req2) { 2416 ret = !wait_for_completion_timeout(&req2->r_safe_completion, 2417 ceph_timeout_jiffies(req2->r_timeout)); 2418 if (ret) 2419 err = -EIO; 2420 } 2421 2422 out: 2423 if (req1) 2424 ceph_mdsc_put_request(req1); 2425 if (req2) 2426 ceph_mdsc_put_request(req2); 2427 return err; 2428 } 2429 2430 int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) 2431 { 2432 struct inode *inode = file->f_mapping->host; 2433 struct ceph_inode_info *ci = ceph_inode(inode); 2434 u64 flush_tid; 2435 int ret, err; 2436 int dirty; 2437 2438 dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); 2439 2440 ret = file_write_and_wait_range(file, start, end); 2441 if (datasync) 2442 goto out; 2443 2444 ret = ceph_wait_on_async_create(inode); 2445 if (ret) 2446 goto out; 2447 2448 dirty = try_flush_caps(inode, &flush_tid); 2449 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); 2450 2451 err = flush_mdlog_and_wait_inode_unsafe_requests(inode); 2452 2453 /* 2454 * only wait on non-file metadata writeback (the mds 2455 * can recover size and mtime, so we don't need to 2456 * wait for that) 2457 */ 2458 if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { 2459 err = wait_event_interruptible(ci->i_cap_wq, 2460 caps_are_flushed(inode, flush_tid)); 2461 } 2462 2463 if (err < 0) 2464 ret = err; 2465 2466 err = file_check_and_advance_wb_err(file); 2467 if (err < 0) 2468 ret = err; 2469 out: 2470 dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); 2471 return ret; 2472 } 2473 2474 /* 2475 * Flush any dirty caps back to the mds. If we aren't asked to wait, 2476 * queue inode for flush but don't do so immediately, because we can 2477 * get by with fewer MDS messages if we wait for data writeback to 2478 * complete first. 2479 */ 2480 int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) 2481 { 2482 struct ceph_inode_info *ci = ceph_inode(inode); 2483 u64 flush_tid; 2484 int err = 0; 2485 int dirty; 2486 int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); 2487 2488 dout("write_inode %p wait=%d\n", inode, wait); 2489 ceph_fscache_unpin_writeback(inode, wbc); 2490 if (wait) { 2491 err = ceph_wait_on_async_create(inode); 2492 if (err) 2493 return err; 2494 dirty = try_flush_caps(inode, &flush_tid); 2495 if (dirty) 2496 err = wait_event_interruptible(ci->i_cap_wq, 2497 caps_are_flushed(inode, flush_tid)); 2498 } else { 2499 struct ceph_mds_client *mdsc = 2500 ceph_sb_to_fs_client(inode->i_sb)->mdsc; 2501 2502 spin_lock(&ci->i_ceph_lock); 2503 if (__ceph_caps_dirty(ci)) 2504 __cap_delay_requeue_front(mdsc, ci); 2505 spin_unlock(&ci->i_ceph_lock); 2506 } 2507 return err; 2508 } 2509 2510 static void __kick_flushing_caps(struct ceph_mds_client *mdsc, 2511 struct ceph_mds_session *session, 2512 struct ceph_inode_info *ci, 2513 u64 oldest_flush_tid) 2514 __releases(ci->i_ceph_lock) 2515 __acquires(ci->i_ceph_lock) 2516 { 2517 struct inode *inode = &ci->netfs.inode; 2518 struct ceph_cap *cap; 2519 struct ceph_cap_flush *cf; 2520 int ret; 2521 u64 first_tid = 0; 2522 u64 last_snap_flush = 0; 2523 2524 /* Don't do anything until create reply comes in */ 2525 if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) 2526 return; 2527 2528 ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; 2529 2530 list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { 2531 if (cf->is_capsnap) { 2532 last_snap_flush = cf->tid; 2533 break; 2534 } 2535 } 2536 2537 list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { 2538 if (cf->tid < first_tid) 2539 continue; 2540 2541 cap = ci->i_auth_cap; 2542 if (!(cap && cap->session == session)) { 2543 pr_err("%p auth cap %p not mds%d ???\n", 2544 inode, cap, session->s_mds); 2545 break; 2546 } 2547 2548 first_tid = cf->tid + 1; 2549 2550 if (!cf->is_capsnap) { 2551 struct cap_msg_args arg; 2552 2553 dout("kick_flushing_caps %p cap %p tid %llu %s\n", 2554 inode, cap, cf->tid, ceph_cap_string(cf->caps)); 2555 __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, 2556 (cf->tid < last_snap_flush ? 2557 CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), 2558 __ceph_caps_used(ci), 2559 __ceph_caps_wanted(ci), 2560 (cap->issued | cap->implemented), 2561 cf->caps, cf->tid, oldest_flush_tid); 2562 spin_unlock(&ci->i_ceph_lock); 2563 __send_cap(&arg, ci); 2564 } else { 2565 struct ceph_cap_snap *capsnap = 2566 container_of(cf, struct ceph_cap_snap, 2567 cap_flush); 2568 dout("kick_flushing_caps %p capsnap %p tid %llu %s\n", 2569 inode, capsnap, cf->tid, 2570 ceph_cap_string(capsnap->dirty)); 2571 2572 refcount_inc(&capsnap->nref); 2573 spin_unlock(&ci->i_ceph_lock); 2574 2575 ret = __send_flush_snap(inode, session, capsnap, cap->mseq, 2576 oldest_flush_tid); 2577 if (ret < 0) { 2578 pr_err("kick_flushing_caps: error sending " 2579 "cap flushsnap, ino (%llx.%llx) " 2580 "tid %llu follows %llu\n", 2581 ceph_vinop(inode), cf->tid, 2582 capsnap->follows); 2583 } 2584 2585 ceph_put_cap_snap(capsnap); 2586 } 2587 2588 spin_lock(&ci->i_ceph_lock); 2589 } 2590 } 2591 2592 void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, 2593 struct ceph_mds_session *session) 2594 { 2595 struct ceph_inode_info *ci; 2596 struct ceph_cap *cap; 2597 u64 oldest_flush_tid; 2598 2599 dout("early_kick_flushing_caps mds%d\n", session->s_mds); 2600 2601 spin_lock(&mdsc->cap_dirty_lock); 2602 oldest_flush_tid = __get_oldest_flush_tid(mdsc); 2603 spin_unlock(&mdsc->cap_dirty_lock); 2604 2605 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { 2606 spin_lock(&ci->i_ceph_lock); 2607 cap = ci->i_auth_cap; 2608 if (!(cap && cap->session == session)) { 2609 pr_err("%p auth cap %p not mds%d ???\n", 2610 &ci->netfs.inode, cap, session->s_mds); 2611 spin_unlock(&ci->i_ceph_lock); 2612 continue; 2613 } 2614 2615 2616 /* 2617 * if flushing caps were revoked, we re-send the cap flush 2618 * in client reconnect stage. This guarantees MDS * processes 2619 * the cap flush message before issuing the flushing caps to 2620 * other client. 2621 */ 2622 if ((cap->issued & ci->i_flushing_caps) != 2623 ci->i_flushing_caps) { 2624 /* encode_caps_cb() also will reset these sequence 2625 * numbers. make sure sequence numbers in cap flush 2626 * message match later reconnect message */ 2627 cap->seq = 0; 2628 cap->issue_seq = 0; 2629 cap->mseq = 0; 2630 __kick_flushing_caps(mdsc, session, ci, 2631 oldest_flush_tid); 2632 } else { 2633 ci->i_ceph_flags |= CEPH_I_KICK_FLUSH; 2634 } 2635 2636 spin_unlock(&ci->i_ceph_lock); 2637 } 2638 } 2639 2640 void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, 2641 struct ceph_mds_session *session) 2642 { 2643 struct ceph_inode_info *ci; 2644 struct ceph_cap *cap; 2645 u64 oldest_flush_tid; 2646 2647 lockdep_assert_held(&session->s_mutex); 2648 2649 dout("kick_flushing_caps mds%d\n", session->s_mds); 2650 2651 spin_lock(&mdsc->cap_dirty_lock); 2652 oldest_flush_tid = __get_oldest_flush_tid(mdsc); 2653 spin_unlock(&mdsc->cap_dirty_lock); 2654 2655 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { 2656 spin_lock(&ci->i_ceph_lock); 2657 cap = ci->i_auth_cap; 2658 if (!(cap && cap->session == session)) { 2659 pr_err("%p auth cap %p not mds%d ???\n", 2660 &ci->netfs.inode, cap, session->s_mds); 2661 spin_unlock(&ci->i_ceph_lock); 2662 continue; 2663 } 2664 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { 2665 __kick_flushing_caps(mdsc, session, ci, 2666 oldest_flush_tid); 2667 } 2668 spin_unlock(&ci->i_ceph_lock); 2669 } 2670 } 2671 2672 void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, 2673 struct ceph_inode_info *ci) 2674 { 2675 struct ceph_mds_client *mdsc = session->s_mdsc; 2676 struct ceph_cap *cap = ci->i_auth_cap; 2677 2678 lockdep_assert_held(&ci->i_ceph_lock); 2679 2680 dout("%s %p flushing %s\n", __func__, &ci->netfs.inode, 2681 ceph_cap_string(ci->i_flushing_caps)); 2682 2683 if (!list_empty(&ci->i_cap_flush_list)) { 2684 u64 oldest_flush_tid; 2685 spin_lock(&mdsc->cap_dirty_lock); 2686 list_move_tail(&ci->i_flushing_item, 2687 &cap->session->s_cap_flushing); 2688 oldest_flush_tid = __get_oldest_flush_tid(mdsc); 2689 spin_unlock(&mdsc->cap_dirty_lock); 2690 2691 __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); 2692 } 2693 } 2694 2695 2696 /* 2697 * Take references to capabilities we hold, so that we don't release 2698 * them to the MDS prematurely. 2699 */ 2700 void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, 2701 bool snap_rwsem_locked) 2702 { 2703 lockdep_assert_held(&ci->i_ceph_lock); 2704 2705 if (got & CEPH_CAP_PIN) 2706 ci->i_pin_ref++; 2707 if (got & CEPH_CAP_FILE_RD) 2708 ci->i_rd_ref++; 2709 if (got & CEPH_CAP_FILE_CACHE) 2710 ci->i_rdcache_ref++; 2711 if (got & CEPH_CAP_FILE_EXCL) 2712 ci->i_fx_ref++; 2713 if (got & CEPH_CAP_FILE_WR) { 2714 if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { 2715 BUG_ON(!snap_rwsem_locked); 2716 ci->i_head_snapc = ceph_get_snap_context( 2717 ci->i_snap_realm->cached_context); 2718 } 2719 ci->i_wr_ref++; 2720 } 2721 if (got & CEPH_CAP_FILE_BUFFER) { 2722 if (ci->i_wb_ref == 0) 2723 ihold(&ci->netfs.inode); 2724 ci->i_wb_ref++; 2725 dout("%s %p wb %d -> %d (?)\n", __func__, 2726 &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref); 2727 } 2728 } 2729 2730 /* 2731 * Try to grab cap references. Specify those refs we @want, and the 2732 * minimal set we @need. Also include the larger offset we are writing 2733 * to (when applicable), and check against max_size here as well. 2734 * Note that caller is responsible for ensuring max_size increases are 2735 * requested from the MDS. 2736 * 2737 * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, 2738 * or a negative error code. There are 3 speical error codes: 2739 * -EAGAIN: need to sleep but non-blocking is specified 2740 * -EFBIG: ask caller to call check_max_size() and try again. 2741 * -EUCLEAN: ask caller to call ceph_renew_caps() and try again. 2742 */ 2743 enum { 2744 /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ 2745 NON_BLOCKING = (1 << 8), 2746 CHECK_FILELOCK = (1 << 9), 2747 }; 2748 2749 static int try_get_cap_refs(struct inode *inode, int need, int want, 2750 loff_t endoff, int flags, int *got) 2751 { 2752 struct ceph_inode_info *ci = ceph_inode(inode); 2753 struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; 2754 int ret = 0; 2755 int have, implemented; 2756 bool snap_rwsem_locked = false; 2757 2758 dout("get_cap_refs %p need %s want %s\n", inode, 2759 ceph_cap_string(need), ceph_cap_string(want)); 2760 2761 again: 2762 spin_lock(&ci->i_ceph_lock); 2763 2764 if ((flags & CHECK_FILELOCK) && 2765 (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { 2766 dout("try_get_cap_refs %p error filelock\n", inode); 2767 ret = -EIO; 2768 goto out_unlock; 2769 } 2770 2771 /* finish pending truncate */ 2772 while (ci->i_truncate_pending) { 2773 spin_unlock(&ci->i_ceph_lock); 2774 if (snap_rwsem_locked) { 2775 up_read(&mdsc->snap_rwsem); 2776 snap_rwsem_locked = false; 2777 } 2778 __ceph_do_pending_vmtruncate(inode); 2779 spin_lock(&ci->i_ceph_lock); 2780 } 2781 2782 have = __ceph_caps_issued(ci, &implemented); 2783 2784 if (have & need & CEPH_CAP_FILE_WR) { 2785 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { 2786 dout("get_cap_refs %p endoff %llu > maxsize %llu\n", 2787 inode, endoff, ci->i_max_size); 2788 if (endoff > ci->i_requested_max_size) 2789 ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN; 2790 goto out_unlock; 2791 } 2792 /* 2793 * If a sync write is in progress, we must wait, so that we 2794 * can get a final snapshot value for size+mtime. 2795 */ 2796 if (__ceph_have_pending_cap_snap(ci)) { 2797 dout("get_cap_refs %p cap_snap_pending\n", inode); 2798 goto out_unlock; 2799 } 2800 } 2801 2802 if ((have & need) == need) { 2803 /* 2804 * Look at (implemented & ~have & not) so that we keep waiting 2805 * on transition from wanted -> needed caps. This is needed 2806 * for WRBUFFER|WR -> WR to avoid a new WR sync write from 2807 * going before a prior buffered writeback happens. 2808 * 2809 * For RDCACHE|RD -> RD, there is not need to wait and we can 2810 * just exclude the revoking caps and force to sync read. 2811 */ 2812 int not = want & ~(have & need); 2813 int revoking = implemented & ~have; 2814 int exclude = revoking & not; 2815 dout("get_cap_refs %p have %s but not %s (revoking %s)\n", 2816 inode, ceph_cap_string(have), ceph_cap_string(not), 2817 ceph_cap_string(revoking)); 2818 if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { 2819 if (!snap_rwsem_locked && 2820 !ci->i_head_snapc && 2821 (need & CEPH_CAP_FILE_WR)) { 2822 if (!down_read_trylock(&mdsc->snap_rwsem)) { 2823 /* 2824 * we can not call down_read() when 2825 * task isn't in TASK_RUNNING state 2826 */ 2827 if (flags & NON_BLOCKING) { 2828 ret = -EAGAIN; 2829 goto out_unlock; 2830 } 2831 2832 spin_unlock(&ci->i_ceph_lock); 2833 down_read(&mdsc->snap_rwsem); 2834 snap_rwsem_locked = true; 2835 goto again; 2836 } 2837 snap_rwsem_locked = true; 2838 } 2839 if ((have & want) == want) 2840 *got = need | (want & ~exclude); 2841 else 2842 *got = need; 2843 ceph_take_cap_refs(ci, *got, true); 2844 ret = 1; 2845 } 2846 } else { 2847 int session_readonly = false; 2848 int mds_wanted; 2849 if (ci->i_auth_cap && 2850 (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) { 2851 struct ceph_mds_session *s = ci->i_auth_cap->session; 2852 spin_lock(&s->s_cap_lock); 2853 session_readonly = s->s_readonly; 2854 spin_unlock(&s->s_cap_lock); 2855 } 2856 if (session_readonly) { 2857 dout("get_cap_refs %p need %s but mds%d readonly\n", 2858 inode, ceph_cap_string(need), ci->i_auth_cap->mds); 2859 ret = -EROFS; 2860 goto out_unlock; 2861 } 2862 2863 if (ceph_inode_is_shutdown(inode)) { 2864 dout("get_cap_refs %p inode is shutdown\n", inode); 2865 ret = -ESTALE; 2866 goto out_unlock; 2867 } 2868 mds_wanted = __ceph_caps_mds_wanted(ci, false); 2869 if (need & ~mds_wanted) { 2870 dout("get_cap_refs %p need %s > mds_wanted %s\n", 2871 inode, ceph_cap_string(need), 2872 ceph_cap_string(mds_wanted)); 2873 ret = -EUCLEAN; 2874 goto out_unlock; 2875 } 2876 2877 dout("get_cap_refs %p have %s need %s\n", inode, 2878 ceph_cap_string(have), ceph_cap_string(need)); 2879 } 2880 out_unlock: 2881 2882 __ceph_touch_fmode(ci, mdsc, flags); 2883 2884 spin_unlock(&ci->i_ceph_lock); 2885 if (snap_rwsem_locked) 2886 up_read(&mdsc->snap_rwsem); 2887 2888 if (!ret) 2889 ceph_update_cap_mis(&mdsc->metric); 2890 else if (ret == 1) 2891 ceph_update_cap_hit(&mdsc->metric); 2892 2893 dout("get_cap_refs %p ret %d got %s\n", inode, 2894 ret, ceph_cap_string(*got)); 2895 return ret; 2896 } 2897 2898 /* 2899 * Check the offset we are writing up to against our current 2900 * max_size. If necessary, tell the MDS we want to write to 2901 * a larger offset. 2902 */ 2903 static void check_max_size(struct inode *inode, loff_t endoff) 2904 { 2905 struct ceph_inode_info *ci = ceph_inode(inode); 2906 int check = 0; 2907 2908 /* do we need to explicitly request a larger max_size? */ 2909 spin_lock(&ci->i_ceph_lock); 2910 if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { 2911 dout("write %p at large endoff %llu, req max_size\n", 2912 inode, endoff); 2913 ci->i_wanted_max_size = endoff; 2914 } 2915 /* duplicate ceph_check_caps()'s logic */ 2916 if (ci->i_auth_cap && 2917 (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && 2918 ci->i_wanted_max_size > ci->i_max_size && 2919 ci->i_wanted_max_size > ci->i_requested_max_size) 2920 check = 1; 2921 spin_unlock(&ci->i_ceph_lock); 2922 if (check) 2923 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); 2924 } 2925 2926 static inline int get_used_fmode(int caps) 2927 { 2928 int fmode = 0; 2929 if (caps & CEPH_CAP_FILE_RD) 2930 fmode |= CEPH_FILE_MODE_RD; 2931 if (caps & CEPH_CAP_FILE_WR) 2932 fmode |= CEPH_FILE_MODE_WR; 2933 return fmode; 2934 } 2935 2936 int ceph_try_get_caps(struct inode *inode, int need, int want, 2937 bool nonblock, int *got) 2938 { 2939 int ret, flags; 2940 2941 BUG_ON(need & ~CEPH_CAP_FILE_RD); 2942 BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO | 2943 CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | 2944 CEPH_CAP_ANY_DIR_OPS)); 2945 if (need) { 2946 ret = ceph_pool_perm_check(inode, need); 2947 if (ret < 0) 2948 return ret; 2949 } 2950 2951 flags = get_used_fmode(need | want); 2952 if (nonblock) 2953 flags |= NON_BLOCKING; 2954 2955 ret = try_get_cap_refs(inode, need, want, 0, flags, got); 2956 /* three special error codes */ 2957 if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN) 2958 ret = 0; 2959 return ret; 2960 } 2961 2962 /* 2963 * Wait for caps, and take cap references. If we can't get a WR cap 2964 * due to a small max_size, make sure we check_max_size (and possibly 2965 * ask the mds) so we don't get hung up indefinitely. 2966 */ 2967 int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, 2968 int want, loff_t endoff, int *got) 2969 { 2970 struct ceph_inode_info *ci = ceph_inode(inode); 2971 struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 2972 int ret, _got, flags; 2973 2974 ret = ceph_pool_perm_check(inode, need); 2975 if (ret < 0) 2976 return ret; 2977 2978 if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && 2979 fi->filp_gen != READ_ONCE(fsc->filp_gen)) 2980 return -EBADF; 2981 2982 flags = get_used_fmode(need | want); 2983 2984 while (true) { 2985 flags &= CEPH_FILE_MODE_MASK; 2986 if (vfs_inode_has_locks(inode)) 2987 flags |= CHECK_FILELOCK; 2988 _got = 0; 2989 ret = try_get_cap_refs(inode, need, want, endoff, 2990 flags, &_got); 2991 WARN_ON_ONCE(ret == -EAGAIN); 2992 if (!ret) { 2993 struct ceph_mds_client *mdsc = fsc->mdsc; 2994 struct cap_wait cw; 2995 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2996 2997 cw.ino = ceph_ino(inode); 2998 cw.tgid = current->tgid; 2999 cw.need = need; 3000 cw.want = want; 3001 3002 spin_lock(&mdsc->caps_list_lock); 3003 list_add(&cw.list, &mdsc->cap_wait_list); 3004 spin_unlock(&mdsc->caps_list_lock); 3005 3006 /* make sure used fmode not timeout */ 3007 ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); 3008 add_wait_queue(&ci->i_cap_wq, &wait); 3009 3010 flags |= NON_BLOCKING; 3011 while (!(ret = try_get_cap_refs(inode, need, want, 3012 endoff, flags, &_got))) { 3013 if (signal_pending(current)) { 3014 ret = -ERESTARTSYS; 3015 break; 3016 } 3017 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 3018 } 3019 3020 remove_wait_queue(&ci->i_cap_wq, &wait); 3021 ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); 3022 3023 spin_lock(&mdsc->caps_list_lock); 3024 list_del(&cw.list); 3025 spin_unlock(&mdsc->caps_list_lock); 3026 3027 if (ret == -EAGAIN) 3028 continue; 3029 } 3030 3031 if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && 3032 fi->filp_gen != READ_ONCE(fsc->filp_gen)) { 3033 if (ret >= 0 && _got) 3034 ceph_put_cap_refs(ci, _got); 3035 return -EBADF; 3036 } 3037 3038 if (ret < 0) { 3039 if (ret == -EFBIG || ret == -EUCLEAN) { 3040 int ret2 = ceph_wait_on_async_create(inode); 3041 if (ret2 < 0) 3042 return ret2; 3043 } 3044 if (ret == -EFBIG) { 3045 check_max_size(inode, endoff); 3046 continue; 3047 } 3048 if (ret == -EUCLEAN) { 3049 /* session was killed, try renew caps */ 3050 ret = ceph_renew_caps(inode, flags); 3051 if (ret == 0) 3052 continue; 3053 } 3054 return ret; 3055 } 3056 3057 if (S_ISREG(ci->netfs.inode.i_mode) && 3058 ceph_has_inline_data(ci) && 3059 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 3060 i_size_read(inode) > 0) { 3061 struct page *page = 3062 find_get_page(inode->i_mapping, 0); 3063 if (page) { 3064 bool uptodate = PageUptodate(page); 3065 3066 put_page(page); 3067 if (uptodate) 3068 break; 3069 } 3070 /* 3071 * drop cap refs first because getattr while 3072 * holding * caps refs can cause deadlock. 3073 */ 3074 ceph_put_cap_refs(ci, _got); 3075 _got = 0; 3076 3077 /* 3078 * getattr request will bring inline data into 3079 * page cache 3080 */ 3081 ret = __ceph_do_getattr(inode, NULL, 3082 CEPH_STAT_CAP_INLINE_DATA, 3083 true); 3084 if (ret < 0) 3085 return ret; 3086 continue; 3087 } 3088 break; 3089 } 3090 *got = _got; 3091 return 0; 3092 } 3093 3094 int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, 3095 int *got) 3096 { 3097 struct ceph_file_info *fi = filp->private_data; 3098 struct inode *inode = file_inode(filp); 3099 3100 return __ceph_get_caps(inode, fi, need, want, endoff, got); 3101 } 3102 3103 /* 3104 * Take cap refs. Caller must already know we hold at least one ref 3105 * on the caps in question or we don't know this is safe. 3106 */ 3107 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) 3108 { 3109 spin_lock(&ci->i_ceph_lock); 3110 ceph_take_cap_refs(ci, caps, false); 3111 spin_unlock(&ci->i_ceph_lock); 3112 } 3113 3114 3115 /* 3116 * drop cap_snap that is not associated with any snapshot. 3117 * we don't need to send FLUSHSNAP message for it. 3118 */ 3119 static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, 3120 struct ceph_cap_snap *capsnap) 3121 { 3122 if (!capsnap->need_flush && 3123 !capsnap->writing && !capsnap->dirty_pages) { 3124 dout("dropping cap_snap %p follows %llu\n", 3125 capsnap, capsnap->follows); 3126 BUG_ON(capsnap->cap_flush.tid > 0); 3127 ceph_put_snap_context(capsnap->context); 3128 if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps)) 3129 ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; 3130 3131 list_del(&capsnap->ci_item); 3132 ceph_put_cap_snap(capsnap); 3133 return 1; 3134 } 3135 return 0; 3136 } 3137 3138 enum put_cap_refs_mode { 3139 PUT_CAP_REFS_SYNC = 0, 3140 PUT_CAP_REFS_NO_CHECK, 3141 PUT_CAP_REFS_ASYNC, 3142 }; 3143 3144 /* 3145 * Release cap refs. 3146 * 3147 * If we released the last ref on any given cap, call ceph_check_caps 3148 * to release (or schedule a release). 3149 * 3150 * If we are releasing a WR cap (from a sync write), finalize any affected 3151 * cap_snap, and wake up any waiters. 3152 */ 3153 static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, 3154 enum put_cap_refs_mode mode) 3155 { 3156 struct inode *inode = &ci->netfs.inode; 3157 int last = 0, put = 0, flushsnaps = 0, wake = 0; 3158 bool check_flushsnaps = false; 3159 3160 spin_lock(&ci->i_ceph_lock); 3161 if (had & CEPH_CAP_PIN) 3162 --ci->i_pin_ref; 3163 if (had & CEPH_CAP_FILE_RD) 3164 if (--ci->i_rd_ref == 0) 3165 last++; 3166 if (had & CEPH_CAP_FILE_CACHE) 3167 if (--ci->i_rdcache_ref == 0) 3168 last++; 3169 if (had & CEPH_CAP_FILE_EXCL) 3170 if (--ci->i_fx_ref == 0) 3171 last++; 3172 if (had & CEPH_CAP_FILE_BUFFER) { 3173 if (--ci->i_wb_ref == 0) { 3174 last++; 3175 /* put the ref held by ceph_take_cap_refs() */ 3176 put++; 3177 check_flushsnaps = true; 3178 } 3179 dout("put_cap_refs %p wb %d -> %d (?)\n", 3180 inode, ci->i_wb_ref+1, ci->i_wb_ref); 3181 } 3182 if (had & CEPH_CAP_FILE_WR) { 3183 if (--ci->i_wr_ref == 0) { 3184 /* 3185 * The Fb caps will always be took and released 3186 * together with the Fw caps. 3187 */ 3188 WARN_ON_ONCE(ci->i_wb_ref); 3189 3190 last++; 3191 check_flushsnaps = true; 3192 if (ci->i_wrbuffer_ref_head == 0 && 3193 ci->i_dirty_caps == 0 && 3194 ci->i_flushing_caps == 0) { 3195 BUG_ON(!ci->i_head_snapc); 3196 ceph_put_snap_context(ci->i_head_snapc); 3197 ci->i_head_snapc = NULL; 3198 } 3199 /* see comment in __ceph_remove_cap() */ 3200 if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) 3201 ceph_change_snap_realm(inode, NULL); 3202 } 3203 } 3204 if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { 3205 struct ceph_cap_snap *capsnap = 3206 list_last_entry(&ci->i_cap_snaps, 3207 struct ceph_cap_snap, 3208 ci_item); 3209 3210 capsnap->writing = 0; 3211 if (ceph_try_drop_cap_snap(ci, capsnap)) 3212 /* put the ref held by ceph_queue_cap_snap() */ 3213 put++; 3214 else if (__ceph_finish_cap_snap(ci, capsnap)) 3215 flushsnaps = 1; 3216 wake = 1; 3217 } 3218 spin_unlock(&ci->i_ceph_lock); 3219 3220 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), 3221 last ? " last" : "", put ? " put" : ""); 3222 3223 switch (mode) { 3224 case PUT_CAP_REFS_SYNC: 3225 if (last) 3226 ceph_check_caps(ci, 0); 3227 else if (flushsnaps) 3228 ceph_flush_snaps(ci, NULL); 3229 break; 3230 case PUT_CAP_REFS_ASYNC: 3231 if (last) 3232 ceph_queue_check_caps(inode); 3233 else if (flushsnaps) 3234 ceph_queue_flush_snaps(inode); 3235 break; 3236 default: 3237 break; 3238 } 3239 if (wake) 3240 wake_up_all(&ci->i_cap_wq); 3241 while (put-- > 0) 3242 iput(inode); 3243 } 3244 3245 void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) 3246 { 3247 __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC); 3248 } 3249 3250 void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had) 3251 { 3252 __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC); 3253 } 3254 3255 void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) 3256 { 3257 __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK); 3258 } 3259 3260 /* 3261 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap 3262 * context. Adjust per-snap dirty page accounting as appropriate. 3263 * Once all dirty data for a cap_snap is flushed, flush snapped file 3264 * metadata back to the MDS. If we dropped the last ref, call 3265 * ceph_check_caps. 3266 */ 3267 void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, 3268 struct ceph_snap_context *snapc) 3269 { 3270 struct inode *inode = &ci->netfs.inode; 3271 struct ceph_cap_snap *capsnap = NULL, *iter; 3272 int put = 0; 3273 bool last = false; 3274 bool flush_snaps = false; 3275 bool complete_capsnap = false; 3276 3277 spin_lock(&ci->i_ceph_lock); 3278 ci->i_wrbuffer_ref -= nr; 3279 if (ci->i_wrbuffer_ref == 0) { 3280 last = true; 3281 put++; 3282 } 3283 3284 if (ci->i_head_snapc == snapc) { 3285 ci->i_wrbuffer_ref_head -= nr; 3286 if (ci->i_wrbuffer_ref_head == 0 && 3287 ci->i_wr_ref == 0 && 3288 ci->i_dirty_caps == 0 && 3289 ci->i_flushing_caps == 0) { 3290 BUG_ON(!ci->i_head_snapc); 3291 ceph_put_snap_context(ci->i_head_snapc); 3292 ci->i_head_snapc = NULL; 3293 } 3294 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n", 3295 inode, 3296 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, 3297 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, 3298 last ? " LAST" : ""); 3299 } else { 3300 list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { 3301 if (iter->context == snapc) { 3302 capsnap = iter; 3303 break; 3304 } 3305 } 3306 3307 if (!capsnap) { 3308 /* 3309 * The capsnap should already be removed when removing 3310 * auth cap in the case of a forced unmount. 3311 */ 3312 WARN_ON_ONCE(ci->i_auth_cap); 3313 goto unlock; 3314 } 3315 3316 capsnap->dirty_pages -= nr; 3317 if (capsnap->dirty_pages == 0) { 3318 complete_capsnap = true; 3319 if (!capsnap->writing) { 3320 if (ceph_try_drop_cap_snap(ci, capsnap)) { 3321 put++; 3322 } else { 3323 ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; 3324 flush_snaps = true; 3325 } 3326 } 3327 } 3328 dout("put_wrbuffer_cap_refs on %p cap_snap %p " 3329 " snap %lld %d/%d -> %d/%d %s%s\n", 3330 inode, capsnap, capsnap->context->seq, 3331 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, 3332 ci->i_wrbuffer_ref, capsnap->dirty_pages, 3333 last ? " (wrbuffer last)" : "", 3334 complete_capsnap ? " (complete capsnap)" : ""); 3335 } 3336 3337 unlock: 3338 spin_unlock(&ci->i_ceph_lock); 3339 3340 if (last) { 3341 ceph_check_caps(ci, 0); 3342 } else if (flush_snaps) { 3343 ceph_flush_snaps(ci, NULL); 3344 } 3345 if (complete_capsnap) 3346 wake_up_all(&ci->i_cap_wq); 3347 while (put-- > 0) { 3348 iput(inode); 3349 } 3350 } 3351 3352 /* 3353 * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. 3354 */ 3355 static void invalidate_aliases(struct inode *inode) 3356 { 3357 struct dentry *dn, *prev = NULL; 3358 3359 dout("invalidate_aliases inode %p\n", inode); 3360 d_prune_aliases(inode); 3361 /* 3362 * For non-directory inode, d_find_alias() only returns 3363 * hashed dentry. After calling d_invalidate(), the 3364 * dentry becomes unhashed. 3365 * 3366 * For directory inode, d_find_alias() can return 3367 * unhashed dentry. But directory inode should have 3368 * one alias at most. 3369 */ 3370 while ((dn = d_find_alias(inode))) { 3371 if (dn == prev) { 3372 dput(dn); 3373 break; 3374 } 3375 d_invalidate(dn); 3376 if (prev) 3377 dput(prev); 3378 prev = dn; 3379 } 3380 if (prev) 3381 dput(prev); 3382 } 3383 3384 struct cap_extra_info { 3385 struct ceph_string *pool_ns; 3386 /* inline data */ 3387 u64 inline_version; 3388 void *inline_data; 3389 u32 inline_len; 3390 /* dirstat */ 3391 bool dirstat_valid; 3392 u64 nfiles; 3393 u64 nsubdirs; 3394 u64 change_attr; 3395 /* currently issued */ 3396 int issued; 3397 struct timespec64 btime; 3398 u8 *fscrypt_auth; 3399 u32 fscrypt_auth_len; 3400 u64 fscrypt_file_size; 3401 }; 3402 3403 /* 3404 * Handle a cap GRANT message from the MDS. (Note that a GRANT may 3405 * actually be a revocation if it specifies a smaller cap set.) 3406 * 3407 * caller holds s_mutex and i_ceph_lock, we drop both. 3408 */ 3409 static void handle_cap_grant(struct inode *inode, 3410 struct ceph_mds_session *session, 3411 struct ceph_cap *cap, 3412 struct ceph_mds_caps *grant, 3413 struct ceph_buffer *xattr_buf, 3414 struct cap_extra_info *extra_info) 3415 __releases(ci->i_ceph_lock) 3416 __releases(session->s_mdsc->snap_rwsem) 3417 { 3418 struct ceph_inode_info *ci = ceph_inode(inode); 3419 int seq = le32_to_cpu(grant->seq); 3420 int newcaps = le32_to_cpu(grant->caps); 3421 int used, wanted, dirty; 3422 u64 size = le64_to_cpu(grant->size); 3423 u64 max_size = le64_to_cpu(grant->max_size); 3424 unsigned char check_caps = 0; 3425 bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen); 3426 bool wake = false; 3427 bool writeback = false; 3428 bool queue_trunc = false; 3429 bool queue_invalidate = false; 3430 bool deleted_inode = false; 3431 bool fill_inline = false; 3432 3433 /* 3434 * If there is at least one crypto block then we'll trust 3435 * fscrypt_file_size. If the real length of the file is 0, then 3436 * ignore it (it has probably been truncated down to 0 by the MDS). 3437 */ 3438 if (IS_ENCRYPTED(inode) && size) 3439 size = extra_info->fscrypt_file_size; 3440 3441 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 3442 inode, cap, session->s_mds, seq, ceph_cap_string(newcaps)); 3443 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 3444 i_size_read(inode)); 3445 3446 3447 /* 3448 * If CACHE is being revoked, and we have no dirty buffers, 3449 * try to invalidate (once). (If there are dirty buffers, we 3450 * will invalidate _after_ writeback.) 3451 */ 3452 if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */ 3453 ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && 3454 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && 3455 !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { 3456 if (try_nonblocking_invalidate(inode)) { 3457 /* there were locked pages.. invalidate later 3458 in a separate thread. */ 3459 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { 3460 queue_invalidate = true; 3461 ci->i_rdcache_revoking = ci->i_rdcache_gen; 3462 } 3463 } 3464 } 3465 3466 if (was_stale) 3467 cap->issued = cap->implemented = CEPH_CAP_PIN; 3468 3469 /* 3470 * auth mds of the inode changed. we received the cap export message, 3471 * but still haven't received the cap import message. handle_cap_export 3472 * updated the new auth MDS' cap. 3473 * 3474 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message 3475 * that was sent before the cap import message. So don't remove caps. 3476 */ 3477 if (ceph_seq_cmp(seq, cap->seq) <= 0) { 3478 WARN_ON(cap != ci->i_auth_cap); 3479 WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); 3480 seq = cap->seq; 3481 newcaps |= cap->issued; 3482 } 3483 3484 /* side effects now are allowed */ 3485 cap->cap_gen = atomic_read(&session->s_cap_gen); 3486 cap->seq = seq; 3487 3488 __check_cap_issue(ci, cap, newcaps); 3489 3490 inode_set_max_iversion_raw(inode, extra_info->change_attr); 3491 3492 if ((newcaps & CEPH_CAP_AUTH_SHARED) && 3493 (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { 3494 umode_t mode = le32_to_cpu(grant->mode); 3495 3496 if (inode_wrong_type(inode, mode)) 3497 pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", 3498 ceph_vinop(inode), inode->i_mode, mode); 3499 else 3500 inode->i_mode = mode; 3501 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); 3502 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); 3503 ci->i_btime = extra_info->btime; 3504 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, 3505 from_kuid(&init_user_ns, inode->i_uid), 3506 from_kgid(&init_user_ns, inode->i_gid)); 3507 #if IS_ENABLED(CONFIG_FS_ENCRYPTION) 3508 if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len || 3509 memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth, 3510 ci->fscrypt_auth_len)) 3511 pr_warn_ratelimited("%s: cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n", 3512 __func__, ci->fscrypt_auth_len, 3513 extra_info->fscrypt_auth_len); 3514 #endif 3515 } 3516 3517 if ((newcaps & CEPH_CAP_LINK_SHARED) && 3518 (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { 3519 set_nlink(inode, le32_to_cpu(grant->nlink)); 3520 if (inode->i_nlink == 0) 3521 deleted_inode = true; 3522 } 3523 3524 if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 && 3525 grant->xattr_len) { 3526 int len = le32_to_cpu(grant->xattr_len); 3527 u64 version = le64_to_cpu(grant->xattr_version); 3528 3529 if (version > ci->i_xattrs.version) { 3530 dout(" got new xattrs v%llu on %p len %d\n", 3531 version, inode, len); 3532 if (ci->i_xattrs.blob) 3533 ceph_buffer_put(ci->i_xattrs.blob); 3534 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); 3535 ci->i_xattrs.version = version; 3536 ceph_forget_all_cached_acls(inode); 3537 ceph_security_invalidate_secctx(inode); 3538 } 3539 } 3540 3541 if (newcaps & CEPH_CAP_ANY_RD) { 3542 struct timespec64 mtime, atime, ctime; 3543 /* ctime/mtime/atime? */ 3544 ceph_decode_timespec64(&mtime, &grant->mtime); 3545 ceph_decode_timespec64(&atime, &grant->atime); 3546 ceph_decode_timespec64(&ctime, &grant->ctime); 3547 ceph_fill_file_time(inode, extra_info->issued, 3548 le32_to_cpu(grant->time_warp_seq), 3549 &ctime, &mtime, &atime); 3550 } 3551 3552 if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) { 3553 ci->i_files = extra_info->nfiles; 3554 ci->i_subdirs = extra_info->nsubdirs; 3555 } 3556 3557 if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { 3558 /* file layout may have changed */ 3559 s64 old_pool = ci->i_layout.pool_id; 3560 struct ceph_string *old_ns; 3561 3562 ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout); 3563 old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, 3564 lockdep_is_held(&ci->i_ceph_lock)); 3565 rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns); 3566 3567 if (ci->i_layout.pool_id != old_pool || 3568 extra_info->pool_ns != old_ns) 3569 ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; 3570 3571 extra_info->pool_ns = old_ns; 3572 3573 /* size/truncate_seq? */ 3574 queue_trunc = ceph_fill_file_size(inode, extra_info->issued, 3575 le32_to_cpu(grant->truncate_seq), 3576 le64_to_cpu(grant->truncate_size), 3577 size); 3578 } 3579 3580 if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) { 3581 if (max_size != ci->i_max_size) { 3582 dout("max_size %lld -> %llu\n", 3583 ci->i_max_size, max_size); 3584 ci->i_max_size = max_size; 3585 if (max_size >= ci->i_wanted_max_size) { 3586 ci->i_wanted_max_size = 0; /* reset */ 3587 ci->i_requested_max_size = 0; 3588 } 3589 wake = true; 3590 } 3591 } 3592 3593 /* check cap bits */ 3594 wanted = __ceph_caps_wanted(ci); 3595 used = __ceph_caps_used(ci); 3596 dirty = __ceph_caps_dirty(ci); 3597 dout(" my wanted = %s, used = %s, dirty %s\n", 3598 ceph_cap_string(wanted), 3599 ceph_cap_string(used), 3600 ceph_cap_string(dirty)); 3601 3602 if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) && 3603 (wanted & ~(cap->mds_wanted | newcaps))) { 3604 /* 3605 * If mds is importing cap, prior cap messages that update 3606 * 'wanted' may get dropped by mds (migrate seq mismatch). 3607 * 3608 * We don't send cap message to update 'wanted' if what we 3609 * want are already issued. If mds revokes caps, cap message 3610 * that releases caps also tells mds what we want. But if 3611 * caps got revoked by mds forcedly (session stale). We may 3612 * haven't told mds what we want. 3613 */ 3614 check_caps = 1; 3615 } 3616 3617 /* revocation, grant, or no-op? */ 3618 if (cap->issued & ~newcaps) { 3619 int revoking = cap->issued & ~newcaps; 3620 3621 dout("revocation: %s -> %s (revoking %s)\n", 3622 ceph_cap_string(cap->issued), 3623 ceph_cap_string(newcaps), 3624 ceph_cap_string(revoking)); 3625 if (S_ISREG(inode->i_mode) && 3626 (revoking & used & CEPH_CAP_FILE_BUFFER)) 3627 writeback = true; /* initiate writeback; will delay ack */ 3628 else if (queue_invalidate && 3629 revoking == CEPH_CAP_FILE_CACHE && 3630 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) 3631 ; /* do nothing yet, invalidation will be queued */ 3632 else if (cap == ci->i_auth_cap) 3633 check_caps = 1; /* check auth cap only */ 3634 else 3635 check_caps = 2; /* check all caps */ 3636 /* If there is new caps, try to wake up the waiters */ 3637 if (~cap->issued & newcaps) 3638 wake = true; 3639 cap->issued = newcaps; 3640 cap->implemented |= newcaps; 3641 } else if (cap->issued == newcaps) { 3642 dout("caps unchanged: %s -> %s\n", 3643 ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); 3644 } else { 3645 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), 3646 ceph_cap_string(newcaps)); 3647 /* non-auth MDS is revoking the newly grant caps ? */ 3648 if (cap == ci->i_auth_cap && 3649 __ceph_caps_revoking_other(ci, cap, newcaps)) 3650 check_caps = 2; 3651 3652 cap->issued = newcaps; 3653 cap->implemented |= newcaps; /* add bits only, to 3654 * avoid stepping on a 3655 * pending revocation */ 3656 wake = true; 3657 } 3658 BUG_ON(cap->issued & ~cap->implemented); 3659 3660 /* don't let check_caps skip sending a response to MDS for revoke msgs */ 3661 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { 3662 cap->mds_wanted = 0; 3663 if (cap == ci->i_auth_cap) 3664 check_caps = 1; /* check auth cap only */ 3665 else 3666 check_caps = 2; /* check all caps */ 3667 } 3668 3669 if (extra_info->inline_version > 0 && 3670 extra_info->inline_version >= ci->i_inline_version) { 3671 ci->i_inline_version = extra_info->inline_version; 3672 if (ci->i_inline_version != CEPH_INLINE_NONE && 3673 (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) 3674 fill_inline = true; 3675 } 3676 3677 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { 3678 if (ci->i_auth_cap == cap) { 3679 if (newcaps & ~extra_info->issued) 3680 wake = true; 3681 3682 if (ci->i_requested_max_size > max_size || 3683 !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { 3684 /* re-request max_size if necessary */ 3685 ci->i_requested_max_size = 0; 3686 wake = true; 3687 } 3688 3689 ceph_kick_flushing_inode_caps(session, ci); 3690 } 3691 up_read(&session->s_mdsc->snap_rwsem); 3692 } 3693 spin_unlock(&ci->i_ceph_lock); 3694 3695 if (fill_inline) 3696 ceph_fill_inline_data(inode, NULL, extra_info->inline_data, 3697 extra_info->inline_len); 3698 3699 if (queue_trunc) 3700 ceph_queue_vmtruncate(inode); 3701 3702 if (writeback) 3703 /* 3704 * queue inode for writeback: we can't actually call 3705 * filemap_write_and_wait, etc. from message handler 3706 * context. 3707 */ 3708 ceph_queue_writeback(inode); 3709 if (queue_invalidate) 3710 ceph_queue_invalidate(inode); 3711 if (deleted_inode) 3712 invalidate_aliases(inode); 3713 if (wake) 3714 wake_up_all(&ci->i_cap_wq); 3715 3716 mutex_unlock(&session->s_mutex); 3717 if (check_caps == 1) 3718 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); 3719 else if (check_caps == 2) 3720 ceph_check_caps(ci, CHECK_CAPS_NOINVAL); 3721 } 3722 3723 /* 3724 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the 3725 * MDS has been safely committed. 3726 */ 3727 static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, 3728 struct ceph_mds_caps *m, 3729 struct ceph_mds_session *session, 3730 struct ceph_cap *cap) 3731 __releases(ci->i_ceph_lock) 3732 { 3733 struct ceph_inode_info *ci = ceph_inode(inode); 3734 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; 3735 struct ceph_cap_flush *cf, *tmp_cf; 3736 LIST_HEAD(to_remove); 3737 unsigned seq = le32_to_cpu(m->seq); 3738 int dirty = le32_to_cpu(m->dirty); 3739 int cleaned = 0; 3740 bool drop = false; 3741 bool wake_ci = false; 3742 bool wake_mdsc = false; 3743 3744 list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { 3745 /* Is this the one that was flushed? */ 3746 if (cf->tid == flush_tid) 3747 cleaned = cf->caps; 3748 3749 /* Is this a capsnap? */ 3750 if (cf->is_capsnap) 3751 continue; 3752 3753 if (cf->tid <= flush_tid) { 3754 /* 3755 * An earlier or current tid. The FLUSH_ACK should 3756 * represent a superset of this flush's caps. 3757 */ 3758 wake_ci |= __detach_cap_flush_from_ci(ci, cf); 3759 list_add_tail(&cf->i_list, &to_remove); 3760 } else { 3761 /* 3762 * This is a later one. Any caps in it are still dirty 3763 * so don't count them as cleaned. 3764 */ 3765 cleaned &= ~cf->caps; 3766 if (!cleaned) 3767 break; 3768 } 3769 } 3770 3771 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," 3772 " flushing %s -> %s\n", 3773 inode, session->s_mds, seq, ceph_cap_string(dirty), 3774 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), 3775 ceph_cap_string(ci->i_flushing_caps & ~cleaned)); 3776 3777 if (list_empty(&to_remove) && !cleaned) 3778 goto out; 3779 3780 ci->i_flushing_caps &= ~cleaned; 3781 3782 spin_lock(&mdsc->cap_dirty_lock); 3783 3784 list_for_each_entry(cf, &to_remove, i_list) 3785 wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf); 3786 3787 if (ci->i_flushing_caps == 0) { 3788 if (list_empty(&ci->i_cap_flush_list)) { 3789 list_del_init(&ci->i_flushing_item); 3790 if (!list_empty(&session->s_cap_flushing)) { 3791 dout(" mds%d still flushing cap on %p\n", 3792 session->s_mds, 3793 &list_first_entry(&session->s_cap_flushing, 3794 struct ceph_inode_info, 3795 i_flushing_item)->netfs.inode); 3796 } 3797 } 3798 mdsc->num_cap_flushing--; 3799 dout(" inode %p now !flushing\n", inode); 3800 3801 if (ci->i_dirty_caps == 0) { 3802 dout(" inode %p now clean\n", inode); 3803 BUG_ON(!list_empty(&ci->i_dirty_item)); 3804 drop = true; 3805 if (ci->i_wr_ref == 0 && 3806 ci->i_wrbuffer_ref_head == 0) { 3807 BUG_ON(!ci->i_head_snapc); 3808 ceph_put_snap_context(ci->i_head_snapc); 3809 ci->i_head_snapc = NULL; 3810 } 3811 } else { 3812 BUG_ON(list_empty(&ci->i_dirty_item)); 3813 } 3814 } 3815 spin_unlock(&mdsc->cap_dirty_lock); 3816 3817 out: 3818 spin_unlock(&ci->i_ceph_lock); 3819 3820 while (!list_empty(&to_remove)) { 3821 cf = list_first_entry(&to_remove, 3822 struct ceph_cap_flush, i_list); 3823 list_del_init(&cf->i_list); 3824 if (!cf->is_capsnap) 3825 ceph_free_cap_flush(cf); 3826 } 3827 3828 if (wake_ci) 3829 wake_up_all(&ci->i_cap_wq); 3830 if (wake_mdsc) 3831 wake_up_all(&mdsc->cap_flushing_wq); 3832 if (drop) 3833 iput(inode); 3834 } 3835 3836 void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, 3837 bool *wake_ci, bool *wake_mdsc) 3838 { 3839 struct ceph_inode_info *ci = ceph_inode(inode); 3840 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; 3841 bool ret; 3842 3843 lockdep_assert_held(&ci->i_ceph_lock); 3844 3845 dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci); 3846 3847 list_del_init(&capsnap->ci_item); 3848 ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); 3849 if (wake_ci) 3850 *wake_ci = ret; 3851 3852 spin_lock(&mdsc->cap_dirty_lock); 3853 if (list_empty(&ci->i_cap_flush_list)) 3854 list_del_init(&ci->i_flushing_item); 3855 3856 ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); 3857 if (wake_mdsc) 3858 *wake_mdsc = ret; 3859 spin_unlock(&mdsc->cap_dirty_lock); 3860 } 3861 3862 void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, 3863 bool *wake_ci, bool *wake_mdsc) 3864 { 3865 struct ceph_inode_info *ci = ceph_inode(inode); 3866 3867 lockdep_assert_held(&ci->i_ceph_lock); 3868 3869 WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); 3870 __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); 3871 } 3872 3873 /* 3874 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can 3875 * throw away our cap_snap. 3876 * 3877 * Caller hold s_mutex. 3878 */ 3879 static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, 3880 struct ceph_mds_caps *m, 3881 struct ceph_mds_session *session) 3882 { 3883 struct ceph_inode_info *ci = ceph_inode(inode); 3884 struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; 3885 u64 follows = le64_to_cpu(m->snap_follows); 3886 struct ceph_cap_snap *capsnap = NULL, *iter; 3887 bool wake_ci = false; 3888 bool wake_mdsc = false; 3889 3890 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", 3891 inode, ci, session->s_mds, follows); 3892 3893 spin_lock(&ci->i_ceph_lock); 3894 list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { 3895 if (iter->follows == follows) { 3896 if (iter->cap_flush.tid != flush_tid) { 3897 dout(" cap_snap %p follows %lld tid %lld !=" 3898 " %lld\n", iter, follows, 3899 flush_tid, iter->cap_flush.tid); 3900 break; 3901 } 3902 capsnap = iter; 3903 break; 3904 } else { 3905 dout(" skipping cap_snap %p follows %lld\n", 3906 iter, iter->follows); 3907 } 3908 } 3909 if (capsnap) 3910 ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); 3911 spin_unlock(&ci->i_ceph_lock); 3912 3913 if (capsnap) { 3914 ceph_put_snap_context(capsnap->context); 3915 ceph_put_cap_snap(capsnap); 3916 if (wake_ci) 3917 wake_up_all(&ci->i_cap_wq); 3918 if (wake_mdsc) 3919 wake_up_all(&mdsc->cap_flushing_wq); 3920 iput(inode); 3921 } 3922 } 3923 3924 /* 3925 * Handle TRUNC from MDS, indicating file truncation. 3926 * 3927 * caller hold s_mutex. 3928 */ 3929 static bool handle_cap_trunc(struct inode *inode, 3930 struct ceph_mds_caps *trunc, 3931 struct ceph_mds_session *session, 3932 struct cap_extra_info *extra_info) 3933 { 3934 struct ceph_inode_info *ci = ceph_inode(inode); 3935 int mds = session->s_mds; 3936 int seq = le32_to_cpu(trunc->seq); 3937 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); 3938 u64 truncate_size = le64_to_cpu(trunc->truncate_size); 3939 u64 size = le64_to_cpu(trunc->size); 3940 int implemented = 0; 3941 int dirty = __ceph_caps_dirty(ci); 3942 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); 3943 bool queue_trunc = false; 3944 3945 lockdep_assert_held(&ci->i_ceph_lock); 3946 3947 issued |= implemented | dirty; 3948 3949 /* 3950 * If there is at least one crypto block then we'll trust 3951 * fscrypt_file_size. If the real length of the file is 0, then 3952 * ignore it (it has probably been truncated down to 0 by the MDS). 3953 */ 3954 if (IS_ENCRYPTED(inode) && size) 3955 size = extra_info->fscrypt_file_size; 3956 3957 dout("%s inode %p mds%d seq %d to %lld truncate seq %d\n", 3958 __func__, inode, mds, seq, truncate_size, truncate_seq); 3959 queue_trunc = ceph_fill_file_size(inode, issued, 3960 truncate_seq, truncate_size, size); 3961 return queue_trunc; 3962 } 3963 3964 /* 3965 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a 3966 * different one. If we are the most recent migration we've seen (as 3967 * indicated by mseq), make note of the migrating cap bits for the 3968 * duration (until we see the corresponding IMPORT). 3969 * 3970 * caller holds s_mutex 3971 */ 3972 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, 3973 struct ceph_mds_cap_peer *ph, 3974 struct ceph_mds_session *session) 3975 { 3976 struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; 3977 struct ceph_mds_session *tsession = NULL; 3978 struct ceph_cap *cap, *tcap, *new_cap = NULL; 3979 struct ceph_inode_info *ci = ceph_inode(inode); 3980 u64 t_cap_id; 3981 unsigned mseq = le32_to_cpu(ex->migrate_seq); 3982 unsigned t_seq, t_mseq; 3983 int target, issued; 3984 int mds = session->s_mds; 3985 3986 if (ph) { 3987 t_cap_id = le64_to_cpu(ph->cap_id); 3988 t_seq = le32_to_cpu(ph->seq); 3989 t_mseq = le32_to_cpu(ph->mseq); 3990 target = le32_to_cpu(ph->mds); 3991 } else { 3992 t_cap_id = t_seq = t_mseq = 0; 3993 target = -1; 3994 } 3995 3996 dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", 3997 inode, ci, mds, mseq, target); 3998 retry: 3999 down_read(&mdsc->snap_rwsem); 4000 spin_lock(&ci->i_ceph_lock); 4001 cap = __get_cap_for_mds(ci, mds); 4002 if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) 4003 goto out_unlock; 4004 4005 if (target < 0) { 4006 ceph_remove_cap(mdsc, cap, false); 4007 goto out_unlock; 4008 } 4009 4010 /* 4011 * now we know we haven't received the cap import message yet 4012 * because the exported cap still exist. 4013 */ 4014 4015 issued = cap->issued; 4016 if (issued != cap->implemented) 4017 pr_err_ratelimited("handle_cap_export: issued != implemented: " 4018 "ino (%llx.%llx) mds%d seq %d mseq %d " 4019 "issued %s implemented %s\n", 4020 ceph_vinop(inode), mds, cap->seq, cap->mseq, 4021 ceph_cap_string(issued), 4022 ceph_cap_string(cap->implemented)); 4023 4024 4025 tcap = __get_cap_for_mds(ci, target); 4026 if (tcap) { 4027 /* already have caps from the target */ 4028 if (tcap->cap_id == t_cap_id && 4029 ceph_seq_cmp(tcap->seq, t_seq) < 0) { 4030 dout(" updating import cap %p mds%d\n", tcap, target); 4031 tcap->cap_id = t_cap_id; 4032 tcap->seq = t_seq - 1; 4033 tcap->issue_seq = t_seq - 1; 4034 tcap->issued |= issued; 4035 tcap->implemented |= issued; 4036 if (cap == ci->i_auth_cap) { 4037 ci->i_auth_cap = tcap; 4038 change_auth_cap_ses(ci, tcap->session); 4039 } 4040 } 4041 ceph_remove_cap(mdsc, cap, false); 4042 goto out_unlock; 4043 } else if (tsession) { 4044 /* add placeholder for the export tagert */ 4045 int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; 4046 tcap = new_cap; 4047 ceph_add_cap(inode, tsession, t_cap_id, issued, 0, 4048 t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); 4049 4050 if (!list_empty(&ci->i_cap_flush_list) && 4051 ci->i_auth_cap == tcap) { 4052 spin_lock(&mdsc->cap_dirty_lock); 4053 list_move_tail(&ci->i_flushing_item, 4054 &tcap->session->s_cap_flushing); 4055 spin_unlock(&mdsc->cap_dirty_lock); 4056 } 4057 4058 ceph_remove_cap(mdsc, cap, false); 4059 goto out_unlock; 4060 } 4061 4062 spin_unlock(&ci->i_ceph_lock); 4063 up_read(&mdsc->snap_rwsem); 4064 mutex_unlock(&session->s_mutex); 4065 4066 /* open target session */ 4067 tsession = ceph_mdsc_open_export_target_session(mdsc, target); 4068 if (!IS_ERR(tsession)) { 4069 if (mds > target) { 4070 mutex_lock(&session->s_mutex); 4071 mutex_lock_nested(&tsession->s_mutex, 4072 SINGLE_DEPTH_NESTING); 4073 } else { 4074 mutex_lock(&tsession->s_mutex); 4075 mutex_lock_nested(&session->s_mutex, 4076 SINGLE_DEPTH_NESTING); 4077 } 4078 new_cap = ceph_get_cap(mdsc, NULL); 4079 } else { 4080 WARN_ON(1); 4081 tsession = NULL; 4082 target = -1; 4083 mutex_lock(&session->s_mutex); 4084 } 4085 goto retry; 4086 4087 out_unlock: 4088 spin_unlock(&ci->i_ceph_lock); 4089 up_read(&mdsc->snap_rwsem); 4090 mutex_unlock(&session->s_mutex); 4091 if (tsession) { 4092 mutex_unlock(&tsession->s_mutex); 4093 ceph_put_mds_session(tsession); 4094 } 4095 if (new_cap) 4096 ceph_put_cap(mdsc, new_cap); 4097 } 4098 4099 /* 4100 * Handle cap IMPORT. 4101 * 4102 * caller holds s_mutex. acquires i_ceph_lock 4103 */ 4104 static void handle_cap_import(struct ceph_mds_client *mdsc, 4105 struct inode *inode, struct ceph_mds_caps *im, 4106 struct ceph_mds_cap_peer *ph, 4107 struct ceph_mds_session *session, 4108 struct ceph_cap **target_cap, int *old_issued) 4109 { 4110 struct ceph_inode_info *ci = ceph_inode(inode); 4111 struct ceph_cap *cap, *ocap, *new_cap = NULL; 4112 int mds = session->s_mds; 4113 int issued; 4114 unsigned caps = le32_to_cpu(im->caps); 4115 unsigned wanted = le32_to_cpu(im->wanted); 4116 unsigned seq = le32_to_cpu(im->seq); 4117 unsigned mseq = le32_to_cpu(im->migrate_seq); 4118 u64 realmino = le64_to_cpu(im->realm); 4119 u64 cap_id = le64_to_cpu(im->cap_id); 4120 u64 p_cap_id; 4121 int peer; 4122 4123 if (ph) { 4124 p_cap_id = le64_to_cpu(ph->cap_id); 4125 peer = le32_to_cpu(ph->mds); 4126 } else { 4127 p_cap_id = 0; 4128 peer = -1; 4129 } 4130 4131 dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", 4132 inode, ci, mds, mseq, peer); 4133 retry: 4134 cap = __get_cap_for_mds(ci, mds); 4135 if (!cap) { 4136 if (!new_cap) { 4137 spin_unlock(&ci->i_ceph_lock); 4138 new_cap = ceph_get_cap(mdsc, NULL); 4139 spin_lock(&ci->i_ceph_lock); 4140 goto retry; 4141 } 4142 cap = new_cap; 4143 } else { 4144 if (new_cap) { 4145 ceph_put_cap(mdsc, new_cap); 4146 new_cap = NULL; 4147 } 4148 } 4149 4150 __ceph_caps_issued(ci, &issued); 4151 issued |= __ceph_caps_dirty(ci); 4152 4153 ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq, 4154 realmino, CEPH_CAP_FLAG_AUTH, &new_cap); 4155 4156 ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; 4157 if (ocap && ocap->cap_id == p_cap_id) { 4158 dout(" remove export cap %p mds%d flags %d\n", 4159 ocap, peer, ph->flags); 4160 if ((ph->flags & CEPH_CAP_FLAG_AUTH) && 4161 (ocap->seq != le32_to_cpu(ph->seq) || 4162 ocap->mseq != le32_to_cpu(ph->mseq))) { 4163 pr_err_ratelimited("handle_cap_import: " 4164 "mismatched seq/mseq: ino (%llx.%llx) " 4165 "mds%d seq %d mseq %d importer mds%d " 4166 "has peer seq %d mseq %d\n", 4167 ceph_vinop(inode), peer, ocap->seq, 4168 ocap->mseq, mds, le32_to_cpu(ph->seq), 4169 le32_to_cpu(ph->mseq)); 4170 } 4171 ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); 4172 } 4173 4174 *old_issued = issued; 4175 *target_cap = cap; 4176 } 4177 4178 #ifdef CONFIG_FS_ENCRYPTION 4179 static int parse_fscrypt_fields(void **p, void *end, 4180 struct cap_extra_info *extra) 4181 { 4182 u32 len; 4183 4184 ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad); 4185 if (extra->fscrypt_auth_len) { 4186 ceph_decode_need(p, end, extra->fscrypt_auth_len, bad); 4187 extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len, 4188 GFP_KERNEL); 4189 if (!extra->fscrypt_auth) 4190 return -ENOMEM; 4191 ceph_decode_copy_safe(p, end, extra->fscrypt_auth, 4192 extra->fscrypt_auth_len, bad); 4193 } 4194 4195 ceph_decode_32_safe(p, end, len, bad); 4196 if (len >= sizeof(u64)) { 4197 ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad); 4198 len -= sizeof(u64); 4199 } 4200 ceph_decode_skip_n(p, end, len, bad); 4201 return 0; 4202 bad: 4203 return -EIO; 4204 } 4205 #else 4206 static int parse_fscrypt_fields(void **p, void *end, 4207 struct cap_extra_info *extra) 4208 { 4209 u32 len; 4210 4211 /* Don't care about these fields unless we're encryption-capable */ 4212 ceph_decode_32_safe(p, end, len, bad); 4213 if (len) 4214 ceph_decode_skip_n(p, end, len, bad); 4215 ceph_decode_32_safe(p, end, len, bad); 4216 if (len) 4217 ceph_decode_skip_n(p, end, len, bad); 4218 return 0; 4219 bad: 4220 return -EIO; 4221 } 4222 #endif 4223 4224 /* 4225 * Handle a caps message from the MDS. 4226 * 4227 * Identify the appropriate session, inode, and call the right handler 4228 * based on the cap op. 4229 */ 4230 void ceph_handle_caps(struct ceph_mds_session *session, 4231 struct ceph_msg *msg) 4232 { 4233 struct ceph_mds_client *mdsc = session->s_mdsc; 4234 struct inode *inode; 4235 struct ceph_inode_info *ci; 4236 struct ceph_cap *cap; 4237 struct ceph_mds_caps *h; 4238 struct ceph_mds_cap_peer *peer = NULL; 4239 struct ceph_snap_realm *realm = NULL; 4240 int op; 4241 int msg_version = le16_to_cpu(msg->hdr.version); 4242 u32 seq, mseq; 4243 struct ceph_vino vino; 4244 void *snaptrace; 4245 size_t snaptrace_len; 4246 void *p, *end; 4247 struct cap_extra_info extra_info = {}; 4248 bool queue_trunc; 4249 bool close_sessions = false; 4250 bool do_cap_release = false; 4251 4252 dout("handle_caps from mds%d\n", session->s_mds); 4253 4254 if (!ceph_inc_mds_stopping_blocker(mdsc, session)) 4255 return; 4256 4257 /* decode */ 4258 end = msg->front.iov_base + msg->front.iov_len; 4259 if (msg->front.iov_len < sizeof(*h)) 4260 goto bad; 4261 h = msg->front.iov_base; 4262 op = le32_to_cpu(h->op); 4263 vino.ino = le64_to_cpu(h->ino); 4264 vino.snap = CEPH_NOSNAP; 4265 seq = le32_to_cpu(h->seq); 4266 mseq = le32_to_cpu(h->migrate_seq); 4267 4268 snaptrace = h + 1; 4269 snaptrace_len = le32_to_cpu(h->snap_trace_len); 4270 p = snaptrace + snaptrace_len; 4271 4272 if (msg_version >= 2) { 4273 u32 flock_len; 4274 ceph_decode_32_safe(&p, end, flock_len, bad); 4275 if (p + flock_len > end) 4276 goto bad; 4277 p += flock_len; 4278 } 4279 4280 if (msg_version >= 3) { 4281 if (op == CEPH_CAP_OP_IMPORT) { 4282 if (p + sizeof(*peer) > end) 4283 goto bad; 4284 peer = p; 4285 p += sizeof(*peer); 4286 } else if (op == CEPH_CAP_OP_EXPORT) { 4287 /* recorded in unused fields */ 4288 peer = (void *)&h->size; 4289 } 4290 } 4291 4292 if (msg_version >= 4) { 4293 ceph_decode_64_safe(&p, end, extra_info.inline_version, bad); 4294 ceph_decode_32_safe(&p, end, extra_info.inline_len, bad); 4295 if (p + extra_info.inline_len > end) 4296 goto bad; 4297 extra_info.inline_data = p; 4298 p += extra_info.inline_len; 4299 } 4300 4301 if (msg_version >= 5) { 4302 struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; 4303 u32 epoch_barrier; 4304 4305 ceph_decode_32_safe(&p, end, epoch_barrier, bad); 4306 ceph_osdc_update_epoch_barrier(osdc, epoch_barrier); 4307 } 4308 4309 if (msg_version >= 8) { 4310 u32 pool_ns_len; 4311 4312 /* version >= 6 */ 4313 ceph_decode_skip_64(&p, end, bad); // flush_tid 4314 /* version >= 7 */ 4315 ceph_decode_skip_32(&p, end, bad); // caller_uid 4316 ceph_decode_skip_32(&p, end, bad); // caller_gid 4317 /* version >= 8 */ 4318 ceph_decode_32_safe(&p, end, pool_ns_len, bad); 4319 if (pool_ns_len > 0) { 4320 ceph_decode_need(&p, end, pool_ns_len, bad); 4321 extra_info.pool_ns = 4322 ceph_find_or_create_string(p, pool_ns_len); 4323 p += pool_ns_len; 4324 } 4325 } 4326 4327 if (msg_version >= 9) { 4328 struct ceph_timespec *btime; 4329 4330 if (p + sizeof(*btime) > end) 4331 goto bad; 4332 btime = p; 4333 ceph_decode_timespec64(&extra_info.btime, btime); 4334 p += sizeof(*btime); 4335 ceph_decode_64_safe(&p, end, extra_info.change_attr, bad); 4336 } 4337 4338 if (msg_version >= 11) { 4339 /* version >= 10 */ 4340 ceph_decode_skip_32(&p, end, bad); // flags 4341 /* version >= 11 */ 4342 extra_info.dirstat_valid = true; 4343 ceph_decode_64_safe(&p, end, extra_info.nfiles, bad); 4344 ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); 4345 } 4346 4347 if (msg_version >= 12) { 4348 if (parse_fscrypt_fields(&p, end, &extra_info)) 4349 goto bad; 4350 } 4351 4352 /* lookup ino */ 4353 inode = ceph_find_inode(mdsc->fsc->sb, vino); 4354 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, 4355 vino.snap, inode); 4356 4357 mutex_lock(&session->s_mutex); 4358 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, 4359 (unsigned)seq); 4360 4361 if (!inode) { 4362 dout(" i don't have ino %llx\n", vino.ino); 4363 4364 switch (op) { 4365 case CEPH_CAP_OP_IMPORT: 4366 case CEPH_CAP_OP_REVOKE: 4367 case CEPH_CAP_OP_GRANT: 4368 do_cap_release = true; 4369 break; 4370 default: 4371 break; 4372 } 4373 goto flush_cap_releases; 4374 } 4375 ci = ceph_inode(inode); 4376 4377 /* these will work even if we don't have a cap yet */ 4378 switch (op) { 4379 case CEPH_CAP_OP_FLUSHSNAP_ACK: 4380 handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid), 4381 h, session); 4382 goto done; 4383 4384 case CEPH_CAP_OP_EXPORT: 4385 handle_cap_export(inode, h, peer, session); 4386 goto done_unlocked; 4387 4388 case CEPH_CAP_OP_IMPORT: 4389 realm = NULL; 4390 if (snaptrace_len) { 4391 down_write(&mdsc->snap_rwsem); 4392 if (ceph_update_snap_trace(mdsc, snaptrace, 4393 snaptrace + snaptrace_len, 4394 false, &realm)) { 4395 up_write(&mdsc->snap_rwsem); 4396 close_sessions = true; 4397 goto done; 4398 } 4399 downgrade_write(&mdsc->snap_rwsem); 4400 } else { 4401 down_read(&mdsc->snap_rwsem); 4402 } 4403 spin_lock(&ci->i_ceph_lock); 4404 handle_cap_import(mdsc, inode, h, peer, session, 4405 &cap, &extra_info.issued); 4406 handle_cap_grant(inode, session, cap, 4407 h, msg->middle, &extra_info); 4408 if (realm) 4409 ceph_put_snap_realm(mdsc, realm); 4410 goto done_unlocked; 4411 } 4412 4413 /* the rest require a cap */ 4414 spin_lock(&ci->i_ceph_lock); 4415 cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds); 4416 if (!cap) { 4417 dout(" no cap on %p ino %llx.%llx from mds%d\n", 4418 inode, ceph_ino(inode), ceph_snap(inode), 4419 session->s_mds); 4420 spin_unlock(&ci->i_ceph_lock); 4421 switch (op) { 4422 case CEPH_CAP_OP_REVOKE: 4423 case CEPH_CAP_OP_GRANT: 4424 do_cap_release = true; 4425 break; 4426 default: 4427 break; 4428 } 4429 goto flush_cap_releases; 4430 } 4431 4432 /* note that each of these drops i_ceph_lock for us */ 4433 switch (op) { 4434 case CEPH_CAP_OP_REVOKE: 4435 case CEPH_CAP_OP_GRANT: 4436 __ceph_caps_issued(ci, &extra_info.issued); 4437 extra_info.issued |= __ceph_caps_dirty(ci); 4438 handle_cap_grant(inode, session, cap, 4439 h, msg->middle, &extra_info); 4440 goto done_unlocked; 4441 4442 case CEPH_CAP_OP_FLUSH_ACK: 4443 handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid), 4444 h, session, cap); 4445 break; 4446 4447 case CEPH_CAP_OP_TRUNC: 4448 queue_trunc = handle_cap_trunc(inode, h, session, 4449 &extra_info); 4450 spin_unlock(&ci->i_ceph_lock); 4451 if (queue_trunc) 4452 ceph_queue_vmtruncate(inode); 4453 break; 4454 4455 default: 4456 spin_unlock(&ci->i_ceph_lock); 4457 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, 4458 ceph_cap_op_name(op)); 4459 } 4460 4461 done: 4462 mutex_unlock(&session->s_mutex); 4463 done_unlocked: 4464 iput(inode); 4465 out: 4466 ceph_dec_mds_stopping_blocker(mdsc); 4467 4468 ceph_put_string(extra_info.pool_ns); 4469 4470 /* Defer closing the sessions after s_mutex lock being released */ 4471 if (close_sessions) 4472 ceph_mdsc_close_sessions(mdsc); 4473 4474 kfree(extra_info.fscrypt_auth); 4475 return; 4476 4477 flush_cap_releases: 4478 /* 4479 * send any cap release message to try to move things 4480 * along for the mds (who clearly thinks we still have this 4481 * cap). 4482 */ 4483 if (do_cap_release) { 4484 cap = ceph_get_cap(mdsc, NULL); 4485 cap->cap_ino = vino.ino; 4486 cap->queue_release = 1; 4487 cap->cap_id = le64_to_cpu(h->cap_id); 4488 cap->mseq = mseq; 4489 cap->seq = seq; 4490 cap->issue_seq = seq; 4491 spin_lock(&session->s_cap_lock); 4492 __ceph_queue_cap_release(session, cap); 4493 spin_unlock(&session->s_cap_lock); 4494 } 4495 ceph_flush_cap_releases(mdsc, session); 4496 goto done; 4497 4498 bad: 4499 pr_err("ceph_handle_caps: corrupt message\n"); 4500 ceph_msg_dump(msg); 4501 goto out; 4502 } 4503 4504 /* 4505 * Delayed work handler to process end of delayed cap release LRU list. 4506 * 4507 * If new caps are added to the list while processing it, these won't get 4508 * processed in this run. In this case, the ci->i_hold_caps_max will be 4509 * returned so that the work can be scheduled accordingly. 4510 */ 4511 unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) 4512 { 4513 struct inode *inode; 4514 struct ceph_inode_info *ci; 4515 struct ceph_mount_options *opt = mdsc->fsc->mount_options; 4516 unsigned long delay_max = opt->caps_wanted_delay_max * HZ; 4517 unsigned long loop_start = jiffies; 4518 unsigned long delay = 0; 4519 4520 dout("check_delayed_caps\n"); 4521 spin_lock(&mdsc->cap_delay_lock); 4522 while (!list_empty(&mdsc->cap_delay_list)) { 4523 ci = list_first_entry(&mdsc->cap_delay_list, 4524 struct ceph_inode_info, 4525 i_cap_delay_list); 4526 if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) { 4527 dout("%s caps added recently. Exiting loop", __func__); 4528 delay = ci->i_hold_caps_max; 4529 break; 4530 } 4531 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && 4532 time_before(jiffies, ci->i_hold_caps_max)) 4533 break; 4534 list_del_init(&ci->i_cap_delay_list); 4535 4536 inode = igrab(&ci->netfs.inode); 4537 if (inode) { 4538 spin_unlock(&mdsc->cap_delay_lock); 4539 dout("check_delayed_caps on %p\n", inode); 4540 ceph_check_caps(ci, 0); 4541 iput(inode); 4542 spin_lock(&mdsc->cap_delay_lock); 4543 } 4544 } 4545 spin_unlock(&mdsc->cap_delay_lock); 4546 4547 return delay; 4548 } 4549 4550 /* 4551 * Flush all dirty caps to the mds 4552 */ 4553 static void flush_dirty_session_caps(struct ceph_mds_session *s) 4554 { 4555 struct ceph_mds_client *mdsc = s->s_mdsc; 4556 struct ceph_inode_info *ci; 4557 struct inode *inode; 4558 4559 dout("flush_dirty_caps\n"); 4560 spin_lock(&mdsc->cap_dirty_lock); 4561 while (!list_empty(&s->s_cap_dirty)) { 4562 ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, 4563 i_dirty_item); 4564 inode = &ci->netfs.inode; 4565 ihold(inode); 4566 dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode)); 4567 spin_unlock(&mdsc->cap_dirty_lock); 4568 ceph_wait_on_async_create(inode); 4569 ceph_check_caps(ci, CHECK_CAPS_FLUSH); 4570 iput(inode); 4571 spin_lock(&mdsc->cap_dirty_lock); 4572 } 4573 spin_unlock(&mdsc->cap_dirty_lock); 4574 dout("flush_dirty_caps done\n"); 4575 } 4576 4577 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) 4578 { 4579 ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); 4580 } 4581 4582 void __ceph_touch_fmode(struct ceph_inode_info *ci, 4583 struct ceph_mds_client *mdsc, int fmode) 4584 { 4585 unsigned long now = jiffies; 4586 if (fmode & CEPH_FILE_MODE_RD) 4587 ci->i_last_rd = now; 4588 if (fmode & CEPH_FILE_MODE_WR) 4589 ci->i_last_wr = now; 4590 /* queue periodic check */ 4591 if (fmode && 4592 __ceph_is_any_real_caps(ci) && 4593 list_empty(&ci->i_cap_delay_list)) 4594 __cap_delay_requeue(mdsc, ci); 4595 } 4596 4597 void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) 4598 { 4599 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); 4600 int bits = (fmode << 1) | 1; 4601 bool already_opened = false; 4602 int i; 4603 4604 if (count == 1) 4605 atomic64_inc(&mdsc->metric.opened_files); 4606 4607 spin_lock(&ci->i_ceph_lock); 4608 for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { 4609 /* 4610 * If any of the mode ref is larger than 0, 4611 * that means it has been already opened by 4612 * others. Just skip checking the PIN ref. 4613 */ 4614 if (i && ci->i_nr_by_mode[i]) 4615 already_opened = true; 4616 4617 if (bits & (1 << i)) 4618 ci->i_nr_by_mode[i] += count; 4619 } 4620 4621 if (!already_opened) 4622 percpu_counter_inc(&mdsc->metric.opened_inodes); 4623 spin_unlock(&ci->i_ceph_lock); 4624 } 4625 4626 /* 4627 * Drop open file reference. If we were the last open file, 4628 * we may need to release capabilities to the MDS (or schedule 4629 * their delayed release). 4630 */ 4631 void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) 4632 { 4633 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); 4634 int bits = (fmode << 1) | 1; 4635 bool is_closed = true; 4636 int i; 4637 4638 if (count == 1) 4639 atomic64_dec(&mdsc->metric.opened_files); 4640 4641 spin_lock(&ci->i_ceph_lock); 4642 for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { 4643 if (bits & (1 << i)) { 4644 BUG_ON(ci->i_nr_by_mode[i] < count); 4645 ci->i_nr_by_mode[i] -= count; 4646 } 4647 4648 /* 4649 * If any of the mode ref is not 0 after 4650 * decreased, that means it is still opened 4651 * by others. Just skip checking the PIN ref. 4652 */ 4653 if (i && ci->i_nr_by_mode[i]) 4654 is_closed = false; 4655 } 4656 4657 if (is_closed) 4658 percpu_counter_dec(&mdsc->metric.opened_inodes); 4659 spin_unlock(&ci->i_ceph_lock); 4660 } 4661 4662 /* 4663 * For a soon-to-be unlinked file, drop the LINK caps. If it 4664 * looks like the link count will hit 0, drop any other caps (other 4665 * than PIN) we don't specifically want (due to the file still being 4666 * open). 4667 */ 4668 int ceph_drop_caps_for_unlink(struct inode *inode) 4669 { 4670 struct ceph_inode_info *ci = ceph_inode(inode); 4671 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; 4672 4673 spin_lock(&ci->i_ceph_lock); 4674 if (inode->i_nlink == 1) { 4675 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); 4676 4677 if (__ceph_caps_dirty(ci)) { 4678 struct ceph_mds_client *mdsc = 4679 ceph_inode_to_fs_client(inode)->mdsc; 4680 __cap_delay_requeue_front(mdsc, ci); 4681 } 4682 } 4683 spin_unlock(&ci->i_ceph_lock); 4684 return drop; 4685 } 4686 4687 /* 4688 * Helpers for embedding cap and dentry lease releases into mds 4689 * requests. 4690 * 4691 * @force is used by dentry_release (below) to force inclusion of a 4692 * record for the directory inode, even when there aren't any caps to 4693 * drop. 4694 */ 4695 int ceph_encode_inode_release(void **p, struct inode *inode, 4696 int mds, int drop, int unless, int force) 4697 { 4698 struct ceph_inode_info *ci = ceph_inode(inode); 4699 struct ceph_cap *cap; 4700 struct ceph_mds_request_release *rel = *p; 4701 int used, dirty; 4702 int ret = 0; 4703 4704 spin_lock(&ci->i_ceph_lock); 4705 used = __ceph_caps_used(ci); 4706 dirty = __ceph_caps_dirty(ci); 4707 4708 dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n", 4709 inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), 4710 ceph_cap_string(unless)); 4711 4712 /* only drop unused, clean caps */ 4713 drop &= ~(used | dirty); 4714 4715 cap = __get_cap_for_mds(ci, mds); 4716 if (cap && __cap_is_valid(cap)) { 4717 unless &= cap->issued; 4718 if (unless) { 4719 if (unless & CEPH_CAP_AUTH_EXCL) 4720 drop &= ~CEPH_CAP_AUTH_SHARED; 4721 if (unless & CEPH_CAP_LINK_EXCL) 4722 drop &= ~CEPH_CAP_LINK_SHARED; 4723 if (unless & CEPH_CAP_XATTR_EXCL) 4724 drop &= ~CEPH_CAP_XATTR_SHARED; 4725 if (unless & CEPH_CAP_FILE_EXCL) 4726 drop &= ~CEPH_CAP_FILE_SHARED; 4727 } 4728 4729 if (force || (cap->issued & drop)) { 4730 if (cap->issued & drop) { 4731 int wanted = __ceph_caps_wanted(ci); 4732 dout("encode_inode_release %p cap %p " 4733 "%s -> %s, wanted %s -> %s\n", inode, cap, 4734 ceph_cap_string(cap->issued), 4735 ceph_cap_string(cap->issued & ~drop), 4736 ceph_cap_string(cap->mds_wanted), 4737 ceph_cap_string(wanted)); 4738 4739 cap->issued &= ~drop; 4740 cap->implemented &= ~drop; 4741 cap->mds_wanted = wanted; 4742 if (cap == ci->i_auth_cap && 4743 !(wanted & CEPH_CAP_ANY_FILE_WR)) 4744 ci->i_requested_max_size = 0; 4745 } else { 4746 dout("encode_inode_release %p cap %p %s" 4747 " (force)\n", inode, cap, 4748 ceph_cap_string(cap->issued)); 4749 } 4750 4751 rel->ino = cpu_to_le64(ceph_ino(inode)); 4752 rel->cap_id = cpu_to_le64(cap->cap_id); 4753 rel->seq = cpu_to_le32(cap->seq); 4754 rel->issue_seq = cpu_to_le32(cap->issue_seq); 4755 rel->mseq = cpu_to_le32(cap->mseq); 4756 rel->caps = cpu_to_le32(cap->implemented); 4757 rel->wanted = cpu_to_le32(cap->mds_wanted); 4758 rel->dname_len = 0; 4759 rel->dname_seq = 0; 4760 *p += sizeof(*rel); 4761 ret = 1; 4762 } else { 4763 dout("encode_inode_release %p cap %p %s (noop)\n", 4764 inode, cap, ceph_cap_string(cap->issued)); 4765 } 4766 } 4767 spin_unlock(&ci->i_ceph_lock); 4768 return ret; 4769 } 4770 4771 /** 4772 * ceph_encode_dentry_release - encode a dentry release into an outgoing request 4773 * @p: outgoing request buffer 4774 * @dentry: dentry to release 4775 * @dir: dir to release it from 4776 * @mds: mds that we're speaking to 4777 * @drop: caps being dropped 4778 * @unless: unless we have these caps 4779 * 4780 * Encode a dentry release into an outgoing request buffer. Returns 1 if the 4781 * thing was released, or a negative error code otherwise. 4782 */ 4783 int ceph_encode_dentry_release(void **p, struct dentry *dentry, 4784 struct inode *dir, 4785 int mds, int drop, int unless) 4786 { 4787 struct ceph_mds_request_release *rel = *p; 4788 struct ceph_dentry_info *di = ceph_dentry(dentry); 4789 int force = 0; 4790 int ret; 4791 4792 /* This shouldn't happen */ 4793 BUG_ON(!dir); 4794 4795 /* 4796 * force an record for the directory caps if we have a dentry lease. 4797 * this is racy (can't take i_ceph_lock and d_lock together), but it 4798 * doesn't have to be perfect; the mds will revoke anything we don't 4799 * release. 4800 */ 4801 spin_lock(&dentry->d_lock); 4802 if (di->lease_session && di->lease_session->s_mds == mds) 4803 force = 1; 4804 spin_unlock(&dentry->d_lock); 4805 4806 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); 4807 4808 spin_lock(&dentry->d_lock); 4809 if (ret && di->lease_session && di->lease_session->s_mds == mds) { 4810 dout("encode_dentry_release %p mds%d seq %d\n", 4811 dentry, mds, (int)di->lease_seq); 4812 rel->dname_seq = cpu_to_le32(di->lease_seq); 4813 __ceph_mdsc_drop_dentry_lease(dentry); 4814 spin_unlock(&dentry->d_lock); 4815 if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) { 4816 int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p); 4817 4818 if (ret2 < 0) 4819 return ret2; 4820 4821 rel->dname_len = cpu_to_le32(ret2); 4822 *p += ret2; 4823 } else { 4824 rel->dname_len = cpu_to_le32(dentry->d_name.len); 4825 memcpy(*p, dentry->d_name.name, dentry->d_name.len); 4826 *p += dentry->d_name.len; 4827 } 4828 } else { 4829 spin_unlock(&dentry->d_lock); 4830 } 4831 return ret; 4832 } 4833 4834 static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) 4835 { 4836 struct ceph_inode_info *ci = ceph_inode(inode); 4837 struct ceph_cap_snap *capsnap; 4838 int capsnap_release = 0; 4839 4840 lockdep_assert_held(&ci->i_ceph_lock); 4841 4842 dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode); 4843 4844 while (!list_empty(&ci->i_cap_snaps)) { 4845 capsnap = list_first_entry(&ci->i_cap_snaps, 4846 struct ceph_cap_snap, ci_item); 4847 __ceph_remove_capsnap(inode, capsnap, NULL, NULL); 4848 ceph_put_snap_context(capsnap->context); 4849 ceph_put_cap_snap(capsnap); 4850 capsnap_release++; 4851 } 4852 wake_up_all(&ci->i_cap_wq); 4853 wake_up_all(&mdsc->cap_flushing_wq); 4854 return capsnap_release; 4855 } 4856 4857 int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate) 4858 { 4859 struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 4860 struct ceph_mds_client *mdsc = fsc->mdsc; 4861 struct ceph_inode_info *ci = ceph_inode(inode); 4862 bool is_auth; 4863 bool dirty_dropped = false; 4864 int iputs = 0; 4865 4866 lockdep_assert_held(&ci->i_ceph_lock); 4867 4868 dout("removing cap %p, ci is %p, inode is %p\n", 4869 cap, ci, &ci->netfs.inode); 4870 4871 is_auth = (cap == ci->i_auth_cap); 4872 __ceph_remove_cap(cap, false); 4873 if (is_auth) { 4874 struct ceph_cap_flush *cf; 4875 4876 if (ceph_inode_is_shutdown(inode)) { 4877 if (inode->i_data.nrpages > 0) 4878 *invalidate = true; 4879 if (ci->i_wrbuffer_ref > 0) 4880 mapping_set_error(&inode->i_data, -EIO); 4881 } 4882 4883 spin_lock(&mdsc->cap_dirty_lock); 4884 4885 /* trash all of the cap flushes for this inode */ 4886 while (!list_empty(&ci->i_cap_flush_list)) { 4887 cf = list_first_entry(&ci->i_cap_flush_list, 4888 struct ceph_cap_flush, i_list); 4889 list_del_init(&cf->g_list); 4890 list_del_init(&cf->i_list); 4891 if (!cf->is_capsnap) 4892 ceph_free_cap_flush(cf); 4893 } 4894 4895 if (!list_empty(&ci->i_dirty_item)) { 4896 pr_warn_ratelimited( 4897 " dropping dirty %s state for %p %lld\n", 4898 ceph_cap_string(ci->i_dirty_caps), 4899 inode, ceph_ino(inode)); 4900 ci->i_dirty_caps = 0; 4901 list_del_init(&ci->i_dirty_item); 4902 dirty_dropped = true; 4903 } 4904 if (!list_empty(&ci->i_flushing_item)) { 4905 pr_warn_ratelimited( 4906 " dropping dirty+flushing %s state for %p %lld\n", 4907 ceph_cap_string(ci->i_flushing_caps), 4908 inode, ceph_ino(inode)); 4909 ci->i_flushing_caps = 0; 4910 list_del_init(&ci->i_flushing_item); 4911 mdsc->num_cap_flushing--; 4912 dirty_dropped = true; 4913 } 4914 spin_unlock(&mdsc->cap_dirty_lock); 4915 4916 if (dirty_dropped) { 4917 mapping_set_error(inode->i_mapping, -EIO); 4918 4919 if (ci->i_wrbuffer_ref_head == 0 && 4920 ci->i_wr_ref == 0 && 4921 ci->i_dirty_caps == 0 && 4922 ci->i_flushing_caps == 0) { 4923 ceph_put_snap_context(ci->i_head_snapc); 4924 ci->i_head_snapc = NULL; 4925 } 4926 } 4927 4928 if (atomic_read(&ci->i_filelock_ref) > 0) { 4929 /* make further file lock syscall return -EIO */ 4930 ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; 4931 pr_warn_ratelimited(" dropping file locks for %p %lld\n", 4932 inode, ceph_ino(inode)); 4933 } 4934 4935 if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { 4936 cf = ci->i_prealloc_cap_flush; 4937 ci->i_prealloc_cap_flush = NULL; 4938 if (!cf->is_capsnap) 4939 ceph_free_cap_flush(cf); 4940 } 4941 4942 if (!list_empty(&ci->i_cap_snaps)) 4943 iputs = remove_capsnaps(mdsc, inode); 4944 } 4945 if (dirty_dropped) 4946 ++iputs; 4947 return iputs; 4948 } 4949