1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/ceph/ceph_debug.h> 3 4 #include <linux/module.h> 5 #include <linux/fs.h> 6 #include <linux/slab.h> 7 #include <linux/string.h> 8 #include <linux/uaccess.h> 9 #include <linux/kernel.h> 10 #include <linux/writeback.h> 11 #include <linux/vmalloc.h> 12 #include <linux/xattr.h> 13 #include <linux/posix_acl.h> 14 #include <linux/random.h> 15 #include <linux/sort.h> 16 #include <linux/iversion.h> 17 18 #include "super.h" 19 #include "mds_client.h" 20 #include "cache.h" 21 #include <linux/ceph/decode.h> 22 23 /* 24 * Ceph inode operations 25 * 26 * Implement basic inode helpers (get, alloc) and inode ops (getattr, 27 * setattr, etc.), xattr helpers, and helpers for assimilating 28 * metadata returned by the MDS into our cache. 29 * 30 * Also define helpers for doing asynchronous writeback, invalidation, 31 * and truncation for the benefit of those who can't afford to block 32 * (typically because they are in the message handler path). 33 */ 34 35 static const struct inode_operations ceph_symlink_iops; 36 37 static void ceph_inode_work(struct work_struct *work); 38 39 /* 40 * find or create an inode, given the ceph ino number 41 */ 42 static int ceph_set_ino_cb(struct inode *inode, void *data) 43 { 44 struct ceph_inode_info *ci = ceph_inode(inode); 45 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 46 47 ci->i_vino = *(struct ceph_vino *)data; 48 inode->i_ino = ceph_vino_to_ino_t(ci->i_vino); 49 inode_set_iversion_raw(inode, 0); 50 percpu_counter_inc(&mdsc->metric.total_inodes); 51 52 return 0; 53 } 54 55 struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) 56 { 57 struct inode *inode; 58 59 if (ceph_vino_is_reserved(vino)) 60 return ERR_PTR(-EREMOTEIO); 61 62 inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare, 63 ceph_set_ino_cb, &vino); 64 if (!inode) 65 return ERR_PTR(-ENOMEM); 66 67 dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode), 68 ceph_vinop(inode), inode, !!(inode->i_state & I_NEW)); 69 return inode; 70 } 71 72 /* 73 * get/constuct snapdir inode for a given directory 74 */ 75 struct inode *ceph_get_snapdir(struct inode *parent) 76 { 77 struct ceph_vino vino = { 78 .ino = ceph_ino(parent), 79 .snap = CEPH_SNAPDIR, 80 }; 81 struct inode *inode = ceph_get_inode(parent->i_sb, vino); 82 struct ceph_inode_info *ci = ceph_inode(inode); 83 84 BUG_ON(!S_ISDIR(parent->i_mode)); 85 if (IS_ERR(inode)) 86 return inode; 87 inode->i_mode = parent->i_mode; 88 inode->i_uid = parent->i_uid; 89 inode->i_gid = parent->i_gid; 90 inode->i_mtime = parent->i_mtime; 91 inode->i_ctime = parent->i_ctime; 92 inode->i_atime = parent->i_atime; 93 ci->i_rbytes = 0; 94 ci->i_btime = ceph_inode(parent)->i_btime; 95 96 if (inode->i_state & I_NEW) { 97 inode->i_op = &ceph_snapdir_iops; 98 inode->i_fop = &ceph_snapdir_fops; 99 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ 100 unlock_new_inode(inode); 101 } 102 103 return inode; 104 } 105 106 const struct inode_operations ceph_file_iops = { 107 .permission = ceph_permission, 108 .setattr = ceph_setattr, 109 .getattr = ceph_getattr, 110 .listxattr = ceph_listxattr, 111 .get_acl = ceph_get_acl, 112 .set_acl = ceph_set_acl, 113 }; 114 115 116 /* 117 * We use a 'frag tree' to keep track of the MDS's directory fragments 118 * for a given inode (usually there is just a single fragment). We 119 * need to know when a child frag is delegated to a new MDS, or when 120 * it is flagged as replicated, so we can direct our requests 121 * accordingly. 122 */ 123 124 /* 125 * find/create a frag in the tree 126 */ 127 static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, 128 u32 f) 129 { 130 struct rb_node **p; 131 struct rb_node *parent = NULL; 132 struct ceph_inode_frag *frag; 133 int c; 134 135 p = &ci->i_fragtree.rb_node; 136 while (*p) { 137 parent = *p; 138 frag = rb_entry(parent, struct ceph_inode_frag, node); 139 c = ceph_frag_compare(f, frag->frag); 140 if (c < 0) 141 p = &(*p)->rb_left; 142 else if (c > 0) 143 p = &(*p)->rb_right; 144 else 145 return frag; 146 } 147 148 frag = kmalloc(sizeof(*frag), GFP_NOFS); 149 if (!frag) 150 return ERR_PTR(-ENOMEM); 151 152 frag->frag = f; 153 frag->split_by = 0; 154 frag->mds = -1; 155 frag->ndist = 0; 156 157 rb_link_node(&frag->node, parent, p); 158 rb_insert_color(&frag->node, &ci->i_fragtree); 159 160 dout("get_or_create_frag added %llx.%llx frag %x\n", 161 ceph_vinop(&ci->vfs_inode), f); 162 return frag; 163 } 164 165 /* 166 * find a specific frag @f 167 */ 168 struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) 169 { 170 struct rb_node *n = ci->i_fragtree.rb_node; 171 172 while (n) { 173 struct ceph_inode_frag *frag = 174 rb_entry(n, struct ceph_inode_frag, node); 175 int c = ceph_frag_compare(f, frag->frag); 176 if (c < 0) 177 n = n->rb_left; 178 else if (c > 0) 179 n = n->rb_right; 180 else 181 return frag; 182 } 183 return NULL; 184 } 185 186 /* 187 * Choose frag containing the given value @v. If @pfrag is 188 * specified, copy the frag delegation info to the caller if 189 * it is present. 190 */ 191 static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, 192 struct ceph_inode_frag *pfrag, int *found) 193 { 194 u32 t = ceph_frag_make(0, 0); 195 struct ceph_inode_frag *frag; 196 unsigned nway, i; 197 u32 n; 198 199 if (found) 200 *found = 0; 201 202 while (1) { 203 WARN_ON(!ceph_frag_contains_value(t, v)); 204 frag = __ceph_find_frag(ci, t); 205 if (!frag) 206 break; /* t is a leaf */ 207 if (frag->split_by == 0) { 208 if (pfrag) 209 memcpy(pfrag, frag, sizeof(*pfrag)); 210 if (found) 211 *found = 1; 212 break; 213 } 214 215 /* choose child */ 216 nway = 1 << frag->split_by; 217 dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t, 218 frag->split_by, nway); 219 for (i = 0; i < nway; i++) { 220 n = ceph_frag_make_child(t, frag->split_by, i); 221 if (ceph_frag_contains_value(n, v)) { 222 t = n; 223 break; 224 } 225 } 226 BUG_ON(i == nway); 227 } 228 dout("choose_frag(%x) = %x\n", v, t); 229 230 return t; 231 } 232 233 u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, 234 struct ceph_inode_frag *pfrag, int *found) 235 { 236 u32 ret; 237 mutex_lock(&ci->i_fragtree_mutex); 238 ret = __ceph_choose_frag(ci, v, pfrag, found); 239 mutex_unlock(&ci->i_fragtree_mutex); 240 return ret; 241 } 242 243 /* 244 * Process dirfrag (delegation) info from the mds. Include leaf 245 * fragment in tree ONLY if ndist > 0. Otherwise, only 246 * branches/splits are included in i_fragtree) 247 */ 248 static int ceph_fill_dirfrag(struct inode *inode, 249 struct ceph_mds_reply_dirfrag *dirinfo) 250 { 251 struct ceph_inode_info *ci = ceph_inode(inode); 252 struct ceph_inode_frag *frag; 253 u32 id = le32_to_cpu(dirinfo->frag); 254 int mds = le32_to_cpu(dirinfo->auth); 255 int ndist = le32_to_cpu(dirinfo->ndist); 256 int diri_auth = -1; 257 int i; 258 int err = 0; 259 260 spin_lock(&ci->i_ceph_lock); 261 if (ci->i_auth_cap) 262 diri_auth = ci->i_auth_cap->mds; 263 spin_unlock(&ci->i_ceph_lock); 264 265 if (mds == -1) /* CDIR_AUTH_PARENT */ 266 mds = diri_auth; 267 268 mutex_lock(&ci->i_fragtree_mutex); 269 if (ndist == 0 && mds == diri_auth) { 270 /* no delegation info needed. */ 271 frag = __ceph_find_frag(ci, id); 272 if (!frag) 273 goto out; 274 if (frag->split_by == 0) { 275 /* tree leaf, remove */ 276 dout("fill_dirfrag removed %llx.%llx frag %x" 277 " (no ref)\n", ceph_vinop(inode), id); 278 rb_erase(&frag->node, &ci->i_fragtree); 279 kfree(frag); 280 } else { 281 /* tree branch, keep and clear */ 282 dout("fill_dirfrag cleared %llx.%llx frag %x" 283 " referral\n", ceph_vinop(inode), id); 284 frag->mds = -1; 285 frag->ndist = 0; 286 } 287 goto out; 288 } 289 290 291 /* find/add this frag to store mds delegation info */ 292 frag = __get_or_create_frag(ci, id); 293 if (IS_ERR(frag)) { 294 /* this is not the end of the world; we can continue 295 with bad/inaccurate delegation info */ 296 pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n", 297 ceph_vinop(inode), le32_to_cpu(dirinfo->frag)); 298 err = -ENOMEM; 299 goto out; 300 } 301 302 frag->mds = mds; 303 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP); 304 for (i = 0; i < frag->ndist; i++) 305 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]); 306 dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n", 307 ceph_vinop(inode), frag->frag, frag->ndist); 308 309 out: 310 mutex_unlock(&ci->i_fragtree_mutex); 311 return err; 312 } 313 314 static int frag_tree_split_cmp(const void *l, const void *r) 315 { 316 struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l; 317 struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r; 318 return ceph_frag_compare(le32_to_cpu(ls->frag), 319 le32_to_cpu(rs->frag)); 320 } 321 322 static bool is_frag_child(u32 f, struct ceph_inode_frag *frag) 323 { 324 if (!frag) 325 return f == ceph_frag_make(0, 0); 326 if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by) 327 return false; 328 return ceph_frag_contains_value(frag->frag, ceph_frag_value(f)); 329 } 330 331 static int ceph_fill_fragtree(struct inode *inode, 332 struct ceph_frag_tree_head *fragtree, 333 struct ceph_mds_reply_dirfrag *dirinfo) 334 { 335 struct ceph_inode_info *ci = ceph_inode(inode); 336 struct ceph_inode_frag *frag, *prev_frag = NULL; 337 struct rb_node *rb_node; 338 unsigned i, split_by, nsplits; 339 u32 id; 340 bool update = false; 341 342 mutex_lock(&ci->i_fragtree_mutex); 343 nsplits = le32_to_cpu(fragtree->nsplits); 344 if (nsplits != ci->i_fragtree_nsplits) { 345 update = true; 346 } else if (nsplits) { 347 i = prandom_u32() % nsplits; 348 id = le32_to_cpu(fragtree->splits[i].frag); 349 if (!__ceph_find_frag(ci, id)) 350 update = true; 351 } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) { 352 rb_node = rb_first(&ci->i_fragtree); 353 frag = rb_entry(rb_node, struct ceph_inode_frag, node); 354 if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node)) 355 update = true; 356 } 357 if (!update && dirinfo) { 358 id = le32_to_cpu(dirinfo->frag); 359 if (id != __ceph_choose_frag(ci, id, NULL, NULL)) 360 update = true; 361 } 362 if (!update) 363 goto out_unlock; 364 365 if (nsplits > 1) { 366 sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]), 367 frag_tree_split_cmp, NULL); 368 } 369 370 dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); 371 rb_node = rb_first(&ci->i_fragtree); 372 for (i = 0; i < nsplits; i++) { 373 id = le32_to_cpu(fragtree->splits[i].frag); 374 split_by = le32_to_cpu(fragtree->splits[i].by); 375 if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { 376 pr_err("fill_fragtree %llx.%llx invalid split %d/%u, " 377 "frag %x split by %d\n", ceph_vinop(inode), 378 i, nsplits, id, split_by); 379 continue; 380 } 381 frag = NULL; 382 while (rb_node) { 383 frag = rb_entry(rb_node, struct ceph_inode_frag, node); 384 if (ceph_frag_compare(frag->frag, id) >= 0) { 385 if (frag->frag != id) 386 frag = NULL; 387 else 388 rb_node = rb_next(rb_node); 389 break; 390 } 391 rb_node = rb_next(rb_node); 392 /* delete stale split/leaf node */ 393 if (frag->split_by > 0 || 394 !is_frag_child(frag->frag, prev_frag)) { 395 rb_erase(&frag->node, &ci->i_fragtree); 396 if (frag->split_by > 0) 397 ci->i_fragtree_nsplits--; 398 kfree(frag); 399 } 400 frag = NULL; 401 } 402 if (!frag) { 403 frag = __get_or_create_frag(ci, id); 404 if (IS_ERR(frag)) 405 continue; 406 } 407 if (frag->split_by == 0) 408 ci->i_fragtree_nsplits++; 409 frag->split_by = split_by; 410 dout(" frag %x split by %d\n", frag->frag, frag->split_by); 411 prev_frag = frag; 412 } 413 while (rb_node) { 414 frag = rb_entry(rb_node, struct ceph_inode_frag, node); 415 rb_node = rb_next(rb_node); 416 /* delete stale split/leaf node */ 417 if (frag->split_by > 0 || 418 !is_frag_child(frag->frag, prev_frag)) { 419 rb_erase(&frag->node, &ci->i_fragtree); 420 if (frag->split_by > 0) 421 ci->i_fragtree_nsplits--; 422 kfree(frag); 423 } 424 } 425 out_unlock: 426 mutex_unlock(&ci->i_fragtree_mutex); 427 return 0; 428 } 429 430 /* 431 * initialize a newly allocated inode. 432 */ 433 struct inode *ceph_alloc_inode(struct super_block *sb) 434 { 435 struct ceph_inode_info *ci; 436 int i; 437 438 ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS); 439 if (!ci) 440 return NULL; 441 442 dout("alloc_inode %p\n", &ci->vfs_inode); 443 444 spin_lock_init(&ci->i_ceph_lock); 445 446 ci->i_version = 0; 447 ci->i_inline_version = 0; 448 ci->i_time_warp_seq = 0; 449 ci->i_ceph_flags = 0; 450 atomic64_set(&ci->i_ordered_count, 1); 451 atomic64_set(&ci->i_release_count, 1); 452 atomic64_set(&ci->i_complete_seq[0], 0); 453 atomic64_set(&ci->i_complete_seq[1], 0); 454 ci->i_symlink = NULL; 455 456 ci->i_max_bytes = 0; 457 ci->i_max_files = 0; 458 459 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); 460 memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); 461 RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); 462 463 ci->i_fragtree = RB_ROOT; 464 mutex_init(&ci->i_fragtree_mutex); 465 466 ci->i_xattrs.blob = NULL; 467 ci->i_xattrs.prealloc_blob = NULL; 468 ci->i_xattrs.dirty = false; 469 ci->i_xattrs.index = RB_ROOT; 470 ci->i_xattrs.count = 0; 471 ci->i_xattrs.names_size = 0; 472 ci->i_xattrs.vals_size = 0; 473 ci->i_xattrs.version = 0; 474 ci->i_xattrs.index_version = 0; 475 476 ci->i_caps = RB_ROOT; 477 ci->i_auth_cap = NULL; 478 ci->i_dirty_caps = 0; 479 ci->i_flushing_caps = 0; 480 INIT_LIST_HEAD(&ci->i_dirty_item); 481 INIT_LIST_HEAD(&ci->i_flushing_item); 482 ci->i_prealloc_cap_flush = NULL; 483 INIT_LIST_HEAD(&ci->i_cap_flush_list); 484 init_waitqueue_head(&ci->i_cap_wq); 485 ci->i_hold_caps_max = 0; 486 INIT_LIST_HEAD(&ci->i_cap_delay_list); 487 INIT_LIST_HEAD(&ci->i_cap_snaps); 488 ci->i_head_snapc = NULL; 489 ci->i_snap_caps = 0; 490 491 ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ; 492 for (i = 0; i < CEPH_FILE_MODE_BITS; i++) 493 ci->i_nr_by_mode[i] = 0; 494 495 mutex_init(&ci->i_truncate_mutex); 496 ci->i_truncate_seq = 0; 497 ci->i_truncate_size = 0; 498 ci->i_truncate_pending = 0; 499 500 ci->i_max_size = 0; 501 ci->i_reported_size = 0; 502 ci->i_wanted_max_size = 0; 503 ci->i_requested_max_size = 0; 504 505 ci->i_pin_ref = 0; 506 ci->i_rd_ref = 0; 507 ci->i_rdcache_ref = 0; 508 ci->i_wr_ref = 0; 509 ci->i_wb_ref = 0; 510 ci->i_fx_ref = 0; 511 ci->i_wrbuffer_ref = 0; 512 ci->i_wrbuffer_ref_head = 0; 513 atomic_set(&ci->i_filelock_ref, 0); 514 atomic_set(&ci->i_shared_gen, 1); 515 ci->i_rdcache_gen = 0; 516 ci->i_rdcache_revoking = 0; 517 518 INIT_LIST_HEAD(&ci->i_unsafe_dirops); 519 INIT_LIST_HEAD(&ci->i_unsafe_iops); 520 spin_lock_init(&ci->i_unsafe_lock); 521 522 ci->i_snap_realm = NULL; 523 INIT_LIST_HEAD(&ci->i_snap_realm_item); 524 INIT_LIST_HEAD(&ci->i_snap_flush_item); 525 526 INIT_WORK(&ci->i_work, ceph_inode_work); 527 ci->i_work_mask = 0; 528 memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); 529 530 ceph_fscache_inode_init(ci); 531 532 ci->i_meta_err = 0; 533 534 return &ci->vfs_inode; 535 } 536 537 void ceph_free_inode(struct inode *inode) 538 { 539 struct ceph_inode_info *ci = ceph_inode(inode); 540 541 kfree(ci->i_symlink); 542 kmem_cache_free(ceph_inode_cachep, ci); 543 } 544 545 void ceph_evict_inode(struct inode *inode) 546 { 547 struct ceph_inode_info *ci = ceph_inode(inode); 548 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 549 struct ceph_inode_frag *frag; 550 struct rb_node *n; 551 552 dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); 553 554 percpu_counter_dec(&mdsc->metric.total_inodes); 555 556 truncate_inode_pages_final(&inode->i_data); 557 clear_inode(inode); 558 559 ceph_fscache_unregister_inode_cookie(ci); 560 561 __ceph_remove_caps(ci); 562 563 if (__ceph_has_any_quota(ci)) 564 ceph_adjust_quota_realms_count(inode, false); 565 566 /* 567 * we may still have a snap_realm reference if there are stray 568 * caps in i_snap_caps. 569 */ 570 if (ci->i_snap_realm) { 571 if (ceph_snap(inode) == CEPH_NOSNAP) { 572 struct ceph_snap_realm *realm = ci->i_snap_realm; 573 dout(" dropping residual ref to snap realm %p\n", 574 realm); 575 spin_lock(&realm->inodes_with_caps_lock); 576 list_del_init(&ci->i_snap_realm_item); 577 ci->i_snap_realm = NULL; 578 if (realm->ino == ci->i_vino.ino) 579 realm->inode = NULL; 580 spin_unlock(&realm->inodes_with_caps_lock); 581 ceph_put_snap_realm(mdsc, realm); 582 } else { 583 ceph_put_snapid_map(mdsc, ci->i_snapid_map); 584 ci->i_snap_realm = NULL; 585 } 586 } 587 588 while ((n = rb_first(&ci->i_fragtree)) != NULL) { 589 frag = rb_entry(n, struct ceph_inode_frag, node); 590 rb_erase(n, &ci->i_fragtree); 591 kfree(frag); 592 } 593 ci->i_fragtree_nsplits = 0; 594 595 __ceph_destroy_xattrs(ci); 596 if (ci->i_xattrs.blob) 597 ceph_buffer_put(ci->i_xattrs.blob); 598 if (ci->i_xattrs.prealloc_blob) 599 ceph_buffer_put(ci->i_xattrs.prealloc_blob); 600 601 ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); 602 ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); 603 } 604 605 static inline blkcnt_t calc_inode_blocks(u64 size) 606 { 607 return (size + (1<<9) - 1) >> 9; 608 } 609 610 /* 611 * Helpers to fill in size, ctime, mtime, and atime. We have to be 612 * careful because either the client or MDS may have more up to date 613 * info, depending on which capabilities are held, and whether 614 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime 615 * and size are monotonically increasing, except when utimes() or 616 * truncate() increments the corresponding _seq values.) 617 */ 618 int ceph_fill_file_size(struct inode *inode, int issued, 619 u32 truncate_seq, u64 truncate_size, u64 size) 620 { 621 struct ceph_inode_info *ci = ceph_inode(inode); 622 int queue_trunc = 0; 623 loff_t isize = i_size_read(inode); 624 625 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || 626 (truncate_seq == ci->i_truncate_seq && size > isize)) { 627 dout("size %lld -> %llu\n", isize, size); 628 if (size > 0 && S_ISDIR(inode->i_mode)) { 629 pr_err("fill_file_size non-zero size for directory\n"); 630 size = 0; 631 } 632 i_size_write(inode, size); 633 inode->i_blocks = calc_inode_blocks(size); 634 ci->i_reported_size = size; 635 if (truncate_seq != ci->i_truncate_seq) { 636 dout("truncate_seq %u -> %u\n", 637 ci->i_truncate_seq, truncate_seq); 638 ci->i_truncate_seq = truncate_seq; 639 640 /* the MDS should have revoked these caps */ 641 WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL | 642 CEPH_CAP_FILE_RD | 643 CEPH_CAP_FILE_WR | 644 CEPH_CAP_FILE_LAZYIO)); 645 /* 646 * If we hold relevant caps, or in the case where we're 647 * not the only client referencing this file and we 648 * don't hold those caps, then we need to check whether 649 * the file is either opened or mmaped 650 */ 651 if ((issued & (CEPH_CAP_FILE_CACHE| 652 CEPH_CAP_FILE_BUFFER)) || 653 mapping_mapped(inode->i_mapping) || 654 __ceph_is_file_opened(ci)) { 655 ci->i_truncate_pending++; 656 queue_trunc = 1; 657 } 658 } 659 } 660 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 && 661 ci->i_truncate_size != truncate_size) { 662 dout("truncate_size %lld -> %llu\n", ci->i_truncate_size, 663 truncate_size); 664 ci->i_truncate_size = truncate_size; 665 } 666 667 if (queue_trunc) 668 ceph_fscache_invalidate(inode); 669 670 return queue_trunc; 671 } 672 673 void ceph_fill_file_time(struct inode *inode, int issued, 674 u64 time_warp_seq, struct timespec64 *ctime, 675 struct timespec64 *mtime, struct timespec64 *atime) 676 { 677 struct ceph_inode_info *ci = ceph_inode(inode); 678 int warn = 0; 679 680 if (issued & (CEPH_CAP_FILE_EXCL| 681 CEPH_CAP_FILE_WR| 682 CEPH_CAP_FILE_BUFFER| 683 CEPH_CAP_AUTH_EXCL| 684 CEPH_CAP_XATTR_EXCL)) { 685 if (ci->i_version == 0 || 686 timespec64_compare(ctime, &inode->i_ctime) > 0) { 687 dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n", 688 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, 689 ctime->tv_sec, ctime->tv_nsec); 690 inode->i_ctime = *ctime; 691 } 692 if (ci->i_version == 0 || 693 ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { 694 /* the MDS did a utimes() */ 695 dout("mtime %lld.%09ld -> %lld.%09ld " 696 "tw %d -> %d\n", 697 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, 698 mtime->tv_sec, mtime->tv_nsec, 699 ci->i_time_warp_seq, (int)time_warp_seq); 700 701 inode->i_mtime = *mtime; 702 inode->i_atime = *atime; 703 ci->i_time_warp_seq = time_warp_seq; 704 } else if (time_warp_seq == ci->i_time_warp_seq) { 705 /* nobody did utimes(); take the max */ 706 if (timespec64_compare(mtime, &inode->i_mtime) > 0) { 707 dout("mtime %lld.%09ld -> %lld.%09ld inc\n", 708 inode->i_mtime.tv_sec, 709 inode->i_mtime.tv_nsec, 710 mtime->tv_sec, mtime->tv_nsec); 711 inode->i_mtime = *mtime; 712 } 713 if (timespec64_compare(atime, &inode->i_atime) > 0) { 714 dout("atime %lld.%09ld -> %lld.%09ld inc\n", 715 inode->i_atime.tv_sec, 716 inode->i_atime.tv_nsec, 717 atime->tv_sec, atime->tv_nsec); 718 inode->i_atime = *atime; 719 } 720 } else if (issued & CEPH_CAP_FILE_EXCL) { 721 /* we did a utimes(); ignore mds values */ 722 } else { 723 warn = 1; 724 } 725 } else { 726 /* we have no write|excl caps; whatever the MDS says is true */ 727 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { 728 inode->i_ctime = *ctime; 729 inode->i_mtime = *mtime; 730 inode->i_atime = *atime; 731 ci->i_time_warp_seq = time_warp_seq; 732 } else { 733 warn = 1; 734 } 735 } 736 if (warn) /* time_warp_seq shouldn't go backwards */ 737 dout("%p mds time_warp_seq %llu < %u\n", 738 inode, time_warp_seq, ci->i_time_warp_seq); 739 } 740 741 /* 742 * Populate an inode based on info from mds. May be called on new or 743 * existing inodes. 744 */ 745 int ceph_fill_inode(struct inode *inode, struct page *locked_page, 746 struct ceph_mds_reply_info_in *iinfo, 747 struct ceph_mds_reply_dirfrag *dirinfo, 748 struct ceph_mds_session *session, int cap_fmode, 749 struct ceph_cap_reservation *caps_reservation) 750 { 751 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 752 struct ceph_mds_reply_inode *info = iinfo->in; 753 struct ceph_inode_info *ci = ceph_inode(inode); 754 int issued, new_issued, info_caps; 755 struct timespec64 mtime, atime, ctime; 756 struct ceph_buffer *xattr_blob = NULL; 757 struct ceph_buffer *old_blob = NULL; 758 struct ceph_string *pool_ns = NULL; 759 struct ceph_cap *new_cap = NULL; 760 int err = 0; 761 bool wake = false; 762 bool queue_trunc = false; 763 bool new_version = false; 764 bool fill_inline = false; 765 766 dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__, 767 inode, ceph_vinop(inode), le64_to_cpu(info->version), 768 ci->i_version); 769 770 info_caps = le32_to_cpu(info->cap.caps); 771 772 /* prealloc new cap struct */ 773 if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) { 774 new_cap = ceph_get_cap(mdsc, caps_reservation); 775 if (!new_cap) 776 return -ENOMEM; 777 } 778 779 /* 780 * prealloc xattr data, if it looks like we'll need it. only 781 * if len > 4 (meaning there are actually xattrs; the first 4 782 * bytes are the xattr count). 783 */ 784 if (iinfo->xattr_len > 4) { 785 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS); 786 if (!xattr_blob) 787 pr_err("%s ENOMEM xattr blob %d bytes\n", __func__, 788 iinfo->xattr_len); 789 } 790 791 if (iinfo->pool_ns_len > 0) 792 pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data, 793 iinfo->pool_ns_len); 794 795 if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map) 796 ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode)); 797 798 spin_lock(&ci->i_ceph_lock); 799 800 /* 801 * provided version will be odd if inode value is projected, 802 * even if stable. skip the update if we have newer stable 803 * info (ours>=theirs, e.g. due to racing mds replies), unless 804 * we are getting projected (unstable) info (in which case the 805 * version is odd, and we want ours>theirs). 806 * us them 807 * 2 2 skip 808 * 3 2 skip 809 * 3 3 update 810 */ 811 if (ci->i_version == 0 || 812 ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && 813 le64_to_cpu(info->version) > (ci->i_version & ~1))) 814 new_version = true; 815 816 /* Update change_attribute */ 817 inode_set_max_iversion_raw(inode, iinfo->change_attr); 818 819 __ceph_caps_issued(ci, &issued); 820 issued |= __ceph_caps_dirty(ci); 821 new_issued = ~issued & info_caps; 822 823 /* update inode */ 824 inode->i_rdev = le32_to_cpu(info->rdev); 825 /* directories have fl_stripe_unit set to zero */ 826 if (le32_to_cpu(info->layout.fl_stripe_unit)) 827 inode->i_blkbits = 828 fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 829 else 830 inode->i_blkbits = CEPH_BLOCK_SHIFT; 831 832 __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); 833 834 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && 835 (issued & CEPH_CAP_AUTH_EXCL) == 0) { 836 inode->i_mode = le32_to_cpu(info->mode); 837 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); 838 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); 839 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, 840 from_kuid(&init_user_ns, inode->i_uid), 841 from_kgid(&init_user_ns, inode->i_gid)); 842 ceph_decode_timespec64(&ci->i_btime, &iinfo->btime); 843 ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime); 844 } 845 846 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && 847 (issued & CEPH_CAP_LINK_EXCL) == 0) 848 set_nlink(inode, le32_to_cpu(info->nlink)); 849 850 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { 851 /* be careful with mtime, atime, size */ 852 ceph_decode_timespec64(&atime, &info->atime); 853 ceph_decode_timespec64(&mtime, &info->mtime); 854 ceph_decode_timespec64(&ctime, &info->ctime); 855 ceph_fill_file_time(inode, issued, 856 le32_to_cpu(info->time_warp_seq), 857 &ctime, &mtime, &atime); 858 } 859 860 if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) { 861 ci->i_files = le64_to_cpu(info->files); 862 ci->i_subdirs = le64_to_cpu(info->subdirs); 863 } 864 865 if (new_version || 866 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { 867 s64 old_pool = ci->i_layout.pool_id; 868 struct ceph_string *old_ns; 869 870 ceph_file_layout_from_legacy(&ci->i_layout, &info->layout); 871 old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, 872 lockdep_is_held(&ci->i_ceph_lock)); 873 rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns); 874 875 if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns) 876 ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; 877 878 pool_ns = old_ns; 879 880 queue_trunc = ceph_fill_file_size(inode, issued, 881 le32_to_cpu(info->truncate_seq), 882 le64_to_cpu(info->truncate_size), 883 le64_to_cpu(info->size)); 884 /* only update max_size on auth cap */ 885 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && 886 ci->i_max_size != le64_to_cpu(info->max_size)) { 887 dout("max_size %lld -> %llu\n", ci->i_max_size, 888 le64_to_cpu(info->max_size)); 889 ci->i_max_size = le64_to_cpu(info->max_size); 890 } 891 } 892 893 /* layout and rstat are not tracked by capability, update them if 894 * the inode info is from auth mds */ 895 if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) { 896 if (S_ISDIR(inode->i_mode)) { 897 ci->i_dir_layout = iinfo->dir_layout; 898 ci->i_rbytes = le64_to_cpu(info->rbytes); 899 ci->i_rfiles = le64_to_cpu(info->rfiles); 900 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); 901 ci->i_dir_pin = iinfo->dir_pin; 902 ci->i_rsnaps = iinfo->rsnaps; 903 ceph_decode_timespec64(&ci->i_rctime, &info->rctime); 904 } 905 } 906 907 /* xattrs */ 908 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ 909 if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && 910 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) { 911 if (ci->i_xattrs.blob) 912 old_blob = ci->i_xattrs.blob; 913 ci->i_xattrs.blob = xattr_blob; 914 if (xattr_blob) 915 memcpy(ci->i_xattrs.blob->vec.iov_base, 916 iinfo->xattr_data, iinfo->xattr_len); 917 ci->i_xattrs.version = le64_to_cpu(info->xattr_version); 918 ceph_forget_all_cached_acls(inode); 919 ceph_security_invalidate_secctx(inode); 920 xattr_blob = NULL; 921 } 922 923 /* finally update i_version */ 924 if (le64_to_cpu(info->version) > ci->i_version) 925 ci->i_version = le64_to_cpu(info->version); 926 927 inode->i_mapping->a_ops = &ceph_aops; 928 929 switch (inode->i_mode & S_IFMT) { 930 case S_IFIFO: 931 case S_IFBLK: 932 case S_IFCHR: 933 case S_IFSOCK: 934 inode->i_blkbits = PAGE_SHIFT; 935 init_special_inode(inode, inode->i_mode, inode->i_rdev); 936 inode->i_op = &ceph_file_iops; 937 break; 938 case S_IFREG: 939 inode->i_op = &ceph_file_iops; 940 inode->i_fop = &ceph_file_fops; 941 break; 942 case S_IFLNK: 943 inode->i_op = &ceph_symlink_iops; 944 if (!ci->i_symlink) { 945 u32 symlen = iinfo->symlink_len; 946 char *sym; 947 948 spin_unlock(&ci->i_ceph_lock); 949 950 if (symlen != i_size_read(inode)) { 951 pr_err("%s %llx.%llx BAD symlink " 952 "size %lld\n", __func__, 953 ceph_vinop(inode), 954 i_size_read(inode)); 955 i_size_write(inode, symlen); 956 inode->i_blocks = calc_inode_blocks(symlen); 957 } 958 959 err = -ENOMEM; 960 sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); 961 if (!sym) 962 goto out; 963 964 spin_lock(&ci->i_ceph_lock); 965 if (!ci->i_symlink) 966 ci->i_symlink = sym; 967 else 968 kfree(sym); /* lost a race */ 969 } 970 inode->i_link = ci->i_symlink; 971 break; 972 case S_IFDIR: 973 inode->i_op = &ceph_dir_iops; 974 inode->i_fop = &ceph_dir_fops; 975 break; 976 default: 977 pr_err("%s %llx.%llx BAD mode 0%o\n", __func__, 978 ceph_vinop(inode), inode->i_mode); 979 } 980 981 /* were we issued a capability? */ 982 if (info_caps) { 983 if (ceph_snap(inode) == CEPH_NOSNAP) { 984 ceph_add_cap(inode, session, 985 le64_to_cpu(info->cap.cap_id), 986 info_caps, 987 le32_to_cpu(info->cap.wanted), 988 le32_to_cpu(info->cap.seq), 989 le32_to_cpu(info->cap.mseq), 990 le64_to_cpu(info->cap.realm), 991 info->cap.flags, &new_cap); 992 993 /* set dir completion flag? */ 994 if (S_ISDIR(inode->i_mode) && 995 ci->i_files == 0 && ci->i_subdirs == 0 && 996 (info_caps & CEPH_CAP_FILE_SHARED) && 997 (issued & CEPH_CAP_FILE_EXCL) == 0 && 998 !__ceph_dir_is_complete(ci)) { 999 dout(" marking %p complete (empty)\n", inode); 1000 i_size_write(inode, 0); 1001 __ceph_dir_set_complete(ci, 1002 atomic64_read(&ci->i_release_count), 1003 atomic64_read(&ci->i_ordered_count)); 1004 } 1005 1006 wake = true; 1007 } else { 1008 dout(" %p got snap_caps %s\n", inode, 1009 ceph_cap_string(info_caps)); 1010 ci->i_snap_caps |= info_caps; 1011 } 1012 } 1013 1014 if (iinfo->inline_version > 0 && 1015 iinfo->inline_version >= ci->i_inline_version) { 1016 int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1017 ci->i_inline_version = iinfo->inline_version; 1018 if (ci->i_inline_version != CEPH_INLINE_NONE && 1019 (locked_page || (info_caps & cache_caps))) 1020 fill_inline = true; 1021 } 1022 1023 if (cap_fmode >= 0) { 1024 if (!info_caps) 1025 pr_warn("mds issued no caps on %llx.%llx\n", 1026 ceph_vinop(inode)); 1027 __ceph_touch_fmode(ci, mdsc, cap_fmode); 1028 } 1029 1030 spin_unlock(&ci->i_ceph_lock); 1031 1032 if (fill_inline) 1033 ceph_fill_inline_data(inode, locked_page, 1034 iinfo->inline_data, iinfo->inline_len); 1035 1036 if (wake) 1037 wake_up_all(&ci->i_cap_wq); 1038 1039 /* queue truncate if we saw i_size decrease */ 1040 if (queue_trunc) 1041 ceph_queue_vmtruncate(inode); 1042 1043 /* populate frag tree */ 1044 if (S_ISDIR(inode->i_mode)) 1045 ceph_fill_fragtree(inode, &info->fragtree, dirinfo); 1046 1047 /* update delegation info? */ 1048 if (dirinfo) 1049 ceph_fill_dirfrag(inode, dirinfo); 1050 1051 err = 0; 1052 out: 1053 if (new_cap) 1054 ceph_put_cap(mdsc, new_cap); 1055 ceph_buffer_put(old_blob); 1056 ceph_buffer_put(xattr_blob); 1057 ceph_put_string(pool_ns); 1058 return err; 1059 } 1060 1061 /* 1062 * caller should hold session s_mutex and dentry->d_lock. 1063 */ 1064 static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, 1065 struct ceph_mds_reply_lease *lease, 1066 struct ceph_mds_session *session, 1067 unsigned long from_time, 1068 struct ceph_mds_session **old_lease_session) 1069 { 1070 struct ceph_dentry_info *di = ceph_dentry(dentry); 1071 unsigned mask = le16_to_cpu(lease->mask); 1072 long unsigned duration = le32_to_cpu(lease->duration_ms); 1073 long unsigned ttl = from_time + (duration * HZ) / 1000; 1074 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; 1075 1076 dout("update_dentry_lease %p duration %lu ms ttl %lu\n", 1077 dentry, duration, ttl); 1078 1079 /* only track leases on regular dentries */ 1080 if (ceph_snap(dir) != CEPH_NOSNAP) 1081 return; 1082 1083 if (mask & CEPH_LEASE_PRIMARY_LINK) 1084 di->flags |= CEPH_DENTRY_PRIMARY_LINK; 1085 else 1086 di->flags &= ~CEPH_DENTRY_PRIMARY_LINK; 1087 1088 di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); 1089 if (!(mask & CEPH_LEASE_VALID)) { 1090 __ceph_dentry_dir_lease_touch(di); 1091 return; 1092 } 1093 1094 if (di->lease_gen == session->s_cap_gen && 1095 time_before(ttl, di->time)) 1096 return; /* we already have a newer lease. */ 1097 1098 if (di->lease_session && di->lease_session != session) { 1099 *old_lease_session = di->lease_session; 1100 di->lease_session = NULL; 1101 } 1102 1103 if (!di->lease_session) 1104 di->lease_session = ceph_get_mds_session(session); 1105 di->lease_gen = session->s_cap_gen; 1106 di->lease_seq = le32_to_cpu(lease->seq); 1107 di->lease_renew_after = half_ttl; 1108 di->lease_renew_from = 0; 1109 di->time = ttl; 1110 1111 __ceph_dentry_lease_touch(di); 1112 } 1113 1114 static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry, 1115 struct ceph_mds_reply_lease *lease, 1116 struct ceph_mds_session *session, 1117 unsigned long from_time) 1118 { 1119 struct ceph_mds_session *old_lease_session = NULL; 1120 spin_lock(&dentry->d_lock); 1121 __update_dentry_lease(dir, dentry, lease, session, from_time, 1122 &old_lease_session); 1123 spin_unlock(&dentry->d_lock); 1124 if (old_lease_session) 1125 ceph_put_mds_session(old_lease_session); 1126 } 1127 1128 /* 1129 * update dentry lease without having parent inode locked 1130 */ 1131 static void update_dentry_lease_careful(struct dentry *dentry, 1132 struct ceph_mds_reply_lease *lease, 1133 struct ceph_mds_session *session, 1134 unsigned long from_time, 1135 char *dname, u32 dname_len, 1136 struct ceph_vino *pdvino, 1137 struct ceph_vino *ptvino) 1138 1139 { 1140 struct inode *dir; 1141 struct ceph_mds_session *old_lease_session = NULL; 1142 1143 spin_lock(&dentry->d_lock); 1144 /* make sure dentry's name matches target */ 1145 if (dentry->d_name.len != dname_len || 1146 memcmp(dentry->d_name.name, dname, dname_len)) 1147 goto out_unlock; 1148 1149 dir = d_inode(dentry->d_parent); 1150 /* make sure parent matches dvino */ 1151 if (!ceph_ino_compare(dir, pdvino)) 1152 goto out_unlock; 1153 1154 /* make sure dentry's inode matches target. NULL ptvino means that 1155 * we expect a negative dentry */ 1156 if (ptvino) { 1157 if (d_really_is_negative(dentry)) 1158 goto out_unlock; 1159 if (!ceph_ino_compare(d_inode(dentry), ptvino)) 1160 goto out_unlock; 1161 } else { 1162 if (d_really_is_positive(dentry)) 1163 goto out_unlock; 1164 } 1165 1166 __update_dentry_lease(dir, dentry, lease, session, 1167 from_time, &old_lease_session); 1168 out_unlock: 1169 spin_unlock(&dentry->d_lock); 1170 if (old_lease_session) 1171 ceph_put_mds_session(old_lease_session); 1172 } 1173 1174 /* 1175 * splice a dentry to an inode. 1176 * caller must hold directory i_mutex for this to be safe. 1177 */ 1178 static int splice_dentry(struct dentry **pdn, struct inode *in) 1179 { 1180 struct dentry *dn = *pdn; 1181 struct dentry *realdn; 1182 1183 BUG_ON(d_inode(dn)); 1184 1185 if (S_ISDIR(in->i_mode)) { 1186 /* If inode is directory, d_splice_alias() below will remove 1187 * 'realdn' from its origin parent. We need to ensure that 1188 * origin parent's readdir cache will not reference 'realdn' 1189 */ 1190 realdn = d_find_any_alias(in); 1191 if (realdn) { 1192 struct ceph_dentry_info *di = ceph_dentry(realdn); 1193 spin_lock(&realdn->d_lock); 1194 1195 realdn->d_op->d_prune(realdn); 1196 1197 di->time = jiffies; 1198 di->lease_shared_gen = 0; 1199 di->offset = 0; 1200 1201 spin_unlock(&realdn->d_lock); 1202 dput(realdn); 1203 } 1204 } 1205 1206 /* dn must be unhashed */ 1207 if (!d_unhashed(dn)) 1208 d_drop(dn); 1209 realdn = d_splice_alias(in, dn); 1210 if (IS_ERR(realdn)) { 1211 pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", 1212 PTR_ERR(realdn), dn, in, ceph_vinop(in)); 1213 return PTR_ERR(realdn); 1214 } 1215 1216 if (realdn) { 1217 dout("dn %p (%d) spliced with %p (%d) " 1218 "inode %p ino %llx.%llx\n", 1219 dn, d_count(dn), 1220 realdn, d_count(realdn), 1221 d_inode(realdn), ceph_vinop(d_inode(realdn))); 1222 dput(dn); 1223 *pdn = realdn; 1224 } else { 1225 BUG_ON(!ceph_dentry(dn)); 1226 dout("dn %p attached to %p ino %llx.%llx\n", 1227 dn, d_inode(dn), ceph_vinop(d_inode(dn))); 1228 } 1229 return 0; 1230 } 1231 1232 /* 1233 * Incorporate results into the local cache. This is either just 1234 * one inode, or a directory, dentry, and possibly linked-to inode (e.g., 1235 * after a lookup). 1236 * 1237 * A reply may contain 1238 * a directory inode along with a dentry. 1239 * and/or a target inode 1240 * 1241 * Called with snap_rwsem (read). 1242 */ 1243 int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) 1244 { 1245 struct ceph_mds_session *session = req->r_session; 1246 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 1247 struct inode *in = NULL; 1248 struct ceph_vino tvino, dvino; 1249 struct ceph_fs_client *fsc = ceph_sb_to_client(sb); 1250 int err = 0; 1251 1252 dout("fill_trace %p is_dentry %d is_target %d\n", req, 1253 rinfo->head->is_dentry, rinfo->head->is_target); 1254 1255 if (!rinfo->head->is_target && !rinfo->head->is_dentry) { 1256 dout("fill_trace reply is empty!\n"); 1257 if (rinfo->head->result == 0 && req->r_parent) 1258 ceph_invalidate_dir_request(req); 1259 return 0; 1260 } 1261 1262 if (rinfo->head->is_dentry) { 1263 struct inode *dir = req->r_parent; 1264 1265 if (dir) { 1266 err = ceph_fill_inode(dir, NULL, &rinfo->diri, 1267 rinfo->dirfrag, session, -1, 1268 &req->r_caps_reservation); 1269 if (err < 0) 1270 goto done; 1271 } else { 1272 WARN_ON_ONCE(1); 1273 } 1274 1275 if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && 1276 test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && 1277 !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { 1278 struct qstr dname; 1279 struct dentry *dn, *parent; 1280 1281 BUG_ON(!rinfo->head->is_target); 1282 BUG_ON(req->r_dentry); 1283 1284 parent = d_find_any_alias(dir); 1285 BUG_ON(!parent); 1286 1287 dname.name = rinfo->dname; 1288 dname.len = rinfo->dname_len; 1289 dname.hash = full_name_hash(parent, dname.name, dname.len); 1290 tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); 1291 tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); 1292 retry_lookup: 1293 dn = d_lookup(parent, &dname); 1294 dout("d_lookup on parent=%p name=%.*s got %p\n", 1295 parent, dname.len, dname.name, dn); 1296 1297 if (!dn) { 1298 dn = d_alloc(parent, &dname); 1299 dout("d_alloc %p '%.*s' = %p\n", parent, 1300 dname.len, dname.name, dn); 1301 if (!dn) { 1302 dput(parent); 1303 err = -ENOMEM; 1304 goto done; 1305 } 1306 err = 0; 1307 } else if (d_really_is_positive(dn) && 1308 (ceph_ino(d_inode(dn)) != tvino.ino || 1309 ceph_snap(d_inode(dn)) != tvino.snap)) { 1310 dout(" dn %p points to wrong inode %p\n", 1311 dn, d_inode(dn)); 1312 ceph_dir_clear_ordered(dir); 1313 d_delete(dn); 1314 dput(dn); 1315 goto retry_lookup; 1316 } 1317 1318 req->r_dentry = dn; 1319 dput(parent); 1320 } 1321 } 1322 1323 if (rinfo->head->is_target) { 1324 /* Should be filled in by handle_reply */ 1325 BUG_ON(!req->r_target_inode); 1326 1327 in = req->r_target_inode; 1328 err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti, 1329 NULL, session, 1330 (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && 1331 !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && 1332 rinfo->head->result == 0) ? req->r_fmode : -1, 1333 &req->r_caps_reservation); 1334 if (err < 0) { 1335 pr_err("ceph_fill_inode badness %p %llx.%llx\n", 1336 in, ceph_vinop(in)); 1337 req->r_target_inode = NULL; 1338 if (in->i_state & I_NEW) 1339 discard_new_inode(in); 1340 else 1341 iput(in); 1342 goto done; 1343 } 1344 if (in->i_state & I_NEW) 1345 unlock_new_inode(in); 1346 } 1347 1348 /* 1349 * ignore null lease/binding on snapdir ENOENT, or else we 1350 * will have trouble splicing in the virtual snapdir later 1351 */ 1352 if (rinfo->head->is_dentry && 1353 !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && 1354 test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && 1355 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, 1356 fsc->mount_options->snapdir_name, 1357 req->r_dentry->d_name.len))) { 1358 /* 1359 * lookup link rename : null -> possibly existing inode 1360 * mknod symlink mkdir : null -> new inode 1361 * unlink : linked -> null 1362 */ 1363 struct inode *dir = req->r_parent; 1364 struct dentry *dn = req->r_dentry; 1365 bool have_dir_cap, have_lease; 1366 1367 BUG_ON(!dn); 1368 BUG_ON(!dir); 1369 BUG_ON(d_inode(dn->d_parent) != dir); 1370 1371 dvino.ino = le64_to_cpu(rinfo->diri.in->ino); 1372 dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); 1373 1374 BUG_ON(ceph_ino(dir) != dvino.ino); 1375 BUG_ON(ceph_snap(dir) != dvino.snap); 1376 1377 /* do we have a lease on the whole dir? */ 1378 have_dir_cap = 1379 (le32_to_cpu(rinfo->diri.in->cap.caps) & 1380 CEPH_CAP_FILE_SHARED); 1381 1382 /* do we have a dn lease? */ 1383 have_lease = have_dir_cap || 1384 le32_to_cpu(rinfo->dlease->duration_ms); 1385 if (!have_lease) 1386 dout("fill_trace no dentry lease or dir cap\n"); 1387 1388 /* rename? */ 1389 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { 1390 struct inode *olddir = req->r_old_dentry_dir; 1391 BUG_ON(!olddir); 1392 1393 dout(" src %p '%pd' dst %p '%pd'\n", 1394 req->r_old_dentry, 1395 req->r_old_dentry, 1396 dn, dn); 1397 dout("fill_trace doing d_move %p -> %p\n", 1398 req->r_old_dentry, dn); 1399 1400 /* d_move screws up sibling dentries' offsets */ 1401 ceph_dir_clear_ordered(dir); 1402 ceph_dir_clear_ordered(olddir); 1403 1404 d_move(req->r_old_dentry, dn); 1405 dout(" src %p '%pd' dst %p '%pd'\n", 1406 req->r_old_dentry, 1407 req->r_old_dentry, 1408 dn, dn); 1409 1410 /* ensure target dentry is invalidated, despite 1411 rehashing bug in vfs_rename_dir */ 1412 ceph_invalidate_dentry_lease(dn); 1413 1414 dout("dn %p gets new offset %lld\n", req->r_old_dentry, 1415 ceph_dentry(req->r_old_dentry)->offset); 1416 1417 /* swap r_dentry and r_old_dentry in case that 1418 * splice_dentry() gets called later. This is safe 1419 * because no other place will use them */ 1420 req->r_dentry = req->r_old_dentry; 1421 req->r_old_dentry = dn; 1422 dn = req->r_dentry; 1423 } 1424 1425 /* null dentry? */ 1426 if (!rinfo->head->is_target) { 1427 dout("fill_trace null dentry\n"); 1428 if (d_really_is_positive(dn)) { 1429 dout("d_delete %p\n", dn); 1430 ceph_dir_clear_ordered(dir); 1431 d_delete(dn); 1432 } else if (have_lease) { 1433 if (d_unhashed(dn)) 1434 d_add(dn, NULL); 1435 update_dentry_lease(dir, dn, 1436 rinfo->dlease, session, 1437 req->r_request_started); 1438 } 1439 goto done; 1440 } 1441 1442 /* attach proper inode */ 1443 if (d_really_is_negative(dn)) { 1444 ceph_dir_clear_ordered(dir); 1445 ihold(in); 1446 err = splice_dentry(&req->r_dentry, in); 1447 if (err < 0) 1448 goto done; 1449 dn = req->r_dentry; /* may have spliced */ 1450 } else if (d_really_is_positive(dn) && d_inode(dn) != in) { 1451 dout(" %p links to %p %llx.%llx, not %llx.%llx\n", 1452 dn, d_inode(dn), ceph_vinop(d_inode(dn)), 1453 ceph_vinop(in)); 1454 d_invalidate(dn); 1455 have_lease = false; 1456 } 1457 1458 if (have_lease) { 1459 update_dentry_lease(dir, dn, 1460 rinfo->dlease, session, 1461 req->r_request_started); 1462 } 1463 dout(" final dn %p\n", dn); 1464 } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || 1465 req->r_op == CEPH_MDS_OP_MKSNAP) && 1466 test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && 1467 !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { 1468 struct inode *dir = req->r_parent; 1469 1470 /* fill out a snapdir LOOKUPSNAP dentry */ 1471 BUG_ON(!dir); 1472 BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); 1473 BUG_ON(!req->r_dentry); 1474 dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry); 1475 ceph_dir_clear_ordered(dir); 1476 ihold(in); 1477 err = splice_dentry(&req->r_dentry, in); 1478 if (err < 0) 1479 goto done; 1480 } else if (rinfo->head->is_dentry && req->r_dentry) { 1481 /* parent inode is not locked, be carefull */ 1482 struct ceph_vino *ptvino = NULL; 1483 dvino.ino = le64_to_cpu(rinfo->diri.in->ino); 1484 dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); 1485 if (rinfo->head->is_target) { 1486 tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); 1487 tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); 1488 ptvino = &tvino; 1489 } 1490 update_dentry_lease_careful(req->r_dentry, rinfo->dlease, 1491 session, req->r_request_started, 1492 rinfo->dname, rinfo->dname_len, 1493 &dvino, ptvino); 1494 } 1495 done: 1496 dout("fill_trace done err=%d\n", err); 1497 return err; 1498 } 1499 1500 /* 1501 * Prepopulate our cache with readdir results, leases, etc. 1502 */ 1503 static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, 1504 struct ceph_mds_session *session) 1505 { 1506 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 1507 int i, err = 0; 1508 1509 for (i = 0; i < rinfo->dir_nr; i++) { 1510 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; 1511 struct ceph_vino vino; 1512 struct inode *in; 1513 int rc; 1514 1515 vino.ino = le64_to_cpu(rde->inode.in->ino); 1516 vino.snap = le64_to_cpu(rde->inode.in->snapid); 1517 1518 in = ceph_get_inode(req->r_dentry->d_sb, vino); 1519 if (IS_ERR(in)) { 1520 err = PTR_ERR(in); 1521 dout("new_inode badness got %d\n", err); 1522 continue; 1523 } 1524 rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, 1525 -1, &req->r_caps_reservation); 1526 if (rc < 0) { 1527 pr_err("ceph_fill_inode badness on %p got %d\n", 1528 in, rc); 1529 err = rc; 1530 if (in->i_state & I_NEW) { 1531 ihold(in); 1532 discard_new_inode(in); 1533 } 1534 } else if (in->i_state & I_NEW) { 1535 unlock_new_inode(in); 1536 } 1537 1538 /* avoid calling iput_final() in mds dispatch threads */ 1539 ceph_async_iput(in); 1540 } 1541 1542 return err; 1543 } 1544 1545 void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) 1546 { 1547 if (ctl->page) { 1548 kunmap(ctl->page); 1549 put_page(ctl->page); 1550 ctl->page = NULL; 1551 } 1552 } 1553 1554 static int fill_readdir_cache(struct inode *dir, struct dentry *dn, 1555 struct ceph_readdir_cache_control *ctl, 1556 struct ceph_mds_request *req) 1557 { 1558 struct ceph_inode_info *ci = ceph_inode(dir); 1559 unsigned nsize = PAGE_SIZE / sizeof(struct dentry*); 1560 unsigned idx = ctl->index % nsize; 1561 pgoff_t pgoff = ctl->index / nsize; 1562 1563 if (!ctl->page || pgoff != page_index(ctl->page)) { 1564 ceph_readdir_cache_release(ctl); 1565 if (idx == 0) 1566 ctl->page = grab_cache_page(&dir->i_data, pgoff); 1567 else 1568 ctl->page = find_lock_page(&dir->i_data, pgoff); 1569 if (!ctl->page) { 1570 ctl->index = -1; 1571 return idx == 0 ? -ENOMEM : 0; 1572 } 1573 /* reading/filling the cache are serialized by 1574 * i_mutex, no need to use page lock */ 1575 unlock_page(ctl->page); 1576 ctl->dentries = kmap(ctl->page); 1577 if (idx == 0) 1578 memset(ctl->dentries, 0, PAGE_SIZE); 1579 } 1580 1581 if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && 1582 req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) { 1583 dout("readdir cache dn %p idx %d\n", dn, ctl->index); 1584 ctl->dentries[idx] = dn; 1585 ctl->index++; 1586 } else { 1587 dout("disable readdir cache\n"); 1588 ctl->index = -1; 1589 } 1590 return 0; 1591 } 1592 1593 int ceph_readdir_prepopulate(struct ceph_mds_request *req, 1594 struct ceph_mds_session *session) 1595 { 1596 struct dentry *parent = req->r_dentry; 1597 struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); 1598 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 1599 struct qstr dname; 1600 struct dentry *dn; 1601 struct inode *in; 1602 int err = 0, skipped = 0, ret, i; 1603 u32 frag = le32_to_cpu(req->r_args.readdir.frag); 1604 u32 last_hash = 0; 1605 u32 fpos_offset; 1606 struct ceph_readdir_cache_control cache_ctl = {}; 1607 1608 if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) 1609 return readdir_prepopulate_inodes_only(req, session); 1610 1611 if (rinfo->hash_order) { 1612 if (req->r_path2) { 1613 last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, 1614 req->r_path2, 1615 strlen(req->r_path2)); 1616 last_hash = ceph_frag_value(last_hash); 1617 } else if (rinfo->offset_hash) { 1618 /* mds understands offset_hash */ 1619 WARN_ON_ONCE(req->r_readdir_offset != 2); 1620 last_hash = le32_to_cpu(req->r_args.readdir.offset_hash); 1621 } 1622 } 1623 1624 if (rinfo->dir_dir && 1625 le32_to_cpu(rinfo->dir_dir->frag) != frag) { 1626 dout("readdir_prepopulate got new frag %x -> %x\n", 1627 frag, le32_to_cpu(rinfo->dir_dir->frag)); 1628 frag = le32_to_cpu(rinfo->dir_dir->frag); 1629 if (!rinfo->hash_order) 1630 req->r_readdir_offset = 2; 1631 } 1632 1633 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { 1634 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", 1635 rinfo->dir_nr, parent); 1636 } else { 1637 dout("readdir_prepopulate %d items under dn %p\n", 1638 rinfo->dir_nr, parent); 1639 if (rinfo->dir_dir) 1640 ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); 1641 1642 if (ceph_frag_is_leftmost(frag) && 1643 req->r_readdir_offset == 2 && 1644 !(rinfo->hash_order && last_hash)) { 1645 /* note dir version at start of readdir so we can 1646 * tell if any dentries get dropped */ 1647 req->r_dir_release_cnt = 1648 atomic64_read(&ci->i_release_count); 1649 req->r_dir_ordered_cnt = 1650 atomic64_read(&ci->i_ordered_count); 1651 req->r_readdir_cache_idx = 0; 1652 } 1653 } 1654 1655 cache_ctl.index = req->r_readdir_cache_idx; 1656 fpos_offset = req->r_readdir_offset; 1657 1658 /* FIXME: release caps/leases if error occurs */ 1659 for (i = 0; i < rinfo->dir_nr; i++) { 1660 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; 1661 struct ceph_vino tvino; 1662 1663 dname.name = rde->name; 1664 dname.len = rde->name_len; 1665 dname.hash = full_name_hash(parent, dname.name, dname.len); 1666 1667 tvino.ino = le64_to_cpu(rde->inode.in->ino); 1668 tvino.snap = le64_to_cpu(rde->inode.in->snapid); 1669 1670 if (rinfo->hash_order) { 1671 u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, 1672 rde->name, rde->name_len); 1673 hash = ceph_frag_value(hash); 1674 if (hash != last_hash) 1675 fpos_offset = 2; 1676 last_hash = hash; 1677 rde->offset = ceph_make_fpos(hash, fpos_offset++, true); 1678 } else { 1679 rde->offset = ceph_make_fpos(frag, fpos_offset++, false); 1680 } 1681 1682 retry_lookup: 1683 dn = d_lookup(parent, &dname); 1684 dout("d_lookup on parent=%p name=%.*s got %p\n", 1685 parent, dname.len, dname.name, dn); 1686 1687 if (!dn) { 1688 dn = d_alloc(parent, &dname); 1689 dout("d_alloc %p '%.*s' = %p\n", parent, 1690 dname.len, dname.name, dn); 1691 if (!dn) { 1692 dout("d_alloc badness\n"); 1693 err = -ENOMEM; 1694 goto out; 1695 } 1696 } else if (d_really_is_positive(dn) && 1697 (ceph_ino(d_inode(dn)) != tvino.ino || 1698 ceph_snap(d_inode(dn)) != tvino.snap)) { 1699 struct ceph_dentry_info *di = ceph_dentry(dn); 1700 dout(" dn %p points to wrong inode %p\n", 1701 dn, d_inode(dn)); 1702 1703 spin_lock(&dn->d_lock); 1704 if (di->offset > 0 && 1705 di->lease_shared_gen == 1706 atomic_read(&ci->i_shared_gen)) { 1707 __ceph_dir_clear_ordered(ci); 1708 di->offset = 0; 1709 } 1710 spin_unlock(&dn->d_lock); 1711 1712 d_delete(dn); 1713 dput(dn); 1714 goto retry_lookup; 1715 } 1716 1717 /* inode */ 1718 if (d_really_is_positive(dn)) { 1719 in = d_inode(dn); 1720 } else { 1721 in = ceph_get_inode(parent->d_sb, tvino); 1722 if (IS_ERR(in)) { 1723 dout("new_inode badness\n"); 1724 d_drop(dn); 1725 dput(dn); 1726 err = PTR_ERR(in); 1727 goto out; 1728 } 1729 } 1730 1731 ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, 1732 -1, &req->r_caps_reservation); 1733 if (ret < 0) { 1734 pr_err("ceph_fill_inode badness on %p\n", in); 1735 if (d_really_is_negative(dn)) { 1736 /* avoid calling iput_final() in mds 1737 * dispatch threads */ 1738 if (in->i_state & I_NEW) { 1739 ihold(in); 1740 discard_new_inode(in); 1741 } 1742 ceph_async_iput(in); 1743 } 1744 d_drop(dn); 1745 err = ret; 1746 goto next_item; 1747 } 1748 if (in->i_state & I_NEW) 1749 unlock_new_inode(in); 1750 1751 if (d_really_is_negative(dn)) { 1752 if (ceph_security_xattr_deadlock(in)) { 1753 dout(" skip splicing dn %p to inode %p" 1754 " (security xattr deadlock)\n", dn, in); 1755 ceph_async_iput(in); 1756 skipped++; 1757 goto next_item; 1758 } 1759 1760 err = splice_dentry(&dn, in); 1761 if (err < 0) 1762 goto next_item; 1763 } 1764 1765 ceph_dentry(dn)->offset = rde->offset; 1766 1767 update_dentry_lease(d_inode(parent), dn, 1768 rde->lease, req->r_session, 1769 req->r_request_started); 1770 1771 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { 1772 ret = fill_readdir_cache(d_inode(parent), dn, 1773 &cache_ctl, req); 1774 if (ret < 0) 1775 err = ret; 1776 } 1777 next_item: 1778 dput(dn); 1779 } 1780 out: 1781 if (err == 0 && skipped == 0) { 1782 set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags); 1783 req->r_readdir_cache_idx = cache_ctl.index; 1784 } 1785 ceph_readdir_cache_release(&cache_ctl); 1786 dout("readdir_prepopulate done\n"); 1787 return err; 1788 } 1789 1790 bool ceph_inode_set_size(struct inode *inode, loff_t size) 1791 { 1792 struct ceph_inode_info *ci = ceph_inode(inode); 1793 bool ret; 1794 1795 spin_lock(&ci->i_ceph_lock); 1796 dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); 1797 i_size_write(inode, size); 1798 inode->i_blocks = calc_inode_blocks(size); 1799 1800 ret = __ceph_should_report_size(ci); 1801 1802 spin_unlock(&ci->i_ceph_lock); 1803 return ret; 1804 } 1805 1806 /* 1807 * Put reference to inode, but avoid calling iput_final() in current thread. 1808 * iput_final() may wait for reahahead pages. The wait can cause deadlock in 1809 * some contexts. 1810 */ 1811 void ceph_async_iput(struct inode *inode) 1812 { 1813 if (!inode) 1814 return; 1815 for (;;) { 1816 if (atomic_add_unless(&inode->i_count, -1, 1)) 1817 break; 1818 if (queue_work(ceph_inode_to_client(inode)->inode_wq, 1819 &ceph_inode(inode)->i_work)) 1820 break; 1821 /* queue work failed, i_count must be at least 2 */ 1822 } 1823 } 1824 1825 void ceph_queue_inode_work(struct inode *inode, int work_bit) 1826 { 1827 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1828 struct ceph_inode_info *ci = ceph_inode(inode); 1829 set_bit(work_bit, &ci->i_work_mask); 1830 1831 ihold(inode); 1832 if (queue_work(fsc->inode_wq, &ci->i_work)) { 1833 dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask); 1834 } else { 1835 dout("queue_inode_work %p already queued, mask=%lx\n", 1836 inode, ci->i_work_mask); 1837 iput(inode); 1838 } 1839 } 1840 1841 static void ceph_do_invalidate_pages(struct inode *inode) 1842 { 1843 struct ceph_inode_info *ci = ceph_inode(inode); 1844 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1845 u32 orig_gen; 1846 int check = 0; 1847 1848 mutex_lock(&ci->i_truncate_mutex); 1849 1850 if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { 1851 pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", 1852 inode, ceph_ino(inode)); 1853 mapping_set_error(inode->i_mapping, -EIO); 1854 truncate_pagecache(inode, 0); 1855 mutex_unlock(&ci->i_truncate_mutex); 1856 goto out; 1857 } 1858 1859 spin_lock(&ci->i_ceph_lock); 1860 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1861 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1862 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { 1863 if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) 1864 check = 1; 1865 spin_unlock(&ci->i_ceph_lock); 1866 mutex_unlock(&ci->i_truncate_mutex); 1867 goto out; 1868 } 1869 orig_gen = ci->i_rdcache_gen; 1870 spin_unlock(&ci->i_ceph_lock); 1871 1872 ceph_fscache_invalidate(inode); 1873 if (invalidate_inode_pages2(inode->i_mapping) < 0) { 1874 pr_err("invalidate_pages %p fails\n", inode); 1875 } 1876 1877 spin_lock(&ci->i_ceph_lock); 1878 if (orig_gen == ci->i_rdcache_gen && 1879 orig_gen == ci->i_rdcache_revoking) { 1880 dout("invalidate_pages %p gen %d successful\n", inode, 1881 ci->i_rdcache_gen); 1882 ci->i_rdcache_revoking--; 1883 check = 1; 1884 } else { 1885 dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", 1886 inode, orig_gen, ci->i_rdcache_gen, 1887 ci->i_rdcache_revoking); 1888 if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) 1889 check = 1; 1890 } 1891 spin_unlock(&ci->i_ceph_lock); 1892 mutex_unlock(&ci->i_truncate_mutex); 1893 out: 1894 if (check) 1895 ceph_check_caps(ci, 0, NULL); 1896 } 1897 1898 /* 1899 * Make sure any pending truncation is applied before doing anything 1900 * that may depend on it. 1901 */ 1902 void __ceph_do_pending_vmtruncate(struct inode *inode) 1903 { 1904 struct ceph_inode_info *ci = ceph_inode(inode); 1905 u64 to; 1906 int wrbuffer_refs, finish = 0; 1907 1908 mutex_lock(&ci->i_truncate_mutex); 1909 retry: 1910 spin_lock(&ci->i_ceph_lock); 1911 if (ci->i_truncate_pending == 0) { 1912 dout("__do_pending_vmtruncate %p none pending\n", inode); 1913 spin_unlock(&ci->i_ceph_lock); 1914 mutex_unlock(&ci->i_truncate_mutex); 1915 return; 1916 } 1917 1918 /* 1919 * make sure any dirty snapped pages are flushed before we 1920 * possibly truncate them.. so write AND block! 1921 */ 1922 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { 1923 spin_unlock(&ci->i_ceph_lock); 1924 dout("__do_pending_vmtruncate %p flushing snaps first\n", 1925 inode); 1926 filemap_write_and_wait_range(&inode->i_data, 0, 1927 inode->i_sb->s_maxbytes); 1928 goto retry; 1929 } 1930 1931 /* there should be no reader or writer */ 1932 WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); 1933 1934 to = ci->i_truncate_size; 1935 wrbuffer_refs = ci->i_wrbuffer_ref; 1936 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, 1937 ci->i_truncate_pending, to); 1938 spin_unlock(&ci->i_ceph_lock); 1939 1940 truncate_pagecache(inode, to); 1941 1942 spin_lock(&ci->i_ceph_lock); 1943 if (to == ci->i_truncate_size) { 1944 ci->i_truncate_pending = 0; 1945 finish = 1; 1946 } 1947 spin_unlock(&ci->i_ceph_lock); 1948 if (!finish) 1949 goto retry; 1950 1951 mutex_unlock(&ci->i_truncate_mutex); 1952 1953 if (wrbuffer_refs == 0) 1954 ceph_check_caps(ci, 0, NULL); 1955 1956 wake_up_all(&ci->i_cap_wq); 1957 } 1958 1959 static void ceph_inode_work(struct work_struct *work) 1960 { 1961 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, 1962 i_work); 1963 struct inode *inode = &ci->vfs_inode; 1964 1965 if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) { 1966 dout("writeback %p\n", inode); 1967 filemap_fdatawrite(&inode->i_data); 1968 } 1969 if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask)) 1970 ceph_do_invalidate_pages(inode); 1971 1972 if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask)) 1973 __ceph_do_pending_vmtruncate(inode); 1974 1975 if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask)) 1976 ceph_check_caps(ci, 0, NULL); 1977 1978 if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask)) 1979 ceph_flush_snaps(ci, NULL); 1980 1981 iput(inode); 1982 } 1983 1984 /* 1985 * symlinks 1986 */ 1987 static const struct inode_operations ceph_symlink_iops = { 1988 .get_link = simple_get_link, 1989 .setattr = ceph_setattr, 1990 .getattr = ceph_getattr, 1991 .listxattr = ceph_listxattr, 1992 }; 1993 1994 int __ceph_setattr(struct inode *inode, struct iattr *attr) 1995 { 1996 struct ceph_inode_info *ci = ceph_inode(inode); 1997 unsigned int ia_valid = attr->ia_valid; 1998 struct ceph_mds_request *req; 1999 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 2000 struct ceph_cap_flush *prealloc_cf; 2001 int issued; 2002 int release = 0, dirtied = 0; 2003 int mask = 0; 2004 int err = 0; 2005 int inode_dirty_flags = 0; 2006 bool lock_snap_rwsem = false; 2007 2008 prealloc_cf = ceph_alloc_cap_flush(); 2009 if (!prealloc_cf) 2010 return -ENOMEM; 2011 2012 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, 2013 USE_AUTH_MDS); 2014 if (IS_ERR(req)) { 2015 ceph_free_cap_flush(prealloc_cf); 2016 return PTR_ERR(req); 2017 } 2018 2019 spin_lock(&ci->i_ceph_lock); 2020 issued = __ceph_caps_issued(ci, NULL); 2021 2022 if (!ci->i_head_snapc && 2023 (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) { 2024 lock_snap_rwsem = true; 2025 if (!down_read_trylock(&mdsc->snap_rwsem)) { 2026 spin_unlock(&ci->i_ceph_lock); 2027 down_read(&mdsc->snap_rwsem); 2028 spin_lock(&ci->i_ceph_lock); 2029 issued = __ceph_caps_issued(ci, NULL); 2030 } 2031 } 2032 2033 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); 2034 2035 if (ia_valid & ATTR_UID) { 2036 dout("setattr %p uid %d -> %d\n", inode, 2037 from_kuid(&init_user_ns, inode->i_uid), 2038 from_kuid(&init_user_ns, attr->ia_uid)); 2039 if (issued & CEPH_CAP_AUTH_EXCL) { 2040 inode->i_uid = attr->ia_uid; 2041 dirtied |= CEPH_CAP_AUTH_EXCL; 2042 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || 2043 !uid_eq(attr->ia_uid, inode->i_uid)) { 2044 req->r_args.setattr.uid = cpu_to_le32( 2045 from_kuid(&init_user_ns, attr->ia_uid)); 2046 mask |= CEPH_SETATTR_UID; 2047 release |= CEPH_CAP_AUTH_SHARED; 2048 } 2049 } 2050 if (ia_valid & ATTR_GID) { 2051 dout("setattr %p gid %d -> %d\n", inode, 2052 from_kgid(&init_user_ns, inode->i_gid), 2053 from_kgid(&init_user_ns, attr->ia_gid)); 2054 if (issued & CEPH_CAP_AUTH_EXCL) { 2055 inode->i_gid = attr->ia_gid; 2056 dirtied |= CEPH_CAP_AUTH_EXCL; 2057 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || 2058 !gid_eq(attr->ia_gid, inode->i_gid)) { 2059 req->r_args.setattr.gid = cpu_to_le32( 2060 from_kgid(&init_user_ns, attr->ia_gid)); 2061 mask |= CEPH_SETATTR_GID; 2062 release |= CEPH_CAP_AUTH_SHARED; 2063 } 2064 } 2065 if (ia_valid & ATTR_MODE) { 2066 dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode, 2067 attr->ia_mode); 2068 if (issued & CEPH_CAP_AUTH_EXCL) { 2069 inode->i_mode = attr->ia_mode; 2070 dirtied |= CEPH_CAP_AUTH_EXCL; 2071 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || 2072 attr->ia_mode != inode->i_mode) { 2073 inode->i_mode = attr->ia_mode; 2074 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); 2075 mask |= CEPH_SETATTR_MODE; 2076 release |= CEPH_CAP_AUTH_SHARED; 2077 } 2078 } 2079 2080 if (ia_valid & ATTR_ATIME) { 2081 dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode, 2082 inode->i_atime.tv_sec, inode->i_atime.tv_nsec, 2083 attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); 2084 if (issued & CEPH_CAP_FILE_EXCL) { 2085 ci->i_time_warp_seq++; 2086 inode->i_atime = attr->ia_atime; 2087 dirtied |= CEPH_CAP_FILE_EXCL; 2088 } else if ((issued & CEPH_CAP_FILE_WR) && 2089 timespec64_compare(&inode->i_atime, 2090 &attr->ia_atime) < 0) { 2091 inode->i_atime = attr->ia_atime; 2092 dirtied |= CEPH_CAP_FILE_WR; 2093 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || 2094 !timespec64_equal(&inode->i_atime, &attr->ia_atime)) { 2095 ceph_encode_timespec64(&req->r_args.setattr.atime, 2096 &attr->ia_atime); 2097 mask |= CEPH_SETATTR_ATIME; 2098 release |= CEPH_CAP_FILE_SHARED | 2099 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2100 } 2101 } 2102 if (ia_valid & ATTR_SIZE) { 2103 loff_t isize = i_size_read(inode); 2104 2105 dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size); 2106 if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > isize) { 2107 i_size_write(inode, attr->ia_size); 2108 inode->i_blocks = calc_inode_blocks(attr->ia_size); 2109 ci->i_reported_size = attr->ia_size; 2110 dirtied |= CEPH_CAP_FILE_EXCL; 2111 ia_valid |= ATTR_MTIME; 2112 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || 2113 attr->ia_size != isize) { 2114 req->r_args.setattr.size = cpu_to_le64(attr->ia_size); 2115 req->r_args.setattr.old_size = cpu_to_le64(isize); 2116 mask |= CEPH_SETATTR_SIZE; 2117 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | 2118 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2119 } 2120 } 2121 if (ia_valid & ATTR_MTIME) { 2122 dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, 2123 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, 2124 attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); 2125 if (issued & CEPH_CAP_FILE_EXCL) { 2126 ci->i_time_warp_seq++; 2127 inode->i_mtime = attr->ia_mtime; 2128 dirtied |= CEPH_CAP_FILE_EXCL; 2129 } else if ((issued & CEPH_CAP_FILE_WR) && 2130 timespec64_compare(&inode->i_mtime, 2131 &attr->ia_mtime) < 0) { 2132 inode->i_mtime = attr->ia_mtime; 2133 dirtied |= CEPH_CAP_FILE_WR; 2134 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || 2135 !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) { 2136 ceph_encode_timespec64(&req->r_args.setattr.mtime, 2137 &attr->ia_mtime); 2138 mask |= CEPH_SETATTR_MTIME; 2139 release |= CEPH_CAP_FILE_SHARED | 2140 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2141 } 2142 } 2143 2144 /* these do nothing */ 2145 if (ia_valid & ATTR_CTIME) { 2146 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| 2147 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; 2148 dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode, 2149 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, 2150 attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, 2151 only ? "ctime only" : "ignored"); 2152 if (only) { 2153 /* 2154 * if kernel wants to dirty ctime but nothing else, 2155 * we need to choose a cap to dirty under, or do 2156 * a almost-no-op setattr 2157 */ 2158 if (issued & CEPH_CAP_AUTH_EXCL) 2159 dirtied |= CEPH_CAP_AUTH_EXCL; 2160 else if (issued & CEPH_CAP_FILE_EXCL) 2161 dirtied |= CEPH_CAP_FILE_EXCL; 2162 else if (issued & CEPH_CAP_XATTR_EXCL) 2163 dirtied |= CEPH_CAP_XATTR_EXCL; 2164 else 2165 mask |= CEPH_SETATTR_CTIME; 2166 } 2167 } 2168 if (ia_valid & ATTR_FILE) 2169 dout("setattr %p ATTR_FILE ... hrm!\n", inode); 2170 2171 if (dirtied) { 2172 inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, 2173 &prealloc_cf); 2174 inode->i_ctime = attr->ia_ctime; 2175 } 2176 2177 release &= issued; 2178 spin_unlock(&ci->i_ceph_lock); 2179 if (lock_snap_rwsem) 2180 up_read(&mdsc->snap_rwsem); 2181 2182 if (inode_dirty_flags) 2183 __mark_inode_dirty(inode, inode_dirty_flags); 2184 2185 2186 if (mask) { 2187 req->r_inode = inode; 2188 ihold(inode); 2189 req->r_inode_drop = release; 2190 req->r_args.setattr.mask = cpu_to_le32(mask); 2191 req->r_num_caps = 1; 2192 req->r_stamp = attr->ia_ctime; 2193 err = ceph_mdsc_do_request(mdsc, NULL, req); 2194 } 2195 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, 2196 ceph_cap_string(dirtied), mask); 2197 2198 ceph_mdsc_put_request(req); 2199 ceph_free_cap_flush(prealloc_cf); 2200 2201 if (err >= 0 && (mask & CEPH_SETATTR_SIZE)) 2202 __ceph_do_pending_vmtruncate(inode); 2203 2204 return err; 2205 } 2206 2207 /* 2208 * setattr 2209 */ 2210 int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, 2211 struct iattr *attr) 2212 { 2213 struct inode *inode = d_inode(dentry); 2214 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 2215 int err; 2216 2217 if (ceph_snap(inode) != CEPH_NOSNAP) 2218 return -EROFS; 2219 2220 err = setattr_prepare(&init_user_ns, dentry, attr); 2221 if (err != 0) 2222 return err; 2223 2224 if ((attr->ia_valid & ATTR_SIZE) && 2225 attr->ia_size > max(i_size_read(inode), fsc->max_file_size)) 2226 return -EFBIG; 2227 2228 if ((attr->ia_valid & ATTR_SIZE) && 2229 ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size)) 2230 return -EDQUOT; 2231 2232 err = __ceph_setattr(inode, attr); 2233 2234 if (err >= 0 && (attr->ia_valid & ATTR_MODE)) 2235 err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode); 2236 2237 return err; 2238 } 2239 2240 /* 2241 * Verify that we have a lease on the given mask. If not, 2242 * do a getattr against an mds. 2243 */ 2244 int __ceph_do_getattr(struct inode *inode, struct page *locked_page, 2245 int mask, bool force) 2246 { 2247 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); 2248 struct ceph_mds_client *mdsc = fsc->mdsc; 2249 struct ceph_mds_request *req; 2250 int mode; 2251 int err; 2252 2253 if (ceph_snap(inode) == CEPH_SNAPDIR) { 2254 dout("do_getattr inode %p SNAPDIR\n", inode); 2255 return 0; 2256 } 2257 2258 dout("do_getattr inode %p mask %s mode 0%o\n", 2259 inode, ceph_cap_string(mask), inode->i_mode); 2260 if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) 2261 return 0; 2262 2263 mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; 2264 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); 2265 if (IS_ERR(req)) 2266 return PTR_ERR(req); 2267 req->r_inode = inode; 2268 ihold(inode); 2269 req->r_num_caps = 1; 2270 req->r_args.getattr.mask = cpu_to_le32(mask); 2271 req->r_locked_page = locked_page; 2272 err = ceph_mdsc_do_request(mdsc, NULL, req); 2273 if (locked_page && err == 0) { 2274 u64 inline_version = req->r_reply_info.targeti.inline_version; 2275 if (inline_version == 0) { 2276 /* the reply is supposed to contain inline data */ 2277 err = -EINVAL; 2278 } else if (inline_version == CEPH_INLINE_NONE) { 2279 err = -ENODATA; 2280 } else { 2281 err = req->r_reply_info.targeti.inline_len; 2282 } 2283 } 2284 ceph_mdsc_put_request(req); 2285 dout("do_getattr result=%d\n", err); 2286 return err; 2287 } 2288 2289 2290 /* 2291 * Check inode permissions. We verify we have a valid value for 2292 * the AUTH cap, then call the generic handler. 2293 */ 2294 int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode, 2295 int mask) 2296 { 2297 int err; 2298 2299 if (mask & MAY_NOT_BLOCK) 2300 return -ECHILD; 2301 2302 err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); 2303 2304 if (!err) 2305 err = generic_permission(&init_user_ns, inode, mask); 2306 return err; 2307 } 2308 2309 /* Craft a mask of needed caps given a set of requested statx attrs. */ 2310 static int statx_to_caps(u32 want, umode_t mode) 2311 { 2312 int mask = 0; 2313 2314 if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) 2315 mask |= CEPH_CAP_AUTH_SHARED; 2316 2317 if (want & (STATX_NLINK|STATX_CTIME)) { 2318 /* 2319 * The link count for directories depends on inode->i_subdirs, 2320 * and that is only updated when Fs caps are held. 2321 */ 2322 if (S_ISDIR(mode)) 2323 mask |= CEPH_CAP_FILE_SHARED; 2324 else 2325 mask |= CEPH_CAP_LINK_SHARED; 2326 } 2327 2328 if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| 2329 STATX_BLOCKS)) 2330 mask |= CEPH_CAP_FILE_SHARED; 2331 2332 if (want & (STATX_CTIME)) 2333 mask |= CEPH_CAP_XATTR_SHARED; 2334 2335 return mask; 2336 } 2337 2338 /* 2339 * Get all the attributes. If we have sufficient caps for the requested attrs, 2340 * then we can avoid talking to the MDS at all. 2341 */ 2342 int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, 2343 struct kstat *stat, u32 request_mask, unsigned int flags) 2344 { 2345 struct inode *inode = d_inode(path->dentry); 2346 struct ceph_inode_info *ci = ceph_inode(inode); 2347 u32 valid_mask = STATX_BASIC_STATS; 2348 int err = 0; 2349 2350 /* Skip the getattr altogether if we're asked not to sync */ 2351 if (!(flags & AT_STATX_DONT_SYNC)) { 2352 err = ceph_do_getattr(inode, 2353 statx_to_caps(request_mask, inode->i_mode), 2354 flags & AT_STATX_FORCE_SYNC); 2355 if (err) 2356 return err; 2357 } 2358 2359 generic_fillattr(&init_user_ns, inode, stat); 2360 stat->ino = ceph_present_inode(inode); 2361 2362 /* 2363 * btime on newly-allocated inodes is 0, so if this is still set to 2364 * that, then assume that it's not valid. 2365 */ 2366 if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) { 2367 stat->btime = ci->i_btime; 2368 valid_mask |= STATX_BTIME; 2369 } 2370 2371 if (ceph_snap(inode) == CEPH_NOSNAP) 2372 stat->dev = inode->i_sb->s_dev; 2373 else 2374 stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; 2375 2376 if (S_ISDIR(inode->i_mode)) { 2377 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), 2378 RBYTES)) 2379 stat->size = ci->i_rbytes; 2380 else 2381 stat->size = ci->i_files + ci->i_subdirs; 2382 stat->blocks = 0; 2383 stat->blksize = 65536; 2384 /* 2385 * Some applications rely on the number of st_nlink 2386 * value on directories to be either 0 (if unlinked) 2387 * or 2 + number of subdirectories. 2388 */ 2389 if (stat->nlink == 1) 2390 /* '.' + '..' + subdirs */ 2391 stat->nlink = 1 + 1 + ci->i_subdirs; 2392 } 2393 2394 stat->result_mask = request_mask & valid_mask; 2395 return err; 2396 } 2397