1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/kernel.h> 7 #include <linux/bio.h> 8 #include <linux/file.h> 9 #include <linux/fs.h> 10 #include <linux/fsnotify.h> 11 #include <linux/pagemap.h> 12 #include <linux/highmem.h> 13 #include <linux/time.h> 14 #include <linux/string.h> 15 #include <linux/backing-dev.h> 16 #include <linux/mount.h> 17 #include <linux/namei.h> 18 #include <linux/writeback.h> 19 #include <linux/compat.h> 20 #include <linux/security.h> 21 #include <linux/xattr.h> 22 #include <linux/mm.h> 23 #include <linux/slab.h> 24 #include <linux/blkdev.h> 25 #include <linux/uuid.h> 26 #include <linux/btrfs.h> 27 #include <linux/uaccess.h> 28 #include <linux/iversion.h> 29 #include "ctree.h" 30 #include "disk-io.h" 31 #include "transaction.h" 32 #include "btrfs_inode.h" 33 #include "print-tree.h" 34 #include "volumes.h" 35 #include "locking.h" 36 #include "inode-map.h" 37 #include "backref.h" 38 #include "rcu-string.h" 39 #include "send.h" 40 #include "dev-replace.h" 41 #include "props.h" 42 #include "sysfs.h" 43 #include "qgroup.h" 44 #include "tree-log.h" 45 #include "compression.h" 46 47 #ifdef CONFIG_64BIT 48 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI 49 * structures are incorrect, as the timespec structure from userspace 50 * is 4 bytes too small. We define these alternatives here to teach 51 * the kernel about the 32-bit struct packing. 52 */ 53 struct btrfs_ioctl_timespec_32 { 54 __u64 sec; 55 __u32 nsec; 56 } __attribute__ ((__packed__)); 57 58 struct btrfs_ioctl_received_subvol_args_32 { 59 char uuid[BTRFS_UUID_SIZE]; /* in */ 60 __u64 stransid; /* in */ 61 __u64 rtransid; /* out */ 62 struct btrfs_ioctl_timespec_32 stime; /* in */ 63 struct btrfs_ioctl_timespec_32 rtime; /* out */ 64 __u64 flags; /* in */ 65 __u64 reserved[16]; /* in */ 66 } __attribute__ ((__packed__)); 67 68 #define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \ 69 struct btrfs_ioctl_received_subvol_args_32) 70 #endif 71 72 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 73 struct btrfs_ioctl_send_args_32 { 74 __s64 send_fd; /* in */ 75 __u64 clone_sources_count; /* in */ 76 compat_uptr_t clone_sources; /* in */ 77 __u64 parent_root; /* in */ 78 __u64 flags; /* in */ 79 __u64 reserved[4]; /* in */ 80 } __attribute__ ((__packed__)); 81 82 #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ 83 struct btrfs_ioctl_send_args_32) 84 #endif 85 86 static int btrfs_clone(struct inode *src, struct inode *inode, 87 u64 off, u64 olen, u64 olen_aligned, u64 destoff, 88 int no_time_update); 89 90 /* Mask out flags that are inappropriate for the given type of inode. */ 91 static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, 92 unsigned int flags) 93 { 94 if (S_ISDIR(inode->i_mode)) 95 return flags; 96 else if (S_ISREG(inode->i_mode)) 97 return flags & ~FS_DIRSYNC_FL; 98 else 99 return flags & (FS_NODUMP_FL | FS_NOATIME_FL); 100 } 101 102 /* 103 * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS 104 * ioctl. 105 */ 106 static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags) 107 { 108 unsigned int iflags = 0; 109 110 if (flags & BTRFS_INODE_SYNC) 111 iflags |= FS_SYNC_FL; 112 if (flags & BTRFS_INODE_IMMUTABLE) 113 iflags |= FS_IMMUTABLE_FL; 114 if (flags & BTRFS_INODE_APPEND) 115 iflags |= FS_APPEND_FL; 116 if (flags & BTRFS_INODE_NODUMP) 117 iflags |= FS_NODUMP_FL; 118 if (flags & BTRFS_INODE_NOATIME) 119 iflags |= FS_NOATIME_FL; 120 if (flags & BTRFS_INODE_DIRSYNC) 121 iflags |= FS_DIRSYNC_FL; 122 if (flags & BTRFS_INODE_NODATACOW) 123 iflags |= FS_NOCOW_FL; 124 125 if (flags & BTRFS_INODE_NOCOMPRESS) 126 iflags |= FS_NOCOMP_FL; 127 else if (flags & BTRFS_INODE_COMPRESS) 128 iflags |= FS_COMPR_FL; 129 130 return iflags; 131 } 132 133 /* 134 * Update inode->i_flags based on the btrfs internal flags. 135 */ 136 void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) 137 { 138 struct btrfs_inode *binode = BTRFS_I(inode); 139 unsigned int new_fl = 0; 140 141 if (binode->flags & BTRFS_INODE_SYNC) 142 new_fl |= S_SYNC; 143 if (binode->flags & BTRFS_INODE_IMMUTABLE) 144 new_fl |= S_IMMUTABLE; 145 if (binode->flags & BTRFS_INODE_APPEND) 146 new_fl |= S_APPEND; 147 if (binode->flags & BTRFS_INODE_NOATIME) 148 new_fl |= S_NOATIME; 149 if (binode->flags & BTRFS_INODE_DIRSYNC) 150 new_fl |= S_DIRSYNC; 151 152 set_mask_bits(&inode->i_flags, 153 S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, 154 new_fl); 155 } 156 157 static int btrfs_ioctl_getflags(struct file *file, void __user *arg) 158 { 159 struct btrfs_inode *binode = BTRFS_I(file_inode(file)); 160 unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags); 161 162 if (copy_to_user(arg, &flags, sizeof(flags))) 163 return -EFAULT; 164 return 0; 165 } 166 167 /* Check if @flags are a supported and valid set of FS_*_FL flags */ 168 static int check_fsflags(unsigned int flags) 169 { 170 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ 171 FS_NOATIME_FL | FS_NODUMP_FL | \ 172 FS_SYNC_FL | FS_DIRSYNC_FL | \ 173 FS_NOCOMP_FL | FS_COMPR_FL | 174 FS_NOCOW_FL)) 175 return -EOPNOTSUPP; 176 177 if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) 178 return -EINVAL; 179 180 return 0; 181 } 182 183 static int btrfs_ioctl_setflags(struct file *file, void __user *arg) 184 { 185 struct inode *inode = file_inode(file); 186 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 187 struct btrfs_inode *binode = BTRFS_I(inode); 188 struct btrfs_root *root = binode->root; 189 struct btrfs_trans_handle *trans; 190 unsigned int fsflags, old_fsflags; 191 int ret; 192 u64 old_flags; 193 unsigned int old_i_flags; 194 umode_t mode; 195 196 if (!inode_owner_or_capable(inode)) 197 return -EPERM; 198 199 if (btrfs_root_readonly(root)) 200 return -EROFS; 201 202 if (copy_from_user(&fsflags, arg, sizeof(fsflags))) 203 return -EFAULT; 204 205 ret = check_fsflags(fsflags); 206 if (ret) 207 return ret; 208 209 ret = mnt_want_write_file(file); 210 if (ret) 211 return ret; 212 213 inode_lock(inode); 214 215 old_flags = binode->flags; 216 old_i_flags = inode->i_flags; 217 mode = inode->i_mode; 218 219 fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); 220 old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); 221 if ((fsflags ^ old_fsflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 222 if (!capable(CAP_LINUX_IMMUTABLE)) { 223 ret = -EPERM; 224 goto out_unlock; 225 } 226 } 227 228 if (fsflags & FS_SYNC_FL) 229 binode->flags |= BTRFS_INODE_SYNC; 230 else 231 binode->flags &= ~BTRFS_INODE_SYNC; 232 if (fsflags & FS_IMMUTABLE_FL) 233 binode->flags |= BTRFS_INODE_IMMUTABLE; 234 else 235 binode->flags &= ~BTRFS_INODE_IMMUTABLE; 236 if (fsflags & FS_APPEND_FL) 237 binode->flags |= BTRFS_INODE_APPEND; 238 else 239 binode->flags &= ~BTRFS_INODE_APPEND; 240 if (fsflags & FS_NODUMP_FL) 241 binode->flags |= BTRFS_INODE_NODUMP; 242 else 243 binode->flags &= ~BTRFS_INODE_NODUMP; 244 if (fsflags & FS_NOATIME_FL) 245 binode->flags |= BTRFS_INODE_NOATIME; 246 else 247 binode->flags &= ~BTRFS_INODE_NOATIME; 248 if (fsflags & FS_DIRSYNC_FL) 249 binode->flags |= BTRFS_INODE_DIRSYNC; 250 else 251 binode->flags &= ~BTRFS_INODE_DIRSYNC; 252 if (fsflags & FS_NOCOW_FL) { 253 if (S_ISREG(mode)) { 254 /* 255 * It's safe to turn csums off here, no extents exist. 256 * Otherwise we want the flag to reflect the real COW 257 * status of the file and will not set it. 258 */ 259 if (inode->i_size == 0) 260 binode->flags |= BTRFS_INODE_NODATACOW 261 | BTRFS_INODE_NODATASUM; 262 } else { 263 binode->flags |= BTRFS_INODE_NODATACOW; 264 } 265 } else { 266 /* 267 * Revert back under same assumptions as above 268 */ 269 if (S_ISREG(mode)) { 270 if (inode->i_size == 0) 271 binode->flags &= ~(BTRFS_INODE_NODATACOW 272 | BTRFS_INODE_NODATASUM); 273 } else { 274 binode->flags &= ~BTRFS_INODE_NODATACOW; 275 } 276 } 277 278 /* 279 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS 280 * flag may be changed automatically if compression code won't make 281 * things smaller. 282 */ 283 if (fsflags & FS_NOCOMP_FL) { 284 binode->flags &= ~BTRFS_INODE_COMPRESS; 285 binode->flags |= BTRFS_INODE_NOCOMPRESS; 286 287 ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); 288 if (ret && ret != -ENODATA) 289 goto out_drop; 290 } else if (fsflags & FS_COMPR_FL) { 291 const char *comp; 292 293 binode->flags |= BTRFS_INODE_COMPRESS; 294 binode->flags &= ~BTRFS_INODE_NOCOMPRESS; 295 296 comp = btrfs_compress_type2str(fs_info->compress_type); 297 if (!comp || comp[0] == 0) 298 comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); 299 300 ret = btrfs_set_prop(inode, "btrfs.compression", 301 comp, strlen(comp), 0); 302 if (ret) 303 goto out_drop; 304 305 } else { 306 ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); 307 if (ret && ret != -ENODATA) 308 goto out_drop; 309 binode->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 310 } 311 312 trans = btrfs_start_transaction(root, 1); 313 if (IS_ERR(trans)) { 314 ret = PTR_ERR(trans); 315 goto out_drop; 316 } 317 318 btrfs_sync_inode_flags_to_i_flags(inode); 319 inode_inc_iversion(inode); 320 inode->i_ctime = current_time(inode); 321 ret = btrfs_update_inode(trans, root, inode); 322 323 btrfs_end_transaction(trans); 324 out_drop: 325 if (ret) { 326 binode->flags = old_flags; 327 inode->i_flags = old_i_flags; 328 } 329 330 out_unlock: 331 inode_unlock(inode); 332 mnt_drop_write_file(file); 333 return ret; 334 } 335 336 /* 337 * Translate btrfs internal inode flags to xflags as expected by the 338 * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are 339 * silently dropped. 340 */ 341 static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags) 342 { 343 unsigned int xflags = 0; 344 345 if (flags & BTRFS_INODE_APPEND) 346 xflags |= FS_XFLAG_APPEND; 347 if (flags & BTRFS_INODE_IMMUTABLE) 348 xflags |= FS_XFLAG_IMMUTABLE; 349 if (flags & BTRFS_INODE_NOATIME) 350 xflags |= FS_XFLAG_NOATIME; 351 if (flags & BTRFS_INODE_NODUMP) 352 xflags |= FS_XFLAG_NODUMP; 353 if (flags & BTRFS_INODE_SYNC) 354 xflags |= FS_XFLAG_SYNC; 355 356 return xflags; 357 } 358 359 /* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */ 360 static int check_xflags(unsigned int flags) 361 { 362 if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME | 363 FS_XFLAG_NODUMP | FS_XFLAG_SYNC)) 364 return -EOPNOTSUPP; 365 return 0; 366 } 367 368 /* 369 * Set the xflags from the internal inode flags. The remaining items of fsxattr 370 * are zeroed. 371 */ 372 static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg) 373 { 374 struct btrfs_inode *binode = BTRFS_I(file_inode(file)); 375 struct fsxattr fa; 376 377 memset(&fa, 0, sizeof(fa)); 378 fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags); 379 380 if (copy_to_user(arg, &fa, sizeof(fa))) 381 return -EFAULT; 382 383 return 0; 384 } 385 386 static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) 387 { 388 struct inode *inode = file_inode(file); 389 struct btrfs_inode *binode = BTRFS_I(inode); 390 struct btrfs_root *root = binode->root; 391 struct btrfs_trans_handle *trans; 392 struct fsxattr fa; 393 unsigned old_flags; 394 unsigned old_i_flags; 395 int ret = 0; 396 397 if (!inode_owner_or_capable(inode)) 398 return -EPERM; 399 400 if (btrfs_root_readonly(root)) 401 return -EROFS; 402 403 memset(&fa, 0, sizeof(fa)); 404 if (copy_from_user(&fa, arg, sizeof(fa))) 405 return -EFAULT; 406 407 ret = check_xflags(fa.fsx_xflags); 408 if (ret) 409 return ret; 410 411 if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0) 412 return -EOPNOTSUPP; 413 414 ret = mnt_want_write_file(file); 415 if (ret) 416 return ret; 417 418 inode_lock(inode); 419 420 old_flags = binode->flags; 421 old_i_flags = inode->i_flags; 422 423 /* We need the capabilities to change append-only or immutable inode */ 424 if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) || 425 (fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) && 426 !capable(CAP_LINUX_IMMUTABLE)) { 427 ret = -EPERM; 428 goto out_unlock; 429 } 430 431 if (fa.fsx_xflags & FS_XFLAG_SYNC) 432 binode->flags |= BTRFS_INODE_SYNC; 433 else 434 binode->flags &= ~BTRFS_INODE_SYNC; 435 if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE) 436 binode->flags |= BTRFS_INODE_IMMUTABLE; 437 else 438 binode->flags &= ~BTRFS_INODE_IMMUTABLE; 439 if (fa.fsx_xflags & FS_XFLAG_APPEND) 440 binode->flags |= BTRFS_INODE_APPEND; 441 else 442 binode->flags &= ~BTRFS_INODE_APPEND; 443 if (fa.fsx_xflags & FS_XFLAG_NODUMP) 444 binode->flags |= BTRFS_INODE_NODUMP; 445 else 446 binode->flags &= ~BTRFS_INODE_NODUMP; 447 if (fa.fsx_xflags & FS_XFLAG_NOATIME) 448 binode->flags |= BTRFS_INODE_NOATIME; 449 else 450 binode->flags &= ~BTRFS_INODE_NOATIME; 451 452 /* 1 item for the inode */ 453 trans = btrfs_start_transaction(root, 1); 454 if (IS_ERR(trans)) { 455 ret = PTR_ERR(trans); 456 goto out_unlock; 457 } 458 459 btrfs_sync_inode_flags_to_i_flags(inode); 460 inode_inc_iversion(inode); 461 inode->i_ctime = current_time(inode); 462 ret = btrfs_update_inode(trans, root, inode); 463 464 btrfs_end_transaction(trans); 465 466 out_unlock: 467 if (ret) { 468 binode->flags = old_flags; 469 inode->i_flags = old_i_flags; 470 } 471 472 inode_unlock(inode); 473 mnt_drop_write_file(file); 474 475 return ret; 476 } 477 478 static int btrfs_ioctl_getversion(struct file *file, int __user *arg) 479 { 480 struct inode *inode = file_inode(file); 481 482 return put_user(inode->i_generation, arg); 483 } 484 485 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) 486 { 487 struct inode *inode = file_inode(file); 488 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 489 struct btrfs_device *device; 490 struct request_queue *q; 491 struct fstrim_range range; 492 u64 minlen = ULLONG_MAX; 493 u64 num_devices = 0; 494 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 495 int ret; 496 497 if (!capable(CAP_SYS_ADMIN)) 498 return -EPERM; 499 500 rcu_read_lock(); 501 list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, 502 dev_list) { 503 if (!device->bdev) 504 continue; 505 q = bdev_get_queue(device->bdev); 506 if (blk_queue_discard(q)) { 507 num_devices++; 508 minlen = min_t(u64, q->limits.discard_granularity, 509 minlen); 510 } 511 } 512 rcu_read_unlock(); 513 514 if (!num_devices) 515 return -EOPNOTSUPP; 516 if (copy_from_user(&range, arg, sizeof(range))) 517 return -EFAULT; 518 if (range.start > total_bytes || 519 range.len < fs_info->sb->s_blocksize) 520 return -EINVAL; 521 522 range.len = min(range.len, total_bytes - range.start); 523 range.minlen = max(range.minlen, minlen); 524 ret = btrfs_trim_fs(fs_info, &range); 525 if (ret < 0) 526 return ret; 527 528 if (copy_to_user(arg, &range, sizeof(range))) 529 return -EFAULT; 530 531 return 0; 532 } 533 534 int btrfs_is_empty_uuid(u8 *uuid) 535 { 536 int i; 537 538 for (i = 0; i < BTRFS_UUID_SIZE; i++) { 539 if (uuid[i]) 540 return 0; 541 } 542 return 1; 543 } 544 545 static noinline int create_subvol(struct inode *dir, 546 struct dentry *dentry, 547 const char *name, int namelen, 548 u64 *async_transid, 549 struct btrfs_qgroup_inherit *inherit) 550 { 551 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 552 struct btrfs_trans_handle *trans; 553 struct btrfs_key key; 554 struct btrfs_root_item *root_item; 555 struct btrfs_inode_item *inode_item; 556 struct extent_buffer *leaf; 557 struct btrfs_root *root = BTRFS_I(dir)->root; 558 struct btrfs_root *new_root; 559 struct btrfs_block_rsv block_rsv; 560 struct timespec64 cur_time = current_time(dir); 561 struct inode *inode; 562 int ret; 563 int err; 564 u64 objectid; 565 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 566 u64 index = 0; 567 uuid_le new_uuid; 568 569 root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); 570 if (!root_item) 571 return -ENOMEM; 572 573 ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid); 574 if (ret) 575 goto fail_free; 576 577 /* 578 * Don't create subvolume whose level is not zero. Or qgroup will be 579 * screwed up since it assumes subvolume qgroup's level to be 0. 580 */ 581 if (btrfs_qgroup_level(objectid)) { 582 ret = -ENOSPC; 583 goto fail_free; 584 } 585 586 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 587 /* 588 * The same as the snapshot creation, please see the comment 589 * of create_snapshot(). 590 */ 591 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false); 592 if (ret) 593 goto fail_free; 594 595 trans = btrfs_start_transaction(root, 0); 596 if (IS_ERR(trans)) { 597 ret = PTR_ERR(trans); 598 btrfs_subvolume_release_metadata(fs_info, &block_rsv); 599 goto fail_free; 600 } 601 trans->block_rsv = &block_rsv; 602 trans->bytes_reserved = block_rsv.size; 603 604 ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); 605 if (ret) 606 goto fail; 607 608 leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); 609 if (IS_ERR(leaf)) { 610 ret = PTR_ERR(leaf); 611 goto fail; 612 } 613 614 btrfs_mark_buffer_dirty(leaf); 615 616 inode_item = &root_item->inode; 617 btrfs_set_stack_inode_generation(inode_item, 1); 618 btrfs_set_stack_inode_size(inode_item, 3); 619 btrfs_set_stack_inode_nlink(inode_item, 1); 620 btrfs_set_stack_inode_nbytes(inode_item, 621 fs_info->nodesize); 622 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); 623 624 btrfs_set_root_flags(root_item, 0); 625 btrfs_set_root_limit(root_item, 0); 626 btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); 627 628 btrfs_set_root_bytenr(root_item, leaf->start); 629 btrfs_set_root_generation(root_item, trans->transid); 630 btrfs_set_root_level(root_item, 0); 631 btrfs_set_root_refs(root_item, 1); 632 btrfs_set_root_used(root_item, leaf->len); 633 btrfs_set_root_last_snapshot(root_item, 0); 634 635 btrfs_set_root_generation_v2(root_item, 636 btrfs_root_generation(root_item)); 637 uuid_le_gen(&new_uuid); 638 memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); 639 btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); 640 btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); 641 root_item->ctime = root_item->otime; 642 btrfs_set_root_ctransid(root_item, trans->transid); 643 btrfs_set_root_otransid(root_item, trans->transid); 644 645 btrfs_tree_unlock(leaf); 646 free_extent_buffer(leaf); 647 leaf = NULL; 648 649 btrfs_set_root_dirid(root_item, new_dirid); 650 651 key.objectid = objectid; 652 key.offset = 0; 653 key.type = BTRFS_ROOT_ITEM_KEY; 654 ret = btrfs_insert_root(trans, fs_info->tree_root, &key, 655 root_item); 656 if (ret) 657 goto fail; 658 659 key.offset = (u64)-1; 660 new_root = btrfs_read_fs_root_no_name(fs_info, &key); 661 if (IS_ERR(new_root)) { 662 ret = PTR_ERR(new_root); 663 btrfs_abort_transaction(trans, ret); 664 goto fail; 665 } 666 667 btrfs_record_root_in_trans(trans, new_root); 668 669 ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); 670 if (ret) { 671 /* We potentially lose an unused inode item here */ 672 btrfs_abort_transaction(trans, ret); 673 goto fail; 674 } 675 676 mutex_lock(&new_root->objectid_mutex); 677 new_root->highest_objectid = new_dirid; 678 mutex_unlock(&new_root->objectid_mutex); 679 680 /* 681 * insert the directory item 682 */ 683 ret = btrfs_set_inode_index(BTRFS_I(dir), &index); 684 if (ret) { 685 btrfs_abort_transaction(trans, ret); 686 goto fail; 687 } 688 689 ret = btrfs_insert_dir_item(trans, root, 690 name, namelen, BTRFS_I(dir), &key, 691 BTRFS_FT_DIR, index); 692 if (ret) { 693 btrfs_abort_transaction(trans, ret); 694 goto fail; 695 } 696 697 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); 698 ret = btrfs_update_inode(trans, root, dir); 699 BUG_ON(ret); 700 701 ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, 702 btrfs_ino(BTRFS_I(dir)), index, name, namelen); 703 BUG_ON(ret); 704 705 ret = btrfs_uuid_tree_add(trans, root_item->uuid, 706 BTRFS_UUID_KEY_SUBVOL, objectid); 707 if (ret) 708 btrfs_abort_transaction(trans, ret); 709 710 fail: 711 kfree(root_item); 712 trans->block_rsv = NULL; 713 trans->bytes_reserved = 0; 714 btrfs_subvolume_release_metadata(fs_info, &block_rsv); 715 716 if (async_transid) { 717 *async_transid = trans->transid; 718 err = btrfs_commit_transaction_async(trans, 1); 719 if (err) 720 err = btrfs_commit_transaction(trans); 721 } else { 722 err = btrfs_commit_transaction(trans); 723 } 724 if (err && !ret) 725 ret = err; 726 727 if (!ret) { 728 inode = btrfs_lookup_dentry(dir, dentry); 729 if (IS_ERR(inode)) 730 return PTR_ERR(inode); 731 d_instantiate(dentry, inode); 732 } 733 return ret; 734 735 fail_free: 736 kfree(root_item); 737 return ret; 738 } 739 740 static int create_snapshot(struct btrfs_root *root, struct inode *dir, 741 struct dentry *dentry, 742 u64 *async_transid, bool readonly, 743 struct btrfs_qgroup_inherit *inherit) 744 { 745 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 746 struct inode *inode; 747 struct btrfs_pending_snapshot *pending_snapshot; 748 struct btrfs_trans_handle *trans; 749 int ret; 750 bool snapshot_force_cow = false; 751 752 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 753 return -EINVAL; 754 755 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL); 756 if (!pending_snapshot) 757 return -ENOMEM; 758 759 pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), 760 GFP_KERNEL); 761 pending_snapshot->path = btrfs_alloc_path(); 762 if (!pending_snapshot->root_item || !pending_snapshot->path) { 763 ret = -ENOMEM; 764 goto free_pending; 765 } 766 767 /* 768 * Force new buffered writes to reserve space even when NOCOW is 769 * possible. This is to avoid later writeback (running dealloc) to 770 * fallback to COW mode and unexpectedly fail with ENOSPC. 771 */ 772 atomic_inc(&root->will_be_snapshotted); 773 smp_mb__after_atomic(); 774 /* wait for no snapshot writes */ 775 wait_event(root->subv_writers->wait, 776 percpu_counter_sum(&root->subv_writers->counter) == 0); 777 778 ret = btrfs_start_delalloc_inodes(root); 779 if (ret) 780 goto dec_and_free; 781 782 /* 783 * All previous writes have started writeback in NOCOW mode, so now 784 * we force future writes to fallback to COW mode during snapshot 785 * creation. 786 */ 787 atomic_inc(&root->snapshot_force_cow); 788 snapshot_force_cow = true; 789 790 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); 791 792 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 793 BTRFS_BLOCK_RSV_TEMP); 794 /* 795 * 1 - parent dir inode 796 * 2 - dir entries 797 * 1 - root item 798 * 2 - root ref/backref 799 * 1 - root of snapshot 800 * 1 - UUID item 801 */ 802 ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, 803 &pending_snapshot->block_rsv, 8, 804 false); 805 if (ret) 806 goto dec_and_free; 807 808 pending_snapshot->dentry = dentry; 809 pending_snapshot->root = root; 810 pending_snapshot->readonly = readonly; 811 pending_snapshot->dir = dir; 812 pending_snapshot->inherit = inherit; 813 814 trans = btrfs_start_transaction(root, 0); 815 if (IS_ERR(trans)) { 816 ret = PTR_ERR(trans); 817 goto fail; 818 } 819 820 spin_lock(&fs_info->trans_lock); 821 list_add(&pending_snapshot->list, 822 &trans->transaction->pending_snapshots); 823 spin_unlock(&fs_info->trans_lock); 824 if (async_transid) { 825 *async_transid = trans->transid; 826 ret = btrfs_commit_transaction_async(trans, 1); 827 if (ret) 828 ret = btrfs_commit_transaction(trans); 829 } else { 830 ret = btrfs_commit_transaction(trans); 831 } 832 if (ret) 833 goto fail; 834 835 ret = pending_snapshot->error; 836 if (ret) 837 goto fail; 838 839 ret = btrfs_orphan_cleanup(pending_snapshot->snap); 840 if (ret) 841 goto fail; 842 843 inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry); 844 if (IS_ERR(inode)) { 845 ret = PTR_ERR(inode); 846 goto fail; 847 } 848 849 d_instantiate(dentry, inode); 850 ret = 0; 851 fail: 852 btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); 853 dec_and_free: 854 if (snapshot_force_cow) 855 atomic_dec(&root->snapshot_force_cow); 856 if (atomic_dec_and_test(&root->will_be_snapshotted)) 857 wake_up_var(&root->will_be_snapshotted); 858 free_pending: 859 kfree(pending_snapshot->root_item); 860 btrfs_free_path(pending_snapshot->path); 861 kfree(pending_snapshot); 862 863 return ret; 864 } 865 866 /* copy of may_delete in fs/namei.c() 867 * Check whether we can remove a link victim from directory dir, check 868 * whether the type of victim is right. 869 * 1. We can't do it if dir is read-only (done in permission()) 870 * 2. We should have write and exec permissions on dir 871 * 3. We can't remove anything from append-only dir 872 * 4. We can't do anything with immutable dir (done in permission()) 873 * 5. If the sticky bit on dir is set we should either 874 * a. be owner of dir, or 875 * b. be owner of victim, or 876 * c. have CAP_FOWNER capability 877 * 6. If the victim is append-only or immutable we can't do anything with 878 * links pointing to it. 879 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. 880 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. 881 * 9. We can't remove a root or mountpoint. 882 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by 883 * nfs_async_unlink(). 884 */ 885 886 static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) 887 { 888 int error; 889 890 if (d_really_is_negative(victim)) 891 return -ENOENT; 892 893 BUG_ON(d_inode(victim->d_parent) != dir); 894 audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); 895 896 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 897 if (error) 898 return error; 899 if (IS_APPEND(dir)) 900 return -EPERM; 901 if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || 902 IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) 903 return -EPERM; 904 if (isdir) { 905 if (!d_is_dir(victim)) 906 return -ENOTDIR; 907 if (IS_ROOT(victim)) 908 return -EBUSY; 909 } else if (d_is_dir(victim)) 910 return -EISDIR; 911 if (IS_DEADDIR(dir)) 912 return -ENOENT; 913 if (victim->d_flags & DCACHE_NFSFS_RENAMED) 914 return -EBUSY; 915 return 0; 916 } 917 918 /* copy of may_create in fs/namei.c() */ 919 static inline int btrfs_may_create(struct inode *dir, struct dentry *child) 920 { 921 if (d_really_is_positive(child)) 922 return -EEXIST; 923 if (IS_DEADDIR(dir)) 924 return -ENOENT; 925 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 926 } 927 928 /* 929 * Create a new subvolume below @parent. This is largely modeled after 930 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup 931 * inside this filesystem so it's quite a bit simpler. 932 */ 933 static noinline int btrfs_mksubvol(const struct path *parent, 934 const char *name, int namelen, 935 struct btrfs_root *snap_src, 936 u64 *async_transid, bool readonly, 937 struct btrfs_qgroup_inherit *inherit) 938 { 939 struct inode *dir = d_inode(parent->dentry); 940 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 941 struct dentry *dentry; 942 int error; 943 944 error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); 945 if (error == -EINTR) 946 return error; 947 948 dentry = lookup_one_len(name, parent->dentry, namelen); 949 error = PTR_ERR(dentry); 950 if (IS_ERR(dentry)) 951 goto out_unlock; 952 953 error = btrfs_may_create(dir, dentry); 954 if (error) 955 goto out_dput; 956 957 /* 958 * even if this name doesn't exist, we may get hash collisions. 959 * check for them now when we can safely fail 960 */ 961 error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, 962 dir->i_ino, name, 963 namelen); 964 if (error) 965 goto out_dput; 966 967 down_read(&fs_info->subvol_sem); 968 969 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) 970 goto out_up_read; 971 972 if (snap_src) { 973 error = create_snapshot(snap_src, dir, dentry, 974 async_transid, readonly, inherit); 975 } else { 976 error = create_subvol(dir, dentry, name, namelen, 977 async_transid, inherit); 978 } 979 if (!error) 980 fsnotify_mkdir(dir, dentry); 981 out_up_read: 982 up_read(&fs_info->subvol_sem); 983 out_dput: 984 dput(dentry); 985 out_unlock: 986 inode_unlock(dir); 987 return error; 988 } 989 990 /* 991 * When we're defragging a range, we don't want to kick it off again 992 * if it is really just waiting for delalloc to send it down. 993 * If we find a nice big extent or delalloc range for the bytes in the 994 * file you want to defrag, we return 0 to let you know to skip this 995 * part of the file 996 */ 997 static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) 998 { 999 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1000 struct extent_map *em = NULL; 1001 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 1002 u64 end; 1003 1004 read_lock(&em_tree->lock); 1005 em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE); 1006 read_unlock(&em_tree->lock); 1007 1008 if (em) { 1009 end = extent_map_end(em); 1010 free_extent_map(em); 1011 if (end - offset > thresh) 1012 return 0; 1013 } 1014 /* if we already have a nice delalloc here, just stop */ 1015 thresh /= 2; 1016 end = count_range_bits(io_tree, &offset, offset + thresh, 1017 thresh, EXTENT_DELALLOC, 1); 1018 if (end >= thresh) 1019 return 0; 1020 return 1; 1021 } 1022 1023 /* 1024 * helper function to walk through a file and find extents 1025 * newer than a specific transid, and smaller than thresh. 1026 * 1027 * This is used by the defragging code to find new and small 1028 * extents 1029 */ 1030 static int find_new_extents(struct btrfs_root *root, 1031 struct inode *inode, u64 newer_than, 1032 u64 *off, u32 thresh) 1033 { 1034 struct btrfs_path *path; 1035 struct btrfs_key min_key; 1036 struct extent_buffer *leaf; 1037 struct btrfs_file_extent_item *extent; 1038 int type; 1039 int ret; 1040 u64 ino = btrfs_ino(BTRFS_I(inode)); 1041 1042 path = btrfs_alloc_path(); 1043 if (!path) 1044 return -ENOMEM; 1045 1046 min_key.objectid = ino; 1047 min_key.type = BTRFS_EXTENT_DATA_KEY; 1048 min_key.offset = *off; 1049 1050 while (1) { 1051 ret = btrfs_search_forward(root, &min_key, path, newer_than); 1052 if (ret != 0) 1053 goto none; 1054 process_slot: 1055 if (min_key.objectid != ino) 1056 goto none; 1057 if (min_key.type != BTRFS_EXTENT_DATA_KEY) 1058 goto none; 1059 1060 leaf = path->nodes[0]; 1061 extent = btrfs_item_ptr(leaf, path->slots[0], 1062 struct btrfs_file_extent_item); 1063 1064 type = btrfs_file_extent_type(leaf, extent); 1065 if (type == BTRFS_FILE_EXTENT_REG && 1066 btrfs_file_extent_num_bytes(leaf, extent) < thresh && 1067 check_defrag_in_cache(inode, min_key.offset, thresh)) { 1068 *off = min_key.offset; 1069 btrfs_free_path(path); 1070 return 0; 1071 } 1072 1073 path->slots[0]++; 1074 if (path->slots[0] < btrfs_header_nritems(leaf)) { 1075 btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); 1076 goto process_slot; 1077 } 1078 1079 if (min_key.offset == (u64)-1) 1080 goto none; 1081 1082 min_key.offset++; 1083 btrfs_release_path(path); 1084 } 1085 none: 1086 btrfs_free_path(path); 1087 return -ENOENT; 1088 } 1089 1090 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) 1091 { 1092 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 1093 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1094 struct extent_map *em; 1095 u64 len = PAGE_SIZE; 1096 1097 /* 1098 * hopefully we have this extent in the tree already, try without 1099 * the full extent lock 1100 */ 1101 read_lock(&em_tree->lock); 1102 em = lookup_extent_mapping(em_tree, start, len); 1103 read_unlock(&em_tree->lock); 1104 1105 if (!em) { 1106 struct extent_state *cached = NULL; 1107 u64 end = start + len - 1; 1108 1109 /* get the big lock and read metadata off disk */ 1110 lock_extent_bits(io_tree, start, end, &cached); 1111 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); 1112 unlock_extent_cached(io_tree, start, end, &cached); 1113 1114 if (IS_ERR(em)) 1115 return NULL; 1116 } 1117 1118 return em; 1119 } 1120 1121 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) 1122 { 1123 struct extent_map *next; 1124 bool ret = true; 1125 1126 /* this is the last extent */ 1127 if (em->start + em->len >= i_size_read(inode)) 1128 return false; 1129 1130 next = defrag_lookup_extent(inode, em->start + em->len); 1131 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 1132 ret = false; 1133 else if ((em->block_start + em->block_len == next->block_start) && 1134 (em->block_len > SZ_128K && next->block_len > SZ_128K)) 1135 ret = false; 1136 1137 free_extent_map(next); 1138 return ret; 1139 } 1140 1141 static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, 1142 u64 *last_len, u64 *skip, u64 *defrag_end, 1143 int compress) 1144 { 1145 struct extent_map *em; 1146 int ret = 1; 1147 bool next_mergeable = true; 1148 bool prev_mergeable = true; 1149 1150 /* 1151 * make sure that once we start defragging an extent, we keep on 1152 * defragging it 1153 */ 1154 if (start < *defrag_end) 1155 return 1; 1156 1157 *skip = 0; 1158 1159 em = defrag_lookup_extent(inode, start); 1160 if (!em) 1161 return 0; 1162 1163 /* this will cover holes, and inline extents */ 1164 if (em->block_start >= EXTENT_MAP_LAST_BYTE) { 1165 ret = 0; 1166 goto out; 1167 } 1168 1169 if (!*defrag_end) 1170 prev_mergeable = false; 1171 1172 next_mergeable = defrag_check_next_extent(inode, em); 1173 /* 1174 * we hit a real extent, if it is big or the next extent is not a 1175 * real extent, don't bother defragging it 1176 */ 1177 if (!compress && (*last_len == 0 || *last_len >= thresh) && 1178 (em->len >= thresh || (!next_mergeable && !prev_mergeable))) 1179 ret = 0; 1180 out: 1181 /* 1182 * last_len ends up being a counter of how many bytes we've defragged. 1183 * every time we choose not to defrag an extent, we reset *last_len 1184 * so that the next tiny extent will force a defrag. 1185 * 1186 * The end result of this is that tiny extents before a single big 1187 * extent will force at least part of that big extent to be defragged. 1188 */ 1189 if (ret) { 1190 *defrag_end = extent_map_end(em); 1191 } else { 1192 *last_len = 0; 1193 *skip = extent_map_end(em); 1194 *defrag_end = 0; 1195 } 1196 1197 free_extent_map(em); 1198 return ret; 1199 } 1200 1201 /* 1202 * it doesn't do much good to defrag one or two pages 1203 * at a time. This pulls in a nice chunk of pages 1204 * to COW and defrag. 1205 * 1206 * It also makes sure the delalloc code has enough 1207 * dirty data to avoid making new small extents as part 1208 * of the defrag 1209 * 1210 * It's a good idea to start RA on this range 1211 * before calling this. 1212 */ 1213 static int cluster_pages_for_defrag(struct inode *inode, 1214 struct page **pages, 1215 unsigned long start_index, 1216 unsigned long num_pages) 1217 { 1218 unsigned long file_end; 1219 u64 isize = i_size_read(inode); 1220 u64 page_start; 1221 u64 page_end; 1222 u64 page_cnt; 1223 int ret; 1224 int i; 1225 int i_done; 1226 struct btrfs_ordered_extent *ordered; 1227 struct extent_state *cached_state = NULL; 1228 struct extent_io_tree *tree; 1229 struct extent_changeset *data_reserved = NULL; 1230 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 1231 1232 file_end = (isize - 1) >> PAGE_SHIFT; 1233 if (!isize || start_index > file_end) 1234 return 0; 1235 1236 page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); 1237 1238 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 1239 start_index << PAGE_SHIFT, 1240 page_cnt << PAGE_SHIFT); 1241 if (ret) 1242 return ret; 1243 i_done = 0; 1244 tree = &BTRFS_I(inode)->io_tree; 1245 1246 /* step one, lock all the pages */ 1247 for (i = 0; i < page_cnt; i++) { 1248 struct page *page; 1249 again: 1250 page = find_or_create_page(inode->i_mapping, 1251 start_index + i, mask); 1252 if (!page) 1253 break; 1254 1255 page_start = page_offset(page); 1256 page_end = page_start + PAGE_SIZE - 1; 1257 while (1) { 1258 lock_extent_bits(tree, page_start, page_end, 1259 &cached_state); 1260 ordered = btrfs_lookup_ordered_extent(inode, 1261 page_start); 1262 unlock_extent_cached(tree, page_start, page_end, 1263 &cached_state); 1264 if (!ordered) 1265 break; 1266 1267 unlock_page(page); 1268 btrfs_start_ordered_extent(inode, ordered, 1); 1269 btrfs_put_ordered_extent(ordered); 1270 lock_page(page); 1271 /* 1272 * we unlocked the page above, so we need check if 1273 * it was released or not. 1274 */ 1275 if (page->mapping != inode->i_mapping) { 1276 unlock_page(page); 1277 put_page(page); 1278 goto again; 1279 } 1280 } 1281 1282 if (!PageUptodate(page)) { 1283 btrfs_readpage(NULL, page); 1284 lock_page(page); 1285 if (!PageUptodate(page)) { 1286 unlock_page(page); 1287 put_page(page); 1288 ret = -EIO; 1289 break; 1290 } 1291 } 1292 1293 if (page->mapping != inode->i_mapping) { 1294 unlock_page(page); 1295 put_page(page); 1296 goto again; 1297 } 1298 1299 pages[i] = page; 1300 i_done++; 1301 } 1302 if (!i_done || ret) 1303 goto out; 1304 1305 if (!(inode->i_sb->s_flags & SB_ACTIVE)) 1306 goto out; 1307 1308 /* 1309 * so now we have a nice long stream of locked 1310 * and up to date pages, lets wait on them 1311 */ 1312 for (i = 0; i < i_done; i++) 1313 wait_on_page_writeback(pages[i]); 1314 1315 page_start = page_offset(pages[0]); 1316 page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE; 1317 1318 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1319 page_start, page_end - 1, &cached_state); 1320 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, 1321 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1322 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, 1323 &cached_state); 1324 1325 if (i_done != page_cnt) { 1326 spin_lock(&BTRFS_I(inode)->lock); 1327 BTRFS_I(inode)->outstanding_extents++; 1328 spin_unlock(&BTRFS_I(inode)->lock); 1329 btrfs_delalloc_release_space(inode, data_reserved, 1330 start_index << PAGE_SHIFT, 1331 (page_cnt - i_done) << PAGE_SHIFT, true); 1332 } 1333 1334 1335 set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, 1336 &cached_state); 1337 1338 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1339 page_start, page_end - 1, &cached_state); 1340 1341 for (i = 0; i < i_done; i++) { 1342 clear_page_dirty_for_io(pages[i]); 1343 ClearPageChecked(pages[i]); 1344 set_page_extent_mapped(pages[i]); 1345 set_page_dirty(pages[i]); 1346 unlock_page(pages[i]); 1347 put_page(pages[i]); 1348 } 1349 btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, 1350 false); 1351 extent_changeset_free(data_reserved); 1352 return i_done; 1353 out: 1354 for (i = 0; i < i_done; i++) { 1355 unlock_page(pages[i]); 1356 put_page(pages[i]); 1357 } 1358 btrfs_delalloc_release_space(inode, data_reserved, 1359 start_index << PAGE_SHIFT, 1360 page_cnt << PAGE_SHIFT, true); 1361 btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, 1362 true); 1363 extent_changeset_free(data_reserved); 1364 return ret; 1365 1366 } 1367 1368 int btrfs_defrag_file(struct inode *inode, struct file *file, 1369 struct btrfs_ioctl_defrag_range_args *range, 1370 u64 newer_than, unsigned long max_to_defrag) 1371 { 1372 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1373 struct btrfs_root *root = BTRFS_I(inode)->root; 1374 struct file_ra_state *ra = NULL; 1375 unsigned long last_index; 1376 u64 isize = i_size_read(inode); 1377 u64 last_len = 0; 1378 u64 skip = 0; 1379 u64 defrag_end = 0; 1380 u64 newer_off = range->start; 1381 unsigned long i; 1382 unsigned long ra_index = 0; 1383 int ret; 1384 int defrag_count = 0; 1385 int compress_type = BTRFS_COMPRESS_ZLIB; 1386 u32 extent_thresh = range->extent_thresh; 1387 unsigned long max_cluster = SZ_256K >> PAGE_SHIFT; 1388 unsigned long cluster = max_cluster; 1389 u64 new_align = ~((u64)SZ_128K - 1); 1390 struct page **pages = NULL; 1391 bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; 1392 1393 if (isize == 0) 1394 return 0; 1395 1396 if (range->start >= isize) 1397 return -EINVAL; 1398 1399 if (do_compress) { 1400 if (range->compress_type > BTRFS_COMPRESS_TYPES) 1401 return -EINVAL; 1402 if (range->compress_type) 1403 compress_type = range->compress_type; 1404 } 1405 1406 if (extent_thresh == 0) 1407 extent_thresh = SZ_256K; 1408 1409 /* 1410 * If we were not given a file, allocate a readahead context. As 1411 * readahead is just an optimization, defrag will work without it so 1412 * we don't error out. 1413 */ 1414 if (!file) { 1415 ra = kzalloc(sizeof(*ra), GFP_KERNEL); 1416 if (ra) 1417 file_ra_state_init(ra, inode->i_mapping); 1418 } else { 1419 ra = &file->f_ra; 1420 } 1421 1422 pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL); 1423 if (!pages) { 1424 ret = -ENOMEM; 1425 goto out_ra; 1426 } 1427 1428 /* find the last page to defrag */ 1429 if (range->start + range->len > range->start) { 1430 last_index = min_t(u64, isize - 1, 1431 range->start + range->len - 1) >> PAGE_SHIFT; 1432 } else { 1433 last_index = (isize - 1) >> PAGE_SHIFT; 1434 } 1435 1436 if (newer_than) { 1437 ret = find_new_extents(root, inode, newer_than, 1438 &newer_off, SZ_64K); 1439 if (!ret) { 1440 range->start = newer_off; 1441 /* 1442 * we always align our defrag to help keep 1443 * the extents in the file evenly spaced 1444 */ 1445 i = (newer_off & new_align) >> PAGE_SHIFT; 1446 } else 1447 goto out_ra; 1448 } else { 1449 i = range->start >> PAGE_SHIFT; 1450 } 1451 if (!max_to_defrag) 1452 max_to_defrag = last_index - i + 1; 1453 1454 /* 1455 * make writeback starts from i, so the defrag range can be 1456 * written sequentially. 1457 */ 1458 if (i < inode->i_mapping->writeback_index) 1459 inode->i_mapping->writeback_index = i; 1460 1461 while (i <= last_index && defrag_count < max_to_defrag && 1462 (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) { 1463 /* 1464 * make sure we stop running if someone unmounts 1465 * the FS 1466 */ 1467 if (!(inode->i_sb->s_flags & SB_ACTIVE)) 1468 break; 1469 1470 if (btrfs_defrag_cancelled(fs_info)) { 1471 btrfs_debug(fs_info, "defrag_file cancelled"); 1472 ret = -EAGAIN; 1473 break; 1474 } 1475 1476 if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, 1477 extent_thresh, &last_len, &skip, 1478 &defrag_end, do_compress)){ 1479 unsigned long next; 1480 /* 1481 * the should_defrag function tells us how much to skip 1482 * bump our counter by the suggested amount 1483 */ 1484 next = DIV_ROUND_UP(skip, PAGE_SIZE); 1485 i = max(i + 1, next); 1486 continue; 1487 } 1488 1489 if (!newer_than) { 1490 cluster = (PAGE_ALIGN(defrag_end) >> 1491 PAGE_SHIFT) - i; 1492 cluster = min(cluster, max_cluster); 1493 } else { 1494 cluster = max_cluster; 1495 } 1496 1497 if (i + cluster > ra_index) { 1498 ra_index = max(i, ra_index); 1499 if (ra) 1500 page_cache_sync_readahead(inode->i_mapping, ra, 1501 file, ra_index, cluster); 1502 ra_index += cluster; 1503 } 1504 1505 inode_lock(inode); 1506 if (do_compress) 1507 BTRFS_I(inode)->defrag_compress = compress_type; 1508 ret = cluster_pages_for_defrag(inode, pages, i, cluster); 1509 if (ret < 0) { 1510 inode_unlock(inode); 1511 goto out_ra; 1512 } 1513 1514 defrag_count += ret; 1515 balance_dirty_pages_ratelimited(inode->i_mapping); 1516 inode_unlock(inode); 1517 1518 if (newer_than) { 1519 if (newer_off == (u64)-1) 1520 break; 1521 1522 if (ret > 0) 1523 i += ret; 1524 1525 newer_off = max(newer_off + 1, 1526 (u64)i << PAGE_SHIFT); 1527 1528 ret = find_new_extents(root, inode, newer_than, 1529 &newer_off, SZ_64K); 1530 if (!ret) { 1531 range->start = newer_off; 1532 i = (newer_off & new_align) >> PAGE_SHIFT; 1533 } else { 1534 break; 1535 } 1536 } else { 1537 if (ret > 0) { 1538 i += ret; 1539 last_len += ret << PAGE_SHIFT; 1540 } else { 1541 i++; 1542 last_len = 0; 1543 } 1544 } 1545 } 1546 1547 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { 1548 filemap_flush(inode->i_mapping); 1549 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1550 &BTRFS_I(inode)->runtime_flags)) 1551 filemap_flush(inode->i_mapping); 1552 } 1553 1554 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1555 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1556 } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { 1557 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1558 } 1559 1560 ret = defrag_count; 1561 1562 out_ra: 1563 if (do_compress) { 1564 inode_lock(inode); 1565 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; 1566 inode_unlock(inode); 1567 } 1568 if (!file) 1569 kfree(ra); 1570 kfree(pages); 1571 return ret; 1572 } 1573 1574 static noinline int btrfs_ioctl_resize(struct file *file, 1575 void __user *arg) 1576 { 1577 struct inode *inode = file_inode(file); 1578 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1579 u64 new_size; 1580 u64 old_size; 1581 u64 devid = 1; 1582 struct btrfs_root *root = BTRFS_I(inode)->root; 1583 struct btrfs_ioctl_vol_args *vol_args; 1584 struct btrfs_trans_handle *trans; 1585 struct btrfs_device *device = NULL; 1586 char *sizestr; 1587 char *retptr; 1588 char *devstr = NULL; 1589 int ret = 0; 1590 int mod = 0; 1591 1592 if (!capable(CAP_SYS_ADMIN)) 1593 return -EPERM; 1594 1595 ret = mnt_want_write_file(file); 1596 if (ret) 1597 return ret; 1598 1599 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { 1600 mnt_drop_write_file(file); 1601 return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 1602 } 1603 1604 vol_args = memdup_user(arg, sizeof(*vol_args)); 1605 if (IS_ERR(vol_args)) { 1606 ret = PTR_ERR(vol_args); 1607 goto out; 1608 } 1609 1610 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1611 1612 sizestr = vol_args->name; 1613 devstr = strchr(sizestr, ':'); 1614 if (devstr) { 1615 sizestr = devstr + 1; 1616 *devstr = '\0'; 1617 devstr = vol_args->name; 1618 ret = kstrtoull(devstr, 10, &devid); 1619 if (ret) 1620 goto out_free; 1621 if (!devid) { 1622 ret = -EINVAL; 1623 goto out_free; 1624 } 1625 btrfs_info(fs_info, "resizing devid %llu", devid); 1626 } 1627 1628 device = btrfs_find_device(fs_info, devid, NULL, NULL); 1629 if (!device) { 1630 btrfs_info(fs_info, "resizer unable to find device %llu", 1631 devid); 1632 ret = -ENODEV; 1633 goto out_free; 1634 } 1635 1636 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1637 btrfs_info(fs_info, 1638 "resizer unable to apply on readonly device %llu", 1639 devid); 1640 ret = -EPERM; 1641 goto out_free; 1642 } 1643 1644 if (!strcmp(sizestr, "max")) 1645 new_size = device->bdev->bd_inode->i_size; 1646 else { 1647 if (sizestr[0] == '-') { 1648 mod = -1; 1649 sizestr++; 1650 } else if (sizestr[0] == '+') { 1651 mod = 1; 1652 sizestr++; 1653 } 1654 new_size = memparse(sizestr, &retptr); 1655 if (*retptr != '\0' || new_size == 0) { 1656 ret = -EINVAL; 1657 goto out_free; 1658 } 1659 } 1660 1661 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1662 ret = -EPERM; 1663 goto out_free; 1664 } 1665 1666 old_size = btrfs_device_get_total_bytes(device); 1667 1668 if (mod < 0) { 1669 if (new_size > old_size) { 1670 ret = -EINVAL; 1671 goto out_free; 1672 } 1673 new_size = old_size - new_size; 1674 } else if (mod > 0) { 1675 if (new_size > ULLONG_MAX - old_size) { 1676 ret = -ERANGE; 1677 goto out_free; 1678 } 1679 new_size = old_size + new_size; 1680 } 1681 1682 if (new_size < SZ_256M) { 1683 ret = -EINVAL; 1684 goto out_free; 1685 } 1686 if (new_size > device->bdev->bd_inode->i_size) { 1687 ret = -EFBIG; 1688 goto out_free; 1689 } 1690 1691 new_size = round_down(new_size, fs_info->sectorsize); 1692 1693 btrfs_info_in_rcu(fs_info, "new size for %s is %llu", 1694 rcu_str_deref(device->name), new_size); 1695 1696 if (new_size > old_size) { 1697 trans = btrfs_start_transaction(root, 0); 1698 if (IS_ERR(trans)) { 1699 ret = PTR_ERR(trans); 1700 goto out_free; 1701 } 1702 ret = btrfs_grow_device(trans, device, new_size); 1703 btrfs_commit_transaction(trans); 1704 } else if (new_size < old_size) { 1705 ret = btrfs_shrink_device(device, new_size); 1706 } /* equal, nothing need to do */ 1707 1708 out_free: 1709 kfree(vol_args); 1710 out: 1711 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 1712 mnt_drop_write_file(file); 1713 return ret; 1714 } 1715 1716 static noinline int btrfs_ioctl_snap_create_transid(struct file *file, 1717 const char *name, unsigned long fd, int subvol, 1718 u64 *transid, bool readonly, 1719 struct btrfs_qgroup_inherit *inherit) 1720 { 1721 int namelen; 1722 int ret = 0; 1723 1724 if (!S_ISDIR(file_inode(file)->i_mode)) 1725 return -ENOTDIR; 1726 1727 ret = mnt_want_write_file(file); 1728 if (ret) 1729 goto out; 1730 1731 namelen = strlen(name); 1732 if (strchr(name, '/')) { 1733 ret = -EINVAL; 1734 goto out_drop_write; 1735 } 1736 1737 if (name[0] == '.' && 1738 (namelen == 1 || (name[1] == '.' && namelen == 2))) { 1739 ret = -EEXIST; 1740 goto out_drop_write; 1741 } 1742 1743 if (subvol) { 1744 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1745 NULL, transid, readonly, inherit); 1746 } else { 1747 struct fd src = fdget(fd); 1748 struct inode *src_inode; 1749 if (!src.file) { 1750 ret = -EINVAL; 1751 goto out_drop_write; 1752 } 1753 1754 src_inode = file_inode(src.file); 1755 if (src_inode->i_sb != file_inode(file)->i_sb) { 1756 btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, 1757 "Snapshot src from another FS"); 1758 ret = -EXDEV; 1759 } else if (!inode_owner_or_capable(src_inode)) { 1760 /* 1761 * Subvolume creation is not restricted, but snapshots 1762 * are limited to own subvolumes only 1763 */ 1764 ret = -EPERM; 1765 } else { 1766 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1767 BTRFS_I(src_inode)->root, 1768 transid, readonly, inherit); 1769 } 1770 fdput(src); 1771 } 1772 out_drop_write: 1773 mnt_drop_write_file(file); 1774 out: 1775 return ret; 1776 } 1777 1778 static noinline int btrfs_ioctl_snap_create(struct file *file, 1779 void __user *arg, int subvol) 1780 { 1781 struct btrfs_ioctl_vol_args *vol_args; 1782 int ret; 1783 1784 if (!S_ISDIR(file_inode(file)->i_mode)) 1785 return -ENOTDIR; 1786 1787 vol_args = memdup_user(arg, sizeof(*vol_args)); 1788 if (IS_ERR(vol_args)) 1789 return PTR_ERR(vol_args); 1790 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1791 1792 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1793 vol_args->fd, subvol, 1794 NULL, false, NULL); 1795 1796 kfree(vol_args); 1797 return ret; 1798 } 1799 1800 static noinline int btrfs_ioctl_snap_create_v2(struct file *file, 1801 void __user *arg, int subvol) 1802 { 1803 struct btrfs_ioctl_vol_args_v2 *vol_args; 1804 int ret; 1805 u64 transid = 0; 1806 u64 *ptr = NULL; 1807 bool readonly = false; 1808 struct btrfs_qgroup_inherit *inherit = NULL; 1809 1810 if (!S_ISDIR(file_inode(file)->i_mode)) 1811 return -ENOTDIR; 1812 1813 vol_args = memdup_user(arg, sizeof(*vol_args)); 1814 if (IS_ERR(vol_args)) 1815 return PTR_ERR(vol_args); 1816 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; 1817 1818 if (vol_args->flags & 1819 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | 1820 BTRFS_SUBVOL_QGROUP_INHERIT)) { 1821 ret = -EOPNOTSUPP; 1822 goto free_args; 1823 } 1824 1825 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) 1826 ptr = &transid; 1827 if (vol_args->flags & BTRFS_SUBVOL_RDONLY) 1828 readonly = true; 1829 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { 1830 if (vol_args->size > PAGE_SIZE) { 1831 ret = -EINVAL; 1832 goto free_args; 1833 } 1834 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); 1835 if (IS_ERR(inherit)) { 1836 ret = PTR_ERR(inherit); 1837 goto free_args; 1838 } 1839 } 1840 1841 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1842 vol_args->fd, subvol, ptr, 1843 readonly, inherit); 1844 if (ret) 1845 goto free_inherit; 1846 1847 if (ptr && copy_to_user(arg + 1848 offsetof(struct btrfs_ioctl_vol_args_v2, 1849 transid), 1850 ptr, sizeof(*ptr))) 1851 ret = -EFAULT; 1852 1853 free_inherit: 1854 kfree(inherit); 1855 free_args: 1856 kfree(vol_args); 1857 return ret; 1858 } 1859 1860 static noinline int btrfs_ioctl_subvol_getflags(struct file *file, 1861 void __user *arg) 1862 { 1863 struct inode *inode = file_inode(file); 1864 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1865 struct btrfs_root *root = BTRFS_I(inode)->root; 1866 int ret = 0; 1867 u64 flags = 0; 1868 1869 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) 1870 return -EINVAL; 1871 1872 down_read(&fs_info->subvol_sem); 1873 if (btrfs_root_readonly(root)) 1874 flags |= BTRFS_SUBVOL_RDONLY; 1875 up_read(&fs_info->subvol_sem); 1876 1877 if (copy_to_user(arg, &flags, sizeof(flags))) 1878 ret = -EFAULT; 1879 1880 return ret; 1881 } 1882 1883 static noinline int btrfs_ioctl_subvol_setflags(struct file *file, 1884 void __user *arg) 1885 { 1886 struct inode *inode = file_inode(file); 1887 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1888 struct btrfs_root *root = BTRFS_I(inode)->root; 1889 struct btrfs_trans_handle *trans; 1890 u64 root_flags; 1891 u64 flags; 1892 int ret = 0; 1893 1894 if (!inode_owner_or_capable(inode)) 1895 return -EPERM; 1896 1897 ret = mnt_want_write_file(file); 1898 if (ret) 1899 goto out; 1900 1901 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { 1902 ret = -EINVAL; 1903 goto out_drop_write; 1904 } 1905 1906 if (copy_from_user(&flags, arg, sizeof(flags))) { 1907 ret = -EFAULT; 1908 goto out_drop_write; 1909 } 1910 1911 if (flags & BTRFS_SUBVOL_CREATE_ASYNC) { 1912 ret = -EINVAL; 1913 goto out_drop_write; 1914 } 1915 1916 if (flags & ~BTRFS_SUBVOL_RDONLY) { 1917 ret = -EOPNOTSUPP; 1918 goto out_drop_write; 1919 } 1920 1921 down_write(&fs_info->subvol_sem); 1922 1923 /* nothing to do */ 1924 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) 1925 goto out_drop_sem; 1926 1927 root_flags = btrfs_root_flags(&root->root_item); 1928 if (flags & BTRFS_SUBVOL_RDONLY) { 1929 btrfs_set_root_flags(&root->root_item, 1930 root_flags | BTRFS_ROOT_SUBVOL_RDONLY); 1931 } else { 1932 /* 1933 * Block RO -> RW transition if this subvolume is involved in 1934 * send 1935 */ 1936 spin_lock(&root->root_item_lock); 1937 if (root->send_in_progress == 0) { 1938 btrfs_set_root_flags(&root->root_item, 1939 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); 1940 spin_unlock(&root->root_item_lock); 1941 } else { 1942 spin_unlock(&root->root_item_lock); 1943 btrfs_warn(fs_info, 1944 "Attempt to set subvolume %llu read-write during send", 1945 root->root_key.objectid); 1946 ret = -EPERM; 1947 goto out_drop_sem; 1948 } 1949 } 1950 1951 trans = btrfs_start_transaction(root, 1); 1952 if (IS_ERR(trans)) { 1953 ret = PTR_ERR(trans); 1954 goto out_reset; 1955 } 1956 1957 ret = btrfs_update_root(trans, fs_info->tree_root, 1958 &root->root_key, &root->root_item); 1959 if (ret < 0) { 1960 btrfs_end_transaction(trans); 1961 goto out_reset; 1962 } 1963 1964 ret = btrfs_commit_transaction(trans); 1965 1966 out_reset: 1967 if (ret) 1968 btrfs_set_root_flags(&root->root_item, root_flags); 1969 out_drop_sem: 1970 up_write(&fs_info->subvol_sem); 1971 out_drop_write: 1972 mnt_drop_write_file(file); 1973 out: 1974 return ret; 1975 } 1976 1977 static noinline int key_in_sk(struct btrfs_key *key, 1978 struct btrfs_ioctl_search_key *sk) 1979 { 1980 struct btrfs_key test; 1981 int ret; 1982 1983 test.objectid = sk->min_objectid; 1984 test.type = sk->min_type; 1985 test.offset = sk->min_offset; 1986 1987 ret = btrfs_comp_cpu_keys(key, &test); 1988 if (ret < 0) 1989 return 0; 1990 1991 test.objectid = sk->max_objectid; 1992 test.type = sk->max_type; 1993 test.offset = sk->max_offset; 1994 1995 ret = btrfs_comp_cpu_keys(key, &test); 1996 if (ret > 0) 1997 return 0; 1998 return 1; 1999 } 2000 2001 static noinline int copy_to_sk(struct btrfs_path *path, 2002 struct btrfs_key *key, 2003 struct btrfs_ioctl_search_key *sk, 2004 size_t *buf_size, 2005 char __user *ubuf, 2006 unsigned long *sk_offset, 2007 int *num_found) 2008 { 2009 u64 found_transid; 2010 struct extent_buffer *leaf; 2011 struct btrfs_ioctl_search_header sh; 2012 struct btrfs_key test; 2013 unsigned long item_off; 2014 unsigned long item_len; 2015 int nritems; 2016 int i; 2017 int slot; 2018 int ret = 0; 2019 2020 leaf = path->nodes[0]; 2021 slot = path->slots[0]; 2022 nritems = btrfs_header_nritems(leaf); 2023 2024 if (btrfs_header_generation(leaf) > sk->max_transid) { 2025 i = nritems; 2026 goto advance_key; 2027 } 2028 found_transid = btrfs_header_generation(leaf); 2029 2030 for (i = slot; i < nritems; i++) { 2031 item_off = btrfs_item_ptr_offset(leaf, i); 2032 item_len = btrfs_item_size_nr(leaf, i); 2033 2034 btrfs_item_key_to_cpu(leaf, key, i); 2035 if (!key_in_sk(key, sk)) 2036 continue; 2037 2038 if (sizeof(sh) + item_len > *buf_size) { 2039 if (*num_found) { 2040 ret = 1; 2041 goto out; 2042 } 2043 2044 /* 2045 * return one empty item back for v1, which does not 2046 * handle -EOVERFLOW 2047 */ 2048 2049 *buf_size = sizeof(sh) + item_len; 2050 item_len = 0; 2051 ret = -EOVERFLOW; 2052 } 2053 2054 if (sizeof(sh) + item_len + *sk_offset > *buf_size) { 2055 ret = 1; 2056 goto out; 2057 } 2058 2059 sh.objectid = key->objectid; 2060 sh.offset = key->offset; 2061 sh.type = key->type; 2062 sh.len = item_len; 2063 sh.transid = found_transid; 2064 2065 /* copy search result header */ 2066 if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { 2067 ret = -EFAULT; 2068 goto out; 2069 } 2070 2071 *sk_offset += sizeof(sh); 2072 2073 if (item_len) { 2074 char __user *up = ubuf + *sk_offset; 2075 /* copy the item */ 2076 if (read_extent_buffer_to_user(leaf, up, 2077 item_off, item_len)) { 2078 ret = -EFAULT; 2079 goto out; 2080 } 2081 2082 *sk_offset += item_len; 2083 } 2084 (*num_found)++; 2085 2086 if (ret) /* -EOVERFLOW from above */ 2087 goto out; 2088 2089 if (*num_found >= sk->nr_items) { 2090 ret = 1; 2091 goto out; 2092 } 2093 } 2094 advance_key: 2095 ret = 0; 2096 test.objectid = sk->max_objectid; 2097 test.type = sk->max_type; 2098 test.offset = sk->max_offset; 2099 if (btrfs_comp_cpu_keys(key, &test) >= 0) 2100 ret = 1; 2101 else if (key->offset < (u64)-1) 2102 key->offset++; 2103 else if (key->type < (u8)-1) { 2104 key->offset = 0; 2105 key->type++; 2106 } else if (key->objectid < (u64)-1) { 2107 key->offset = 0; 2108 key->type = 0; 2109 key->objectid++; 2110 } else 2111 ret = 1; 2112 out: 2113 /* 2114 * 0: all items from this leaf copied, continue with next 2115 * 1: * more items can be copied, but unused buffer is too small 2116 * * all items were found 2117 * Either way, it will stops the loop which iterates to the next 2118 * leaf 2119 * -EOVERFLOW: item was to large for buffer 2120 * -EFAULT: could not copy extent buffer back to userspace 2121 */ 2122 return ret; 2123 } 2124 2125 static noinline int search_ioctl(struct inode *inode, 2126 struct btrfs_ioctl_search_key *sk, 2127 size_t *buf_size, 2128 char __user *ubuf) 2129 { 2130 struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); 2131 struct btrfs_root *root; 2132 struct btrfs_key key; 2133 struct btrfs_path *path; 2134 int ret; 2135 int num_found = 0; 2136 unsigned long sk_offset = 0; 2137 2138 if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) { 2139 *buf_size = sizeof(struct btrfs_ioctl_search_header); 2140 return -EOVERFLOW; 2141 } 2142 2143 path = btrfs_alloc_path(); 2144 if (!path) 2145 return -ENOMEM; 2146 2147 if (sk->tree_id == 0) { 2148 /* search the root of the inode that was passed */ 2149 root = BTRFS_I(inode)->root; 2150 } else { 2151 key.objectid = sk->tree_id; 2152 key.type = BTRFS_ROOT_ITEM_KEY; 2153 key.offset = (u64)-1; 2154 root = btrfs_read_fs_root_no_name(info, &key); 2155 if (IS_ERR(root)) { 2156 btrfs_free_path(path); 2157 return PTR_ERR(root); 2158 } 2159 } 2160 2161 key.objectid = sk->min_objectid; 2162 key.type = sk->min_type; 2163 key.offset = sk->min_offset; 2164 2165 while (1) { 2166 ret = btrfs_search_forward(root, &key, path, sk->min_transid); 2167 if (ret != 0) { 2168 if (ret > 0) 2169 ret = 0; 2170 goto err; 2171 } 2172 ret = copy_to_sk(path, &key, sk, buf_size, ubuf, 2173 &sk_offset, &num_found); 2174 btrfs_release_path(path); 2175 if (ret) 2176 break; 2177 2178 } 2179 if (ret > 0) 2180 ret = 0; 2181 err: 2182 sk->nr_items = num_found; 2183 btrfs_free_path(path); 2184 return ret; 2185 } 2186 2187 static noinline int btrfs_ioctl_tree_search(struct file *file, 2188 void __user *argp) 2189 { 2190 struct btrfs_ioctl_search_args __user *uargs; 2191 struct btrfs_ioctl_search_key sk; 2192 struct inode *inode; 2193 int ret; 2194 size_t buf_size; 2195 2196 if (!capable(CAP_SYS_ADMIN)) 2197 return -EPERM; 2198 2199 uargs = (struct btrfs_ioctl_search_args __user *)argp; 2200 2201 if (copy_from_user(&sk, &uargs->key, sizeof(sk))) 2202 return -EFAULT; 2203 2204 buf_size = sizeof(uargs->buf); 2205 2206 inode = file_inode(file); 2207 ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); 2208 2209 /* 2210 * In the origin implementation an overflow is handled by returning a 2211 * search header with a len of zero, so reset ret. 2212 */ 2213 if (ret == -EOVERFLOW) 2214 ret = 0; 2215 2216 if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk))) 2217 ret = -EFAULT; 2218 return ret; 2219 } 2220 2221 static noinline int btrfs_ioctl_tree_search_v2(struct file *file, 2222 void __user *argp) 2223 { 2224 struct btrfs_ioctl_search_args_v2 __user *uarg; 2225 struct btrfs_ioctl_search_args_v2 args; 2226 struct inode *inode; 2227 int ret; 2228 size_t buf_size; 2229 const size_t buf_limit = SZ_16M; 2230 2231 if (!capable(CAP_SYS_ADMIN)) 2232 return -EPERM; 2233 2234 /* copy search header and buffer size */ 2235 uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; 2236 if (copy_from_user(&args, uarg, sizeof(args))) 2237 return -EFAULT; 2238 2239 buf_size = args.buf_size; 2240 2241 /* limit result size to 16MB */ 2242 if (buf_size > buf_limit) 2243 buf_size = buf_limit; 2244 2245 inode = file_inode(file); 2246 ret = search_ioctl(inode, &args.key, &buf_size, 2247 (char __user *)(&uarg->buf[0])); 2248 if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) 2249 ret = -EFAULT; 2250 else if (ret == -EOVERFLOW && 2251 copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size))) 2252 ret = -EFAULT; 2253 2254 return ret; 2255 } 2256 2257 /* 2258 * Search INODE_REFs to identify path name of 'dirid' directory 2259 * in a 'tree_id' tree. and sets path name to 'name'. 2260 */ 2261 static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, 2262 u64 tree_id, u64 dirid, char *name) 2263 { 2264 struct btrfs_root *root; 2265 struct btrfs_key key; 2266 char *ptr; 2267 int ret = -1; 2268 int slot; 2269 int len; 2270 int total_len = 0; 2271 struct btrfs_inode_ref *iref; 2272 struct extent_buffer *l; 2273 struct btrfs_path *path; 2274 2275 if (dirid == BTRFS_FIRST_FREE_OBJECTID) { 2276 name[0]='\0'; 2277 return 0; 2278 } 2279 2280 path = btrfs_alloc_path(); 2281 if (!path) 2282 return -ENOMEM; 2283 2284 ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1]; 2285 2286 key.objectid = tree_id; 2287 key.type = BTRFS_ROOT_ITEM_KEY; 2288 key.offset = (u64)-1; 2289 root = btrfs_read_fs_root_no_name(info, &key); 2290 if (IS_ERR(root)) { 2291 ret = PTR_ERR(root); 2292 goto out; 2293 } 2294 2295 key.objectid = dirid; 2296 key.type = BTRFS_INODE_REF_KEY; 2297 key.offset = (u64)-1; 2298 2299 while (1) { 2300 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2301 if (ret < 0) 2302 goto out; 2303 else if (ret > 0) { 2304 ret = btrfs_previous_item(root, path, dirid, 2305 BTRFS_INODE_REF_KEY); 2306 if (ret < 0) 2307 goto out; 2308 else if (ret > 0) { 2309 ret = -ENOENT; 2310 goto out; 2311 } 2312 } 2313 2314 l = path->nodes[0]; 2315 slot = path->slots[0]; 2316 btrfs_item_key_to_cpu(l, &key, slot); 2317 2318 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); 2319 len = btrfs_inode_ref_name_len(l, iref); 2320 ptr -= len + 1; 2321 total_len += len + 1; 2322 if (ptr < name) { 2323 ret = -ENAMETOOLONG; 2324 goto out; 2325 } 2326 2327 *(ptr + len) = '/'; 2328 read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len); 2329 2330 if (key.offset == BTRFS_FIRST_FREE_OBJECTID) 2331 break; 2332 2333 btrfs_release_path(path); 2334 key.objectid = key.offset; 2335 key.offset = (u64)-1; 2336 dirid = key.objectid; 2337 } 2338 memmove(name, ptr, total_len); 2339 name[total_len] = '\0'; 2340 ret = 0; 2341 out: 2342 btrfs_free_path(path); 2343 return ret; 2344 } 2345 2346 static int btrfs_search_path_in_tree_user(struct inode *inode, 2347 struct btrfs_ioctl_ino_lookup_user_args *args) 2348 { 2349 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2350 struct super_block *sb = inode->i_sb; 2351 struct btrfs_key upper_limit = BTRFS_I(inode)->location; 2352 u64 treeid = BTRFS_I(inode)->root->root_key.objectid; 2353 u64 dirid = args->dirid; 2354 unsigned long item_off; 2355 unsigned long item_len; 2356 struct btrfs_inode_ref *iref; 2357 struct btrfs_root_ref *rref; 2358 struct btrfs_root *root; 2359 struct btrfs_path *path; 2360 struct btrfs_key key, key2; 2361 struct extent_buffer *leaf; 2362 struct inode *temp_inode; 2363 char *ptr; 2364 int slot; 2365 int len; 2366 int total_len = 0; 2367 int ret; 2368 2369 path = btrfs_alloc_path(); 2370 if (!path) 2371 return -ENOMEM; 2372 2373 /* 2374 * If the bottom subvolume does not exist directly under upper_limit, 2375 * construct the path in from the bottom up. 2376 */ 2377 if (dirid != upper_limit.objectid) { 2378 ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; 2379 2380 key.objectid = treeid; 2381 key.type = BTRFS_ROOT_ITEM_KEY; 2382 key.offset = (u64)-1; 2383 root = btrfs_read_fs_root_no_name(fs_info, &key); 2384 if (IS_ERR(root)) { 2385 ret = PTR_ERR(root); 2386 goto out; 2387 } 2388 2389 key.objectid = dirid; 2390 key.type = BTRFS_INODE_REF_KEY; 2391 key.offset = (u64)-1; 2392 while (1) { 2393 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2394 if (ret < 0) { 2395 goto out; 2396 } else if (ret > 0) { 2397 ret = btrfs_previous_item(root, path, dirid, 2398 BTRFS_INODE_REF_KEY); 2399 if (ret < 0) { 2400 goto out; 2401 } else if (ret > 0) { 2402 ret = -ENOENT; 2403 goto out; 2404 } 2405 } 2406 2407 leaf = path->nodes[0]; 2408 slot = path->slots[0]; 2409 btrfs_item_key_to_cpu(leaf, &key, slot); 2410 2411 iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref); 2412 len = btrfs_inode_ref_name_len(leaf, iref); 2413 ptr -= len + 1; 2414 total_len += len + 1; 2415 if (ptr < args->path) { 2416 ret = -ENAMETOOLONG; 2417 goto out; 2418 } 2419 2420 *(ptr + len) = '/'; 2421 read_extent_buffer(leaf, ptr, 2422 (unsigned long)(iref + 1), len); 2423 2424 /* Check the read+exec permission of this directory */ 2425 ret = btrfs_previous_item(root, path, dirid, 2426 BTRFS_INODE_ITEM_KEY); 2427 if (ret < 0) { 2428 goto out; 2429 } else if (ret > 0) { 2430 ret = -ENOENT; 2431 goto out; 2432 } 2433 2434 leaf = path->nodes[0]; 2435 slot = path->slots[0]; 2436 btrfs_item_key_to_cpu(leaf, &key2, slot); 2437 if (key2.objectid != dirid) { 2438 ret = -ENOENT; 2439 goto out; 2440 } 2441 2442 temp_inode = btrfs_iget(sb, &key2, root, NULL); 2443 if (IS_ERR(temp_inode)) { 2444 ret = PTR_ERR(temp_inode); 2445 goto out; 2446 } 2447 ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC); 2448 iput(temp_inode); 2449 if (ret) { 2450 ret = -EACCES; 2451 goto out; 2452 } 2453 2454 if (key.offset == upper_limit.objectid) 2455 break; 2456 if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { 2457 ret = -EACCES; 2458 goto out; 2459 } 2460 2461 btrfs_release_path(path); 2462 key.objectid = key.offset; 2463 key.offset = (u64)-1; 2464 dirid = key.objectid; 2465 } 2466 2467 memmove(args->path, ptr, total_len); 2468 args->path[total_len] = '\0'; 2469 btrfs_release_path(path); 2470 } 2471 2472 /* Get the bottom subvolume's name from ROOT_REF */ 2473 root = fs_info->tree_root; 2474 key.objectid = treeid; 2475 key.type = BTRFS_ROOT_REF_KEY; 2476 key.offset = args->treeid; 2477 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2478 if (ret < 0) { 2479 goto out; 2480 } else if (ret > 0) { 2481 ret = -ENOENT; 2482 goto out; 2483 } 2484 2485 leaf = path->nodes[0]; 2486 slot = path->slots[0]; 2487 btrfs_item_key_to_cpu(leaf, &key, slot); 2488 2489 item_off = btrfs_item_ptr_offset(leaf, slot); 2490 item_len = btrfs_item_size_nr(leaf, slot); 2491 /* Check if dirid in ROOT_REF corresponds to passed dirid */ 2492 rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); 2493 if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { 2494 ret = -EINVAL; 2495 goto out; 2496 } 2497 2498 /* Copy subvolume's name */ 2499 item_off += sizeof(struct btrfs_root_ref); 2500 item_len -= sizeof(struct btrfs_root_ref); 2501 read_extent_buffer(leaf, args->name, item_off, item_len); 2502 args->name[item_len] = 0; 2503 2504 out: 2505 btrfs_free_path(path); 2506 return ret; 2507 } 2508 2509 static noinline int btrfs_ioctl_ino_lookup(struct file *file, 2510 void __user *argp) 2511 { 2512 struct btrfs_ioctl_ino_lookup_args *args; 2513 struct inode *inode; 2514 int ret = 0; 2515 2516 args = memdup_user(argp, sizeof(*args)); 2517 if (IS_ERR(args)) 2518 return PTR_ERR(args); 2519 2520 inode = file_inode(file); 2521 2522 /* 2523 * Unprivileged query to obtain the containing subvolume root id. The 2524 * path is reset so it's consistent with btrfs_search_path_in_tree. 2525 */ 2526 if (args->treeid == 0) 2527 args->treeid = BTRFS_I(inode)->root->root_key.objectid; 2528 2529 if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { 2530 args->name[0] = 0; 2531 goto out; 2532 } 2533 2534 if (!capable(CAP_SYS_ADMIN)) { 2535 ret = -EPERM; 2536 goto out; 2537 } 2538 2539 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, 2540 args->treeid, args->objectid, 2541 args->name); 2542 2543 out: 2544 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 2545 ret = -EFAULT; 2546 2547 kfree(args); 2548 return ret; 2549 } 2550 2551 /* 2552 * Version of ino_lookup ioctl (unprivileged) 2553 * 2554 * The main differences from ino_lookup ioctl are: 2555 * 2556 * 1. Read + Exec permission will be checked using inode_permission() during 2557 * path construction. -EACCES will be returned in case of failure. 2558 * 2. Path construction will be stopped at the inode number which corresponds 2559 * to the fd with which this ioctl is called. If constructed path does not 2560 * exist under fd's inode, -EACCES will be returned. 2561 * 3. The name of bottom subvolume is also searched and filled. 2562 */ 2563 static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) 2564 { 2565 struct btrfs_ioctl_ino_lookup_user_args *args; 2566 struct inode *inode; 2567 int ret; 2568 2569 args = memdup_user(argp, sizeof(*args)); 2570 if (IS_ERR(args)) 2571 return PTR_ERR(args); 2572 2573 inode = file_inode(file); 2574 2575 if (args->dirid == BTRFS_FIRST_FREE_OBJECTID && 2576 BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) { 2577 /* 2578 * The subvolume does not exist under fd with which this is 2579 * called 2580 */ 2581 kfree(args); 2582 return -EACCES; 2583 } 2584 2585 ret = btrfs_search_path_in_tree_user(inode, args); 2586 2587 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 2588 ret = -EFAULT; 2589 2590 kfree(args); 2591 return ret; 2592 } 2593 2594 /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ 2595 static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) 2596 { 2597 struct btrfs_ioctl_get_subvol_info_args *subvol_info; 2598 struct btrfs_fs_info *fs_info; 2599 struct btrfs_root *root; 2600 struct btrfs_path *path; 2601 struct btrfs_key key; 2602 struct btrfs_root_item *root_item; 2603 struct btrfs_root_ref *rref; 2604 struct extent_buffer *leaf; 2605 unsigned long item_off; 2606 unsigned long item_len; 2607 struct inode *inode; 2608 int slot; 2609 int ret = 0; 2610 2611 path = btrfs_alloc_path(); 2612 if (!path) 2613 return -ENOMEM; 2614 2615 subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL); 2616 if (!subvol_info) { 2617 btrfs_free_path(path); 2618 return -ENOMEM; 2619 } 2620 2621 inode = file_inode(file); 2622 fs_info = BTRFS_I(inode)->root->fs_info; 2623 2624 /* Get root_item of inode's subvolume */ 2625 key.objectid = BTRFS_I(inode)->root->root_key.objectid; 2626 key.type = BTRFS_ROOT_ITEM_KEY; 2627 key.offset = (u64)-1; 2628 root = btrfs_read_fs_root_no_name(fs_info, &key); 2629 if (IS_ERR(root)) { 2630 ret = PTR_ERR(root); 2631 goto out; 2632 } 2633 root_item = &root->root_item; 2634 2635 subvol_info->treeid = key.objectid; 2636 2637 subvol_info->generation = btrfs_root_generation(root_item); 2638 subvol_info->flags = btrfs_root_flags(root_item); 2639 2640 memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE); 2641 memcpy(subvol_info->parent_uuid, root_item->parent_uuid, 2642 BTRFS_UUID_SIZE); 2643 memcpy(subvol_info->received_uuid, root_item->received_uuid, 2644 BTRFS_UUID_SIZE); 2645 2646 subvol_info->ctransid = btrfs_root_ctransid(root_item); 2647 subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime); 2648 subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime); 2649 2650 subvol_info->otransid = btrfs_root_otransid(root_item); 2651 subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime); 2652 subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime); 2653 2654 subvol_info->stransid = btrfs_root_stransid(root_item); 2655 subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime); 2656 subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime); 2657 2658 subvol_info->rtransid = btrfs_root_rtransid(root_item); 2659 subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime); 2660 subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime); 2661 2662 if (key.objectid != BTRFS_FS_TREE_OBJECTID) { 2663 /* Search root tree for ROOT_BACKREF of this subvolume */ 2664 root = fs_info->tree_root; 2665 2666 key.type = BTRFS_ROOT_BACKREF_KEY; 2667 key.offset = 0; 2668 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2669 if (ret < 0) { 2670 goto out; 2671 } else if (path->slots[0] >= 2672 btrfs_header_nritems(path->nodes[0])) { 2673 ret = btrfs_next_leaf(root, path); 2674 if (ret < 0) { 2675 goto out; 2676 } else if (ret > 0) { 2677 ret = -EUCLEAN; 2678 goto out; 2679 } 2680 } 2681 2682 leaf = path->nodes[0]; 2683 slot = path->slots[0]; 2684 btrfs_item_key_to_cpu(leaf, &key, slot); 2685 if (key.objectid == subvol_info->treeid && 2686 key.type == BTRFS_ROOT_BACKREF_KEY) { 2687 subvol_info->parent_id = key.offset; 2688 2689 rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); 2690 subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref); 2691 2692 item_off = btrfs_item_ptr_offset(leaf, slot) 2693 + sizeof(struct btrfs_root_ref); 2694 item_len = btrfs_item_size_nr(leaf, slot) 2695 - sizeof(struct btrfs_root_ref); 2696 read_extent_buffer(leaf, subvol_info->name, 2697 item_off, item_len); 2698 } else { 2699 ret = -ENOENT; 2700 goto out; 2701 } 2702 } 2703 2704 if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) 2705 ret = -EFAULT; 2706 2707 out: 2708 btrfs_free_path(path); 2709 kzfree(subvol_info); 2710 return ret; 2711 } 2712 2713 /* 2714 * Return ROOT_REF information of the subvolume containing this inode 2715 * except the subvolume name. 2716 */ 2717 static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) 2718 { 2719 struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; 2720 struct btrfs_root_ref *rref; 2721 struct btrfs_root *root; 2722 struct btrfs_path *path; 2723 struct btrfs_key key; 2724 struct extent_buffer *leaf; 2725 struct inode *inode; 2726 u64 objectid; 2727 int slot; 2728 int ret; 2729 u8 found; 2730 2731 path = btrfs_alloc_path(); 2732 if (!path) 2733 return -ENOMEM; 2734 2735 rootrefs = memdup_user(argp, sizeof(*rootrefs)); 2736 if (IS_ERR(rootrefs)) { 2737 btrfs_free_path(path); 2738 return PTR_ERR(rootrefs); 2739 } 2740 2741 inode = file_inode(file); 2742 root = BTRFS_I(inode)->root->fs_info->tree_root; 2743 objectid = BTRFS_I(inode)->root->root_key.objectid; 2744 2745 key.objectid = objectid; 2746 key.type = BTRFS_ROOT_REF_KEY; 2747 key.offset = rootrefs->min_treeid; 2748 found = 0; 2749 2750 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2751 if (ret < 0) { 2752 goto out; 2753 } else if (path->slots[0] >= 2754 btrfs_header_nritems(path->nodes[0])) { 2755 ret = btrfs_next_leaf(root, path); 2756 if (ret < 0) { 2757 goto out; 2758 } else if (ret > 0) { 2759 ret = -EUCLEAN; 2760 goto out; 2761 } 2762 } 2763 while (1) { 2764 leaf = path->nodes[0]; 2765 slot = path->slots[0]; 2766 2767 btrfs_item_key_to_cpu(leaf, &key, slot); 2768 if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) { 2769 ret = 0; 2770 goto out; 2771 } 2772 2773 if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) { 2774 ret = -EOVERFLOW; 2775 goto out; 2776 } 2777 2778 rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); 2779 rootrefs->rootref[found].treeid = key.offset; 2780 rootrefs->rootref[found].dirid = 2781 btrfs_root_ref_dirid(leaf, rref); 2782 found++; 2783 2784 ret = btrfs_next_item(root, path); 2785 if (ret < 0) { 2786 goto out; 2787 } else if (ret > 0) { 2788 ret = -EUCLEAN; 2789 goto out; 2790 } 2791 } 2792 2793 out: 2794 if (!ret || ret == -EOVERFLOW) { 2795 rootrefs->num_items = found; 2796 /* update min_treeid for next search */ 2797 if (found) 2798 rootrefs->min_treeid = 2799 rootrefs->rootref[found - 1].treeid + 1; 2800 if (copy_to_user(argp, rootrefs, sizeof(*rootrefs))) 2801 ret = -EFAULT; 2802 } 2803 2804 kfree(rootrefs); 2805 btrfs_free_path(path); 2806 2807 return ret; 2808 } 2809 2810 static noinline int btrfs_ioctl_snap_destroy(struct file *file, 2811 void __user *arg) 2812 { 2813 struct dentry *parent = file->f_path.dentry; 2814 struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); 2815 struct dentry *dentry; 2816 struct inode *dir = d_inode(parent); 2817 struct inode *inode; 2818 struct btrfs_root *root = BTRFS_I(dir)->root; 2819 struct btrfs_root *dest = NULL; 2820 struct btrfs_ioctl_vol_args *vol_args; 2821 int namelen; 2822 int err = 0; 2823 2824 if (!S_ISDIR(dir->i_mode)) 2825 return -ENOTDIR; 2826 2827 vol_args = memdup_user(arg, sizeof(*vol_args)); 2828 if (IS_ERR(vol_args)) 2829 return PTR_ERR(vol_args); 2830 2831 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2832 namelen = strlen(vol_args->name); 2833 if (strchr(vol_args->name, '/') || 2834 strncmp(vol_args->name, "..", namelen) == 0) { 2835 err = -EINVAL; 2836 goto out; 2837 } 2838 2839 err = mnt_want_write_file(file); 2840 if (err) 2841 goto out; 2842 2843 2844 err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); 2845 if (err == -EINTR) 2846 goto out_drop_write; 2847 dentry = lookup_one_len(vol_args->name, parent, namelen); 2848 if (IS_ERR(dentry)) { 2849 err = PTR_ERR(dentry); 2850 goto out_unlock_dir; 2851 } 2852 2853 if (d_really_is_negative(dentry)) { 2854 err = -ENOENT; 2855 goto out_dput; 2856 } 2857 2858 inode = d_inode(dentry); 2859 dest = BTRFS_I(inode)->root; 2860 if (!capable(CAP_SYS_ADMIN)) { 2861 /* 2862 * Regular user. Only allow this with a special mount 2863 * option, when the user has write+exec access to the 2864 * subvol root, and when rmdir(2) would have been 2865 * allowed. 2866 * 2867 * Note that this is _not_ check that the subvol is 2868 * empty or doesn't contain data that we wouldn't 2869 * otherwise be able to delete. 2870 * 2871 * Users who want to delete empty subvols should try 2872 * rmdir(2). 2873 */ 2874 err = -EPERM; 2875 if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) 2876 goto out_dput; 2877 2878 /* 2879 * Do not allow deletion if the parent dir is the same 2880 * as the dir to be deleted. That means the ioctl 2881 * must be called on the dentry referencing the root 2882 * of the subvol, not a random directory contained 2883 * within it. 2884 */ 2885 err = -EINVAL; 2886 if (root == dest) 2887 goto out_dput; 2888 2889 err = inode_permission(inode, MAY_WRITE | MAY_EXEC); 2890 if (err) 2891 goto out_dput; 2892 } 2893 2894 /* check if subvolume may be deleted by a user */ 2895 err = btrfs_may_delete(dir, dentry, 1); 2896 if (err) 2897 goto out_dput; 2898 2899 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { 2900 err = -EINVAL; 2901 goto out_dput; 2902 } 2903 2904 inode_lock(inode); 2905 err = btrfs_delete_subvolume(dir, dentry); 2906 inode_unlock(inode); 2907 if (!err) 2908 d_delete(dentry); 2909 2910 out_dput: 2911 dput(dentry); 2912 out_unlock_dir: 2913 inode_unlock(dir); 2914 out_drop_write: 2915 mnt_drop_write_file(file); 2916 out: 2917 kfree(vol_args); 2918 return err; 2919 } 2920 2921 static int btrfs_ioctl_defrag(struct file *file, void __user *argp) 2922 { 2923 struct inode *inode = file_inode(file); 2924 struct btrfs_root *root = BTRFS_I(inode)->root; 2925 struct btrfs_ioctl_defrag_range_args *range; 2926 int ret; 2927 2928 ret = mnt_want_write_file(file); 2929 if (ret) 2930 return ret; 2931 2932 if (btrfs_root_readonly(root)) { 2933 ret = -EROFS; 2934 goto out; 2935 } 2936 2937 switch (inode->i_mode & S_IFMT) { 2938 case S_IFDIR: 2939 if (!capable(CAP_SYS_ADMIN)) { 2940 ret = -EPERM; 2941 goto out; 2942 } 2943 ret = btrfs_defrag_root(root); 2944 break; 2945 case S_IFREG: 2946 /* 2947 * Note that this does not check the file descriptor for write 2948 * access. This prevents defragmenting executables that are 2949 * running and allows defrag on files open in read-only mode. 2950 */ 2951 if (!capable(CAP_SYS_ADMIN) && 2952 inode_permission(inode, MAY_WRITE)) { 2953 ret = -EPERM; 2954 goto out; 2955 } 2956 2957 range = kzalloc(sizeof(*range), GFP_KERNEL); 2958 if (!range) { 2959 ret = -ENOMEM; 2960 goto out; 2961 } 2962 2963 if (argp) { 2964 if (copy_from_user(range, argp, 2965 sizeof(*range))) { 2966 ret = -EFAULT; 2967 kfree(range); 2968 goto out; 2969 } 2970 /* compression requires us to start the IO */ 2971 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { 2972 range->flags |= BTRFS_DEFRAG_RANGE_START_IO; 2973 range->extent_thresh = (u32)-1; 2974 } 2975 } else { 2976 /* the rest are all set to zero by kzalloc */ 2977 range->len = (u64)-1; 2978 } 2979 ret = btrfs_defrag_file(file_inode(file), file, 2980 range, BTRFS_OLDEST_GENERATION, 0); 2981 if (ret > 0) 2982 ret = 0; 2983 kfree(range); 2984 break; 2985 default: 2986 ret = -EINVAL; 2987 } 2988 out: 2989 mnt_drop_write_file(file); 2990 return ret; 2991 } 2992 2993 static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) 2994 { 2995 struct btrfs_ioctl_vol_args *vol_args; 2996 int ret; 2997 2998 if (!capable(CAP_SYS_ADMIN)) 2999 return -EPERM; 3000 3001 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) 3002 return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 3003 3004 vol_args = memdup_user(arg, sizeof(*vol_args)); 3005 if (IS_ERR(vol_args)) { 3006 ret = PTR_ERR(vol_args); 3007 goto out; 3008 } 3009 3010 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 3011 ret = btrfs_init_new_device(fs_info, vol_args->name); 3012 3013 if (!ret) 3014 btrfs_info(fs_info, "disk added %s", vol_args->name); 3015 3016 kfree(vol_args); 3017 out: 3018 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3019 return ret; 3020 } 3021 3022 static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) 3023 { 3024 struct inode *inode = file_inode(file); 3025 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3026 struct btrfs_ioctl_vol_args_v2 *vol_args; 3027 int ret; 3028 3029 if (!capable(CAP_SYS_ADMIN)) 3030 return -EPERM; 3031 3032 ret = mnt_want_write_file(file); 3033 if (ret) 3034 return ret; 3035 3036 vol_args = memdup_user(arg, sizeof(*vol_args)); 3037 if (IS_ERR(vol_args)) { 3038 ret = PTR_ERR(vol_args); 3039 goto err_drop; 3040 } 3041 3042 /* Check for compatibility reject unknown flags */ 3043 if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) { 3044 ret = -EOPNOTSUPP; 3045 goto out; 3046 } 3047 3048 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { 3049 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 3050 goto out; 3051 } 3052 3053 if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { 3054 ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); 3055 } else { 3056 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; 3057 ret = btrfs_rm_device(fs_info, vol_args->name, 0); 3058 } 3059 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3060 3061 if (!ret) { 3062 if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) 3063 btrfs_info(fs_info, "device deleted: id %llu", 3064 vol_args->devid); 3065 else 3066 btrfs_info(fs_info, "device deleted: %s", 3067 vol_args->name); 3068 } 3069 out: 3070 kfree(vol_args); 3071 err_drop: 3072 mnt_drop_write_file(file); 3073 return ret; 3074 } 3075 3076 static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 3077 { 3078 struct inode *inode = file_inode(file); 3079 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3080 struct btrfs_ioctl_vol_args *vol_args; 3081 int ret; 3082 3083 if (!capable(CAP_SYS_ADMIN)) 3084 return -EPERM; 3085 3086 ret = mnt_want_write_file(file); 3087 if (ret) 3088 return ret; 3089 3090 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { 3091 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 3092 goto out_drop_write; 3093 } 3094 3095 vol_args = memdup_user(arg, sizeof(*vol_args)); 3096 if (IS_ERR(vol_args)) { 3097 ret = PTR_ERR(vol_args); 3098 goto out; 3099 } 3100 3101 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 3102 ret = btrfs_rm_device(fs_info, vol_args->name, 0); 3103 3104 if (!ret) 3105 btrfs_info(fs_info, "disk deleted %s", vol_args->name); 3106 kfree(vol_args); 3107 out: 3108 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 3109 out_drop_write: 3110 mnt_drop_write_file(file); 3111 3112 return ret; 3113 } 3114 3115 static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, 3116 void __user *arg) 3117 { 3118 struct btrfs_ioctl_fs_info_args *fi_args; 3119 struct btrfs_device *device; 3120 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3121 int ret = 0; 3122 3123 fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); 3124 if (!fi_args) 3125 return -ENOMEM; 3126 3127 rcu_read_lock(); 3128 fi_args->num_devices = fs_devices->num_devices; 3129 3130 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 3131 if (device->devid > fi_args->max_id) 3132 fi_args->max_id = device->devid; 3133 } 3134 rcu_read_unlock(); 3135 3136 memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); 3137 fi_args->nodesize = fs_info->nodesize; 3138 fi_args->sectorsize = fs_info->sectorsize; 3139 fi_args->clone_alignment = fs_info->sectorsize; 3140 3141 if (copy_to_user(arg, fi_args, sizeof(*fi_args))) 3142 ret = -EFAULT; 3143 3144 kfree(fi_args); 3145 return ret; 3146 } 3147 3148 static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, 3149 void __user *arg) 3150 { 3151 struct btrfs_ioctl_dev_info_args *di_args; 3152 struct btrfs_device *dev; 3153 int ret = 0; 3154 char *s_uuid = NULL; 3155 3156 di_args = memdup_user(arg, sizeof(*di_args)); 3157 if (IS_ERR(di_args)) 3158 return PTR_ERR(di_args); 3159 3160 if (!btrfs_is_empty_uuid(di_args->uuid)) 3161 s_uuid = di_args->uuid; 3162 3163 rcu_read_lock(); 3164 dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL); 3165 3166 if (!dev) { 3167 ret = -ENODEV; 3168 goto out; 3169 } 3170 3171 di_args->devid = dev->devid; 3172 di_args->bytes_used = btrfs_device_get_bytes_used(dev); 3173 di_args->total_bytes = btrfs_device_get_total_bytes(dev); 3174 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); 3175 if (dev->name) { 3176 strncpy(di_args->path, rcu_str_deref(dev->name), 3177 sizeof(di_args->path) - 1); 3178 di_args->path[sizeof(di_args->path) - 1] = 0; 3179 } else { 3180 di_args->path[0] = '\0'; 3181 } 3182 3183 out: 3184 rcu_read_unlock(); 3185 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) 3186 ret = -EFAULT; 3187 3188 kfree(di_args); 3189 return ret; 3190 } 3191 3192 static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) 3193 { 3194 struct page *page; 3195 3196 page = grab_cache_page(inode->i_mapping, index); 3197 if (!page) 3198 return ERR_PTR(-ENOMEM); 3199 3200 if (!PageUptodate(page)) { 3201 int ret; 3202 3203 ret = btrfs_readpage(NULL, page); 3204 if (ret) 3205 return ERR_PTR(ret); 3206 lock_page(page); 3207 if (!PageUptodate(page)) { 3208 unlock_page(page); 3209 put_page(page); 3210 return ERR_PTR(-EIO); 3211 } 3212 if (page->mapping != inode->i_mapping) { 3213 unlock_page(page); 3214 put_page(page); 3215 return ERR_PTR(-EAGAIN); 3216 } 3217 } 3218 3219 return page; 3220 } 3221 3222 static int gather_extent_pages(struct inode *inode, struct page **pages, 3223 int num_pages, u64 off) 3224 { 3225 int i; 3226 pgoff_t index = off >> PAGE_SHIFT; 3227 3228 for (i = 0; i < num_pages; i++) { 3229 again: 3230 pages[i] = extent_same_get_page(inode, index + i); 3231 if (IS_ERR(pages[i])) { 3232 int err = PTR_ERR(pages[i]); 3233 3234 if (err == -EAGAIN) 3235 goto again; 3236 pages[i] = NULL; 3237 return err; 3238 } 3239 } 3240 return 0; 3241 } 3242 3243 static int lock_extent_range(struct inode *inode, u64 off, u64 len, 3244 bool retry_range_locking) 3245 { 3246 /* 3247 * Do any pending delalloc/csum calculations on inode, one way or 3248 * another, and lock file content. 3249 * The locking order is: 3250 * 3251 * 1) pages 3252 * 2) range in the inode's io tree 3253 */ 3254 while (1) { 3255 struct btrfs_ordered_extent *ordered; 3256 lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 3257 ordered = btrfs_lookup_first_ordered_extent(inode, 3258 off + len - 1); 3259 if ((!ordered || 3260 ordered->file_offset + ordered->len <= off || 3261 ordered->file_offset >= off + len) && 3262 !test_range_bit(&BTRFS_I(inode)->io_tree, off, 3263 off + len - 1, EXTENT_DELALLOC, 0, NULL)) { 3264 if (ordered) 3265 btrfs_put_ordered_extent(ordered); 3266 break; 3267 } 3268 unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); 3269 if (ordered) 3270 btrfs_put_ordered_extent(ordered); 3271 if (!retry_range_locking) 3272 return -EAGAIN; 3273 btrfs_wait_ordered_range(inode, off, len); 3274 } 3275 return 0; 3276 } 3277 3278 static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) 3279 { 3280 inode_unlock(inode1); 3281 inode_unlock(inode2); 3282 } 3283 3284 static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) 3285 { 3286 if (inode1 < inode2) 3287 swap(inode1, inode2); 3288 3289 inode_lock_nested(inode1, I_MUTEX_PARENT); 3290 inode_lock_nested(inode2, I_MUTEX_CHILD); 3291 } 3292 3293 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, 3294 struct inode *inode2, u64 loff2, u64 len) 3295 { 3296 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); 3297 unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); 3298 } 3299 3300 static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1, 3301 struct inode *inode2, u64 loff2, u64 len, 3302 bool retry_range_locking) 3303 { 3304 int ret; 3305 3306 if (inode1 < inode2) { 3307 swap(inode1, inode2); 3308 swap(loff1, loff2); 3309 } 3310 ret = lock_extent_range(inode1, loff1, len, retry_range_locking); 3311 if (ret) 3312 return ret; 3313 ret = lock_extent_range(inode2, loff2, len, retry_range_locking); 3314 if (ret) 3315 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, 3316 loff1 + len - 1); 3317 return ret; 3318 } 3319 3320 struct cmp_pages { 3321 int num_pages; 3322 struct page **src_pages; 3323 struct page **dst_pages; 3324 }; 3325 3326 static void btrfs_cmp_data_free(struct cmp_pages *cmp) 3327 { 3328 int i; 3329 struct page *pg; 3330 3331 for (i = 0; i < cmp->num_pages; i++) { 3332 pg = cmp->src_pages[i]; 3333 if (pg) { 3334 unlock_page(pg); 3335 put_page(pg); 3336 cmp->src_pages[i] = NULL; 3337 } 3338 pg = cmp->dst_pages[i]; 3339 if (pg) { 3340 unlock_page(pg); 3341 put_page(pg); 3342 cmp->dst_pages[i] = NULL; 3343 } 3344 } 3345 } 3346 3347 static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, 3348 struct inode *dst, u64 dst_loff, 3349 u64 len, struct cmp_pages *cmp) 3350 { 3351 int ret; 3352 int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT; 3353 3354 cmp->num_pages = num_pages; 3355 3356 ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff); 3357 if (ret) 3358 goto out; 3359 3360 ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff); 3361 3362 out: 3363 if (ret) 3364 btrfs_cmp_data_free(cmp); 3365 return ret; 3366 } 3367 3368 static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) 3369 { 3370 int ret = 0; 3371 int i; 3372 struct page *src_page, *dst_page; 3373 unsigned int cmp_len = PAGE_SIZE; 3374 void *addr, *dst_addr; 3375 3376 i = 0; 3377 while (len) { 3378 if (len < PAGE_SIZE) 3379 cmp_len = len; 3380 3381 BUG_ON(i >= cmp->num_pages); 3382 3383 src_page = cmp->src_pages[i]; 3384 dst_page = cmp->dst_pages[i]; 3385 ASSERT(PageLocked(src_page)); 3386 ASSERT(PageLocked(dst_page)); 3387 3388 addr = kmap_atomic(src_page); 3389 dst_addr = kmap_atomic(dst_page); 3390 3391 flush_dcache_page(src_page); 3392 flush_dcache_page(dst_page); 3393 3394 if (memcmp(addr, dst_addr, cmp_len)) 3395 ret = -EBADE; 3396 3397 kunmap_atomic(addr); 3398 kunmap_atomic(dst_addr); 3399 3400 if (ret) 3401 break; 3402 3403 len -= cmp_len; 3404 i++; 3405 } 3406 3407 return ret; 3408 } 3409 3410 static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, 3411 u64 olen) 3412 { 3413 u64 len = *plen; 3414 u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; 3415 3416 if (off + olen > inode->i_size || off + olen < off) 3417 return -EINVAL; 3418 3419 /* if we extend to eof, continue to block boundary */ 3420 if (off + len == inode->i_size) 3421 *plen = len = ALIGN(inode->i_size, bs) - off; 3422 3423 /* Check that we are block aligned - btrfs_clone() requires this */ 3424 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) 3425 return -EINVAL; 3426 3427 return 0; 3428 } 3429 3430 static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, 3431 struct inode *dst, u64 dst_loff, 3432 struct cmp_pages *cmp) 3433 { 3434 int ret; 3435 u64 len = olen; 3436 bool same_inode = (src == dst); 3437 u64 same_lock_start = 0; 3438 u64 same_lock_len = 0; 3439 3440 ret = extent_same_check_offsets(src, loff, &len, olen); 3441 if (ret) 3442 return ret; 3443 3444 ret = extent_same_check_offsets(dst, dst_loff, &len, olen); 3445 if (ret) 3446 return ret; 3447 3448 if (same_inode) { 3449 /* 3450 * Single inode case wants the same checks, except we 3451 * don't want our length pushed out past i_size as 3452 * comparing that data range makes no sense. 3453 * 3454 * extent_same_check_offsets() will do this for an 3455 * unaligned length at i_size, so catch it here and 3456 * reject the request. 3457 * 3458 * This effectively means we require aligned extents 3459 * for the single-inode case, whereas the other cases 3460 * allow an unaligned length so long as it ends at 3461 * i_size. 3462 */ 3463 if (len != olen) 3464 return -EINVAL; 3465 3466 /* Check for overlapping ranges */ 3467 if (dst_loff + len > loff && dst_loff < loff + len) 3468 return -EINVAL; 3469 3470 same_lock_start = min_t(u64, loff, dst_loff); 3471 same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; 3472 } else { 3473 /* 3474 * If the source and destination inodes are different, the 3475 * source's range end offset matches the source's i_size, that 3476 * i_size is not a multiple of the sector size, and the 3477 * destination range does not go past the destination's i_size, 3478 * we must round down the length to the nearest sector size 3479 * multiple. If we don't do this adjustment we end replacing 3480 * with zeroes the bytes in the range that starts at the 3481 * deduplication range's end offset and ends at the next sector 3482 * size multiple. 3483 */ 3484 if (loff + olen == i_size_read(src) && 3485 dst_loff + len < i_size_read(dst)) { 3486 const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize; 3487 3488 len = round_down(i_size_read(src), sz) - loff; 3489 olen = len; 3490 } 3491 } 3492 3493 again: 3494 ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp); 3495 if (ret) 3496 return ret; 3497 3498 if (same_inode) 3499 ret = lock_extent_range(src, same_lock_start, same_lock_len, 3500 false); 3501 else 3502 ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len, 3503 false); 3504 /* 3505 * If one of the inodes has dirty pages in the respective range or 3506 * ordered extents, we need to flush dellaloc and wait for all ordered 3507 * extents in the range. We must unlock the pages and the ranges in the 3508 * io trees to avoid deadlocks when flushing delalloc (requires locking 3509 * pages) and when waiting for ordered extents to complete (they require 3510 * range locking). 3511 */ 3512 if (ret == -EAGAIN) { 3513 /* 3514 * Ranges in the io trees already unlocked. Now unlock all 3515 * pages before waiting for all IO to complete. 3516 */ 3517 btrfs_cmp_data_free(cmp); 3518 if (same_inode) { 3519 btrfs_wait_ordered_range(src, same_lock_start, 3520 same_lock_len); 3521 } else { 3522 btrfs_wait_ordered_range(src, loff, len); 3523 btrfs_wait_ordered_range(dst, dst_loff, len); 3524 } 3525 goto again; 3526 } 3527 ASSERT(ret == 0); 3528 if (WARN_ON(ret)) { 3529 /* ranges in the io trees already unlocked */ 3530 btrfs_cmp_data_free(cmp); 3531 return ret; 3532 } 3533 3534 /* pass original length for comparison so we stay within i_size */ 3535 ret = btrfs_cmp_data(olen, cmp); 3536 if (ret == 0) 3537 ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); 3538 3539 if (same_inode) 3540 unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, 3541 same_lock_start + same_lock_len - 1); 3542 else 3543 btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); 3544 3545 btrfs_cmp_data_free(cmp); 3546 3547 return ret; 3548 } 3549 3550 #define BTRFS_MAX_DEDUPE_LEN SZ_16M 3551 3552 static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, 3553 struct inode *dst, u64 dst_loff) 3554 { 3555 int ret; 3556 struct cmp_pages cmp; 3557 int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; 3558 bool same_inode = (src == dst); 3559 u64 i, tail_len, chunk_count; 3560 3561 if (olen == 0) 3562 return 0; 3563 3564 if (same_inode) 3565 inode_lock(src); 3566 else 3567 btrfs_double_inode_lock(src, dst); 3568 3569 /* don't make the dst file partly checksummed */ 3570 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != 3571 (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { 3572 ret = -EINVAL; 3573 goto out_unlock; 3574 } 3575 3576 tail_len = olen % BTRFS_MAX_DEDUPE_LEN; 3577 chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); 3578 if (chunk_count == 0) 3579 num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; 3580 3581 /* 3582 * If deduping ranges in the same inode, locking rules make it 3583 * mandatory to always lock pages in ascending order to avoid deadlocks 3584 * with concurrent tasks (such as starting writeback/delalloc). 3585 */ 3586 if (same_inode && dst_loff < loff) 3587 swap(loff, dst_loff); 3588 3589 /* 3590 * We must gather up all the pages before we initiate our extent 3591 * locking. We use an array for the page pointers. Size of the array is 3592 * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN. 3593 */ 3594 cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *), 3595 GFP_KERNEL | __GFP_ZERO); 3596 cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *), 3597 GFP_KERNEL | __GFP_ZERO); 3598 if (!cmp.src_pages || !cmp.dst_pages) { 3599 ret = -ENOMEM; 3600 goto out_free; 3601 } 3602 3603 for (i = 0; i < chunk_count; i++) { 3604 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, 3605 dst, dst_loff, &cmp); 3606 if (ret) 3607 goto out_free; 3608 3609 loff += BTRFS_MAX_DEDUPE_LEN; 3610 dst_loff += BTRFS_MAX_DEDUPE_LEN; 3611 } 3612 3613 if (tail_len > 0) 3614 ret = btrfs_extent_same_range(src, loff, tail_len, dst, 3615 dst_loff, &cmp); 3616 3617 out_free: 3618 kvfree(cmp.src_pages); 3619 kvfree(cmp.dst_pages); 3620 3621 out_unlock: 3622 if (same_inode) 3623 inode_unlock(src); 3624 else 3625 btrfs_double_inode_unlock(src, dst); 3626 3627 return ret; 3628 } 3629 3630 int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, 3631 struct file *dst_file, loff_t dst_loff, 3632 u64 olen) 3633 { 3634 struct inode *src = file_inode(src_file); 3635 struct inode *dst = file_inode(dst_file); 3636 u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; 3637 3638 if (WARN_ON_ONCE(bs < PAGE_SIZE)) { 3639 /* 3640 * Btrfs does not support blocksize < page_size. As a 3641 * result, btrfs_cmp_data() won't correctly handle 3642 * this situation without an update. 3643 */ 3644 return -EINVAL; 3645 } 3646 3647 return btrfs_extent_same(src, src_loff, olen, dst, dst_loff); 3648 } 3649 3650 static int clone_finish_inode_update(struct btrfs_trans_handle *trans, 3651 struct inode *inode, 3652 u64 endoff, 3653 const u64 destoff, 3654 const u64 olen, 3655 int no_time_update) 3656 { 3657 struct btrfs_root *root = BTRFS_I(inode)->root; 3658 int ret; 3659 3660 inode_inc_iversion(inode); 3661 if (!no_time_update) 3662 inode->i_mtime = inode->i_ctime = current_time(inode); 3663 /* 3664 * We round up to the block size at eof when determining which 3665 * extents to clone above, but shouldn't round up the file size. 3666 */ 3667 if (endoff > destoff + olen) 3668 endoff = destoff + olen; 3669 if (endoff > inode->i_size) 3670 btrfs_i_size_write(BTRFS_I(inode), endoff); 3671 3672 ret = btrfs_update_inode(trans, root, inode); 3673 if (ret) { 3674 btrfs_abort_transaction(trans, ret); 3675 btrfs_end_transaction(trans); 3676 goto out; 3677 } 3678 ret = btrfs_end_transaction(trans); 3679 out: 3680 return ret; 3681 } 3682 3683 static void clone_update_extent_map(struct btrfs_inode *inode, 3684 const struct btrfs_trans_handle *trans, 3685 const struct btrfs_path *path, 3686 const u64 hole_offset, 3687 const u64 hole_len) 3688 { 3689 struct extent_map_tree *em_tree = &inode->extent_tree; 3690 struct extent_map *em; 3691 int ret; 3692 3693 em = alloc_extent_map(); 3694 if (!em) { 3695 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 3696 return; 3697 } 3698 3699 if (path) { 3700 struct btrfs_file_extent_item *fi; 3701 3702 fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 3703 struct btrfs_file_extent_item); 3704 btrfs_extent_item_to_extent_map(inode, path, fi, false, em); 3705 em->generation = -1; 3706 if (btrfs_file_extent_type(path->nodes[0], fi) == 3707 BTRFS_FILE_EXTENT_INLINE) 3708 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3709 &inode->runtime_flags); 3710 } else { 3711 em->start = hole_offset; 3712 em->len = hole_len; 3713 em->ram_bytes = em->len; 3714 em->orig_start = hole_offset; 3715 em->block_start = EXTENT_MAP_HOLE; 3716 em->block_len = 0; 3717 em->orig_block_len = 0; 3718 em->compress_type = BTRFS_COMPRESS_NONE; 3719 em->generation = trans->transid; 3720 } 3721 3722 while (1) { 3723 write_lock(&em_tree->lock); 3724 ret = add_extent_mapping(em_tree, em, 1); 3725 write_unlock(&em_tree->lock); 3726 if (ret != -EEXIST) { 3727 free_extent_map(em); 3728 break; 3729 } 3730 btrfs_drop_extent_cache(inode, em->start, 3731 em->start + em->len - 1, 0); 3732 } 3733 3734 if (ret) 3735 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 3736 } 3737 3738 /* 3739 * Make sure we do not end up inserting an inline extent into a file that has 3740 * already other (non-inline) extents. If a file has an inline extent it can 3741 * not have any other extents and the (single) inline extent must start at the 3742 * file offset 0. Failing to respect these rules will lead to file corruption, 3743 * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc 3744 * 3745 * We can have extents that have been already written to disk or we can have 3746 * dirty ranges still in delalloc, in which case the extent maps and items are 3747 * created only when we run delalloc, and the delalloc ranges might fall outside 3748 * the range we are currently locking in the inode's io tree. So we check the 3749 * inode's i_size because of that (i_size updates are done while holding the 3750 * i_mutex, which we are holding here). 3751 * We also check to see if the inode has a size not greater than "datal" but has 3752 * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are 3753 * protected against such concurrent fallocate calls by the i_mutex). 3754 * 3755 * If the file has no extents but a size greater than datal, do not allow the 3756 * copy because we would need turn the inline extent into a non-inline one (even 3757 * with NO_HOLES enabled). If we find our destination inode only has one inline 3758 * extent, just overwrite it with the source inline extent if its size is less 3759 * than the source extent's size, or we could copy the source inline extent's 3760 * data into the destination inode's inline extent if the later is greater then 3761 * the former. 3762 */ 3763 static int clone_copy_inline_extent(struct inode *dst, 3764 struct btrfs_trans_handle *trans, 3765 struct btrfs_path *path, 3766 struct btrfs_key *new_key, 3767 const u64 drop_start, 3768 const u64 datal, 3769 const u64 skip, 3770 const u64 size, 3771 char *inline_data) 3772 { 3773 struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); 3774 struct btrfs_root *root = BTRFS_I(dst)->root; 3775 const u64 aligned_end = ALIGN(new_key->offset + datal, 3776 fs_info->sectorsize); 3777 int ret; 3778 struct btrfs_key key; 3779 3780 if (new_key->offset > 0) 3781 return -EOPNOTSUPP; 3782 3783 key.objectid = btrfs_ino(BTRFS_I(dst)); 3784 key.type = BTRFS_EXTENT_DATA_KEY; 3785 key.offset = 0; 3786 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3787 if (ret < 0) { 3788 return ret; 3789 } else if (ret > 0) { 3790 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 3791 ret = btrfs_next_leaf(root, path); 3792 if (ret < 0) 3793 return ret; 3794 else if (ret > 0) 3795 goto copy_inline_extent; 3796 } 3797 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3798 if (key.objectid == btrfs_ino(BTRFS_I(dst)) && 3799 key.type == BTRFS_EXTENT_DATA_KEY) { 3800 ASSERT(key.offset > 0); 3801 return -EOPNOTSUPP; 3802 } 3803 } else if (i_size_read(dst) <= datal) { 3804 struct btrfs_file_extent_item *ei; 3805 u64 ext_len; 3806 3807 /* 3808 * If the file size is <= datal, make sure there are no other 3809 * extents following (can happen do to an fallocate call with 3810 * the flag FALLOC_FL_KEEP_SIZE). 3811 */ 3812 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3813 struct btrfs_file_extent_item); 3814 /* 3815 * If it's an inline extent, it can not have other extents 3816 * following it. 3817 */ 3818 if (btrfs_file_extent_type(path->nodes[0], ei) == 3819 BTRFS_FILE_EXTENT_INLINE) 3820 goto copy_inline_extent; 3821 3822 ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); 3823 if (ext_len > aligned_end) 3824 return -EOPNOTSUPP; 3825 3826 ret = btrfs_next_item(root, path); 3827 if (ret < 0) { 3828 return ret; 3829 } else if (ret == 0) { 3830 btrfs_item_key_to_cpu(path->nodes[0], &key, 3831 path->slots[0]); 3832 if (key.objectid == btrfs_ino(BTRFS_I(dst)) && 3833 key.type == BTRFS_EXTENT_DATA_KEY) 3834 return -EOPNOTSUPP; 3835 } 3836 } 3837 3838 copy_inline_extent: 3839 /* 3840 * We have no extent items, or we have an extent at offset 0 which may 3841 * or may not be inlined. All these cases are dealt the same way. 3842 */ 3843 if (i_size_read(dst) > datal) { 3844 /* 3845 * If the destination inode has an inline extent... 3846 * This would require copying the data from the source inline 3847 * extent into the beginning of the destination's inline extent. 3848 * But this is really complex, both extents can be compressed 3849 * or just one of them, which would require decompressing and 3850 * re-compressing data (which could increase the new compressed 3851 * size, not allowing the compressed data to fit anymore in an 3852 * inline extent). 3853 * So just don't support this case for now (it should be rare, 3854 * we are not really saving space when cloning inline extents). 3855 */ 3856 return -EOPNOTSUPP; 3857 } 3858 3859 btrfs_release_path(path); 3860 ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); 3861 if (ret) 3862 return ret; 3863 ret = btrfs_insert_empty_item(trans, root, path, new_key, size); 3864 if (ret) 3865 return ret; 3866 3867 if (skip) { 3868 const u32 start = btrfs_file_extent_calc_inline_size(0); 3869 3870 memmove(inline_data + start, inline_data + start + skip, datal); 3871 } 3872 3873 write_extent_buffer(path->nodes[0], inline_data, 3874 btrfs_item_ptr_offset(path->nodes[0], 3875 path->slots[0]), 3876 size); 3877 inode_add_bytes(dst, datal); 3878 3879 return 0; 3880 } 3881 3882 /** 3883 * btrfs_clone() - clone a range from inode file to another 3884 * 3885 * @src: Inode to clone from 3886 * @inode: Inode to clone to 3887 * @off: Offset within source to start clone from 3888 * @olen: Original length, passed by user, of range to clone 3889 * @olen_aligned: Block-aligned value of olen 3890 * @destoff: Offset within @inode to start clone 3891 * @no_time_update: Whether to update mtime/ctime on the target inode 3892 */ 3893 static int btrfs_clone(struct inode *src, struct inode *inode, 3894 const u64 off, const u64 olen, const u64 olen_aligned, 3895 const u64 destoff, int no_time_update) 3896 { 3897 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3898 struct btrfs_root *root = BTRFS_I(inode)->root; 3899 struct btrfs_path *path = NULL; 3900 struct extent_buffer *leaf; 3901 struct btrfs_trans_handle *trans; 3902 char *buf = NULL; 3903 struct btrfs_key key; 3904 u32 nritems; 3905 int slot; 3906 int ret; 3907 const u64 len = olen_aligned; 3908 u64 last_dest_end = destoff; 3909 3910 ret = -ENOMEM; 3911 buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); 3912 if (!buf) 3913 return ret; 3914 3915 path = btrfs_alloc_path(); 3916 if (!path) { 3917 kvfree(buf); 3918 return ret; 3919 } 3920 3921 path->reada = READA_FORWARD; 3922 /* clone data */ 3923 key.objectid = btrfs_ino(BTRFS_I(src)); 3924 key.type = BTRFS_EXTENT_DATA_KEY; 3925 key.offset = off; 3926 3927 while (1) { 3928 u64 next_key_min_offset = key.offset + 1; 3929 3930 /* 3931 * note the key will change type as we walk through the 3932 * tree. 3933 */ 3934 path->leave_spinning = 1; 3935 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 3936 0, 0); 3937 if (ret < 0) 3938 goto out; 3939 /* 3940 * First search, if no extent item that starts at offset off was 3941 * found but the previous item is an extent item, it's possible 3942 * it might overlap our target range, therefore process it. 3943 */ 3944 if (key.offset == off && ret > 0 && path->slots[0] > 0) { 3945 btrfs_item_key_to_cpu(path->nodes[0], &key, 3946 path->slots[0] - 1); 3947 if (key.type == BTRFS_EXTENT_DATA_KEY) 3948 path->slots[0]--; 3949 } 3950 3951 nritems = btrfs_header_nritems(path->nodes[0]); 3952 process_slot: 3953 if (path->slots[0] >= nritems) { 3954 ret = btrfs_next_leaf(BTRFS_I(src)->root, path); 3955 if (ret < 0) 3956 goto out; 3957 if (ret > 0) 3958 break; 3959 nritems = btrfs_header_nritems(path->nodes[0]); 3960 } 3961 leaf = path->nodes[0]; 3962 slot = path->slots[0]; 3963 3964 btrfs_item_key_to_cpu(leaf, &key, slot); 3965 if (key.type > BTRFS_EXTENT_DATA_KEY || 3966 key.objectid != btrfs_ino(BTRFS_I(src))) 3967 break; 3968 3969 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3970 struct btrfs_file_extent_item *extent; 3971 int type; 3972 u32 size; 3973 struct btrfs_key new_key; 3974 u64 disko = 0, diskl = 0; 3975 u64 datao = 0, datal = 0; 3976 u8 comp; 3977 u64 drop_start; 3978 3979 extent = btrfs_item_ptr(leaf, slot, 3980 struct btrfs_file_extent_item); 3981 comp = btrfs_file_extent_compression(leaf, extent); 3982 type = btrfs_file_extent_type(leaf, extent); 3983 if (type == BTRFS_FILE_EXTENT_REG || 3984 type == BTRFS_FILE_EXTENT_PREALLOC) { 3985 disko = btrfs_file_extent_disk_bytenr(leaf, 3986 extent); 3987 diskl = btrfs_file_extent_disk_num_bytes(leaf, 3988 extent); 3989 datao = btrfs_file_extent_offset(leaf, extent); 3990 datal = btrfs_file_extent_num_bytes(leaf, 3991 extent); 3992 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 3993 /* take upper bound, may be compressed */ 3994 datal = btrfs_file_extent_ram_bytes(leaf, 3995 extent); 3996 } 3997 3998 /* 3999 * The first search might have left us at an extent 4000 * item that ends before our target range's start, can 4001 * happen if we have holes and NO_HOLES feature enabled. 4002 */ 4003 if (key.offset + datal <= off) { 4004 path->slots[0]++; 4005 goto process_slot; 4006 } else if (key.offset >= off + len) { 4007 break; 4008 } 4009 next_key_min_offset = key.offset + datal; 4010 size = btrfs_item_size_nr(leaf, slot); 4011 read_extent_buffer(leaf, buf, 4012 btrfs_item_ptr_offset(leaf, slot), 4013 size); 4014 4015 btrfs_release_path(path); 4016 path->leave_spinning = 0; 4017 4018 memcpy(&new_key, &key, sizeof(new_key)); 4019 new_key.objectid = btrfs_ino(BTRFS_I(inode)); 4020 if (off <= key.offset) 4021 new_key.offset = key.offset + destoff - off; 4022 else 4023 new_key.offset = destoff; 4024 4025 /* 4026 * Deal with a hole that doesn't have an extent item 4027 * that represents it (NO_HOLES feature enabled). 4028 * This hole is either in the middle of the cloning 4029 * range or at the beginning (fully overlaps it or 4030 * partially overlaps it). 4031 */ 4032 if (new_key.offset != last_dest_end) 4033 drop_start = last_dest_end; 4034 else 4035 drop_start = new_key.offset; 4036 4037 /* 4038 * 1 - adjusting old extent (we may have to split it) 4039 * 1 - add new extent 4040 * 1 - inode update 4041 */ 4042 trans = btrfs_start_transaction(root, 3); 4043 if (IS_ERR(trans)) { 4044 ret = PTR_ERR(trans); 4045 goto out; 4046 } 4047 4048 if (type == BTRFS_FILE_EXTENT_REG || 4049 type == BTRFS_FILE_EXTENT_PREALLOC) { 4050 /* 4051 * a | --- range to clone ---| b 4052 * | ------------- extent ------------- | 4053 */ 4054 4055 /* subtract range b */ 4056 if (key.offset + datal > off + len) 4057 datal = off + len - key.offset; 4058 4059 /* subtract range a */ 4060 if (off > key.offset) { 4061 datao += off - key.offset; 4062 datal -= off - key.offset; 4063 } 4064 4065 ret = btrfs_drop_extents(trans, root, inode, 4066 drop_start, 4067 new_key.offset + datal, 4068 1); 4069 if (ret) { 4070 if (ret != -EOPNOTSUPP) 4071 btrfs_abort_transaction(trans, 4072 ret); 4073 btrfs_end_transaction(trans); 4074 goto out; 4075 } 4076 4077 ret = btrfs_insert_empty_item(trans, root, path, 4078 &new_key, size); 4079 if (ret) { 4080 btrfs_abort_transaction(trans, ret); 4081 btrfs_end_transaction(trans); 4082 goto out; 4083 } 4084 4085 leaf = path->nodes[0]; 4086 slot = path->slots[0]; 4087 write_extent_buffer(leaf, buf, 4088 btrfs_item_ptr_offset(leaf, slot), 4089 size); 4090 4091 extent = btrfs_item_ptr(leaf, slot, 4092 struct btrfs_file_extent_item); 4093 4094 /* disko == 0 means it's a hole */ 4095 if (!disko) 4096 datao = 0; 4097 4098 btrfs_set_file_extent_offset(leaf, extent, 4099 datao); 4100 btrfs_set_file_extent_num_bytes(leaf, extent, 4101 datal); 4102 4103 if (disko) { 4104 inode_add_bytes(inode, datal); 4105 ret = btrfs_inc_extent_ref(trans, 4106 root, 4107 disko, diskl, 0, 4108 root->root_key.objectid, 4109 btrfs_ino(BTRFS_I(inode)), 4110 new_key.offset - datao); 4111 if (ret) { 4112 btrfs_abort_transaction(trans, 4113 ret); 4114 btrfs_end_transaction(trans); 4115 goto out; 4116 4117 } 4118 } 4119 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 4120 u64 skip = 0; 4121 u64 trim = 0; 4122 4123 if (off > key.offset) { 4124 skip = off - key.offset; 4125 new_key.offset += skip; 4126 } 4127 4128 if (key.offset + datal > off + len) 4129 trim = key.offset + datal - (off + len); 4130 4131 if (comp && (skip || trim)) { 4132 ret = -EINVAL; 4133 btrfs_end_transaction(trans); 4134 goto out; 4135 } 4136 size -= skip + trim; 4137 datal -= skip + trim; 4138 4139 ret = clone_copy_inline_extent(inode, 4140 trans, path, 4141 &new_key, 4142 drop_start, 4143 datal, 4144 skip, size, buf); 4145 if (ret) { 4146 if (ret != -EOPNOTSUPP) 4147 btrfs_abort_transaction(trans, 4148 ret); 4149 btrfs_end_transaction(trans); 4150 goto out; 4151 } 4152 leaf = path->nodes[0]; 4153 slot = path->slots[0]; 4154 } 4155 4156 /* If we have an implicit hole (NO_HOLES feature). */ 4157 if (drop_start < new_key.offset) 4158 clone_update_extent_map(BTRFS_I(inode), trans, 4159 NULL, drop_start, 4160 new_key.offset - drop_start); 4161 4162 clone_update_extent_map(BTRFS_I(inode), trans, 4163 path, 0, 0); 4164 4165 btrfs_mark_buffer_dirty(leaf); 4166 btrfs_release_path(path); 4167 4168 last_dest_end = ALIGN(new_key.offset + datal, 4169 fs_info->sectorsize); 4170 ret = clone_finish_inode_update(trans, inode, 4171 last_dest_end, 4172 destoff, olen, 4173 no_time_update); 4174 if (ret) 4175 goto out; 4176 if (new_key.offset + datal >= destoff + len) 4177 break; 4178 } 4179 btrfs_release_path(path); 4180 key.offset = next_key_min_offset; 4181 4182 if (fatal_signal_pending(current)) { 4183 ret = -EINTR; 4184 goto out; 4185 } 4186 } 4187 ret = 0; 4188 4189 if (last_dest_end < destoff + len) { 4190 /* 4191 * We have an implicit hole (NO_HOLES feature is enabled) that 4192 * fully or partially overlaps our cloning range at its end. 4193 */ 4194 btrfs_release_path(path); 4195 4196 /* 4197 * 1 - remove extent(s) 4198 * 1 - inode update 4199 */ 4200 trans = btrfs_start_transaction(root, 2); 4201 if (IS_ERR(trans)) { 4202 ret = PTR_ERR(trans); 4203 goto out; 4204 } 4205 ret = btrfs_drop_extents(trans, root, inode, 4206 last_dest_end, destoff + len, 1); 4207 if (ret) { 4208 if (ret != -EOPNOTSUPP) 4209 btrfs_abort_transaction(trans, ret); 4210 btrfs_end_transaction(trans); 4211 goto out; 4212 } 4213 clone_update_extent_map(BTRFS_I(inode), trans, NULL, 4214 last_dest_end, 4215 destoff + len - last_dest_end); 4216 ret = clone_finish_inode_update(trans, inode, destoff + len, 4217 destoff, olen, no_time_update); 4218 } 4219 4220 out: 4221 btrfs_free_path(path); 4222 kvfree(buf); 4223 return ret; 4224 } 4225 4226 static noinline int btrfs_clone_files(struct file *file, struct file *file_src, 4227 u64 off, u64 olen, u64 destoff) 4228 { 4229 struct inode *inode = file_inode(file); 4230 struct inode *src = file_inode(file_src); 4231 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4232 struct btrfs_root *root = BTRFS_I(inode)->root; 4233 int ret; 4234 u64 len = olen; 4235 u64 bs = fs_info->sb->s_blocksize; 4236 int same_inode = src == inode; 4237 4238 /* 4239 * TODO: 4240 * - split compressed inline extents. annoying: we need to 4241 * decompress into destination's address_space (the file offset 4242 * may change, so source mapping won't do), then recompress (or 4243 * otherwise reinsert) a subrange. 4244 * 4245 * - split destination inode's inline extents. The inline extents can 4246 * be either compressed or non-compressed. 4247 */ 4248 4249 if (btrfs_root_readonly(root)) 4250 return -EROFS; 4251 4252 if (file_src->f_path.mnt != file->f_path.mnt || 4253 src->i_sb != inode->i_sb) 4254 return -EXDEV; 4255 4256 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) 4257 return -EISDIR; 4258 4259 if (!same_inode) { 4260 btrfs_double_inode_lock(src, inode); 4261 } else { 4262 inode_lock(src); 4263 } 4264 4265 /* don't make the dst file partly checksummed */ 4266 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != 4267 (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 4268 ret = -EINVAL; 4269 goto out_unlock; 4270 } 4271 4272 /* determine range to clone */ 4273 ret = -EINVAL; 4274 if (off + len > src->i_size || off + len < off) 4275 goto out_unlock; 4276 if (len == 0) 4277 olen = len = src->i_size - off; 4278 /* if we extend to eof, continue to block boundary */ 4279 if (off + len == src->i_size) 4280 len = ALIGN(src->i_size, bs) - off; 4281 4282 if (len == 0) { 4283 ret = 0; 4284 goto out_unlock; 4285 } 4286 4287 /* verify the end result is block aligned */ 4288 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || 4289 !IS_ALIGNED(destoff, bs)) 4290 goto out_unlock; 4291 4292 /* verify if ranges are overlapped within the same file */ 4293 if (same_inode) { 4294 if (destoff + len > off && destoff < off + len) 4295 goto out_unlock; 4296 } 4297 4298 if (destoff > inode->i_size) { 4299 ret = btrfs_cont_expand(inode, inode->i_size, destoff); 4300 if (ret) 4301 goto out_unlock; 4302 } 4303 4304 /* 4305 * Lock the target range too. Right after we replace the file extent 4306 * items in the fs tree (which now point to the cloned data), we might 4307 * have a worker replace them with extent items relative to a write 4308 * operation that was issued before this clone operation (i.e. confront 4309 * with inode.c:btrfs_finish_ordered_io). 4310 */ 4311 if (same_inode) { 4312 u64 lock_start = min_t(u64, off, destoff); 4313 u64 lock_len = max_t(u64, off, destoff) + len - lock_start; 4314 4315 ret = lock_extent_range(src, lock_start, lock_len, true); 4316 } else { 4317 ret = btrfs_double_extent_lock(src, off, inode, destoff, len, 4318 true); 4319 } 4320 ASSERT(ret == 0); 4321 if (WARN_ON(ret)) { 4322 /* ranges in the io trees already unlocked */ 4323 goto out_unlock; 4324 } 4325 4326 ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); 4327 4328 if (same_inode) { 4329 u64 lock_start = min_t(u64, off, destoff); 4330 u64 lock_end = max_t(u64, off, destoff) + len - 1; 4331 4332 unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); 4333 } else { 4334 btrfs_double_extent_unlock(src, off, inode, destoff, len); 4335 } 4336 /* 4337 * Truncate page cache pages so that future reads will see the cloned 4338 * data immediately and not the previous data. 4339 */ 4340 truncate_inode_pages_range(&inode->i_data, 4341 round_down(destoff, PAGE_SIZE), 4342 round_up(destoff + len, PAGE_SIZE) - 1); 4343 out_unlock: 4344 if (!same_inode) 4345 btrfs_double_inode_unlock(src, inode); 4346 else 4347 inode_unlock(src); 4348 return ret; 4349 } 4350 4351 int btrfs_clone_file_range(struct file *src_file, loff_t off, 4352 struct file *dst_file, loff_t destoff, u64 len) 4353 { 4354 return btrfs_clone_files(dst_file, src_file, off, len, destoff); 4355 } 4356 4357 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) 4358 { 4359 struct inode *inode = file_inode(file); 4360 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4361 struct btrfs_root *root = BTRFS_I(inode)->root; 4362 struct btrfs_root *new_root; 4363 struct btrfs_dir_item *di; 4364 struct btrfs_trans_handle *trans; 4365 struct btrfs_path *path; 4366 struct btrfs_key location; 4367 struct btrfs_disk_key disk_key; 4368 u64 objectid = 0; 4369 u64 dir_id; 4370 int ret; 4371 4372 if (!capable(CAP_SYS_ADMIN)) 4373 return -EPERM; 4374 4375 ret = mnt_want_write_file(file); 4376 if (ret) 4377 return ret; 4378 4379 if (copy_from_user(&objectid, argp, sizeof(objectid))) { 4380 ret = -EFAULT; 4381 goto out; 4382 } 4383 4384 if (!objectid) 4385 objectid = BTRFS_FS_TREE_OBJECTID; 4386 4387 location.objectid = objectid; 4388 location.type = BTRFS_ROOT_ITEM_KEY; 4389 location.offset = (u64)-1; 4390 4391 new_root = btrfs_read_fs_root_no_name(fs_info, &location); 4392 if (IS_ERR(new_root)) { 4393 ret = PTR_ERR(new_root); 4394 goto out; 4395 } 4396 if (!is_fstree(new_root->objectid)) { 4397 ret = -ENOENT; 4398 goto out; 4399 } 4400 4401 path = btrfs_alloc_path(); 4402 if (!path) { 4403 ret = -ENOMEM; 4404 goto out; 4405 } 4406 path->leave_spinning = 1; 4407 4408 trans = btrfs_start_transaction(root, 1); 4409 if (IS_ERR(trans)) { 4410 btrfs_free_path(path); 4411 ret = PTR_ERR(trans); 4412 goto out; 4413 } 4414 4415 dir_id = btrfs_super_root_dir(fs_info->super_copy); 4416 di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, 4417 dir_id, "default", 7, 1); 4418 if (IS_ERR_OR_NULL(di)) { 4419 btrfs_free_path(path); 4420 btrfs_end_transaction(trans); 4421 btrfs_err(fs_info, 4422 "Umm, you don't have the default diritem, this isn't going to work"); 4423 ret = -ENOENT; 4424 goto out; 4425 } 4426 4427 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); 4428 btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); 4429 btrfs_mark_buffer_dirty(path->nodes[0]); 4430 btrfs_free_path(path); 4431 4432 btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); 4433 btrfs_end_transaction(trans); 4434 out: 4435 mnt_drop_write_file(file); 4436 return ret; 4437 } 4438 4439 static void get_block_group_info(struct list_head *groups_list, 4440 struct btrfs_ioctl_space_info *space) 4441 { 4442 struct btrfs_block_group_cache *block_group; 4443 4444 space->total_bytes = 0; 4445 space->used_bytes = 0; 4446 space->flags = 0; 4447 list_for_each_entry(block_group, groups_list, list) { 4448 space->flags = block_group->flags; 4449 space->total_bytes += block_group->key.offset; 4450 space->used_bytes += 4451 btrfs_block_group_used(&block_group->item); 4452 } 4453 } 4454 4455 static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, 4456 void __user *arg) 4457 { 4458 struct btrfs_ioctl_space_args space_args; 4459 struct btrfs_ioctl_space_info space; 4460 struct btrfs_ioctl_space_info *dest; 4461 struct btrfs_ioctl_space_info *dest_orig; 4462 struct btrfs_ioctl_space_info __user *user_dest; 4463 struct btrfs_space_info *info; 4464 static const u64 types[] = { 4465 BTRFS_BLOCK_GROUP_DATA, 4466 BTRFS_BLOCK_GROUP_SYSTEM, 4467 BTRFS_BLOCK_GROUP_METADATA, 4468 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA 4469 }; 4470 int num_types = 4; 4471 int alloc_size; 4472 int ret = 0; 4473 u64 slot_count = 0; 4474 int i, c; 4475 4476 if (copy_from_user(&space_args, 4477 (struct btrfs_ioctl_space_args __user *)arg, 4478 sizeof(space_args))) 4479 return -EFAULT; 4480 4481 for (i = 0; i < num_types; i++) { 4482 struct btrfs_space_info *tmp; 4483 4484 info = NULL; 4485 rcu_read_lock(); 4486 list_for_each_entry_rcu(tmp, &fs_info->space_info, 4487 list) { 4488 if (tmp->flags == types[i]) { 4489 info = tmp; 4490 break; 4491 } 4492 } 4493 rcu_read_unlock(); 4494 4495 if (!info) 4496 continue; 4497 4498 down_read(&info->groups_sem); 4499 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 4500 if (!list_empty(&info->block_groups[c])) 4501 slot_count++; 4502 } 4503 up_read(&info->groups_sem); 4504 } 4505 4506 /* 4507 * Global block reserve, exported as a space_info 4508 */ 4509 slot_count++; 4510 4511 /* space_slots == 0 means they are asking for a count */ 4512 if (space_args.space_slots == 0) { 4513 space_args.total_spaces = slot_count; 4514 goto out; 4515 } 4516 4517 slot_count = min_t(u64, space_args.space_slots, slot_count); 4518 4519 alloc_size = sizeof(*dest) * slot_count; 4520 4521 /* we generally have at most 6 or so space infos, one for each raid 4522 * level. So, a whole page should be more than enough for everyone 4523 */ 4524 if (alloc_size > PAGE_SIZE) 4525 return -ENOMEM; 4526 4527 space_args.total_spaces = 0; 4528 dest = kmalloc(alloc_size, GFP_KERNEL); 4529 if (!dest) 4530 return -ENOMEM; 4531 dest_orig = dest; 4532 4533 /* now we have a buffer to copy into */ 4534 for (i = 0; i < num_types; i++) { 4535 struct btrfs_space_info *tmp; 4536 4537 if (!slot_count) 4538 break; 4539 4540 info = NULL; 4541 rcu_read_lock(); 4542 list_for_each_entry_rcu(tmp, &fs_info->space_info, 4543 list) { 4544 if (tmp->flags == types[i]) { 4545 info = tmp; 4546 break; 4547 } 4548 } 4549 rcu_read_unlock(); 4550 4551 if (!info) 4552 continue; 4553 down_read(&info->groups_sem); 4554 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 4555 if (!list_empty(&info->block_groups[c])) { 4556 get_block_group_info(&info->block_groups[c], 4557 &space); 4558 memcpy(dest, &space, sizeof(space)); 4559 dest++; 4560 space_args.total_spaces++; 4561 slot_count--; 4562 } 4563 if (!slot_count) 4564 break; 4565 } 4566 up_read(&info->groups_sem); 4567 } 4568 4569 /* 4570 * Add global block reserve 4571 */ 4572 if (slot_count) { 4573 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 4574 4575 spin_lock(&block_rsv->lock); 4576 space.total_bytes = block_rsv->size; 4577 space.used_bytes = block_rsv->size - block_rsv->reserved; 4578 spin_unlock(&block_rsv->lock); 4579 space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; 4580 memcpy(dest, &space, sizeof(space)); 4581 space_args.total_spaces++; 4582 } 4583 4584 user_dest = (struct btrfs_ioctl_space_info __user *) 4585 (arg + sizeof(struct btrfs_ioctl_space_args)); 4586 4587 if (copy_to_user(user_dest, dest_orig, alloc_size)) 4588 ret = -EFAULT; 4589 4590 kfree(dest_orig); 4591 out: 4592 if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) 4593 ret = -EFAULT; 4594 4595 return ret; 4596 } 4597 4598 static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, 4599 void __user *argp) 4600 { 4601 struct btrfs_trans_handle *trans; 4602 u64 transid; 4603 int ret; 4604 4605 trans = btrfs_attach_transaction_barrier(root); 4606 if (IS_ERR(trans)) { 4607 if (PTR_ERR(trans) != -ENOENT) 4608 return PTR_ERR(trans); 4609 4610 /* No running transaction, don't bother */ 4611 transid = root->fs_info->last_trans_committed; 4612 goto out; 4613 } 4614 transid = trans->transid; 4615 ret = btrfs_commit_transaction_async(trans, 0); 4616 if (ret) { 4617 btrfs_end_transaction(trans); 4618 return ret; 4619 } 4620 out: 4621 if (argp) 4622 if (copy_to_user(argp, &transid, sizeof(transid))) 4623 return -EFAULT; 4624 return 0; 4625 } 4626 4627 static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, 4628 void __user *argp) 4629 { 4630 u64 transid; 4631 4632 if (argp) { 4633 if (copy_from_user(&transid, argp, sizeof(transid))) 4634 return -EFAULT; 4635 } else { 4636 transid = 0; /* current trans */ 4637 } 4638 return btrfs_wait_for_commit(fs_info, transid); 4639 } 4640 4641 static long btrfs_ioctl_scrub(struct file *file, void __user *arg) 4642 { 4643 struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); 4644 struct btrfs_ioctl_scrub_args *sa; 4645 int ret; 4646 4647 if (!capable(CAP_SYS_ADMIN)) 4648 return -EPERM; 4649 4650 sa = memdup_user(arg, sizeof(*sa)); 4651 if (IS_ERR(sa)) 4652 return PTR_ERR(sa); 4653 4654 if (!(sa->flags & BTRFS_SCRUB_READONLY)) { 4655 ret = mnt_want_write_file(file); 4656 if (ret) 4657 goto out; 4658 } 4659 4660 ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end, 4661 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 4662 0); 4663 4664 if (copy_to_user(arg, sa, sizeof(*sa))) 4665 ret = -EFAULT; 4666 4667 if (!(sa->flags & BTRFS_SCRUB_READONLY)) 4668 mnt_drop_write_file(file); 4669 out: 4670 kfree(sa); 4671 return ret; 4672 } 4673 4674 static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info) 4675 { 4676 if (!capable(CAP_SYS_ADMIN)) 4677 return -EPERM; 4678 4679 return btrfs_scrub_cancel(fs_info); 4680 } 4681 4682 static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info, 4683 void __user *arg) 4684 { 4685 struct btrfs_ioctl_scrub_args *sa; 4686 int ret; 4687 4688 if (!capable(CAP_SYS_ADMIN)) 4689 return -EPERM; 4690 4691 sa = memdup_user(arg, sizeof(*sa)); 4692 if (IS_ERR(sa)) 4693 return PTR_ERR(sa); 4694 4695 ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress); 4696 4697 if (copy_to_user(arg, sa, sizeof(*sa))) 4698 ret = -EFAULT; 4699 4700 kfree(sa); 4701 return ret; 4702 } 4703 4704 static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info, 4705 void __user *arg) 4706 { 4707 struct btrfs_ioctl_get_dev_stats *sa; 4708 int ret; 4709 4710 sa = memdup_user(arg, sizeof(*sa)); 4711 if (IS_ERR(sa)) 4712 return PTR_ERR(sa); 4713 4714 if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) { 4715 kfree(sa); 4716 return -EPERM; 4717 } 4718 4719 ret = btrfs_get_dev_stats(fs_info, sa); 4720 4721 if (copy_to_user(arg, sa, sizeof(*sa))) 4722 ret = -EFAULT; 4723 4724 kfree(sa); 4725 return ret; 4726 } 4727 4728 static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, 4729 void __user *arg) 4730 { 4731 struct btrfs_ioctl_dev_replace_args *p; 4732 int ret; 4733 4734 if (!capable(CAP_SYS_ADMIN)) 4735 return -EPERM; 4736 4737 p = memdup_user(arg, sizeof(*p)); 4738 if (IS_ERR(p)) 4739 return PTR_ERR(p); 4740 4741 switch (p->cmd) { 4742 case BTRFS_IOCTL_DEV_REPLACE_CMD_START: 4743 if (sb_rdonly(fs_info->sb)) { 4744 ret = -EROFS; 4745 goto out; 4746 } 4747 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { 4748 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 4749 } else { 4750 ret = btrfs_dev_replace_by_ioctl(fs_info, p); 4751 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4752 } 4753 break; 4754 case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: 4755 btrfs_dev_replace_status(fs_info, p); 4756 ret = 0; 4757 break; 4758 case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: 4759 p->result = btrfs_dev_replace_cancel(fs_info); 4760 ret = 0; 4761 break; 4762 default: 4763 ret = -EINVAL; 4764 break; 4765 } 4766 4767 if (copy_to_user(arg, p, sizeof(*p))) 4768 ret = -EFAULT; 4769 out: 4770 kfree(p); 4771 return ret; 4772 } 4773 4774 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 4775 { 4776 int ret = 0; 4777 int i; 4778 u64 rel_ptr; 4779 int size; 4780 struct btrfs_ioctl_ino_path_args *ipa = NULL; 4781 struct inode_fs_paths *ipath = NULL; 4782 struct btrfs_path *path; 4783 4784 if (!capable(CAP_DAC_READ_SEARCH)) 4785 return -EPERM; 4786 4787 path = btrfs_alloc_path(); 4788 if (!path) { 4789 ret = -ENOMEM; 4790 goto out; 4791 } 4792 4793 ipa = memdup_user(arg, sizeof(*ipa)); 4794 if (IS_ERR(ipa)) { 4795 ret = PTR_ERR(ipa); 4796 ipa = NULL; 4797 goto out; 4798 } 4799 4800 size = min_t(u32, ipa->size, 4096); 4801 ipath = init_ipath(size, root, path); 4802 if (IS_ERR(ipath)) { 4803 ret = PTR_ERR(ipath); 4804 ipath = NULL; 4805 goto out; 4806 } 4807 4808 ret = paths_from_inode(ipa->inum, ipath); 4809 if (ret < 0) 4810 goto out; 4811 4812 for (i = 0; i < ipath->fspath->elem_cnt; ++i) { 4813 rel_ptr = ipath->fspath->val[i] - 4814 (u64)(unsigned long)ipath->fspath->val; 4815 ipath->fspath->val[i] = rel_ptr; 4816 } 4817 4818 ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, 4819 ipath->fspath, size); 4820 if (ret) { 4821 ret = -EFAULT; 4822 goto out; 4823 } 4824 4825 out: 4826 btrfs_free_path(path); 4827 free_ipath(ipath); 4828 kfree(ipa); 4829 4830 return ret; 4831 } 4832 4833 static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) 4834 { 4835 struct btrfs_data_container *inodes = ctx; 4836 const size_t c = 3 * sizeof(u64); 4837 4838 if (inodes->bytes_left >= c) { 4839 inodes->bytes_left -= c; 4840 inodes->val[inodes->elem_cnt] = inum; 4841 inodes->val[inodes->elem_cnt + 1] = offset; 4842 inodes->val[inodes->elem_cnt + 2] = root; 4843 inodes->elem_cnt += 3; 4844 } else { 4845 inodes->bytes_missing += c - inodes->bytes_left; 4846 inodes->bytes_left = 0; 4847 inodes->elem_missed += 3; 4848 } 4849 4850 return 0; 4851 } 4852 4853 static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, 4854 void __user *arg, int version) 4855 { 4856 int ret = 0; 4857 int size; 4858 struct btrfs_ioctl_logical_ino_args *loi; 4859 struct btrfs_data_container *inodes = NULL; 4860 struct btrfs_path *path = NULL; 4861 bool ignore_offset; 4862 4863 if (!capable(CAP_SYS_ADMIN)) 4864 return -EPERM; 4865 4866 loi = memdup_user(arg, sizeof(*loi)); 4867 if (IS_ERR(loi)) 4868 return PTR_ERR(loi); 4869 4870 if (version == 1) { 4871 ignore_offset = false; 4872 size = min_t(u32, loi->size, SZ_64K); 4873 } else { 4874 /* All reserved bits must be 0 for now */ 4875 if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) { 4876 ret = -EINVAL; 4877 goto out_loi; 4878 } 4879 /* Only accept flags we have defined so far */ 4880 if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) { 4881 ret = -EINVAL; 4882 goto out_loi; 4883 } 4884 ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET; 4885 size = min_t(u32, loi->size, SZ_16M); 4886 } 4887 4888 path = btrfs_alloc_path(); 4889 if (!path) { 4890 ret = -ENOMEM; 4891 goto out; 4892 } 4893 4894 inodes = init_data_container(size); 4895 if (IS_ERR(inodes)) { 4896 ret = PTR_ERR(inodes); 4897 inodes = NULL; 4898 goto out; 4899 } 4900 4901 ret = iterate_inodes_from_logical(loi->logical, fs_info, path, 4902 build_ino_list, inodes, ignore_offset); 4903 if (ret == -EINVAL) 4904 ret = -ENOENT; 4905 if (ret < 0) 4906 goto out; 4907 4908 ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes, 4909 size); 4910 if (ret) 4911 ret = -EFAULT; 4912 4913 out: 4914 btrfs_free_path(path); 4915 kvfree(inodes); 4916 out_loi: 4917 kfree(loi); 4918 4919 return ret; 4920 } 4921 4922 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, 4923 struct btrfs_ioctl_balance_args *bargs) 4924 { 4925 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4926 4927 bargs->flags = bctl->flags; 4928 4929 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) 4930 bargs->state |= BTRFS_BALANCE_STATE_RUNNING; 4931 if (atomic_read(&fs_info->balance_pause_req)) 4932 bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; 4933 if (atomic_read(&fs_info->balance_cancel_req)) 4934 bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; 4935 4936 memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); 4937 memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); 4938 memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); 4939 4940 spin_lock(&fs_info->balance_lock); 4941 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); 4942 spin_unlock(&fs_info->balance_lock); 4943 } 4944 4945 static long btrfs_ioctl_balance(struct file *file, void __user *arg) 4946 { 4947 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 4948 struct btrfs_fs_info *fs_info = root->fs_info; 4949 struct btrfs_ioctl_balance_args *bargs; 4950 struct btrfs_balance_control *bctl; 4951 bool need_unlock; /* for mut. excl. ops lock */ 4952 int ret; 4953 4954 if (!capable(CAP_SYS_ADMIN)) 4955 return -EPERM; 4956 4957 ret = mnt_want_write_file(file); 4958 if (ret) 4959 return ret; 4960 4961 again: 4962 if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { 4963 mutex_lock(&fs_info->balance_mutex); 4964 need_unlock = true; 4965 goto locked; 4966 } 4967 4968 /* 4969 * mut. excl. ops lock is locked. Three possibilities: 4970 * (1) some other op is running 4971 * (2) balance is running 4972 * (3) balance is paused -- special case (think resume) 4973 */ 4974 mutex_lock(&fs_info->balance_mutex); 4975 if (fs_info->balance_ctl) { 4976 /* this is either (2) or (3) */ 4977 if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4978 mutex_unlock(&fs_info->balance_mutex); 4979 /* 4980 * Lock released to allow other waiters to continue, 4981 * we'll reexamine the status again. 4982 */ 4983 mutex_lock(&fs_info->balance_mutex); 4984 4985 if (fs_info->balance_ctl && 4986 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4987 /* this is (3) */ 4988 need_unlock = false; 4989 goto locked; 4990 } 4991 4992 mutex_unlock(&fs_info->balance_mutex); 4993 goto again; 4994 } else { 4995 /* this is (2) */ 4996 mutex_unlock(&fs_info->balance_mutex); 4997 ret = -EINPROGRESS; 4998 goto out; 4999 } 5000 } else { 5001 /* this is (1) */ 5002 mutex_unlock(&fs_info->balance_mutex); 5003 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 5004 goto out; 5005 } 5006 5007 locked: 5008 BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); 5009 5010 if (arg) { 5011 bargs = memdup_user(arg, sizeof(*bargs)); 5012 if (IS_ERR(bargs)) { 5013 ret = PTR_ERR(bargs); 5014 goto out_unlock; 5015 } 5016 5017 if (bargs->flags & BTRFS_BALANCE_RESUME) { 5018 if (!fs_info->balance_ctl) { 5019 ret = -ENOTCONN; 5020 goto out_bargs; 5021 } 5022 5023 bctl = fs_info->balance_ctl; 5024 spin_lock(&fs_info->balance_lock); 5025 bctl->flags |= BTRFS_BALANCE_RESUME; 5026 spin_unlock(&fs_info->balance_lock); 5027 5028 goto do_balance; 5029 } 5030 } else { 5031 bargs = NULL; 5032 } 5033 5034 if (fs_info->balance_ctl) { 5035 ret = -EINPROGRESS; 5036 goto out_bargs; 5037 } 5038 5039 bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); 5040 if (!bctl) { 5041 ret = -ENOMEM; 5042 goto out_bargs; 5043 } 5044 5045 if (arg) { 5046 memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); 5047 memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); 5048 memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); 5049 5050 bctl->flags = bargs->flags; 5051 } else { 5052 /* balance everything - no filters */ 5053 bctl->flags |= BTRFS_BALANCE_TYPE_MASK; 5054 } 5055 5056 if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { 5057 ret = -EINVAL; 5058 goto out_bctl; 5059 } 5060 5061 do_balance: 5062 /* 5063 * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to 5064 * btrfs_balance. bctl is freed in reset_balance_state, or, if 5065 * restriper was paused all the way until unmount, in free_fs_info. 5066 * The flag should be cleared after reset_balance_state. 5067 */ 5068 need_unlock = false; 5069 5070 ret = btrfs_balance(fs_info, bctl, bargs); 5071 bctl = NULL; 5072 5073 if (arg) { 5074 if (copy_to_user(arg, bargs, sizeof(*bargs))) 5075 ret = -EFAULT; 5076 } 5077 5078 out_bctl: 5079 kfree(bctl); 5080 out_bargs: 5081 kfree(bargs); 5082 out_unlock: 5083 mutex_unlock(&fs_info->balance_mutex); 5084 if (need_unlock) 5085 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 5086 out: 5087 mnt_drop_write_file(file); 5088 return ret; 5089 } 5090 5091 static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd) 5092 { 5093 if (!capable(CAP_SYS_ADMIN)) 5094 return -EPERM; 5095 5096 switch (cmd) { 5097 case BTRFS_BALANCE_CTL_PAUSE: 5098 return btrfs_pause_balance(fs_info); 5099 case BTRFS_BALANCE_CTL_CANCEL: 5100 return btrfs_cancel_balance(fs_info); 5101 } 5102 5103 return -EINVAL; 5104 } 5105 5106 static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, 5107 void __user *arg) 5108 { 5109 struct btrfs_ioctl_balance_args *bargs; 5110 int ret = 0; 5111 5112 if (!capable(CAP_SYS_ADMIN)) 5113 return -EPERM; 5114 5115 mutex_lock(&fs_info->balance_mutex); 5116 if (!fs_info->balance_ctl) { 5117 ret = -ENOTCONN; 5118 goto out; 5119 } 5120 5121 bargs = kzalloc(sizeof(*bargs), GFP_KERNEL); 5122 if (!bargs) { 5123 ret = -ENOMEM; 5124 goto out; 5125 } 5126 5127 btrfs_update_ioctl_balance_args(fs_info, bargs); 5128 5129 if (copy_to_user(arg, bargs, sizeof(*bargs))) 5130 ret = -EFAULT; 5131 5132 kfree(bargs); 5133 out: 5134 mutex_unlock(&fs_info->balance_mutex); 5135 return ret; 5136 } 5137 5138 static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) 5139 { 5140 struct inode *inode = file_inode(file); 5141 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5142 struct btrfs_ioctl_quota_ctl_args *sa; 5143 int ret; 5144 5145 if (!capable(CAP_SYS_ADMIN)) 5146 return -EPERM; 5147 5148 ret = mnt_want_write_file(file); 5149 if (ret) 5150 return ret; 5151 5152 sa = memdup_user(arg, sizeof(*sa)); 5153 if (IS_ERR(sa)) { 5154 ret = PTR_ERR(sa); 5155 goto drop_write; 5156 } 5157 5158 down_write(&fs_info->subvol_sem); 5159 5160 switch (sa->cmd) { 5161 case BTRFS_QUOTA_CTL_ENABLE: 5162 ret = btrfs_quota_enable(fs_info); 5163 break; 5164 case BTRFS_QUOTA_CTL_DISABLE: 5165 ret = btrfs_quota_disable(fs_info); 5166 break; 5167 default: 5168 ret = -EINVAL; 5169 break; 5170 } 5171 5172 kfree(sa); 5173 up_write(&fs_info->subvol_sem); 5174 drop_write: 5175 mnt_drop_write_file(file); 5176 return ret; 5177 } 5178 5179 static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) 5180 { 5181 struct inode *inode = file_inode(file); 5182 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5183 struct btrfs_root *root = BTRFS_I(inode)->root; 5184 struct btrfs_ioctl_qgroup_assign_args *sa; 5185 struct btrfs_trans_handle *trans; 5186 int ret; 5187 int err; 5188 5189 if (!capable(CAP_SYS_ADMIN)) 5190 return -EPERM; 5191 5192 ret = mnt_want_write_file(file); 5193 if (ret) 5194 return ret; 5195 5196 sa = memdup_user(arg, sizeof(*sa)); 5197 if (IS_ERR(sa)) { 5198 ret = PTR_ERR(sa); 5199 goto drop_write; 5200 } 5201 5202 trans = btrfs_join_transaction(root); 5203 if (IS_ERR(trans)) { 5204 ret = PTR_ERR(trans); 5205 goto out; 5206 } 5207 5208 if (sa->assign) { 5209 ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst); 5210 } else { 5211 ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst); 5212 } 5213 5214 /* update qgroup status and info */ 5215 err = btrfs_run_qgroups(trans); 5216 if (err < 0) 5217 btrfs_handle_fs_error(fs_info, err, 5218 "failed to update qgroup status and info"); 5219 err = btrfs_end_transaction(trans); 5220 if (err && !ret) 5221 ret = err; 5222 5223 out: 5224 kfree(sa); 5225 drop_write: 5226 mnt_drop_write_file(file); 5227 return ret; 5228 } 5229 5230 static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) 5231 { 5232 struct inode *inode = file_inode(file); 5233 struct btrfs_root *root = BTRFS_I(inode)->root; 5234 struct btrfs_ioctl_qgroup_create_args *sa; 5235 struct btrfs_trans_handle *trans; 5236 int ret; 5237 int err; 5238 5239 if (!capable(CAP_SYS_ADMIN)) 5240 return -EPERM; 5241 5242 ret = mnt_want_write_file(file); 5243 if (ret) 5244 return ret; 5245 5246 sa = memdup_user(arg, sizeof(*sa)); 5247 if (IS_ERR(sa)) { 5248 ret = PTR_ERR(sa); 5249 goto drop_write; 5250 } 5251 5252 if (!sa->qgroupid) { 5253 ret = -EINVAL; 5254 goto out; 5255 } 5256 5257 trans = btrfs_join_transaction(root); 5258 if (IS_ERR(trans)) { 5259 ret = PTR_ERR(trans); 5260 goto out; 5261 } 5262 5263 if (sa->create) { 5264 ret = btrfs_create_qgroup(trans, sa->qgroupid); 5265 } else { 5266 ret = btrfs_remove_qgroup(trans, sa->qgroupid); 5267 } 5268 5269 err = btrfs_end_transaction(trans); 5270 if (err && !ret) 5271 ret = err; 5272 5273 out: 5274 kfree(sa); 5275 drop_write: 5276 mnt_drop_write_file(file); 5277 return ret; 5278 } 5279 5280 static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) 5281 { 5282 struct inode *inode = file_inode(file); 5283 struct btrfs_root *root = BTRFS_I(inode)->root; 5284 struct btrfs_ioctl_qgroup_limit_args *sa; 5285 struct btrfs_trans_handle *trans; 5286 int ret; 5287 int err; 5288 u64 qgroupid; 5289 5290 if (!capable(CAP_SYS_ADMIN)) 5291 return -EPERM; 5292 5293 ret = mnt_want_write_file(file); 5294 if (ret) 5295 return ret; 5296 5297 sa = memdup_user(arg, sizeof(*sa)); 5298 if (IS_ERR(sa)) { 5299 ret = PTR_ERR(sa); 5300 goto drop_write; 5301 } 5302 5303 trans = btrfs_join_transaction(root); 5304 if (IS_ERR(trans)) { 5305 ret = PTR_ERR(trans); 5306 goto out; 5307 } 5308 5309 qgroupid = sa->qgroupid; 5310 if (!qgroupid) { 5311 /* take the current subvol as qgroup */ 5312 qgroupid = root->root_key.objectid; 5313 } 5314 5315 ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim); 5316 5317 err = btrfs_end_transaction(trans); 5318 if (err && !ret) 5319 ret = err; 5320 5321 out: 5322 kfree(sa); 5323 drop_write: 5324 mnt_drop_write_file(file); 5325 return ret; 5326 } 5327 5328 static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) 5329 { 5330 struct inode *inode = file_inode(file); 5331 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5332 struct btrfs_ioctl_quota_rescan_args *qsa; 5333 int ret; 5334 5335 if (!capable(CAP_SYS_ADMIN)) 5336 return -EPERM; 5337 5338 ret = mnt_want_write_file(file); 5339 if (ret) 5340 return ret; 5341 5342 qsa = memdup_user(arg, sizeof(*qsa)); 5343 if (IS_ERR(qsa)) { 5344 ret = PTR_ERR(qsa); 5345 goto drop_write; 5346 } 5347 5348 if (qsa->flags) { 5349 ret = -EINVAL; 5350 goto out; 5351 } 5352 5353 ret = btrfs_qgroup_rescan(fs_info); 5354 5355 out: 5356 kfree(qsa); 5357 drop_write: 5358 mnt_drop_write_file(file); 5359 return ret; 5360 } 5361 5362 static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) 5363 { 5364 struct inode *inode = file_inode(file); 5365 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5366 struct btrfs_ioctl_quota_rescan_args *qsa; 5367 int ret = 0; 5368 5369 if (!capable(CAP_SYS_ADMIN)) 5370 return -EPERM; 5371 5372 qsa = kzalloc(sizeof(*qsa), GFP_KERNEL); 5373 if (!qsa) 5374 return -ENOMEM; 5375 5376 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 5377 qsa->flags = 1; 5378 qsa->progress = fs_info->qgroup_rescan_progress.objectid; 5379 } 5380 5381 if (copy_to_user(arg, qsa, sizeof(*qsa))) 5382 ret = -EFAULT; 5383 5384 kfree(qsa); 5385 return ret; 5386 } 5387 5388 static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) 5389 { 5390 struct inode *inode = file_inode(file); 5391 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5392 5393 if (!capable(CAP_SYS_ADMIN)) 5394 return -EPERM; 5395 5396 return btrfs_qgroup_wait_for_completion(fs_info, true); 5397 } 5398 5399 static long _btrfs_ioctl_set_received_subvol(struct file *file, 5400 struct btrfs_ioctl_received_subvol_args *sa) 5401 { 5402 struct inode *inode = file_inode(file); 5403 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5404 struct btrfs_root *root = BTRFS_I(inode)->root; 5405 struct btrfs_root_item *root_item = &root->root_item; 5406 struct btrfs_trans_handle *trans; 5407 struct timespec64 ct = current_time(inode); 5408 int ret = 0; 5409 int received_uuid_changed; 5410 5411 if (!inode_owner_or_capable(inode)) 5412 return -EPERM; 5413 5414 ret = mnt_want_write_file(file); 5415 if (ret < 0) 5416 return ret; 5417 5418 down_write(&fs_info->subvol_sem); 5419 5420 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { 5421 ret = -EINVAL; 5422 goto out; 5423 } 5424 5425 if (btrfs_root_readonly(root)) { 5426 ret = -EROFS; 5427 goto out; 5428 } 5429 5430 /* 5431 * 1 - root item 5432 * 2 - uuid items (received uuid + subvol uuid) 5433 */ 5434 trans = btrfs_start_transaction(root, 3); 5435 if (IS_ERR(trans)) { 5436 ret = PTR_ERR(trans); 5437 trans = NULL; 5438 goto out; 5439 } 5440 5441 sa->rtransid = trans->transid; 5442 sa->rtime.sec = ct.tv_sec; 5443 sa->rtime.nsec = ct.tv_nsec; 5444 5445 received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid, 5446 BTRFS_UUID_SIZE); 5447 if (received_uuid_changed && 5448 !btrfs_is_empty_uuid(root_item->received_uuid)) { 5449 ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, 5450 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 5451 root->root_key.objectid); 5452 if (ret && ret != -ENOENT) { 5453 btrfs_abort_transaction(trans, ret); 5454 btrfs_end_transaction(trans); 5455 goto out; 5456 } 5457 } 5458 memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); 5459 btrfs_set_root_stransid(root_item, sa->stransid); 5460 btrfs_set_root_rtransid(root_item, sa->rtransid); 5461 btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec); 5462 btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec); 5463 btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec); 5464 btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec); 5465 5466 ret = btrfs_update_root(trans, fs_info->tree_root, 5467 &root->root_key, &root->root_item); 5468 if (ret < 0) { 5469 btrfs_end_transaction(trans); 5470 goto out; 5471 } 5472 if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { 5473 ret = btrfs_uuid_tree_add(trans, sa->uuid, 5474 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 5475 root->root_key.objectid); 5476 if (ret < 0 && ret != -EEXIST) { 5477 btrfs_abort_transaction(trans, ret); 5478 btrfs_end_transaction(trans); 5479 goto out; 5480 } 5481 } 5482 ret = btrfs_commit_transaction(trans); 5483 out: 5484 up_write(&fs_info->subvol_sem); 5485 mnt_drop_write_file(file); 5486 return ret; 5487 } 5488 5489 #ifdef CONFIG_64BIT 5490 static long btrfs_ioctl_set_received_subvol_32(struct file *file, 5491 void __user *arg) 5492 { 5493 struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL; 5494 struct btrfs_ioctl_received_subvol_args *args64 = NULL; 5495 int ret = 0; 5496 5497 args32 = memdup_user(arg, sizeof(*args32)); 5498 if (IS_ERR(args32)) 5499 return PTR_ERR(args32); 5500 5501 args64 = kmalloc(sizeof(*args64), GFP_KERNEL); 5502 if (!args64) { 5503 ret = -ENOMEM; 5504 goto out; 5505 } 5506 5507 memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE); 5508 args64->stransid = args32->stransid; 5509 args64->rtransid = args32->rtransid; 5510 args64->stime.sec = args32->stime.sec; 5511 args64->stime.nsec = args32->stime.nsec; 5512 args64->rtime.sec = args32->rtime.sec; 5513 args64->rtime.nsec = args32->rtime.nsec; 5514 args64->flags = args32->flags; 5515 5516 ret = _btrfs_ioctl_set_received_subvol(file, args64); 5517 if (ret) 5518 goto out; 5519 5520 memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE); 5521 args32->stransid = args64->stransid; 5522 args32->rtransid = args64->rtransid; 5523 args32->stime.sec = args64->stime.sec; 5524 args32->stime.nsec = args64->stime.nsec; 5525 args32->rtime.sec = args64->rtime.sec; 5526 args32->rtime.nsec = args64->rtime.nsec; 5527 args32->flags = args64->flags; 5528 5529 ret = copy_to_user(arg, args32, sizeof(*args32)); 5530 if (ret) 5531 ret = -EFAULT; 5532 5533 out: 5534 kfree(args32); 5535 kfree(args64); 5536 return ret; 5537 } 5538 #endif 5539 5540 static long btrfs_ioctl_set_received_subvol(struct file *file, 5541 void __user *arg) 5542 { 5543 struct btrfs_ioctl_received_subvol_args *sa = NULL; 5544 int ret = 0; 5545 5546 sa = memdup_user(arg, sizeof(*sa)); 5547 if (IS_ERR(sa)) 5548 return PTR_ERR(sa); 5549 5550 ret = _btrfs_ioctl_set_received_subvol(file, sa); 5551 5552 if (ret) 5553 goto out; 5554 5555 ret = copy_to_user(arg, sa, sizeof(*sa)); 5556 if (ret) 5557 ret = -EFAULT; 5558 5559 out: 5560 kfree(sa); 5561 return ret; 5562 } 5563 5564 static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) 5565 { 5566 struct inode *inode = file_inode(file); 5567 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5568 size_t len; 5569 int ret; 5570 char label[BTRFS_LABEL_SIZE]; 5571 5572 spin_lock(&fs_info->super_lock); 5573 memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE); 5574 spin_unlock(&fs_info->super_lock); 5575 5576 len = strnlen(label, BTRFS_LABEL_SIZE); 5577 5578 if (len == BTRFS_LABEL_SIZE) { 5579 btrfs_warn(fs_info, 5580 "label is too long, return the first %zu bytes", 5581 --len); 5582 } 5583 5584 ret = copy_to_user(arg, label, len); 5585 5586 return ret ? -EFAULT : 0; 5587 } 5588 5589 static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) 5590 { 5591 struct inode *inode = file_inode(file); 5592 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5593 struct btrfs_root *root = BTRFS_I(inode)->root; 5594 struct btrfs_super_block *super_block = fs_info->super_copy; 5595 struct btrfs_trans_handle *trans; 5596 char label[BTRFS_LABEL_SIZE]; 5597 int ret; 5598 5599 if (!capable(CAP_SYS_ADMIN)) 5600 return -EPERM; 5601 5602 if (copy_from_user(label, arg, sizeof(label))) 5603 return -EFAULT; 5604 5605 if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { 5606 btrfs_err(fs_info, 5607 "unable to set label with more than %d bytes", 5608 BTRFS_LABEL_SIZE - 1); 5609 return -EINVAL; 5610 } 5611 5612 ret = mnt_want_write_file(file); 5613 if (ret) 5614 return ret; 5615 5616 trans = btrfs_start_transaction(root, 0); 5617 if (IS_ERR(trans)) { 5618 ret = PTR_ERR(trans); 5619 goto out_unlock; 5620 } 5621 5622 spin_lock(&fs_info->super_lock); 5623 strcpy(super_block->label, label); 5624 spin_unlock(&fs_info->super_lock); 5625 ret = btrfs_commit_transaction(trans); 5626 5627 out_unlock: 5628 mnt_drop_write_file(file); 5629 return ret; 5630 } 5631 5632 #define INIT_FEATURE_FLAGS(suffix) \ 5633 { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \ 5634 .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ 5635 .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } 5636 5637 int btrfs_ioctl_get_supported_features(void __user *arg) 5638 { 5639 static const struct btrfs_ioctl_feature_flags features[3] = { 5640 INIT_FEATURE_FLAGS(SUPP), 5641 INIT_FEATURE_FLAGS(SAFE_SET), 5642 INIT_FEATURE_FLAGS(SAFE_CLEAR) 5643 }; 5644 5645 if (copy_to_user(arg, &features, sizeof(features))) 5646 return -EFAULT; 5647 5648 return 0; 5649 } 5650 5651 static int btrfs_ioctl_get_features(struct file *file, void __user *arg) 5652 { 5653 struct inode *inode = file_inode(file); 5654 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5655 struct btrfs_super_block *super_block = fs_info->super_copy; 5656 struct btrfs_ioctl_feature_flags features; 5657 5658 features.compat_flags = btrfs_super_compat_flags(super_block); 5659 features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block); 5660 features.incompat_flags = btrfs_super_incompat_flags(super_block); 5661 5662 if (copy_to_user(arg, &features, sizeof(features))) 5663 return -EFAULT; 5664 5665 return 0; 5666 } 5667 5668 static int check_feature_bits(struct btrfs_fs_info *fs_info, 5669 enum btrfs_feature_set set, 5670 u64 change_mask, u64 flags, u64 supported_flags, 5671 u64 safe_set, u64 safe_clear) 5672 { 5673 const char *type = btrfs_feature_set_names[set]; 5674 char *names; 5675 u64 disallowed, unsupported; 5676 u64 set_mask = flags & change_mask; 5677 u64 clear_mask = ~flags & change_mask; 5678 5679 unsupported = set_mask & ~supported_flags; 5680 if (unsupported) { 5681 names = btrfs_printable_features(set, unsupported); 5682 if (names) { 5683 btrfs_warn(fs_info, 5684 "this kernel does not support the %s feature bit%s", 5685 names, strchr(names, ',') ? "s" : ""); 5686 kfree(names); 5687 } else 5688 btrfs_warn(fs_info, 5689 "this kernel does not support %s bits 0x%llx", 5690 type, unsupported); 5691 return -EOPNOTSUPP; 5692 } 5693 5694 disallowed = set_mask & ~safe_set; 5695 if (disallowed) { 5696 names = btrfs_printable_features(set, disallowed); 5697 if (names) { 5698 btrfs_warn(fs_info, 5699 "can't set the %s feature bit%s while mounted", 5700 names, strchr(names, ',') ? "s" : ""); 5701 kfree(names); 5702 } else 5703 btrfs_warn(fs_info, 5704 "can't set %s bits 0x%llx while mounted", 5705 type, disallowed); 5706 return -EPERM; 5707 } 5708 5709 disallowed = clear_mask & ~safe_clear; 5710 if (disallowed) { 5711 names = btrfs_printable_features(set, disallowed); 5712 if (names) { 5713 btrfs_warn(fs_info, 5714 "can't clear the %s feature bit%s while mounted", 5715 names, strchr(names, ',') ? "s" : ""); 5716 kfree(names); 5717 } else 5718 btrfs_warn(fs_info, 5719 "can't clear %s bits 0x%llx while mounted", 5720 type, disallowed); 5721 return -EPERM; 5722 } 5723 5724 return 0; 5725 } 5726 5727 #define check_feature(fs_info, change_mask, flags, mask_base) \ 5728 check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \ 5729 BTRFS_FEATURE_ ## mask_base ## _SUPP, \ 5730 BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ 5731 BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) 5732 5733 static int btrfs_ioctl_set_features(struct file *file, void __user *arg) 5734 { 5735 struct inode *inode = file_inode(file); 5736 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5737 struct btrfs_root *root = BTRFS_I(inode)->root; 5738 struct btrfs_super_block *super_block = fs_info->super_copy; 5739 struct btrfs_ioctl_feature_flags flags[2]; 5740 struct btrfs_trans_handle *trans; 5741 u64 newflags; 5742 int ret; 5743 5744 if (!capable(CAP_SYS_ADMIN)) 5745 return -EPERM; 5746 5747 if (copy_from_user(flags, arg, sizeof(flags))) 5748 return -EFAULT; 5749 5750 /* Nothing to do */ 5751 if (!flags[0].compat_flags && !flags[0].compat_ro_flags && 5752 !flags[0].incompat_flags) 5753 return 0; 5754 5755 ret = check_feature(fs_info, flags[0].compat_flags, 5756 flags[1].compat_flags, COMPAT); 5757 if (ret) 5758 return ret; 5759 5760 ret = check_feature(fs_info, flags[0].compat_ro_flags, 5761 flags[1].compat_ro_flags, COMPAT_RO); 5762 if (ret) 5763 return ret; 5764 5765 ret = check_feature(fs_info, flags[0].incompat_flags, 5766 flags[1].incompat_flags, INCOMPAT); 5767 if (ret) 5768 return ret; 5769 5770 ret = mnt_want_write_file(file); 5771 if (ret) 5772 return ret; 5773 5774 trans = btrfs_start_transaction(root, 0); 5775 if (IS_ERR(trans)) { 5776 ret = PTR_ERR(trans); 5777 goto out_drop_write; 5778 } 5779 5780 spin_lock(&fs_info->super_lock); 5781 newflags = btrfs_super_compat_flags(super_block); 5782 newflags |= flags[0].compat_flags & flags[1].compat_flags; 5783 newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); 5784 btrfs_set_super_compat_flags(super_block, newflags); 5785 5786 newflags = btrfs_super_compat_ro_flags(super_block); 5787 newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags; 5788 newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags); 5789 btrfs_set_super_compat_ro_flags(super_block, newflags); 5790 5791 newflags = btrfs_super_incompat_flags(super_block); 5792 newflags |= flags[0].incompat_flags & flags[1].incompat_flags; 5793 newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); 5794 btrfs_set_super_incompat_flags(super_block, newflags); 5795 spin_unlock(&fs_info->super_lock); 5796 5797 ret = btrfs_commit_transaction(trans); 5798 out_drop_write: 5799 mnt_drop_write_file(file); 5800 5801 return ret; 5802 } 5803 5804 static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) 5805 { 5806 struct btrfs_ioctl_send_args *arg; 5807 int ret; 5808 5809 if (compat) { 5810 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 5811 struct btrfs_ioctl_send_args_32 args32; 5812 5813 ret = copy_from_user(&args32, argp, sizeof(args32)); 5814 if (ret) 5815 return -EFAULT; 5816 arg = kzalloc(sizeof(*arg), GFP_KERNEL); 5817 if (!arg) 5818 return -ENOMEM; 5819 arg->send_fd = args32.send_fd; 5820 arg->clone_sources_count = args32.clone_sources_count; 5821 arg->clone_sources = compat_ptr(args32.clone_sources); 5822 arg->parent_root = args32.parent_root; 5823 arg->flags = args32.flags; 5824 memcpy(arg->reserved, args32.reserved, 5825 sizeof(args32.reserved)); 5826 #else 5827 return -ENOTTY; 5828 #endif 5829 } else { 5830 arg = memdup_user(argp, sizeof(*arg)); 5831 if (IS_ERR(arg)) 5832 return PTR_ERR(arg); 5833 } 5834 ret = btrfs_ioctl_send(file, arg); 5835 kfree(arg); 5836 return ret; 5837 } 5838 5839 long btrfs_ioctl(struct file *file, unsigned int 5840 cmd, unsigned long arg) 5841 { 5842 struct inode *inode = file_inode(file); 5843 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5844 struct btrfs_root *root = BTRFS_I(inode)->root; 5845 void __user *argp = (void __user *)arg; 5846 5847 switch (cmd) { 5848 case FS_IOC_GETFLAGS: 5849 return btrfs_ioctl_getflags(file, argp); 5850 case FS_IOC_SETFLAGS: 5851 return btrfs_ioctl_setflags(file, argp); 5852 case FS_IOC_GETVERSION: 5853 return btrfs_ioctl_getversion(file, argp); 5854 case FITRIM: 5855 return btrfs_ioctl_fitrim(file, argp); 5856 case BTRFS_IOC_SNAP_CREATE: 5857 return btrfs_ioctl_snap_create(file, argp, 0); 5858 case BTRFS_IOC_SNAP_CREATE_V2: 5859 return btrfs_ioctl_snap_create_v2(file, argp, 0); 5860 case BTRFS_IOC_SUBVOL_CREATE: 5861 return btrfs_ioctl_snap_create(file, argp, 1); 5862 case BTRFS_IOC_SUBVOL_CREATE_V2: 5863 return btrfs_ioctl_snap_create_v2(file, argp, 1); 5864 case BTRFS_IOC_SNAP_DESTROY: 5865 return btrfs_ioctl_snap_destroy(file, argp); 5866 case BTRFS_IOC_SUBVOL_GETFLAGS: 5867 return btrfs_ioctl_subvol_getflags(file, argp); 5868 case BTRFS_IOC_SUBVOL_SETFLAGS: 5869 return btrfs_ioctl_subvol_setflags(file, argp); 5870 case BTRFS_IOC_DEFAULT_SUBVOL: 5871 return btrfs_ioctl_default_subvol(file, argp); 5872 case BTRFS_IOC_DEFRAG: 5873 return btrfs_ioctl_defrag(file, NULL); 5874 case BTRFS_IOC_DEFRAG_RANGE: 5875 return btrfs_ioctl_defrag(file, argp); 5876 case BTRFS_IOC_RESIZE: 5877 return btrfs_ioctl_resize(file, argp); 5878 case BTRFS_IOC_ADD_DEV: 5879 return btrfs_ioctl_add_dev(fs_info, argp); 5880 case BTRFS_IOC_RM_DEV: 5881 return btrfs_ioctl_rm_dev(file, argp); 5882 case BTRFS_IOC_RM_DEV_V2: 5883 return btrfs_ioctl_rm_dev_v2(file, argp); 5884 case BTRFS_IOC_FS_INFO: 5885 return btrfs_ioctl_fs_info(fs_info, argp); 5886 case BTRFS_IOC_DEV_INFO: 5887 return btrfs_ioctl_dev_info(fs_info, argp); 5888 case BTRFS_IOC_BALANCE: 5889 return btrfs_ioctl_balance(file, NULL); 5890 case BTRFS_IOC_TREE_SEARCH: 5891 return btrfs_ioctl_tree_search(file, argp); 5892 case BTRFS_IOC_TREE_SEARCH_V2: 5893 return btrfs_ioctl_tree_search_v2(file, argp); 5894 case BTRFS_IOC_INO_LOOKUP: 5895 return btrfs_ioctl_ino_lookup(file, argp); 5896 case BTRFS_IOC_INO_PATHS: 5897 return btrfs_ioctl_ino_to_path(root, argp); 5898 case BTRFS_IOC_LOGICAL_INO: 5899 return btrfs_ioctl_logical_to_ino(fs_info, argp, 1); 5900 case BTRFS_IOC_LOGICAL_INO_V2: 5901 return btrfs_ioctl_logical_to_ino(fs_info, argp, 2); 5902 case BTRFS_IOC_SPACE_INFO: 5903 return btrfs_ioctl_space_info(fs_info, argp); 5904 case BTRFS_IOC_SYNC: { 5905 int ret; 5906 5907 ret = btrfs_start_delalloc_roots(fs_info, -1); 5908 if (ret) 5909 return ret; 5910 ret = btrfs_sync_fs(inode->i_sb, 1); 5911 /* 5912 * The transaction thread may want to do more work, 5913 * namely it pokes the cleaner kthread that will start 5914 * processing uncleaned subvols. 5915 */ 5916 wake_up_process(fs_info->transaction_kthread); 5917 return ret; 5918 } 5919 case BTRFS_IOC_START_SYNC: 5920 return btrfs_ioctl_start_sync(root, argp); 5921 case BTRFS_IOC_WAIT_SYNC: 5922 return btrfs_ioctl_wait_sync(fs_info, argp); 5923 case BTRFS_IOC_SCRUB: 5924 return btrfs_ioctl_scrub(file, argp); 5925 case BTRFS_IOC_SCRUB_CANCEL: 5926 return btrfs_ioctl_scrub_cancel(fs_info); 5927 case BTRFS_IOC_SCRUB_PROGRESS: 5928 return btrfs_ioctl_scrub_progress(fs_info, argp); 5929 case BTRFS_IOC_BALANCE_V2: 5930 return btrfs_ioctl_balance(file, argp); 5931 case BTRFS_IOC_BALANCE_CTL: 5932 return btrfs_ioctl_balance_ctl(fs_info, arg); 5933 case BTRFS_IOC_BALANCE_PROGRESS: 5934 return btrfs_ioctl_balance_progress(fs_info, argp); 5935 case BTRFS_IOC_SET_RECEIVED_SUBVOL: 5936 return btrfs_ioctl_set_received_subvol(file, argp); 5937 #ifdef CONFIG_64BIT 5938 case BTRFS_IOC_SET_RECEIVED_SUBVOL_32: 5939 return btrfs_ioctl_set_received_subvol_32(file, argp); 5940 #endif 5941 case BTRFS_IOC_SEND: 5942 return _btrfs_ioctl_send(file, argp, false); 5943 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 5944 case BTRFS_IOC_SEND_32: 5945 return _btrfs_ioctl_send(file, argp, true); 5946 #endif 5947 case BTRFS_IOC_GET_DEV_STATS: 5948 return btrfs_ioctl_get_dev_stats(fs_info, argp); 5949 case BTRFS_IOC_QUOTA_CTL: 5950 return btrfs_ioctl_quota_ctl(file, argp); 5951 case BTRFS_IOC_QGROUP_ASSIGN: 5952 return btrfs_ioctl_qgroup_assign(file, argp); 5953 case BTRFS_IOC_QGROUP_CREATE: 5954 return btrfs_ioctl_qgroup_create(file, argp); 5955 case BTRFS_IOC_QGROUP_LIMIT: 5956 return btrfs_ioctl_qgroup_limit(file, argp); 5957 case BTRFS_IOC_QUOTA_RESCAN: 5958 return btrfs_ioctl_quota_rescan(file, argp); 5959 case BTRFS_IOC_QUOTA_RESCAN_STATUS: 5960 return btrfs_ioctl_quota_rescan_status(file, argp); 5961 case BTRFS_IOC_QUOTA_RESCAN_WAIT: 5962 return btrfs_ioctl_quota_rescan_wait(file, argp); 5963 case BTRFS_IOC_DEV_REPLACE: 5964 return btrfs_ioctl_dev_replace(fs_info, argp); 5965 case BTRFS_IOC_GET_FSLABEL: 5966 return btrfs_ioctl_get_fslabel(file, argp); 5967 case BTRFS_IOC_SET_FSLABEL: 5968 return btrfs_ioctl_set_fslabel(file, argp); 5969 case BTRFS_IOC_GET_SUPPORTED_FEATURES: 5970 return btrfs_ioctl_get_supported_features(argp); 5971 case BTRFS_IOC_GET_FEATURES: 5972 return btrfs_ioctl_get_features(file, argp); 5973 case BTRFS_IOC_SET_FEATURES: 5974 return btrfs_ioctl_set_features(file, argp); 5975 case FS_IOC_FSGETXATTR: 5976 return btrfs_ioctl_fsgetxattr(file, argp); 5977 case FS_IOC_FSSETXATTR: 5978 return btrfs_ioctl_fssetxattr(file, argp); 5979 case BTRFS_IOC_GET_SUBVOL_INFO: 5980 return btrfs_ioctl_get_subvol_info(file, argp); 5981 case BTRFS_IOC_GET_SUBVOL_ROOTREF: 5982 return btrfs_ioctl_get_subvol_rootref(file, argp); 5983 case BTRFS_IOC_INO_LOOKUP_USER: 5984 return btrfs_ioctl_ino_lookup_user(file, argp); 5985 } 5986 5987 return -ENOTTY; 5988 } 5989 5990 #ifdef CONFIG_COMPAT 5991 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 5992 { 5993 /* 5994 * These all access 32-bit values anyway so no further 5995 * handling is necessary. 5996 */ 5997 switch (cmd) { 5998 case FS_IOC32_GETFLAGS: 5999 cmd = FS_IOC_GETFLAGS; 6000 break; 6001 case FS_IOC32_SETFLAGS: 6002 cmd = FS_IOC_SETFLAGS; 6003 break; 6004 case FS_IOC32_GETVERSION: 6005 cmd = FS_IOC_GETVERSION; 6006 break; 6007 } 6008 6009 return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); 6010 } 6011 #endif 6012